Evaluate

`BaseModel` ¶

Bases: Protocol

Protocol for pydantic BaseModel to ensure compatibility with context

Source code in python/scouter/evaluate/_evaluate.pyi

class BaseModel(Protocol):
    """Protocol for pydantic BaseModel to ensure compatibility with context"""

    def model_dump(self) -> Dict[str, Any]:
        """Dump the model as a dictionary"""

    def model_dump_json(self) -> str:
        """Dump the model as a JSON string"""

    def __str__(self) -> str:
        """String representation of the model"""

`str()` ¶

String representation of the model

Source code in python/scouter/evaluate/_evaluate.pyi

def __str__(self) -> str:
    """String representation of the model"""

`model_dump()` ¶

Dump the model as a dictionary

Source code in python/scouter/evaluate/_evaluate.pyi

def model_dump(self) -> Dict[str, Any]:
    """Dump the model as a dictionary"""

`model_dump_json()` ¶

Dump the model as a JSON string

Source code in python/scouter/evaluate/_evaluate.pyi

def model_dump_json(self) -> str:
    """Dump the model as a JSON string"""

`EvaluationConfig` ¶

Configuration options for LLM evaluation.

Source code in python/scouter/evaluate/_evaluate.pyi

class EvaluationConfig:
    """Configuration options for LLM evaluation."""

    def __init__(
        self,
        embedder: Optional[Embedder] = None,
        embedding_targets: Optional[List[str]] = None,
        compute_similarity: bool = False,
        cluster: bool = False,
        compute_histograms: bool = False,
    ):
        """
        Initialize the EvaluationConfig with optional parameters.

        Args:
            embedder (Optional[Embedder]):
                Optional Embedder instance to use for generating embeddings for similarity-based metrics.
                If not provided, no embeddings will be generated.
            embedding_targets (Optional[List[str]]):
                Optional list of context keys to generate embeddings for. If not provided, embeddings will
                be generated for all string fields in the record context.
            compute_similarity (bool):
                Whether to compute similarity between embeddings. Default is False.
            cluster (bool):
                Whether to perform clustering on the embeddings. Default is False.
            compute_histograms (bool):
                Whether to compute histograms for all calculated features (metrics, embeddings, similarities).
                Default is False.
        """

`init(embedder=None, embedding_targets=None, compute_similarity=False, cluster=False, compute_histograms=False)` ¶

Initialize the EvaluationConfig with optional parameters.

Parameters:

Name	Type	Description	Default
`embedder`	`Optional[Embedder]`	Optional Embedder instance to use for generating embeddings for similarity-based metrics. If not provided, no embeddings will be generated.	`None`
`embedding_targets`	`Optional[List[str]]`	Optional list of context keys to generate embeddings for. If not provided, embeddings will be generated for all string fields in the record context.	`None`
`compute_similarity`	`bool`	Whether to compute similarity between embeddings. Default is False.	`False`
`cluster`	`bool`	Whether to perform clustering on the embeddings. Default is False.	`False`
`compute_histograms`	`bool`	Whether to compute histograms for all calculated features (metrics, embeddings, similarities). Default is False.	`False`

Source code in python/scouter/evaluate/_evaluate.pyi

def __init__(
    self,
    embedder: Optional[Embedder] = None,
    embedding_targets: Optional[List[str]] = None,
    compute_similarity: bool = False,
    cluster: bool = False,
    compute_histograms: bool = False,
):
    """
    Initialize the EvaluationConfig with optional parameters.

    Args:
        embedder (Optional[Embedder]):
            Optional Embedder instance to use for generating embeddings for similarity-based metrics.
            If not provided, no embeddings will be generated.
        embedding_targets (Optional[List[str]]):
            Optional list of context keys to generate embeddings for. If not provided, embeddings will
            be generated for all string fields in the record context.
        compute_similarity (bool):
            Whether to compute similarity between embeddings. Default is False.
        cluster (bool):
            Whether to perform clustering on the embeddings. Default is False.
        compute_histograms (bool):
            Whether to compute histograms for all calculated features (metrics, embeddings, similarities).
            Default is False.
    """

`LLMEvalMetric` ¶

Defines an LLM eval metric to use when evaluating LLMs

Source code in python/scouter/evaluate/_evaluate.pyi

class LLMEvalMetric:
    """Defines an LLM eval metric to use when evaluating LLMs"""

    def __init__(self, name: str, prompt: Prompt):
        """
        Initialize an LLMEvalMetric to use for evaluating LLMs. This is
        most commonly used in conjunction with `evaluate_llm` where LLM inputs
        and responses can be evaluated against a variety of user-defined metrics.

        Args:
            name (str):
                Name of the metric
            prompt (Prompt):
                Prompt to use for the metric. For example, a user may create
                an accuracy analysis prompt or a query reformulation analysis prompt.
        """

    def __str__(self) -> str:
        """
        String representation of the LLMEvalMetric
        """

`init(name, prompt)` ¶

Initialize an LLMEvalMetric to use for evaluating LLMs. This is most commonly used in conjunction with evaluate_llm where LLM inputs and responses can be evaluated against a variety of user-defined metrics.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the metric	required
`prompt`	`Prompt`	Prompt to use for the metric. For example, a user may create an accuracy analysis prompt or a query reformulation analysis prompt.	required

Source code in python/scouter/evaluate/_evaluate.pyi

def __init__(self, name: str, prompt: Prompt):
    """
    Initialize an LLMEvalMetric to use for evaluating LLMs. This is
    most commonly used in conjunction with `evaluate_llm` where LLM inputs
    and responses can be evaluated against a variety of user-defined metrics.

    Args:
        name (str):
            Name of the metric
        prompt (Prompt):
            Prompt to use for the metric. For example, a user may create
            an accuracy analysis prompt or a query reformulation analysis prompt.
    """

`str()` ¶

String representation of the LLMEvalMetric

Source code in python/scouter/evaluate/_evaluate.pyi

def __str__(self) -> str:
    """
    String representation of the LLMEvalMetric
    """

`LLMEvalRecord` ¶

LLM record containing context tied to a Large Language Model interaction that is used to evaluate LLM responses.

Examples:

>>> record = LLMEvalRecord(
        id="123",
        context={
            "input": "What is the capital of France?",
            "response": "Paris is the capital of France."
        },
... )
>>> print(record.context["input"])
"What is the capital of France?"

Source code in python/scouter/evaluate/_evaluate.pyi

class LLMEvalRecord:
    """LLM record containing context tied to a Large Language Model interaction
    that is used to evaluate LLM responses.


    Examples:
        >>> record = LLMEvalRecord(
                id="123",
                context={
                    "input": "What is the capital of France?",
                    "response": "Paris is the capital of France."
                },
        ... )
        >>> print(record.context["input"])
        "What is the capital of France?"
    """

    def __init__(
        self,
        context: Context,
        id: Optional[str] = None,
    ) -> None:
        """Creates a new LLM record to associate with an `LLMDriftProfile`.
        The record is sent to the `Scouter` server via the `ScouterQueue` and is
        then used to inject context into the evaluation prompts.

        Args:
            context:
                Additional context information as a dictionary or a pydantic BaseModel. During evaluation,
                this will be merged with the input and response data and passed to the assigned
                evaluation prompts. So if you're evaluation prompts expect additional context via
                bound variables (e.g., `${foo}`), you can pass that here as key value pairs.
                {"foo": "bar"}
            id:
                Unique identifier for the record. If not provided, a new UUID will be generated.
                This is helpful for when joining evaluation results back to the original request.

        Raises:
            TypeError: If context is not a dict or a pydantic BaseModel.

        """

    @property
    def context(self) -> Dict[str, Any]:
        """Get the contextual information.

        Returns:
            The context data as a Python object (deserialized from JSON).
        """

`context` `property` ¶

Get the contextual information.

Returns:

Type	Description
`Dict[str, Any]`	The context data as a Python object (deserialized from JSON).

`init(context, id=None)` ¶

Creates a new LLM record to associate with an LLMDriftProfile. The record is sent to the Scouter server via the ScouterQueue and is then used to inject context into the evaluation prompts.

Parameters:

Name	Type	Description	Default
`context`	`Context`	Additional context information as a dictionary or a pydantic BaseModel. During evaluation, this will be merged with the input and response data and passed to the assigned evaluation prompts. So if you're evaluation prompts expect additional context via bound variables (e.g., `${foo}`), you can pass that here as key value pairs.	required
`id`	`Optional[str]`	Unique identifier for the record. If not provided, a new UUID will be generated. This is helpful for when joining evaluation results back to the original request.	`None`

Raises:

Type	Description
`TypeError`	If context is not a dict or a pydantic BaseModel.

Source code in python/scouter/evaluate/_evaluate.pyi

def __init__(
    self,
    context: Context,
    id: Optional[str] = None,
) -> None:
    """Creates a new LLM record to associate with an `LLMDriftProfile`.
    The record is sent to the `Scouter` server via the `ScouterQueue` and is
    then used to inject context into the evaluation prompts.

    Args:
        context:
            Additional context information as a dictionary or a pydantic BaseModel. During evaluation,
            this will be merged with the input and response data and passed to the assigned
            evaluation prompts. So if you're evaluation prompts expect additional context via
            bound variables (e.g., `${foo}`), you can pass that here as key value pairs.
            {"foo": "bar"}
        id:
            Unique identifier for the record. If not provided, a new UUID will be generated.
            This is helpful for when joining evaluation results back to the original request.

    Raises:
        TypeError: If context is not a dict or a pydantic BaseModel.

    """

`LLMEvalResults` ¶

Defines the results of an LLM eval metric

Source code in python/scouter/evaluate/_evaluate.pyi

class LLMEvalResults:
    """Defines the results of an LLM eval metric"""

    def __getitem__(self, key: str) -> LLMEvalTaskResult:
        """Get the task results for a specific record ID. A RuntimeError will be raised if the record ID does not exist."""

    def __str__(self):
        """String representation of the LLMEvalResults"""

    def to_dataframe(self, polars: bool = False) -> Any:
        """
        Convert the results to a Pandas or Polars DataFrame.

        Args:
            polars (bool):
                Whether to return a Polars DataFrame. If False, a Pandas DataFrame will be returned.

        Returns:
            DataFrame:
                A Pandas or Polars DataFrame containing the results.
        """

    def model_dump_json(self) -> str:
        """Dump the results as a JSON string"""

    @staticmethod
    def model_validate_json(json_string: str) -> "LLMEvalResults":
        """Validate and create an LLMEvalResults instance from a JSON string

        Args:
            json_string (str):
                JSON string to validate and create the LLMEvalResults instance from.
        """

    @property
    def errored_tasks(self) -> List[str]:
        """Get a list of record IDs that had errors during evaluation"""

    @property
    def histograms(self) -> Optional[Dict[str, Histogram]]:
        """Get histograms for all calculated features (metrics, embeddings, similarities)"""

`errored_tasks` `property` ¶

Get a list of record IDs that had errors during evaluation

`histograms` `property` ¶

Get histograms for all calculated features (metrics, embeddings, similarities)

`getitem(key)` ¶

Get the task results for a specific record ID. A RuntimeError will be raised if the record ID does not exist.

Source code in python/scouter/evaluate/_evaluate.pyi

def __getitem__(self, key: str) -> LLMEvalTaskResult:
    """Get the task results for a specific record ID. A RuntimeError will be raised if the record ID does not exist."""

`str()` ¶

String representation of the LLMEvalResults

Source code in python/scouter/evaluate/_evaluate.pyi

def __str__(self):
    """String representation of the LLMEvalResults"""

`model_dump_json()` ¶

Dump the results as a JSON string

Source code in python/scouter/evaluate/_evaluate.pyi

def model_dump_json(self) -> str:
    """Dump the results as a JSON string"""

`model_validate_json(json_string)` `staticmethod` ¶

Validate and create an LLMEvalResults instance from a JSON string

Parameters:

Name	Type	Description	Default
`json_string`	`str`	JSON string to validate and create the LLMEvalResults instance from.	required

Source code in python/scouter/evaluate/_evaluate.pyi

@staticmethod
def model_validate_json(json_string: str) -> "LLMEvalResults":
    """Validate and create an LLMEvalResults instance from a JSON string

    Args:
        json_string (str):
            JSON string to validate and create the LLMEvalResults instance from.
    """

`to_dataframe(polars=False)` ¶

Convert the results to a Pandas or Polars DataFrame.

Parameters:

Name	Type	Description	Default
`polars`	`bool`	Whether to return a Polars DataFrame. If False, a Pandas DataFrame will be returned.	`False`

Returns:

Name	Type	Description
`DataFrame`	`Any`	A Pandas or Polars DataFrame containing the results.

Source code in python/scouter/evaluate/_evaluate.pyi

def to_dataframe(self, polars: bool = False) -> Any:
    """
    Convert the results to a Pandas or Polars DataFrame.

    Args:
        polars (bool):
            Whether to return a Polars DataFrame. If False, a Pandas DataFrame will be returned.

    Returns:
        DataFrame:
            A Pandas or Polars DataFrame containing the results.
    """

`LLMEvalTaskResult` ¶

Eval Result for a specific evaluation

Source code in python/scouter/evaluate/_evaluate.pyi

class LLMEvalTaskResult:
    """Eval Result for a specific evaluation"""

    @property
    def id(self) -> str:
        """Get the record id associated with this result"""

    @property
    def metrics(self) -> Dict[str, Score]:
        """Get the list of metrics"""

    @property
    def embedding(self) -> Dict[str, List[float]]:
        """Get embeddings of embedding targets"""

`embedding` `property` ¶

Get embeddings of embedding targets

`id` `property` ¶

Get the record id associated with this result

`metrics` `property` ¶

Get the list of metrics

`evaluate_llm(records, metrics, config=None)` ¶

Evaluate LLM responses using the provided evaluation metrics.

Parameters:

Name	Type	Description	Default
`records`	`List[LLMEvalRecord]`	List of LLM evaluation records to evaluate.	required
`metrics`	`List[LLMEvalMetric]`	List of LLMEvalMetric instances to use for evaluation.	required
`config`	`Optional[EvaluationConfig]`	Optional EvaluationConfig instance to configure evaluation options.	`None`

Returns:

Type	Description
`LLMEvalResults`	LLMEvalResults

Source code in python/scouter/evaluate/_evaluate.pyi

def evaluate_llm(
    records: List[LLMEvalRecord],
    metrics: List[LLMEvalMetric],
    config: Optional[EvaluationConfig] = None,
) -> LLMEvalResults:
    """
    Evaluate LLM responses using the provided evaluation metrics.

    Args:
        records (List[LLMEvalRecord]):
            List of LLM evaluation records to evaluate.
        metrics (List[LLMEvalMetric]):
            List of LLMEvalMetric instances to use for evaluation.
        config (Optional[EvaluationConfig]):
            Optional EvaluationConfig instance to configure evaluation options.

    Returns:
        LLMEvalResults
    """

Evaluate

BaseModel ¶

__str__() ¶

model_dump() ¶

model_dump_json() ¶

EvaluationConfig ¶

__init__(embedder=None, embedding_targets=None, compute_similarity=False, cluster=False, compute_histograms=False) ¶

LLMEvalMetric ¶

__init__(name, prompt) ¶

__str__() ¶

LLMEvalRecord ¶

context property ¶

__init__(context, id=None) ¶

LLMEvalResults ¶

errored_tasks property ¶

histograms property ¶

__getitem__(key) ¶

__str__() ¶

model_dump_json() ¶

model_validate_json(json_string) staticmethod ¶

to_dataframe(polars=False) ¶

LLMEvalTaskResult ¶

embedding property ¶

id property ¶

metrics property ¶

evaluate_llm(records, metrics, config=None) ¶

`BaseModel` ¶

`str()` ¶

`model_dump()` ¶

`model_dump_json()` ¶

`EvaluationConfig` ¶

`init(embedder=None, embedding_targets=None, compute_similarity=False, cluster=False, compute_histograms=False)` ¶

`LLMEvalMetric` ¶

`init(name, prompt)` ¶

`str()` ¶

`LLMEvalRecord` ¶

`context` `property` ¶

`init(context, id=None)` ¶

`LLMEvalResults` ¶

`errored_tasks` `property` ¶

`histograms` `property` ¶

`getitem(key)` ¶

`str()` ¶

`model_dump_json()` ¶

`model_validate_json(json_string)` `staticmethod` ¶

`to_dataframe(polars=False)` ¶

`LLMEvalTaskResult` ¶

`embedding` `property` ¶

`id` `property` ¶

`metrics` `property` ¶

`evaluate_llm(records, metrics, config=None)` ¶