# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.

from typing import TYPE_CHECKING, Dict, List, Optional
from datetime import datetime
from typing_extensions import Literal

from pydantic import Field as FieldInfo

from .._models import BaseModel

__all__ = ["Evaluation", "ModelInput"]


class ModelInput(BaseModel):
    user_prompt: str
    """The user prompt used to generate the output."""

    context: Optional[str] = None
    """Optional context supplied to the LLM when generating the output."""

    if TYPE_CHECKING:
        # Some versions of Pydantic <2.8.0 have a bug and don’t allow assigning a
        # value to this field, so for compatibility we avoid doing it at runtime.
        __pydantic_extra__: Dict[str, object] = FieldInfo(init=False)  # pyright: ignore[reportIncompatibleVariableOverride]

        # Stub to indicate that arbitrary properties are accepted.
        # To access properties that are not valid identifiers you can use `getattr`, e.g.
        # `getattr(obj, '$type')`
        def __getattr__(self, attr: str) -> object: ...
    else:
        __pydantic_extra__: Dict[str, object]


class Evaluation(BaseModel):
    eval_id: str
    """A unique evaluation ID."""

    evaluation_status: Literal["in_progress", "completed", "canceled", "queued", "failed"]
    """Status of the evaluation."""

    api_model_input: ModelInput = FieldInfo(alias="model_input")
    """A dictionary of inputs sent to the LLM to generate output.

    The dictionary must contain a `user_prompt` field and an optional `context`
    field. Additional properties are allowed.
    """

    api_model_output: str = FieldInfo(alias="model_output")
    """Output generated by the LLM to be evaluated."""

    run_mode: Literal["precision_plus", "precision", "smart", "economy"]
    """Run mode for the evaluation.

    The run mode allows the user to optimize for speed, accuracy, and cost by
    determining which models are used to evaluate the event.
    """

    created_at: Optional[datetime] = None
    """The time the evaluation was created in UTC."""

    end_timestamp: Optional[datetime] = None
    """The time the evaluation completed in UTC."""

    error_message: Optional[str] = None
    """Description of the error causing the evaluation to fail, if any."""

    error_timestamp: Optional[datetime] = None
    """The time the error causing the evaluation to fail was recorded."""

    evaluation_result: Optional[Dict[str, object]] = None
    """
    Evaluation result consisting of average scores and rationales for each of the
    evaluated guardrail metrics.
    """

    evaluation_total_cost: Optional[float] = None
    """Total cost of the evaluation."""

    guardrail_metrics: Optional[
        List[
            Literal[
                "correctness",
                "completeness",
                "instruction_adherence",
                "context_adherence",
                "ground_truth_adherence",
                "comprehensive_safety",
            ]
        ]
    ] = None
    """
    An array of guardrail metrics that the model input and output pair will be
    evaluated on.
    """

    api_model_used: Optional[str] = FieldInfo(alias="model_used", default=None)
    """Model ID used to generate the output, like `gpt-4o` or `o3`."""

    modified_at: Optional[datetime] = None
    """The most recent time the evaluation was modified in UTC."""

    nametag: Optional[str] = None
    """An optional, user-defined tag for the evaluation."""

    progress: Optional[int] = None
    """Evaluation progress.

    Values range between 0 and 100; 100 corresponds to a completed
    `evaluation_status`.
    """

    start_timestamp: Optional[datetime] = None
    """The time the evaluation started in UTC."""
