Skip to content

evals

patronus.evals

evaluators

Evaluator

Base Evaluator Class

evaluate abstractmethod
evaluate(*args, **kwargs) -> Optional[EvaluationResult]

Synchronous version of evaluate method. When inheriting directly from Evaluator class it's permitted to change parameters signature. Return type should stay unchanged.

Source code in src/patronus/evals/evaluators.py
@abc.abstractmethod
def evaluate(self, *args, **kwargs) -> Optional[EvaluationResult]:
    """
    Synchronous version of evaluate method.
    When inheriting directly from Evaluator class it's permitted to change parameters signature.
    Return type should stay unchanged.
    """

AsyncEvaluator

Bases: Evaluator

evaluate abstractmethod async
evaluate(*args, **kwargs) -> Optional[EvaluationResult]

Asynchronous version of evaluate method. When inheriting directly from Evaluator class it's permitted to change parameters signature. Return type should stay unchanged.

Source code in src/patronus/evals/evaluators.py
@abc.abstractmethod
async def evaluate(self, *args, **kwargs) -> Optional[EvaluationResult]:
    """
    Asynchronous version of evaluate method.
    When inheriting directly from Evaluator class it's permitted to change parameters signature.
    Return type should stay unchanged.
    """

StructuredEvaluator

Bases: Evaluator

Base for structured evaluators

AsyncStructuredEvaluator

Bases: AsyncEvaluator

Base for async structured evaluators

RemoteEvaluator

RemoteEvaluator(
    evaluator_id_or_alias: str,
    criteria: Optional[str] = None,
    *,
    tags: Optional[dict[str, str]] = None,
    explain_strategy: Literal[
        "never", "on-fail", "on-success", "always"
    ] = "always",
    criteria_config: Optional[dict[str, Any]] = None,
    allow_update: bool = False,
    max_attempts: int = 3,
    api_: Optional[PatronusAPIClient] = None,
)

Bases: RemoteEvaluatorMixin, StructuredEvaluator

Synchronous remote evaluator

Source code in src/patronus/evals/evaluators.py
def __init__(
    self,
    evaluator_id_or_alias: str,
    criteria: Optional[str] = None,
    *,
    tags: Optional[dict[str, str]] = None,
    explain_strategy: typing.Literal["never", "on-fail", "on-success", "always"] = "always",
    criteria_config: Optional[dict[str, typing.Any]] = None,
    allow_update: bool = False,
    max_attempts: int = 3,
    api_: Optional[PatronusAPIClient] = None,
):
    self.evaluator_id_or_alias = evaluator_id_or_alias
    self.criteria = criteria
    self.tags = tags or {}
    self.explain_strategy = explain_strategy
    self.criteria_config = criteria_config
    self.allow_update = allow_update
    self.max_attempts = max_attempts
    self._api = api_
evaluate
evaluate(
    *,
    system_prompt: Optional[str] = None,
    task_context: Union[list[str], str, None] = None,
    task_attachments: Union[list[Any], None] = None,
    task_input: Optional[str] = None,
    task_output: Optional[str] = None,
    gold_answer: Optional[str] = None,
    task_metadata: Optional[Dict[str, Any]] = None,
    **kwargs: Any,
) -> EvaluationResult

Evaluates data using remote Patronus Evaluator

Source code in src/patronus/evals/evaluators.py
def evaluate(
    self,
    *,
    system_prompt: Optional[str] = None,
    task_context: Union[list[str], str, None] = None,
    task_attachments: Union[list[Any], None] = None,
    task_input: Optional[str] = None,
    task_output: Optional[str] = None,
    gold_answer: Optional[str] = None,
    task_metadata: Optional[typing.Dict[str, typing.Any]] = None,
    **kwargs: Any,
) -> EvaluationResult:
    """Evaluates data using remote Patronus Evaluator"""
    kws = {
        "system_prompt": system_prompt,
        "task_context": task_context,
        "task_attachments": task_attachments,
        "task_input": task_input,
        "task_output": task_output,
        "gold_answer": gold_answer,
        "task_metadata": task_metadata,
        **kwargs,
    }
    log_id = get_current_log_id(bound_arguments=kws)

    attrs = get_context_evaluation_attributes()
    tags = {**self.tags}
    if t := attrs["tags"]:
        tags.update(t)
    tags = merge_tags(tags, kwargs.get("tags"), attrs["experiment_tags"])
    if tags:
        kws["tags"] = tags
    if did := attrs["dataset_id"]:
        kws["dataset_id"] = did
    if sid := attrs["dataset_sample_id"]:
        kws["dataset_sample_id"] = sid

    resp = retry()(self._evaluate)(log_id=log_id, **kws)
    return self._translate_response(resp)

AsyncRemoteEvaluator

AsyncRemoteEvaluator(
    evaluator_id_or_alias: str,
    criteria: Optional[str] = None,
    *,
    tags: Optional[dict[str, str]] = None,
    explain_strategy: Literal[
        "never", "on-fail", "on-success", "always"
    ] = "always",
    criteria_config: Optional[dict[str, Any]] = None,
    allow_update: bool = False,
    max_attempts: int = 3,
    api_: Optional[PatronusAPIClient] = None,
)

Bases: RemoteEvaluatorMixin, AsyncStructuredEvaluator

Asynchronous remote evaluator

Source code in src/patronus/evals/evaluators.py
def __init__(
    self,
    evaluator_id_or_alias: str,
    criteria: Optional[str] = None,
    *,
    tags: Optional[dict[str, str]] = None,
    explain_strategy: typing.Literal["never", "on-fail", "on-success", "always"] = "always",
    criteria_config: Optional[dict[str, typing.Any]] = None,
    allow_update: bool = False,
    max_attempts: int = 3,
    api_: Optional[PatronusAPIClient] = None,
):
    self.evaluator_id_or_alias = evaluator_id_or_alias
    self.criteria = criteria
    self.tags = tags or {}
    self.explain_strategy = explain_strategy
    self.criteria_config = criteria_config
    self.allow_update = allow_update
    self.max_attempts = max_attempts
    self._api = api_
evaluate async
evaluate(
    *,
    system_prompt: Optional[str] = None,
    task_context: Union[list[str], str, None] = None,
    task_attachments: Union[list[Any], None] = None,
    task_input: Optional[str] = None,
    task_output: Optional[str] = None,
    gold_answer: Optional[str] = None,
    task_metadata: Optional[Dict[str, Any]] = None,
    **kwargs: Any,
) -> EvaluationResult

Evaluates data using remote Patronus Evaluator

Source code in src/patronus/evals/evaluators.py
async def evaluate(
    self,
    *,
    system_prompt: Optional[str] = None,
    task_context: Union[list[str], str, None] = None,
    task_attachments: Union[list[Any], None] = None,
    task_input: Optional[str] = None,
    task_output: Optional[str] = None,
    gold_answer: Optional[str] = None,
    task_metadata: Optional[typing.Dict[str, typing.Any]] = None,
    **kwargs: Any,
) -> EvaluationResult:
    """Evaluates data using remote Patronus Evaluator"""
    kws = {
        "system_prompt": system_prompt,
        "task_context": task_context,
        "task_attachments": task_attachments,
        "task_input": task_input,
        "task_output": task_output,
        "gold_answer": gold_answer,
        "task_metadata": task_metadata,
        **kwargs,
    }
    log_id = get_current_log_id(bound_arguments=kws)

    attrs = get_context_evaluation_attributes()
    tags = {**self.tags}
    if t := attrs["tags"]:
        tags.update(t)
    tags = merge_tags(tags, kwargs.get("tags"), attrs["experiment_tags"])
    if tags:
        kws["tags"] = tags
    if did := attrs["dataset_id"]:
        kws["dataset_id"] = did
    if sid := attrs["dataset_sample_id"]:
        kws["dataset_sample_id"] = sid

    resp = await retry()(self._evaluate)(log_id=log_id, **kws)
    return self._translate_response(resp)

get_current_log_id

get_current_log_id(
    bound_arguments: dict[str, Any],
) -> Optional[LogID]

Return log_id for given arguments in current context. Returns None if there is no context - most likely SDK is not initialized.

Source code in src/patronus/evals/evaluators.py
def get_current_log_id(bound_arguments: dict[str, Any]) -> Optional[LogID]:
    """
    Return log_id for given arguments in current context.
    Returns None if there is no context - most likely SDK is not initialized.
    """
    eval_group = _ctx_evaluation_log_group.get(None)
    if eval_group is None:
        return None
    log_id = eval_group.find_log(bound_arguments)
    if log_id is None:
        raise ValueError("Log not found for provided arguments")
    return log_id

bundled_eval

bundled_eval(
    span_name: str = "Evaluation bundle",
    attributes: Optional[dict[str, str]] = None,
)

Start a span that would automatically bundle evaluations.

Evaluations are passed by arguments passed to the evaluators called inside the context manager.

The following example would create two bundles:

  • fist with arguments x=10, y=20
  • second with arguments spam="abc123"
with bundled_eval():
    foo_evaluator(x=10, y=20)
    bar_evaluator(x=10, y=20)
    tar_evaluator(spam="abc123")
Source code in src/patronus/evals/evaluators.py
@contextlib.contextmanager
def bundled_eval(span_name: str = "Evaluation bundle", attributes: Optional[dict[str, str]] = None):
    """
    Start a span that would automatically bundle evaluations.

    Evaluations are passed by arguments passed to the evaluators called inside the context manager.

    The following example would create two bundles:

    - fist with arguments `x=10, y=20`
    - second with arguments `spam="abc123"`

    ```python
    with bundled_eval():
        foo_evaluator(x=10, y=20)
        bar_evaluator(x=10, y=20)
        tar_evaluator(spam="abc123")
    ```

    """
    tracer = context.get_tracer_or_none()
    if tracer is None:
        yield
        return

    attributes = {
        **(attributes or {}),
        Attributes.span_type.value: SpanTypes.eval.value,
    }
    with tracer.start_as_current_span(span_name, attributes=attributes):
        with _start_evaluation_log_group():
            yield

evaluator

evaluator(
    _fn: Optional[Callable[..., Any]] = None,
    *,
    evaluator_id: Union[
        str, Callable[[], str], None
    ] = None,
    criteria: Union[str, Callable[[], str], None] = None,
    metric_name: Optional[str] = None,
    metric_description: Optional[str] = None,
    is_method: bool = False,
    span_name: Optional[str] = None,
    log_none_arguments: bool = False,
    **kwargs: Any,
) -> typing.Callable[..., typing.Any]

Decorator for creating functional-style evaluators that log execution and results.

This decorator works with both synchronous and asynchronous functions. The decorator doesn't modify the function's return value, but records it after converting to an EvaluationResult.

Evaluators can return different types which are automatically converted to EvaluationResult objects:

  • bool: True/False indicating pass/fail.
  • float/int: Numerical scores (typically between 0-1).
  • str: Text output categorizing the result.
  • EvaluationResult: Complete evaluation with scores, explanations, etc.
  • None: Indicates evaluation was skipped and no result will be recorded.

Evaluation results are exported in the background without blocking execution. The SDK must be initialized with patronus.init() for evaluations to be recorded, though decorated functions will still execute even without initialization.

The evaluator integrates with a context-based system to identify and handle shared evaluation logging and tracing spans.

Example:

from patronus import init, evaluator
from patronus.evals import EvaluationResult

# Initialize the SDK to record evaluations
init()

# Simple evaluator function
@evaluator()
def exact_match(actual: str, expected: str) -> bool:
    return actual.strip() == expected.strip()

# More complex evaluator with detailed result
@evaluator()
def semantic_match(actual: str, expected: str) -> EvaluationResult:
    similarity = calculate_similarity(actual, expected)  # Your similarity function
    return EvaluationResult(
        score=similarity,
        pass_=similarity > 0.8,
        text_output="High similarity" if similarity > 0.8 else "Low similarity",
        explanation=f"Calculated similarity: {similarity}"
    )

# Use the evaluators
result = exact_match("Hello world", "Hello world")
print(f"Match: {result}")  # Output: Match: True

Parameters:

Name Type Description Default
_fn Optional[Callable[..., Any]]

The function to be decorated.

None
evaluator_id Union[str, Callable[[], str], None]

Name for the evaluator. Defaults to function name (or class name in case of class based evaluators).

None
criteria Union[str, Callable[[], str], None]

Name of the criteria used by the evaluator. The use of the criteria is only recommended in more complex evaluator setups where evaluation algorithm changes depending on a criteria (think strategy pattern).

None
metric_name Optional[str]

Name for the evaluation metric. Defaults to evaluator_id value.

None
metric_description Optional[str]

The description of the metric used for evaluation. If not provided then the docstring of the wrapped function is used for this value.

None
is_method bool

Whether the wrapped function is a method. This value is used to determine whether to remove "self" argument from the log. It also allows for dynamic evaluator_id and criteria discovery based on get_evaluator_id() and get_criteria_id() methods. User-code usually shouldn't use it as long as user defined class-based evaluators inherit from the library provided Evaluator base classes.

False
span_name Optional[str]

Name of the span to represent this evaluation in the tracing system. Defaults to None, in which case a default name is generated based on the evaluator.

None
log_none_arguments bool

Controls whether arguments with None values are included in log output. This setting affects only logging behavior and has no impact on function execution. Note: Only applies to top-level arguments. For nested structures like dictionaries, None values will always be logged regardless of this setting.

False
**kwargs Any

Additional keyword arguments that may be passed to the decorator or its internal methods.

{}

Returns:

Name Type Description
Callable Callable[..., Any]

Returns the decorated function with additional evaluation behavior, suitable for synchronous or asynchronous usage.

Note

For evaluations that need to be compatible with experiments, consider using StructuredEvaluator or AsyncStructuredEvaluator classes instead.

Source code in src/patronus/evals/evaluators.py
def evaluator(
    _fn: Optional[typing.Callable[..., typing.Any]] = None,
    *,
    evaluator_id: Union[str, typing.Callable[[], str], None] = None,
    criteria: Union[str, typing.Callable[[], str], None] = None,
    metric_name: Optional[str] = None,
    metric_description: Optional[str] = None,
    is_method: bool = False,
    span_name: Optional[str] = None,
    log_none_arguments: bool = False,
    **kwargs: typing.Any,
) -> typing.Callable[..., typing.Any]:
    """
    Decorator for creating functional-style evaluators that log execution and results.

    This decorator works with both synchronous and asynchronous functions. The decorator doesn't
    modify the function's return value, but records it after converting to an EvaluationResult.

    Evaluators can return different types which are automatically converted to `EvaluationResult` objects:

    * `bool`: `True`/`False` indicating pass/fail.
    * `float`/`int`: Numerical scores (typically between 0-1).
    * `str`: Text output categorizing the result.
    * [EvaluationResult][patronus.evals.types.EvaluationResult]: Complete evaluation with scores, explanations, etc.
    * `None`: Indicates evaluation was skipped and no result will be recorded.

    Evaluation results are exported in the background without blocking execution. The SDK must be
    initialized with `patronus.init()` for evaluations to be recorded, though decorated functions
    will still execute even without initialization.

    The evaluator integrates with a context-based system to identify and handle shared evaluation
    logging and tracing spans.

    **Example:**

    ```python
    from patronus import init, evaluator
    from patronus.evals import EvaluationResult

    # Initialize the SDK to record evaluations
    init()

    # Simple evaluator function
    @evaluator()
    def exact_match(actual: str, expected: str) -> bool:
        return actual.strip() == expected.strip()

    # More complex evaluator with detailed result
    @evaluator()
    def semantic_match(actual: str, expected: str) -> EvaluationResult:
        similarity = calculate_similarity(actual, expected)  # Your similarity function
        return EvaluationResult(
            score=similarity,
            pass_=similarity > 0.8,
            text_output="High similarity" if similarity > 0.8 else "Low similarity",
            explanation=f"Calculated similarity: {similarity}"
        )

    # Use the evaluators
    result = exact_match("Hello world", "Hello world")
    print(f"Match: {result}")  # Output: Match: True
    ```

    Args:
        _fn: The function to be decorated.
        evaluator_id: Name for the evaluator.
            Defaults to function name (or class name in case of class based evaluators).
        criteria: Name of the criteria used by the evaluator.
            The use of the criteria is only recommended in more complex evaluator setups
            where evaluation algorithm changes depending on a criteria (think strategy pattern).
        metric_name: Name for the evaluation metric. Defaults to evaluator_id value.
        metric_description: The description of the metric used for evaluation.
            If not provided then the docstring of the wrapped function is used for this value.
        is_method: Whether the wrapped function is a method.
            This value is used to determine whether to remove "self" argument from the log.
            It also allows for dynamic evaluator_id and criteria discovery
            based on `get_evaluator_id()` and `get_criteria_id()` methods.
            User-code usually shouldn't use it as long as user defined class-based evaluators inherit from
            the library provided Evaluator base classes.
        span_name: Name of the span to represent this evaluation in the tracing system.
            Defaults to None, in which case a default name is generated based on the evaluator.
        log_none_arguments: Controls whether arguments with None values are included in log output.
            This setting affects only logging behavior and has no impact on function execution.
            Note: Only applies to top-level arguments. For nested structures like dictionaries,
            None values will always be logged regardless of this setting.
        **kwargs: Additional keyword arguments that may be passed to the decorator or its internal methods.

    Returns:
        Callable: Returns the decorated function with additional evaluation behavior, suitable for
            synchronous or asynchronous usage.

    Note:
        For evaluations that need to be compatible with experiments, consider using
        [StructuredEvaluator][patronus.evals.evaluators.StructuredEvaluator] or
        [AsyncStructuredEvaluator][patronus.evals.evaluators.AsyncStructuredEvaluator] classes instead.

    """
    if _fn is not None:
        return evaluator()(_fn)

    def decorator(fn):
        fn_sign = inspect.signature(fn)

        def _get_eval_id():
            return (callable(evaluator_id) and evaluator_id()) or evaluator_id or fn.__name__

        def _get_criteria():
            return (callable(criteria) and criteria()) or criteria or None

        def _prep(*fn_args, **fn_kwargs):
            bound_args = fn_sign.bind(*fn_args, **fn_kwargs)
            arguments_to_log = _as_applied_argument(fn_sign, bound_args)
            bound_args.apply_defaults()
            self_key_name = None
            instance = None
            if is_method:
                self_key_name = next(iter(fn_sign.parameters.keys()))
                instance = bound_args.arguments[self_key_name]

            eval_id = None
            eval_criteria = None
            if isinstance(instance, Evaluator):
                eval_id = instance.get_evaluator_id()
                eval_criteria = instance.get_criteria()

            if eval_id is None:
                eval_id = _get_eval_id()
            if eval_criteria is None:
                eval_criteria = _get_criteria()

            met_name = metric_name or eval_id
            met_description = metric_description or inspect.getdoc(fn) or None

            disable_export = isinstance(instance, RemoteEvaluatorMixin) and instance._disable_export

            return PrepEval(
                span_name=span_name,
                evaluator_id=eval_id,
                criteria=eval_criteria,
                metric_name=met_name,
                metric_description=met_description,
                self_key_name=self_key_name,
                arguments=arguments_to_log,
                disable_export=disable_export,
            )

        attributes = {
            Attributes.span_type.value: SpanTypes.eval.value,
            GenAIAttributes.operation_name.value: OperationNames.eval.value,
        }

        @functools.wraps(fn)
        async def wrapper_async(*fn_args, **fn_kwargs):
            ctx = context.get_current_context_or_none()
            if ctx is None:
                return await fn(*fn_args, **fn_kwargs)

            prep = _prep(*fn_args, **fn_kwargs)

            start = time.perf_counter()
            try:
                with start_span(prep.display_name(), attributes=attributes):
                    with _get_or_start_evaluation_log_group() as log_group:
                        log_id = log_group.log(
                            logger=ctx.pat_logger,
                            is_method=is_method,
                            self_key_name=prep.self_key_name,
                            bound_arguments=prep.arguments,
                            log_none_arguments=log_none_arguments,
                        )
                        ret = await fn(*fn_args, **fn_kwargs)
            except Exception as e:
                ctx.logger.exception(f"Evaluator raised an exception: {e}")
                raise e
            if prep.disable_export:
                return ret
            elapsed = time.perf_counter() - start
            handle_eval_output(
                ctx=ctx,
                log_id=log_id,
                evaluator_id=prep.evaluator_id,
                criteria=prep.criteria,
                metric_name=prep.metric_name,
                metric_description=prep.metric_description,
                ret_value=ret,
                duration=datetime.timedelta(seconds=elapsed),
                qualname=fn.__qualname__,
            )
            return ret

        @functools.wraps(fn)
        def wrapper_sync(*fn_args, **fn_kwargs):
            ctx = context.get_current_context_or_none()
            if ctx is None:
                return fn(*fn_args, **fn_kwargs)

            prep = _prep(*fn_args, **fn_kwargs)

            start = time.perf_counter()
            try:
                with start_span(prep.display_name(), attributes=attributes):
                    with _get_or_start_evaluation_log_group() as log_group:
                        log_id = log_group.log(
                            logger=ctx.pat_logger,
                            is_method=is_method,
                            self_key_name=prep.self_key_name,
                            bound_arguments=prep.arguments,
                            log_none_arguments=log_none_arguments,
                        )
                        ret = fn(*fn_args, **fn_kwargs)
            except Exception as e:
                ctx.logger.exception("Evaluation failed")
                raise e
            if prep.disable_export:
                return ret
            elapsed = time.perf_counter() - start
            handle_eval_output(
                ctx=ctx,
                log_id=log_id,
                evaluator_id=prep.evaluator_id,
                criteria=prep.criteria,
                metric_name=prep.metric_name,
                metric_description=prep.metric_description,
                ret_value=ret,
                duration=datetime.timedelta(seconds=elapsed),
                qualname=fn.__qualname__,
            )
            return ret

        def _set_attrs(wrapper: Any):
            wrapper._pat_evaluator = True

            # _pat_evaluator_id and _pat_criteria_id may be a bit misleading since
            # may not be correct since actually values for evaluator_id and criteria
            # are dynamically dispatched for class-based evaluators.
            # These values will be correct for function evaluators though.
            wrapper._pat_evaluator_id = _get_eval_id()
            wrapper._pat_criteria = _get_criteria()

        if inspect.iscoroutinefunction(fn):
            _set_attrs(wrapper_async)
            return wrapper_async
        else:
            _set_attrs(wrapper_sync)
            return wrapper_sync

    return decorator

types

EvaluationResult

Bases: BaseModel

Container for evaluation outcomes including score, pass/fail status, explanations, and metadata.

This class stores complete evaluation results with numeric scores, boolean pass/fail statuses, textual outputs, explanations, and arbitrary metadata. Evaluator functions can return instances of this class directly or return simpler types (bool, float, str) which will be automatically converted to EvaluationResult objects during recording.

Attributes:

Name Type Description
score Optional[float]

Score of the evaluation. Can be any numerical value, though typically ranges from 0 to 1, where 1 represents the best possible score.

pass_ Optional[bool]

Whether the evaluation is considered to pass or fail.

text_output Optional[str]

Text output of the evaluation. Usually used for discrete human-readable category evaluation or as a label for score value.

metadata Optional[dict[str, Any]]

Arbitrary json-serializable metadata about evaluation.

explanation Optional[str]

Human-readable explanation of the evaluation.

tags Optional[dict[str, str]]

Key-value pair metadata.

dataset_id Optional[str]

ID of the dataset associated with evaluated sample.

dataset_sample_id Optional[str]

ID of the sample in a dataset associated with evaluated sample.

evaluation_duration Optional[timedelta]

Duration of the evaluation. In case value is not set, @evaluator decorator and Evaluator classes will set this value automatically.

explanation_duration Optional[timedelta]

Duration of the evaluation explanation.

format
format() -> str

Format the evaluation result into a readable summary.

Source code in src/patronus/evals/types.py
def format(self) -> str:
    """
    Format the evaluation result into a readable summary.
    """
    md = self.model_dump(exclude_none=True, mode="json")
    return yaml.dump(md)
pretty_print
pretty_print(file=None) -> None

Pretty prints the formatted content to the specified file or standard output.

Source code in src/patronus/evals/types.py
def pretty_print(self, file=None) -> None:
    """
    Pretty prints the formatted content to the specified file or standard output.
    """
    f = self.format()
    print(f, file=file)