From 7bb085cf499a070351768e3d2bd181877aef731c Mon Sep 17 00:00:00 2001 From: benomahony Date: Thu, 16 Oct 2025 16:23:05 +0100 Subject: [PATCH 1/4] Add generate_dataset_from_agent --- pydantic_evals/pydantic_evals/generation.py | 118 ++++++++++++++++++-- tests/evals/test_dataset.py | 9 ++ 2 files changed, 118 insertions(+), 9 deletions(-) diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py index 8a5c52c8d3..c2ee542608 100644 --- a/pydantic_evals/pydantic_evals/generation.py +++ b/pydantic_evals/pydantic_evals/generation.py @@ -6,26 +6,25 @@ from __future__ import annotations +import json +import re from collections.abc import Sequence from pathlib import Path -from typing import Any +from typing import Any, TypeVar -from pydantic import ValidationError -from typing_extensions import TypeVar +from pydantic import BaseModel, ValidationError from pydantic_ai import Agent, models from pydantic_ai._utils import strip_markdown_fences -from pydantic_evals import Dataset +from pydantic_evals import Case, Dataset from pydantic_evals.evaluators.evaluator import Evaluator -__all__ = ('generate_dataset',) +__all__ = ('generate_dataset', 'generate_dataset_for_agent') InputsT = TypeVar('InputsT', default=Any) """Generic type for the inputs to the task being evaluated.""" - OutputT = TypeVar('OutputT', default=Any) """Generic type for the expected output of the task being evaluated.""" - MetadataT = TypeVar('MetadataT', default=Any) """Generic type for the metadata associated with the task being evaluated.""" @@ -40,7 +39,6 @@ async def generate_dataset( extra_instructions: str | None = None, ) -> Dataset[InputsT, OutputT, MetadataT]: """Use an LLM to generate a dataset of test cases, each consisting of input, expected output, and metadata. - This function creates a properly structured dataset with the specified input, output, and metadata types. It uses an LLM to attempt to generate realistic test cases that conform to the types' schemas. @@ -72,7 +70,6 @@ async def generate_dataset( output_type=str, retries=1, ) - result = await agent.run(extra_instructions or 'Please generate the object.') output = strip_markdown_fences(result.output) try: @@ -83,3 +80,106 @@ async def generate_dataset( if path is not None: result.to_file(path, custom_evaluator_types=custom_evaluator_types) # pragma: no cover return result + + +async def generate_dataset_for_agent( + agent: Agent[Any, OutputT], + *, + inputs_type: type[InputsT] = str, # type: ignore + metadata_type: type[MetadataT] | None = None, + path: Path | str | None = None, + model: models.Model | models.KnownModelName = 'openai:gpt-4o', + n_examples: int = 3, + extra_instructions: str | None = None, +) -> Dataset[InputsT, OutputT, MetadataT]: + """Generate evaluation cases by running inputs through a target agent. + + Generates diverse inputs and metadata using an LLM, then runs them through the agent + to produce realistic expected outputs for evaluation. + + Args: + agent: Pydantic AI agent to extract outputs from. + inputs_type: Type of inputs the agent expects. Defaults to str. + metadata_type: Type for metadata. Defaults to None (uses NoneType). + path: Optional path to save the generated dataset. + model: Pydantic AI model to use for generation. Defaults to 'gpt-4o'. + n_examples: Number of examples to generate. Defaults to 3. + extra_instructions: Optional additional instructions for the LLM. + + Returns: + A properly structured Dataset object with generated test cases. + + Raises: + ValidationError: If the LLM's response cannot be parsed. + """ + # Get output schema with proper type handling + # Check if it's a Pydantic model class (not an instance) before calling model_json_schema + output_schema: str + if isinstance(agent.output_type, type) and issubclass(agent.output_type, BaseModel): + output_schema = str(agent.output_type.model_json_schema()) + else: + # For other types (str, custom output specs, etc.), just use string representation + output_schema = str(agent.output_type) + + # Get inputs schema with proper type handling + inputs_schema: str + if issubclass(inputs_type, BaseModel): + inputs_schema = str(inputs_type.model_json_schema()) + else: + inputs_schema = str(inputs_type) + + generation_prompt = ( + f'Generate {n_examples} test case inputs for an agent.\n\n' + f'The agent accepts inputs of type: {inputs_schema}\n' + f'The agent produces outputs of type: {output_schema}\n\n' + f'Return a JSON array of objects with "name" (optional string), "inputs" (matching the input type), ' + f'and "metadata" (optional, any additional context).\n' + f'You must not include any characters in your response before the opening [ of the JSON array, or after the closing ].' + + (f'\n\n{extra_instructions}' if extra_instructions else '') + ) + + gen_agent = Agent( + model, + system_prompt=generation_prompt, + output_type=str, + retries=1, + ) + + result = await gen_agent.run('Please generate the test case inputs and metadata.') + output = strip_markdown_fences(result.output).strip() + + try: + if not output: + raise ValueError('Empty output after stripping markdown fences') + + # Additional cleanup in case strip_markdown_fences didn't catch everything + # Remove markdown code blocks with optional language identifier + output = re.sub(r'^```(?:json)?\s*\n?', '', output) + output = re.sub(r'\n?```\s*$', '', output) + output = output.strip() + + inputs_metadata = json.loads(output) + cases = [] + + for i, item in enumerate(inputs_metadata): + agent_result = await agent.run(item['inputs']) + cases.append( + Case( + name=item.get('name', f'case-{i}'), + inputs=item['inputs'], + expected_output=agent_result.output, + metadata=item.get('metadata'), + ) + ) + + result_dataset = Dataset(cases=cases, evaluators=[]) + + except (json.JSONDecodeError, KeyError, ValidationError) as e: # pragma: no cover + print(f'Raw response from model:\n{result.output}\n') + print(f'After stripping markdown fences:\n{output}') + raise e + + if path is not None: + result_dataset.to_file(path) + + return result_dataset diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py index d2b2abc008..db0d155a33 100644 --- a/tests/evals/test_dataset.py +++ b/tests/evals/test_dataset.py @@ -1470,6 +1470,15 @@ def test_import_generate_dataset(): assert generate_dataset +def test_import_generate_dataset_from_agent(): + # this function is tough to test in an interesting way outside an example... + # this at least ensures importing it doesn't fail. + # TODO: Add an "example" that actually makes use of this functionality + from pydantic_evals.generation import generate_dataset_for_agent + + assert generate_dataset_for_agent + + def test_evaluate_non_serializable_inputs(): @dataclass class MyInputs: From 95dbb2b60ecadc6b51cec4430a96c7c1e527834f Mon Sep 17 00:00:00 2001 From: benomahony Date: Thu, 16 Oct 2025 16:29:53 +0100 Subject: [PATCH 2/4] Add docs --- docs/evals.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/docs/evals.md b/docs/evals.md index 68ec8184ee..d00c2efe5d 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -766,6 +766,52 @@ async def main(): _(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main(answer))` to run `main`)_ +### Generating from an Existing Agent + +If you already have an agent, you can use [`generate_evals_from_agent`][pydantic_evals.generation.generate_evals_from_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters. + +```python {title="generate_from_agent_example.py"} +from pydantic import BaseModel +from pydantic_ai import Agent + +from pydantic_evals.generation import generate_evals_from_agent + + +class AnswerOutput(BaseModel): + """Model for expected answer outputs.""" + + answer: str + confidence: float + + +agent = Agent( # (1)! + 'openai:gpt-4o', + output_type=AnswerOutput, + system_prompt='You are a helpful assistant that answers questions about world geography.', +) + + +async def main(): + dataset = await generate_evals_from_agent( # (2)! + agent=agent, + n_examples=3, + model='openai:gpt-4o', + path='agent_test_cases.json', + extra_instructions='Generate questions about world capitals and landmarks.', + ) + print(f'Generated {len(dataset.cases)} test cases') +``` + +1. Create an agent with a defined output type and system prompt. +2. Generate test cases by extracting types from the agent. The function will: + - Use an LLM to generate diverse input prompts based on the agent's configuration + - Run each input through the actual agent to get real outputs + - Save the inputs and outputs as test cases + +This approach ensures your test cases use realistic outputs from your actual agent, rather than having an LLM imagine what the outputs should be. + +_(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main())` to run `main`)_ + ## Integration with Logfire Pydantic Evals is implemented using OpenTelemetry to record traces of the evaluation process. These traces contain all From a796233d6da5505e326ed0f4e4521ad22658c1b8 Mon Sep 17 00:00:00 2001 From: benomahony Date: Thu, 16 Oct 2025 16:38:41 +0100 Subject: [PATCH 3/4] Fix typing for lower python versions --- docs/evals.md | 6 +++--- pydantic_evals/pydantic_evals/generation.py | 20 +++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index d00c2efe5d..be84378307 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -768,13 +768,13 @@ _(This example is complete, it can be run "as is" — you'll need to add `asynci ### Generating from an Existing Agent -If you already have an agent, you can use [`generate_evals_from_agent`][pydantic_evals.generation.generate_evals_from_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters. +If you already have an agent, you can use [`generate_dataset_for_agent`][pydantic_evals.generation.generate_dataset_for_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters. ```python {title="generate_from_agent_example.py"} from pydantic import BaseModel from pydantic_ai import Agent -from pydantic_evals.generation import generate_evals_from_agent +from pydantic_evals.generation import generate_dataset_for_agent class AnswerOutput(BaseModel): @@ -792,7 +792,7 @@ agent = Agent( # (1)! async def main(): - dataset = await generate_evals_from_agent( # (2)! + dataset = await generate_dataset_for_agent( # (2)! agent=agent, n_examples=3, model='openai:gpt-4o', diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py index c2ee542608..fcc8f1ff80 100644 --- a/pydantic_evals/pydantic_evals/generation.py +++ b/pydantic_evals/pydantic_evals/generation.py @@ -10,9 +10,10 @@ import re from collections.abc import Sequence from pathlib import Path -from typing import Any, TypeVar +from typing import Any, cast from pydantic import BaseModel, ValidationError +from typing_extensions import TypeVar from pydantic_ai import Agent, models from pydantic_ai._utils import strip_markdown_fences @@ -39,6 +40,7 @@ async def generate_dataset( extra_instructions: str | None = None, ) -> Dataset[InputsT, OutputT, MetadataT]: """Use an LLM to generate a dataset of test cases, each consisting of input, expected output, and metadata. + This function creates a properly structured dataset with the specified input, output, and metadata types. It uses an LLM to attempt to generate realistic test cases that conform to the types' schemas. @@ -85,7 +87,7 @@ async def generate_dataset( async def generate_dataset_for_agent( agent: Agent[Any, OutputT], *, - inputs_type: type[InputsT] = str, # type: ignore + inputs_type: type[InputsT] = str, # type: ignore[assignment] metadata_type: type[MetadataT] | None = None, path: Path | str | None = None, model: models.Model | models.KnownModelName = 'openai:gpt-4o', @@ -123,7 +125,7 @@ async def generate_dataset_for_agent( # Get inputs schema with proper type handling inputs_schema: str - if issubclass(inputs_type, BaseModel): + if isinstance(inputs_type, type) and issubclass(inputs_type, BaseModel): inputs_schema = str(inputs_type.model_json_schema()) else: inputs_schema = str(inputs_type) @@ -158,21 +160,21 @@ async def generate_dataset_for_agent( output = re.sub(r'\n?```\s*$', '', output) output = output.strip() - inputs_metadata = json.loads(output) - cases = [] + inputs_metadata: list[dict[str, Any]] = json.loads(output) + cases: list[Case[InputsT, OutputT, MetadataT]] = [] for i, item in enumerate(inputs_metadata): agent_result = await agent.run(item['inputs']) cases.append( Case( name=item.get('name', f'case-{i}'), - inputs=item['inputs'], - expected_output=agent_result.output, - metadata=item.get('metadata'), + inputs=cast(InputsT, item['inputs']), + expected_output=cast(OutputT, agent_result.output), + metadata=cast(MetadataT, item.get('metadata')), ) ) - result_dataset = Dataset(cases=cases, evaluators=[]) + result_dataset: Dataset[InputsT, OutputT, MetadataT] = Dataset(cases=cases, evaluators=[]) except (json.JSONDecodeError, KeyError, ValidationError) as e: # pragma: no cover print(f'Raw response from model:\n{result.output}\n') From b70edef52e334c39e187dd1d7cc0157ac30c696a Mon Sep 17 00:00:00 2001 From: benomahony Date: Thu, 16 Oct 2025 17:04:19 +0100 Subject: [PATCH 4/4] WIP Fixing docs and more typing --- docs/evals.md | 2 +- pydantic_evals/pydantic_evals/generation.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/evals.md b/docs/evals.md index be84378307..581f0017a3 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -772,8 +772,8 @@ If you already have an agent, you can use [`generate_dataset_for_agent`][pydanti ```python {title="generate_from_agent_example.py"} from pydantic import BaseModel -from pydantic_ai import Agent +from pydantic_ai import Agent from pydantic_evals.generation import generate_dataset_for_agent diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py index fcc8f1ff80..b5f59994ea 100644 --- a/pydantic_evals/pydantic_evals/generation.py +++ b/pydantic_evals/pydantic_evals/generation.py @@ -87,7 +87,7 @@ async def generate_dataset( async def generate_dataset_for_agent( agent: Agent[Any, OutputT], *, - inputs_type: type[InputsT] = str, # type: ignore[assignment] + inputs_type: type[InputsT] = str, metadata_type: type[MetadataT] | None = None, path: Path | str | None = None, model: models.Model | models.KnownModelName = 'openai:gpt-4o', @@ -121,7 +121,7 @@ async def generate_dataset_for_agent( output_schema = str(agent.output_type.model_json_schema()) else: # For other types (str, custom output specs, etc.), just use string representation - output_schema = str(agent.output_type) + output_schema = str(agent.output_type) # type: ignore[arg-type] # Get inputs schema with proper type handling inputs_schema: str @@ -169,7 +169,7 @@ async def generate_dataset_for_agent( Case( name=item.get('name', f'case-{i}'), inputs=cast(InputsT, item['inputs']), - expected_output=cast(OutputT, agent_result.output), + expected_output=agent_result.output, metadata=cast(MetadataT, item.get('metadata')), ) )