diff --git a/docs/evals.md b/docs/evals.md index 68ec8184ee..581f0017a3 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -766,6 +766,52 @@ async def main(): _(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main(answer))` to run `main`)_ +### Generating from an Existing Agent + +If you already have an agent, you can use [`generate_dataset_for_agent`][pydantic_evals.generation.generate_dataset_for_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters. + +```python {title="generate_from_agent_example.py"} +from pydantic import BaseModel + +from pydantic_ai import Agent +from pydantic_evals.generation import generate_dataset_for_agent + + +class AnswerOutput(BaseModel): + """Model for expected answer outputs.""" + + answer: str + confidence: float + + +agent = Agent( # (1)! + 'openai:gpt-4o', + output_type=AnswerOutput, + system_prompt='You are a helpful assistant that answers questions about world geography.', +) + + +async def main(): + dataset = await generate_dataset_for_agent( # (2)! + agent=agent, + n_examples=3, + model='openai:gpt-4o', + path='agent_test_cases.json', + extra_instructions='Generate questions about world capitals and landmarks.', + ) + print(f'Generated {len(dataset.cases)} test cases') +``` + +1. Create an agent with a defined output type and system prompt. +2. Generate test cases by extracting types from the agent. The function will: + - Use an LLM to generate diverse input prompts based on the agent's configuration + - Run each input through the actual agent to get real outputs + - Save the inputs and outputs as test cases + +This approach ensures your test cases use realistic outputs from your actual agent, rather than having an LLM imagine what the outputs should be. + +_(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main())` to run `main`)_ + ## Integration with Logfire Pydantic Evals is implemented using OpenTelemetry to record traces of the evaluation process. These traces contain all diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py index 8a5c52c8d3..b5f59994ea 100644 --- a/pydantic_evals/pydantic_evals/generation.py +++ b/pydantic_evals/pydantic_evals/generation.py @@ -6,26 +6,26 @@ from __future__ import annotations +import json +import re from collections.abc import Sequence from pathlib import Path -from typing import Any +from typing import Any, cast -from pydantic import ValidationError +from pydantic import BaseModel, ValidationError from typing_extensions import TypeVar from pydantic_ai import Agent, models from pydantic_ai._utils import strip_markdown_fences -from pydantic_evals import Dataset +from pydantic_evals import Case, Dataset from pydantic_evals.evaluators.evaluator import Evaluator -__all__ = ('generate_dataset',) +__all__ = ('generate_dataset', 'generate_dataset_for_agent') InputsT = TypeVar('InputsT', default=Any) """Generic type for the inputs to the task being evaluated.""" - OutputT = TypeVar('OutputT', default=Any) """Generic type for the expected output of the task being evaluated.""" - MetadataT = TypeVar('MetadataT', default=Any) """Generic type for the metadata associated with the task being evaluated.""" @@ -72,7 +72,6 @@ async def generate_dataset( output_type=str, retries=1, ) - result = await agent.run(extra_instructions or 'Please generate the object.') output = strip_markdown_fences(result.output) try: @@ -83,3 +82,106 @@ async def generate_dataset( if path is not None: result.to_file(path, custom_evaluator_types=custom_evaluator_types) # pragma: no cover return result + + +async def generate_dataset_for_agent( + agent: Agent[Any, OutputT], + *, + inputs_type: type[InputsT] = str, + metadata_type: type[MetadataT] | None = None, + path: Path | str | None = None, + model: models.Model | models.KnownModelName = 'openai:gpt-4o', + n_examples: int = 3, + extra_instructions: str | None = None, +) -> Dataset[InputsT, OutputT, MetadataT]: + """Generate evaluation cases by running inputs through a target agent. + + Generates diverse inputs and metadata using an LLM, then runs them through the agent + to produce realistic expected outputs for evaluation. + + Args: + agent: Pydantic AI agent to extract outputs from. + inputs_type: Type of inputs the agent expects. Defaults to str. + metadata_type: Type for metadata. Defaults to None (uses NoneType). + path: Optional path to save the generated dataset. + model: Pydantic AI model to use for generation. Defaults to 'gpt-4o'. + n_examples: Number of examples to generate. Defaults to 3. + extra_instructions: Optional additional instructions for the LLM. + + Returns: + A properly structured Dataset object with generated test cases. + + Raises: + ValidationError: If the LLM's response cannot be parsed. + """ + # Get output schema with proper type handling + # Check if it's a Pydantic model class (not an instance) before calling model_json_schema + output_schema: str + if isinstance(agent.output_type, type) and issubclass(agent.output_type, BaseModel): + output_schema = str(agent.output_type.model_json_schema()) + else: + # For other types (str, custom output specs, etc.), just use string representation + output_schema = str(agent.output_type) # type: ignore[arg-type] + + # Get inputs schema with proper type handling + inputs_schema: str + if isinstance(inputs_type, type) and issubclass(inputs_type, BaseModel): + inputs_schema = str(inputs_type.model_json_schema()) + else: + inputs_schema = str(inputs_type) + + generation_prompt = ( + f'Generate {n_examples} test case inputs for an agent.\n\n' + f'The agent accepts inputs of type: {inputs_schema}\n' + f'The agent produces outputs of type: {output_schema}\n\n' + f'Return a JSON array of objects with "name" (optional string), "inputs" (matching the input type), ' + f'and "metadata" (optional, any additional context).\n' + f'You must not include any characters in your response before the opening [ of the JSON array, or after the closing ].' + + (f'\n\n{extra_instructions}' if extra_instructions else '') + ) + + gen_agent = Agent( + model, + system_prompt=generation_prompt, + output_type=str, + retries=1, + ) + + result = await gen_agent.run('Please generate the test case inputs and metadata.') + output = strip_markdown_fences(result.output).strip() + + try: + if not output: + raise ValueError('Empty output after stripping markdown fences') + + # Additional cleanup in case strip_markdown_fences didn't catch everything + # Remove markdown code blocks with optional language identifier + output = re.sub(r'^```(?:json)?\s*\n?', '', output) + output = re.sub(r'\n?```\s*$', '', output) + output = output.strip() + + inputs_metadata: list[dict[str, Any]] = json.loads(output) + cases: list[Case[InputsT, OutputT, MetadataT]] = [] + + for i, item in enumerate(inputs_metadata): + agent_result = await agent.run(item['inputs']) + cases.append( + Case( + name=item.get('name', f'case-{i}'), + inputs=cast(InputsT, item['inputs']), + expected_output=agent_result.output, + metadata=cast(MetadataT, item.get('metadata')), + ) + ) + + result_dataset: Dataset[InputsT, OutputT, MetadataT] = Dataset(cases=cases, evaluators=[]) + + except (json.JSONDecodeError, KeyError, ValidationError) as e: # pragma: no cover + print(f'Raw response from model:\n{result.output}\n') + print(f'After stripping markdown fences:\n{output}') + raise e + + if path is not None: + result_dataset.to_file(path) + + return result_dataset diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py index 1c7baf20c7..bc7ed51746 100644 --- a/tests/evals/test_dataset.py +++ b/tests/evals/test_dataset.py @@ -1470,6 +1470,15 @@ def test_import_generate_dataset(): assert generate_dataset +def test_import_generate_dataset_from_agent(): + # this function is tough to test in an interesting way outside an example... + # this at least ensures importing it doesn't fail. + # TODO: Add an "example" that actually makes use of this functionality + from pydantic_evals.generation import generate_dataset_for_agent + + assert generate_dataset_for_agent + + def test_evaluate_non_serializable_inputs(): @dataclass class MyInputs: