pydantic · benomahony · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/docs/evals.md b/docs/evals.md
@@ -766,6 +766,52 @@ async def main():
 
 _(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main(answer))` to run `main`)_
 
+### Generating from an Existing Agent
+
+If you already have an agent, you can use [`generate_dataset_for_agent`][pydantic_evals.generation.generate_dataset_for_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters.
+
+```python {title="generate_from_agent_example.py"}
+from pydantic import BaseModel
+
+from pydantic_ai import Agent
+from pydantic_evals.generation import generate_dataset_for_agent
+
+
+class AnswerOutput(BaseModel):
+    """Model for expected answer outputs."""
+
+    answer: str
+    confidence: float
+
+
+agent = Agent(  # (1)!
+    'openai:gpt-4o',
+    output_type=AnswerOutput,
+    system_prompt='You are a helpful assistant that answers questions about world geography.',
+)
+
+
+async def main():
+    dataset = await generate_dataset_for_agent(  # (2)!
+        agent=agent,
+        n_examples=3,
+        model='openai:gpt-4o',
+        path='agent_test_cases.json',
+        extra_instructions='Generate questions about world capitals and landmarks.',
+    )
+    print(f'Generated {len(dataset.cases)} test cases')
+```
+
+1. Create an agent with a defined output type and system prompt.
+2. Generate test cases by extracting types from the agent. The function will:
+    - Use an LLM to generate diverse input prompts based on the agent's configuration
+    - Run each input through the actual agent to get real outputs
+    - Save the inputs and outputs as test cases
+
+This approach ensures your test cases use realistic outputs from your actual agent, rather than having an LLM imagine what the outputs should be.
+
+_(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main())` to run `main`)_
+
 ## Integration with Logfire
 
 Pydantic Evals is implemented using OpenTelemetry to record traces of the evaluation process. These traces contain all

diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py
@@ -6,26 +6,26 @@
 
 from __future__ import annotations
 
+import json
+import re
 from collections.abc import Sequence
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 
-from pydantic import ValidationError
+from pydantic import BaseModel, ValidationError
 from typing_extensions import TypeVar
 
 from pydantic_ai import Agent, models
 from pydantic_ai._utils import strip_markdown_fences
-from pydantic_evals import Dataset
+from pydantic_evals import Case, Dataset
 from pydantic_evals.evaluators.evaluator import Evaluator
 
-__all__ = ('generate_dataset',)
+__all__ = ('generate_dataset', 'generate_dataset_for_agent')
 
 InputsT = TypeVar('InputsT', default=Any)
 """Generic type for the inputs to the task being evaluated."""
-
 OutputT = TypeVar('OutputT', default=Any)
 """Generic type for the expected output of the task being evaluated."""
-
 MetadataT = TypeVar('MetadataT', default=Any)
 """Generic type for the metadata associated with the task being evaluated."""
 
@@ -72,7 +72,6 @@ async def generate_dataset(
         output_type=str,
         retries=1,
     )
-
     result = await agent.run(extra_instructions or 'Please generate the object.')
     output = strip_markdown_fences(result.output)
     try:
@@ -83,3 +82,106 @@ async def generate_dataset(
     if path is not None:
         result.to_file(path, custom_evaluator_types=custom_evaluator_types)  # pragma: no cover
     return result
+
+
+async def generate_dataset_for_agent(
+    agent: Agent[Any, OutputT],
+    *,
+    inputs_type: type[InputsT] = str,
+    metadata_type: type[MetadataT] | None = None,
+    path: Path | str | None = None,
+    model: models.Model | models.KnownModelName = 'openai:gpt-4o',
+    n_examples: int = 3,
+    extra_instructions: str | None = None,
+) -> Dataset[InputsT, OutputT, MetadataT]:
+    """Generate evaluation cases by running inputs through a target agent.
+
+    Generates diverse inputs and metadata using an LLM, then runs them through the agent
+    to produce realistic expected outputs for evaluation.
+
+    Args:
+        agent: Pydantic AI agent to extract outputs from.
+        inputs_type: Type of inputs the agent expects. Defaults to str.
+        metadata_type: Type for metadata. Defaults to None (uses NoneType).
+        path: Optional path to save the generated dataset.
+        model: Pydantic AI model to use for generation. Defaults to 'gpt-4o'.
+        n_examples: Number of examples to generate. Defaults to 3.
+        extra_instructions: Optional additional instructions for the LLM.
+
+    Returns:
+        A properly structured Dataset object with generated test cases.
+
+    Raises:
+        ValidationError: If the LLM's response cannot be parsed.
+    """
+    # Get output schema with proper type handling
+    # Check if it's a Pydantic model class (not an instance) before calling model_json_schema
+    output_schema: str
+    if isinstance(agent.output_type, type) and issubclass(agent.output_type, BaseModel):
+        output_schema = str(agent.output_type.model_json_schema())
+    else:
+        # For other types (str, custom output specs, etc.), just use string representation
+        output_schema = str(agent.output_type)  # type: ignore[arg-type]
+
+    # Get inputs schema with proper type handling
+    inputs_schema: str
+    if isinstance(inputs_type, type) and issubclass(inputs_type, BaseModel):
+        inputs_schema = str(inputs_type.model_json_schema())
+    else:
+        inputs_schema = str(inputs_type)
+
+    generation_prompt = (
+        f'Generate {n_examples} test case inputs for an agent.\n\n'
+        f'The agent accepts inputs of type: {inputs_schema}\n'
+        f'The agent produces outputs of type: {output_schema}\n\n'
+        f'Return a JSON array of objects with "name" (optional string), "inputs" (matching the input type), '
+        f'and "metadata" (optional, any additional context).\n'
+        f'You must not include any characters in your response before the opening [ of the JSON array, or after the closing ].'
+        + (f'\n\n{extra_instructions}' if extra_instructions else '')
+    )
+
+    gen_agent = Agent(
+        model,
+        system_prompt=generation_prompt,
+        output_type=str,
+        retries=1,
+    )
+
+    result = await gen_agent.run('Please generate the test case inputs and metadata.')
+    output = strip_markdown_fences(result.output).strip()
+
+    try:
+        if not output:
+            raise ValueError('Empty output after stripping markdown fences')
+
+        # Additional cleanup in case strip_markdown_fences didn't catch everything
+        # Remove markdown code blocks with optional language identifier
+        output = re.sub(r'^```(?:json)?\s*\n?', '', output)
+        output = re.sub(r'\n?```\s*$', '', output)
+        output = output.strip()
+
+        inputs_metadata: list[dict[str, Any]] = json.loads(output)
+        cases: list[Case[InputsT, OutputT, MetadataT]] = []
+
+        for i, item in enumerate(inputs_metadata):
+            agent_result = await agent.run(item['inputs'])
+            cases.append(
+                Case(
+                    name=item.get('name', f'case-{i}'),
+                    inputs=cast(InputsT, item['inputs']),
+                    expected_output=agent_result.output,
+                    metadata=cast(MetadataT, item.get('metadata')),
+                )
+            )
+
+        result_dataset: Dataset[InputsT, OutputT, MetadataT] = Dataset(cases=cases, evaluators=[])
+
+    except (json.JSONDecodeError, KeyError, ValidationError) as e:  # pragma: no cover
+        print(f'Raw response from model:\n{result.output}\n')
+        print(f'After stripping markdown fences:\n{output}')
+        raise e
+
+    if path is not None:
+        result_dataset.to_file(path)
+
+    return result_dataset
diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py
@@ -1470,6 +1470,15 @@ def test_import_generate_dataset():
     assert generate_dataset
 
 
+def test_import_generate_dataset_from_agent():
+    # this function is tough to test in an interesting way outside an example...
+    # this at least ensures importing it doesn't fail.
+    # TODO: Add an "example" that actually makes use of this functionality
+    from pydantic_evals.generation import generate_dataset_for_agent
+
+    assert generate_dataset_for_agent
+
+
 def test_evaluate_non_serializable_inputs():
     @dataclass
     class MyInputs: