From 7bb085cf499a070351768e3d2bd181877aef731c Mon Sep 17 00:00:00 2001
From: benomahony <bomarni@googlemail.com>
Date: Thu, 16 Oct 2025 16:23:05 +0100
Subject: [PATCH 1/4] Add generate_dataset_from_agent

---
 pydantic_evals/pydantic_evals/generation.py | 118 ++++++++++++++++++--
 tests/evals/test_dataset.py                 |   9 ++
 2 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py
index 8a5c52c8d3..c2ee542608 100644
--- a/pydantic_evals/pydantic_evals/generation.py
+++ b/pydantic_evals/pydantic_evals/generation.py
@@ -6,26 +6,25 @@
 
 from __future__ import annotations
 
+import json
+import re
 from collections.abc import Sequence
 from pathlib import Path
-from typing import Any
+from typing import Any, TypeVar
 
-from pydantic import ValidationError
-from typing_extensions import TypeVar
+from pydantic import BaseModel, ValidationError
 
 from pydantic_ai import Agent, models
 from pydantic_ai._utils import strip_markdown_fences
-from pydantic_evals import Dataset
+from pydantic_evals import Case, Dataset
 from pydantic_evals.evaluators.evaluator import Evaluator
 
-__all__ = ('generate_dataset',)
+__all__ = ('generate_dataset', 'generate_dataset_for_agent')
 
 InputsT = TypeVar('InputsT', default=Any)
 """Generic type for the inputs to the task being evaluated."""
-
 OutputT = TypeVar('OutputT', default=Any)
 """Generic type for the expected output of the task being evaluated."""
-
 MetadataT = TypeVar('MetadataT', default=Any)
 """Generic type for the metadata associated with the task being evaluated."""
 
@@ -40,7 +39,6 @@ async def generate_dataset(
     extra_instructions: str | None = None,
 ) -> Dataset[InputsT, OutputT, MetadataT]:
     """Use an LLM to generate a dataset of test cases, each consisting of input, expected output, and metadata.
-
     This function creates a properly structured dataset with the specified input, output, and metadata types.
     It uses an LLM to attempt to generate realistic test cases that conform to the types' schemas.
 
@@ -72,7 +70,6 @@ async def generate_dataset(
         output_type=str,
         retries=1,
     )
-
     result = await agent.run(extra_instructions or 'Please generate the object.')
     output = strip_markdown_fences(result.output)
     try:
@@ -83,3 +80,106 @@ async def generate_dataset(
     if path is not None:
         result.to_file(path, custom_evaluator_types=custom_evaluator_types)  # pragma: no cover
     return result
+
+
+async def generate_dataset_for_agent(
+    agent: Agent[Any, OutputT],
+    *,
+    inputs_type: type[InputsT] = str,  # type: ignore
+    metadata_type: type[MetadataT] | None = None,
+    path: Path | str | None = None,
+    model: models.Model | models.KnownModelName = 'openai:gpt-4o',
+    n_examples: int = 3,
+    extra_instructions: str | None = None,
+) -> Dataset[InputsT, OutputT, MetadataT]:
+    """Generate evaluation cases by running inputs through a target agent.
+
+    Generates diverse inputs and metadata using an LLM, then runs them through the agent
+    to produce realistic expected outputs for evaluation.
+
+    Args:
+        agent: Pydantic AI agent to extract outputs from.
+        inputs_type: Type of inputs the agent expects. Defaults to str.
+        metadata_type: Type for metadata. Defaults to None (uses NoneType).
+        path: Optional path to save the generated dataset.
+        model: Pydantic AI model to use for generation. Defaults to 'gpt-4o'.
+        n_examples: Number of examples to generate. Defaults to 3.
+        extra_instructions: Optional additional instructions for the LLM.
+
+    Returns:
+        A properly structured Dataset object with generated test cases.
+
+    Raises:
+        ValidationError: If the LLM's response cannot be parsed.
+    """
+    # Get output schema with proper type handling
+    # Check if it's a Pydantic model class (not an instance) before calling model_json_schema
+    output_schema: str
+    if isinstance(agent.output_type, type) and issubclass(agent.output_type, BaseModel):
+        output_schema = str(agent.output_type.model_json_schema())
+    else:
+        # For other types (str, custom output specs, etc.), just use string representation
+        output_schema = str(agent.output_type)
+
+    # Get inputs schema with proper type handling
+    inputs_schema: str
+    if issubclass(inputs_type, BaseModel):
+        inputs_schema = str(inputs_type.model_json_schema())
+    else:
+        inputs_schema = str(inputs_type)
+
+    generation_prompt = (
+        f'Generate {n_examples} test case inputs for an agent.\n\n'
+        f'The agent accepts inputs of type: {inputs_schema}\n'
+        f'The agent produces outputs of type: {output_schema}\n\n'
+        f'Return a JSON array of objects with "name" (optional string), "inputs" (matching the input type), '
+        f'and "metadata" (optional, any additional context).\n'
+        f'You must not include any characters in your response before the opening [ of the JSON array, or after the closing ].'
+        + (f'\n\n{extra_instructions}' if extra_instructions else '')
+    )
+
+    gen_agent = Agent(
+        model,
+        system_prompt=generation_prompt,
+        output_type=str,
+        retries=1,
+    )
+
+    result = await gen_agent.run('Please generate the test case inputs and metadata.')
+    output = strip_markdown_fences(result.output).strip()
+
+    try:
+        if not output:
+            raise ValueError('Empty output after stripping markdown fences')
+
+        # Additional cleanup in case strip_markdown_fences didn't catch everything
+        # Remove markdown code blocks with optional language identifier
+        output = re.sub(r'^```(?:json)?\s*\n?', '', output)
+        output = re.sub(r'\n?```\s*$', '', output)
+        output = output.strip()
+
+        inputs_metadata = json.loads(output)
+        cases = []
+
+        for i, item in enumerate(inputs_metadata):
+            agent_result = await agent.run(item['inputs'])
+            cases.append(
+                Case(
+                    name=item.get('name', f'case-{i}'),
+                    inputs=item['inputs'],
+                    expected_output=agent_result.output,
+                    metadata=item.get('metadata'),
+                )
+            )
+
+        result_dataset = Dataset(cases=cases, evaluators=[])
+
+    except (json.JSONDecodeError, KeyError, ValidationError) as e:  # pragma: no cover
+        print(f'Raw response from model:\n{result.output}\n')
+        print(f'After stripping markdown fences:\n{output}')
+        raise e
+
+    if path is not None:
+        result_dataset.to_file(path)
+
+    return result_dataset
diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py
index d2b2abc008..db0d155a33 100644
--- a/tests/evals/test_dataset.py
+++ b/tests/evals/test_dataset.py
@@ -1470,6 +1470,15 @@ def test_import_generate_dataset():
     assert generate_dataset
 
 
+def test_import_generate_dataset_from_agent():
+    # this function is tough to test in an interesting way outside an example...
+    # this at least ensures importing it doesn't fail.
+    # TODO: Add an "example" that actually makes use of this functionality
+    from pydantic_evals.generation import generate_dataset_for_agent
+
+    assert generate_dataset_for_agent
+
+
 def test_evaluate_non_serializable_inputs():
     @dataclass
     class MyInputs:

From 95dbb2b60ecadc6b51cec4430a96c7c1e527834f Mon Sep 17 00:00:00 2001
From: benomahony <bomarni@googlemail.com>
Date: Thu, 16 Oct 2025 16:29:53 +0100
Subject: [PATCH 2/4] Add docs

---
 docs/evals.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/docs/evals.md b/docs/evals.md
index 68ec8184ee..d00c2efe5d 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -766,6 +766,52 @@ async def main():
 
 _(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main(answer))` to run `main`)_
 
+### Generating from an Existing Agent
+
+If you already have an agent, you can use [`generate_evals_from_agent`][pydantic_evals.generation.generate_evals_from_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters.
+
+```python {title="generate_from_agent_example.py"}
+from pydantic import BaseModel
+from pydantic_ai import Agent
+
+from pydantic_evals.generation import generate_evals_from_agent
+
+
+class AnswerOutput(BaseModel):
+    """Model for expected answer outputs."""
+
+    answer: str
+    confidence: float
+
+
+agent = Agent(  # (1)!
+    'openai:gpt-4o',
+    output_type=AnswerOutput,
+    system_prompt='You are a helpful assistant that answers questions about world geography.',
+)
+
+
+async def main():
+    dataset = await generate_evals_from_agent(  # (2)!
+        agent=agent,
+        n_examples=3,
+        model='openai:gpt-4o',
+        path='agent_test_cases.json',
+        extra_instructions='Generate questions about world capitals and landmarks.',
+    )
+    print(f'Generated {len(dataset.cases)} test cases')
+```
+
+1. Create an agent with a defined output type and system prompt.
+2. Generate test cases by extracting types from the agent. The function will:
+    - Use an LLM to generate diverse input prompts based on the agent's configuration
+    - Run each input through the actual agent to get real outputs
+    - Save the inputs and outputs as test cases
+
+This approach ensures your test cases use realistic outputs from your actual agent, rather than having an LLM imagine what the outputs should be.
+
+_(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main())` to run `main`)_
+
 ## Integration with Logfire
 
 Pydantic Evals is implemented using OpenTelemetry to record traces of the evaluation process. These traces contain all

From a796233d6da5505e326ed0f4e4521ad22658c1b8 Mon Sep 17 00:00:00 2001
From: benomahony <bomarni@googlemail.com>
Date: Thu, 16 Oct 2025 16:38:41 +0100
Subject: [PATCH 3/4] Fix typing for lower python versions

---
 docs/evals.md                               |  6 +++---
 pydantic_evals/pydantic_evals/generation.py | 20 +++++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index d00c2efe5d..be84378307 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -768,13 +768,13 @@ _(This example is complete, it can be run "as is" — you'll need to add `asynci
 
 ### Generating from an Existing Agent
 
-If you already have an agent, you can use [`generate_evals_from_agent`][pydantic_evals.generation.generate_evals_from_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters.
+If you already have an agent, you can use [`generate_dataset_for_agent`][pydantic_evals.generation.generate_dataset_for_agent] to automatically extract types from the agent and generate test cases. This is simpler than `generate_dataset` because you don't need to manually specify the dataset type or generic parameters.
 
 ```python {title="generate_from_agent_example.py"}
 from pydantic import BaseModel
 from pydantic_ai import Agent
 
-from pydantic_evals.generation import generate_evals_from_agent
+from pydantic_evals.generation import generate_dataset_for_agent
 
 
 class AnswerOutput(BaseModel):
@@ -792,7 +792,7 @@ agent = Agent(  # (1)!
 
 
 async def main():
-    dataset = await generate_evals_from_agent(  # (2)!
+    dataset = await generate_dataset_for_agent(  # (2)!
         agent=agent,
         n_examples=3,
         model='openai:gpt-4o',
diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py
index c2ee542608..fcc8f1ff80 100644
--- a/pydantic_evals/pydantic_evals/generation.py
+++ b/pydantic_evals/pydantic_evals/generation.py
@@ -10,9 +10,10 @@
 import re
 from collections.abc import Sequence
 from pathlib import Path
-from typing import Any, TypeVar
+from typing import Any, cast
 
 from pydantic import BaseModel, ValidationError
+from typing_extensions import TypeVar
 
 from pydantic_ai import Agent, models
 from pydantic_ai._utils import strip_markdown_fences
@@ -39,6 +40,7 @@ async def generate_dataset(
     extra_instructions: str | None = None,
 ) -> Dataset[InputsT, OutputT, MetadataT]:
     """Use an LLM to generate a dataset of test cases, each consisting of input, expected output, and metadata.
+
     This function creates a properly structured dataset with the specified input, output, and metadata types.
     It uses an LLM to attempt to generate realistic test cases that conform to the types' schemas.
 
@@ -85,7 +87,7 @@ async def generate_dataset(
 async def generate_dataset_for_agent(
     agent: Agent[Any, OutputT],
     *,
-    inputs_type: type[InputsT] = str,  # type: ignore
+    inputs_type: type[InputsT] = str,  # type: ignore[assignment]
     metadata_type: type[MetadataT] | None = None,
     path: Path | str | None = None,
     model: models.Model | models.KnownModelName = 'openai:gpt-4o',
@@ -123,7 +125,7 @@ async def generate_dataset_for_agent(
 
     # Get inputs schema with proper type handling
     inputs_schema: str
-    if issubclass(inputs_type, BaseModel):
+    if isinstance(inputs_type, type) and issubclass(inputs_type, BaseModel):
         inputs_schema = str(inputs_type.model_json_schema())
     else:
         inputs_schema = str(inputs_type)
@@ -158,21 +160,21 @@ async def generate_dataset_for_agent(
         output = re.sub(r'\n?```\s*$', '', output)
         output = output.strip()
 
-        inputs_metadata = json.loads(output)
-        cases = []
+        inputs_metadata: list[dict[str, Any]] = json.loads(output)
+        cases: list[Case[InputsT, OutputT, MetadataT]] = []
 
         for i, item in enumerate(inputs_metadata):
             agent_result = await agent.run(item['inputs'])
             cases.append(
                 Case(
                     name=item.get('name', f'case-{i}'),
-                    inputs=item['inputs'],
-                    expected_output=agent_result.output,
-                    metadata=item.get('metadata'),
+                    inputs=cast(InputsT, item['inputs']),
+                    expected_output=cast(OutputT, agent_result.output),
+                    metadata=cast(MetadataT, item.get('metadata')),
                 )
             )
 
-        result_dataset = Dataset(cases=cases, evaluators=[])
+        result_dataset: Dataset[InputsT, OutputT, MetadataT] = Dataset(cases=cases, evaluators=[])
 
     except (json.JSONDecodeError, KeyError, ValidationError) as e:  # pragma: no cover
         print(f'Raw response from model:\n{result.output}\n')

From b70edef52e334c39e187dd1d7cc0157ac30c696a Mon Sep 17 00:00:00 2001
From: benomahony <bomarni@googlemail.com>
Date: Thu, 16 Oct 2025 17:04:19 +0100
Subject: [PATCH 4/4] WIP Fixing docs and more typing

---
 docs/evals.md                               | 2 +-
 pydantic_evals/pydantic_evals/generation.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/evals.md b/docs/evals.md
index be84378307..581f0017a3 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -772,8 +772,8 @@ If you already have an agent, you can use [`generate_dataset_for_agent`][pydanti
 
 ```python {title="generate_from_agent_example.py"}
 from pydantic import BaseModel
-from pydantic_ai import Agent
 
+from pydantic_ai import Agent
 from pydantic_evals.generation import generate_dataset_for_agent
 
 
diff --git a/pydantic_evals/pydantic_evals/generation.py b/pydantic_evals/pydantic_evals/generation.py
index fcc8f1ff80..b5f59994ea 100644
--- a/pydantic_evals/pydantic_evals/generation.py
+++ b/pydantic_evals/pydantic_evals/generation.py
@@ -87,7 +87,7 @@ async def generate_dataset(
 async def generate_dataset_for_agent(
     agent: Agent[Any, OutputT],
     *,
-    inputs_type: type[InputsT] = str,  # type: ignore[assignment]
+    inputs_type: type[InputsT] = str,
     metadata_type: type[MetadataT] | None = None,
     path: Path | str | None = None,
     model: models.Model | models.KnownModelName = 'openai:gpt-4o',
@@ -121,7 +121,7 @@ async def generate_dataset_for_agent(
         output_schema = str(agent.output_type.model_json_schema())
     else:
         # For other types (str, custom output specs, etc.), just use string representation
-        output_schema = str(agent.output_type)
+        output_schema = str(agent.output_type)  # type: ignore[arg-type]
 
     # Get inputs schema with proper type handling
     inputs_schema: str
@@ -169,7 +169,7 @@ async def generate_dataset_for_agent(
                 Case(
                     name=item.get('name', f'case-{i}'),
                     inputs=cast(InputsT, item['inputs']),
-                    expected_output=cast(OutputT, agent_result.output),
+                    expected_output=agent_result.output,
                     metadata=cast(MetadataT, item.get('metadata')),
                 )
             )