-
Couldn't load subscription status.
- Fork 1.3k
DRAFT: Add generate_dataset_from_agent to Pydantic Evals #3187
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7bb085c
95dbb2b
a796233
b70edef
8eb6f05
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,26 +6,26 @@ | |
|
|
||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| import re | ||
| from collections.abc import Sequence | ||
| from pathlib import Path | ||
| from typing import Any | ||
| from typing import Any, cast | ||
|
|
||
| from pydantic import ValidationError | ||
| from pydantic import BaseModel, ValidationError | ||
| from typing_extensions import TypeVar | ||
|
|
||
| from pydantic_ai import Agent, models | ||
| from pydantic_ai._utils import strip_markdown_fences | ||
| from pydantic_evals import Dataset | ||
| from pydantic_evals import Case, Dataset | ||
| from pydantic_evals.evaluators.evaluator import Evaluator | ||
|
|
||
| __all__ = ('generate_dataset',) | ||
| __all__ = ('generate_dataset', 'generate_dataset_for_agent') | ||
|
|
||
| InputsT = TypeVar('InputsT', default=Any) | ||
| """Generic type for the inputs to the task being evaluated.""" | ||
|
|
||
| OutputT = TypeVar('OutputT', default=Any) | ||
| """Generic type for the expected output of the task being evaluated.""" | ||
|
|
||
| MetadataT = TypeVar('MetadataT', default=Any) | ||
| """Generic type for the metadata associated with the task being evaluated.""" | ||
|
|
||
|
|
@@ -72,7 +72,6 @@ async def generate_dataset( | |
| output_type=str, | ||
| retries=1, | ||
| ) | ||
|
|
||
| result = await agent.run(extra_instructions or 'Please generate the object.') | ||
| output = strip_markdown_fences(result.output) | ||
| try: | ||
|
|
@@ -83,3 +82,106 @@ async def generate_dataset( | |
| if path is not None: | ||
| result.to_file(path, custom_evaluator_types=custom_evaluator_types) # pragma: no cover | ||
| return result | ||
|
|
||
|
|
||
| async def generate_dataset_for_agent( | ||
| agent: Agent[Any, OutputT], | ||
| *, | ||
| inputs_type: type[InputsT] = str, | ||
| metadata_type: type[MetadataT] | None = None, | ||
| path: Path | str | None = None, | ||
| model: models.Model | models.KnownModelName = 'openai:gpt-4o', | ||
| n_examples: int = 3, | ||
| extra_instructions: str | None = None, | ||
| ) -> Dataset[InputsT, OutputT, MetadataT]: | ||
| """Generate evaluation cases by running inputs through a target agent. | ||
| Generates diverse inputs and metadata using an LLM, then runs them through the agent | ||
| to produce realistic expected outputs for evaluation. | ||
| Args: | ||
| agent: Pydantic AI agent to extract outputs from. | ||
| inputs_type: Type of inputs the agent expects. Defaults to str. | ||
| metadata_type: Type for metadata. Defaults to None (uses NoneType). | ||
| path: Optional path to save the generated dataset. | ||
| model: Pydantic AI model to use for generation. Defaults to 'gpt-4o'. | ||
| n_examples: Number of examples to generate. Defaults to 3. | ||
| extra_instructions: Optional additional instructions for the LLM. | ||
| Returns: | ||
| A properly structured Dataset object with generated test cases. | ||
| Raises: | ||
| ValidationError: If the LLM's response cannot be parsed. | ||
| """ | ||
| # Get output schema with proper type handling | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is going to cover a lot of real-world cases. We'll need a way to actually get an agent's complete output schema: #3076 (comment). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this would be way better! Are you on that or open for a contribution? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @benomahony I just created an issue for it: #3225. Definitely open to a contribution! |
||
| # Check if it's a Pydantic model class (not an instance) before calling model_json_schema | ||
| output_schema: str | ||
| if isinstance(agent.output_type, type) and issubclass(agent.output_type, BaseModel): | ||
| output_schema = str(agent.output_type.model_json_schema()) | ||
| else: | ||
| # For other types (str, custom output specs, etc.), just use string representation | ||
| output_schema = str(agent.output_type) # type: ignore[arg-type] | ||
|
|
||
| # Get inputs schema with proper type handling | ||
| inputs_schema: str | ||
| if isinstance(inputs_type, type) and issubclass(inputs_type, BaseModel): | ||
| inputs_schema = str(inputs_type.model_json_schema()) | ||
| else: | ||
| inputs_schema = str(inputs_type) | ||
|
|
||
| generation_prompt = ( | ||
| f'Generate {n_examples} test case inputs for an agent.\n\n' | ||
| f'The agent accepts inputs of type: {inputs_schema}\n' | ||
| f'The agent produces outputs of type: {output_schema}\n\n' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this needed at all? |
||
| f'Return a JSON array of objects with "name" (optional string), "inputs" (matching the input type), ' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use a structured |
||
| f'and "metadata" (optional, any additional context).\n' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's unclear to the model what it should use metadata for. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I didn't know how to make this work generically. Maybe we can discard for now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok by me |
||
| f'You must not include any characters in your response before the opening [ of the JSON array, or after the closing ].' | ||
| + (f'\n\n{extra_instructions}' if extra_instructions else '') | ||
| ) | ||
|
|
||
| gen_agent = Agent( | ||
| model, | ||
| system_prompt=generation_prompt, | ||
| output_type=str, | ||
| retries=1, | ||
| ) | ||
|
|
||
| result = await gen_agent.run('Please generate the test case inputs and metadata.') | ||
| output = strip_markdown_fences(result.output).strip() | ||
|
|
||
| try: | ||
| if not output: | ||
| raise ValueError('Empty output after stripping markdown fences') | ||
|
|
||
| # Additional cleanup in case strip_markdown_fences didn't catch everything | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we extend There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| # Remove markdown code blocks with optional language identifier | ||
| output = re.sub(r'^```(?:json)?\s*\n?', '', output) | ||
| output = re.sub(r'\n?```\s*$', '', output) | ||
| output = output.strip() | ||
|
|
||
| inputs_metadata: list[dict[str, Any]] = json.loads(output) | ||
| cases: list[Case[InputsT, OutputT, MetadataT]] = [] | ||
|
|
||
| for i, item in enumerate(inputs_metadata): | ||
| agent_result = await agent.run(item['inputs']) | ||
| cases.append( | ||
| Case( | ||
| name=item.get('name', f'case-{i}'), | ||
| inputs=cast(InputsT, item['inputs']), | ||
| expected_output=agent_result.output, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems odd, as the goal is typically to define expected outputs manually and then evaluate whether the agent run matches them. Now we're just assuming the agent's current responses are the desired responses. I could see that being useful to get a dataset going, but then we'd want to document that you should verify and modify the expected outputs yourself. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So my use case is to bootstrap a dataset for a very complex model (https://github.com/bitol-io/open-data-contract-standard/blob/main/schema/odcs-json-schema-latest.json -> 600 lines of pydantic) Creating the structure by hand is painful so I thought a nice helper method would help! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea of a helper method for creating cases, we just can't assume the agent's current output will actually be the expected output, so we should make it clear the user is meant to review this manually. |
||
| metadata=cast(MetadataT, item.get('metadata')), | ||
| ) | ||
| ) | ||
|
|
||
| result_dataset: Dataset[InputsT, OutputT, MetadataT] = Dataset(cases=cases, evaluators=[]) | ||
|
|
||
| except (json.JSONDecodeError, KeyError, ValidationError) as e: # pragma: no cover | ||
| print(f'Raw response from model:\n{result.output}\n') | ||
| print(f'After stripping markdown fences:\n{output}') | ||
| raise e | ||
|
|
||
| if path is not None: | ||
| result_dataset.to_file(path) | ||
|
|
||
| return result_dataset | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1470,6 +1470,15 @@ def test_import_generate_dataset(): | |
| assert generate_dataset | ||
|
|
||
|
|
||
| def test_import_generate_dataset_from_agent(): | ||
| # this function is tough to test in an interesting way outside an example... | ||
| # this at least ensures importing it doesn't fail. | ||
| # TODO: Add an "example" that actually makes use of this functionality | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do need a real test. What makes it tough to test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly I saw the test above and copied it ;) Will take a look at tests for both |
||
| from pydantic_evals.generation import generate_dataset_for_agent | ||
|
|
||
| assert generate_dataset_for_agent | ||
|
|
||
|
|
||
| def test_evaluate_non_serializable_inputs(): | ||
| @dataclass | ||
| class MyInputs: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.