Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions pydantic_evals/pydantic_evals/tournament.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from __future__ import annotations as _annotations

import textwrap
from enum import Enum

from pydantic import BaseModel, Field

from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider
from pydantic_ai.settings import ModelSettings


class EvalPlayer(BaseModel):
"""Player for the Bradley-Terry algorithm."""

idx: int = Field(..., description='unique identifier for the player')
item: str = Field(..., description='item to be scored')
score: float | None = Field(default=None, description='Bradley-Terry strength score for the item')

class GameResult(str, Enum):
"""Possible outcomes of an evaluation game."""
A = 'A'
B = 'B'

evaluation_instructions = """
You are presented with a question and two possible answers A and B. Evaluate carefully whether answer A or answer B is the better reply.
You have got only these two options. Your evaluations contribute to Bradley-Terry scores across multiple items. Consistency and
objectivity are critical for reliable rankings. Each comparison should be independent but internally consistent.

<EXAMPLES>
Example 1:
<QUESTION> Which of the two ice cream flavours below is more creative? </QUESTION>
<A> Vanilla </A>
<B> Pickled Citrus Ribbon </B>
Expected output:
{
"result": "B",
}

Example 2:
<QUESTION> Which search query shows more genuine curiosity? </QUESTION>
<A> effect of ocean acidification feedback loops on Arctic methane release </A>
<B> climate change effects </B>
Expected output:
{
"result": "A",
}

Example 3:
<QUESTION> Which reply is more insulting? </QUESTION>
<A> Your argument lacks logical coherence and fails to address the core issue at hand. </A>
<B> That's an interesting perspective, though I see it differently. </B>
Expected output:
{
"result": "A",
}
</EXAMPLES>

<REQUIREMENTS>
1. Consider the question carefully. What aspects are important for the answer?
2. Think about answer A. Is it a good answer to the question? Why (not)?
3. Think about answer B. Is it a good answer to the question? Why (not)?
4. Make a decision based on your analysis.
</REQUIREMENTS>

<OUTPUT_FORMAT>
You must respond with valid JSON containing exactly one field called "response" with value "A" or "B":

{
"response": "A",
}
or
{
"response": "B",
}

Do NOT include explanations, reasoning, or any other fields.
</OUTPUT_FORMAT>
"""

evaluation_agent = Agent(
model=OpenAIChatModel(
model_name='qwen2.5:72b',
provider=OpenAIProvider(base_url='http://localhost:11434/v1'),
),
output_type=GameResult,
system_prompt=evaluation_instructions,
retries=5,
instrument=True,
)

class EvalGame(BaseModel):
"""A game between two EvalPlayers."""
criterion: str = Field(..., description='evaluation criterion on which players should be judged')

async def run(self, players: tuple[EvalPlayer, EvalPlayer], agent: Agent, model_settings: ModelSettings) -> tuple[int, int]:
prompt = textwrap.dedent(f"""
<QUESTION> {self.criterion} </QUESTION>
<A> {players[0].item} </A>
<B> {players[1].item} </B>
""")

async with agent:
result = await agent.run(
user_prompt=prompt,
model_settings=model_settings,
)

if result.output == GameResult.A:
return (players[0].idx, players[1].idx)
else:
return (players[1].idx, players[0].idx)
15 changes: 15 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import pydantic_ai.models
from pydantic_ai import Agent, BinaryContent
from pydantic_ai.models import Model, cached_async_http_client
from pydantic_evals.tournament import EvalPlayer

__all__ = (
'IsDatetime',
Expand Down Expand Up @@ -552,3 +553,17 @@ def generate_snapshot_id(node_id: str) -> str:
return f'{node_id}:{i}'

return mocker.patch('pydantic_graph.nodes.generate_snapshot_id', side_effect=generate_snapshot_id)


@pytest.fixture
def ice_cream_players() -> list[EvalPlayer]:
"""
Provide a list of EvalPlayer instances with ice cream flavours.
"""
return [
EvalPlayer(idx=0, item='vanilla'),
EvalPlayer(idx=1, item='chocolate'),
EvalPlayer(idx=2, item='strawberry'),
EvalPlayer(idx=3, item='peach'),
EvalPlayer(idx=4, item='toasted rice & miso caramel ice cream'),
]
44 changes: 44 additions & 0 deletions tests/evals/test_tournament.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations as _annotations

import pytest

from pydantic_ai.settings import ModelSettings
from pydantic_evals.tournament import EvalGame, EvalPlayer, evaluation_agent

MODEL_SETTINGS = ModelSettings(
temperature=0.0, # Model needs to be deterministic for VCR recording to work.
timeout=300,
)

def test_evalplayer() -> None:
"""
Test the EvalPlayer class.
"""

player = EvalPlayer(
idx=42,
item='toasted rice & miso caramel ice cream',
)
assert player.idx == 42
assert player.item == 'toasted rice & miso caramel ice cream'


@pytest.mark.anyio
async def test_evalgame(ice_cream_players: list[EvalPlayer]) -> None:
"""
Test the EvalGame class.
"""

game = EvalGame(criterion='Which of the two ice cream flavours A or B is more creative?')
assert game.criterion == 'Which of the two ice cream flavours A or B is more creative?'

result = await game.run(
players=(ice_cream_players[0], ice_cream_players[4]),
agent=evaluation_agent,
model_settings=MODEL_SETTINGS,
)

assert isinstance(result, tuple)
assert len(result) == 2
assert all(isinstance(r, int) for r in result)
assert result[0] == 4 # Toasted rice & miso caramel ice cream flavour is more creative.
Loading
Loading