pydantic · lars20070 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/pydantic_evals/pydantic_evals/tournament.py b/pydantic_evals/pydantic_evals/tournament.py
@@ -0,0 +1,113 @@
+from __future__ import annotations as _annotations
+
+import textwrap
+from enum import Enum
+
+from pydantic import BaseModel, Field
+
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+from pydantic_ai.settings import ModelSettings
+
+
+class EvalPlayer(BaseModel):
+    """Player for the Bradley-Terry algorithm."""
+
+    idx: int = Field(..., description='unique identifier for the player')
+    item: str = Field(..., description='item to be scored')
+    score: float | None = Field(default=None, description='Bradley-Terry strength score for the item')
+
+class GameResult(str, Enum):
+    """Possible outcomes of an evaluation game."""
+    A = 'A'
+    B = 'B'
+
+evaluation_instructions = """
+You are presented with a question and two possible answers A and B. Evaluate carefully whether answer A or answer B is the better reply.
+You have got only these two options. Your evaluations contribute to Bradley-Terry scores across multiple items. Consistency and
+objectivity are critical for reliable rankings. Each comparison should be independent but internally consistent.
+
+<EXAMPLES>
+Example 1:
+<QUESTION> Which of the two ice cream flavours below is more creative? </QUESTION>
+<A> Vanilla </A> 
+<B> Pickled Citrus Ribbon </B>
+Expected output:
+{
+    "result": "B",
+}
+
+Example 2:
+<QUESTION> Which search query shows more genuine curiosity? </QUESTION>
+<A> effect of ocean acidification feedback loops on Arctic methane release </A> 
+<B> climate change effects </B>
+Expected output:
+{
+    "result": "A",
+}
+
+Example 3:
+<QUESTION> Which reply is more insulting? </QUESTION>
+<A> Your argument lacks logical coherence and fails to address the core issue at hand. </A> 
+<B> That's an interesting perspective, though I see it differently. </B>
+Expected output:
+{
+    "result": "A",
+}
+</EXAMPLES>
+
+<REQUIREMENTS>
+1. Consider the question carefully. What aspects are important for the answer?
+2. Think about answer A. Is it a good answer to the question? Why (not)?
+3. Think about answer B. Is it a good answer to the question? Why (not)?
+4. Make a decision based on your analysis.
+</REQUIREMENTS>
+
+<OUTPUT_FORMAT>
+You must respond with valid JSON containing exactly one field called "response" with value "A" or "B":
+
+{
+    "response": "A",
+}
+or
+{
+    "response": "B",
+}
+
+Do NOT include explanations, reasoning, or any other fields.
+</OUTPUT_FORMAT>
+"""
+
+evaluation_agent = Agent(
+    model=OpenAIChatModel(
+            model_name='qwen2.5:72b',
+            provider=OpenAIProvider(base_url='http://localhost:11434/v1'),
+        ),
+    output_type=GameResult,
+    system_prompt=evaluation_instructions,
+    retries=5,
+    instrument=True,
+)
+
+class EvalGame(BaseModel):
+    """A game between two EvalPlayers."""
+    criterion: str = Field(..., description='evaluation criterion on which players should be judged')
+
+    async def run(self, players: tuple[EvalPlayer, EvalPlayer], agent: Agent, model_settings: ModelSettings) -> tuple[int, int]:
+        prompt = textwrap.dedent(f"""
+            <QUESTION> {self.criterion} </QUESTION>
+            <A> {players[0].item} </A>
+            <B> {players[1].item} </B>
+        """)
+
+        async with agent:
+            result = await agent.run(
+                user_prompt=prompt,
+                model_settings=model_settings,
+            )
+
+        if result.output == GameResult.A:
+            return (players[0].idx, players[1].idx)
+        else:
+            return (players[1].idx, players[0].idx)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -25,6 +25,7 @@
 import pydantic_ai.models
 from pydantic_ai import Agent, BinaryContent
 from pydantic_ai.models import Model, cached_async_http_client
+from pydantic_evals.tournament import EvalPlayer
 
 __all__ = (
     'IsDatetime',
@@ -552,3 +553,17 @@ def generate_snapshot_id(node_id: str) -> str:
         return f'{node_id}:{i}'
 
     return mocker.patch('pydantic_graph.nodes.generate_snapshot_id', side_effect=generate_snapshot_id)
+
+
+@pytest.fixture
+def ice_cream_players() -> list[EvalPlayer]:
+    """
+    Provide a list of EvalPlayer instances with ice cream flavours.
+    """
+    return [
+        EvalPlayer(idx=0, item='vanilla'),
+        EvalPlayer(idx=1, item='chocolate'),
+        EvalPlayer(idx=2, item='strawberry'),
+        EvalPlayer(idx=3, item='peach'),
+        EvalPlayer(idx=4, item='toasted rice & miso caramel ice cream'),
+    ]
diff --git a/tests/evals/test_tournament.py b/tests/evals/test_tournament.py
@@ -0,0 +1,44 @@
+from __future__ import annotations as _annotations
+
+import pytest
+
+from pydantic_ai.settings import ModelSettings
+from pydantic_evals.tournament import EvalGame, EvalPlayer, evaluation_agent
+
+MODEL_SETTINGS = ModelSettings(
+    temperature=0.0,  # Model needs to be deterministic for VCR recording to work.
+    timeout=300,
+)
+
+def test_evalplayer() -> None:
+    """
+    Test the EvalPlayer class.
+    """
+
+    player = EvalPlayer(
+        idx=42,
+        item='toasted rice & miso caramel ice cream',
+    )
+    assert player.idx == 42
+    assert player.item == 'toasted rice & miso caramel ice cream'
+
+
+@pytest.mark.anyio
+async def test_evalgame(ice_cream_players: list[EvalPlayer]) -> None:
+    """
+    Test the EvalGame class.
+    """
+
+    game = EvalGame(criterion='Which of the two ice cream flavours A or B is more creative?')
+    assert game.criterion == 'Which of the two ice cream flavours A or B is more creative?'
+
+    result = await game.run(
+        players=(ice_cream_players[0], ice_cream_players[4]),
+        agent=evaluation_agent,
+        model_settings=MODEL_SETTINGS,
+    )
+
+    assert isinstance(result, tuple)
+    assert len(result) == 2
+    assert all(isinstance(r, int) for r in result)
+    assert result[0] == 4  # Toasted rice & miso caramel ice cream flavour is more creative.