Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 64 additions & 102 deletions dspy/evaluate/auto_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,102 +1,64 @@
from dspy.predict.chain_of_thought import ChainOfThought
from dspy.primitives import Module
from dspy.signatures import InputField, OutputField, Signature


class SemanticRecallPrecision(Signature):
"""
Compare a system's response to the ground truth to compute its recall and precision.
If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.
"""

question: str = InputField()
ground_truth: str = InputField()
system_response: str = InputField()
recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")


class DecompositionalSemanticRecallPrecision(Signature):
"""
Compare a system's response to the ground truth to compute recall and precision of key ideas.
You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.
"""

question: str = InputField()
ground_truth: str = InputField()
system_response: str = InputField()
ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")


def f1_score(precision, recall):
precision, recall = max(0.0, min(1.0, precision)), max(0.0, min(1.0, recall))
return 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)


class SemanticF1(Module):
def __init__(self, threshold=0.66, decompositional=False):
self.threshold = threshold

if decompositional:
self.module = ChainOfThought(DecompositionalSemanticRecallPrecision)
else:
self.module = ChainOfThought(SemanticRecallPrecision)

def forward(self, example, pred, trace=None):
scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
score = f1_score(scores.precision, scores.recall)

return score if trace is None else score >= self.threshold



###########


class AnswerCompleteness(Signature):
"""
Estimate the completeness of a system's responses, against the ground truth.
You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
"""

question: str = InputField()
ground_truth: str = InputField()
system_response: str = InputField()
ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
completeness: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")



class AnswerGroundedness(Signature):
"""
Estimate the groundedness of a system's responses, against real retrieved documents written by people.
You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense.
"""

question: str = InputField()
retrieved_context: str = InputField()
system_response: str = InputField()
system_response_claims: str = OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
discussion: str = OutputField(desc="discussion of how supported the claims are by the retrieved context")
groundedness: float = OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")


class CompleteAndGrounded(Module):
def __init__(self, threshold=0.66):
self.threshold = threshold
self.completeness_module = ChainOfThought(AnswerCompleteness)
self.groundedness_module = ChainOfThought(AnswerGroundedness)

def forward(self, example, pred, trace=None):
completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response)
groundedness = self.groundedness_module(question=example.question, retrieved_context=pred.context, system_response=pred.response)
score = f1_score(groundedness.groundedness, completeness.completeness)

return score if trace is None else score >= self.threshold
# dspy.SemanticF1 and dspy.CompleteAndGrounded

DSPy offers automatic evaluation modules for programmatic assessment of prediction quality based on semantic similarity and information completeness/groundedness.

## dspy.SemanticF1

Measures semantic similarity between a predicted response and the ground truth using LLM-powered scoring. Optionally decomposes both responses to compare reasoning overlap.

```python
from dspy.evaluate import SemanticF1
from dspy.datasets import HotPotQA

dspy.settings.configure(lm=dspy.LM('openai/gpt-4o-mini'))
dataset = HotPotQA(train_seed=2024, train_size=500)
module = dspy.ChainOfThought("question -> response")

# Initialize metric
metric = SemanticF1(threshold=0.7, decompositional=False)

score = metric(dataset.train[0], module(dataset.train[0]))
```

## dspy.CompleteAndGrounded

Evaluates both answer completeness (relative to ground truth) and factual groundedness (relative to retrieved evidence), then combines them as an F1 score.

```python
from dspy.evaluate import CompleteAndGrounded

metric = CompleteAndGrounded(threshold=0.66)
score = metric(example, module(example))
```

## API Reference

<!-- START_API_REF -->
::: dspy.SemanticF1
handler: python
options:
show_source: true
show_root_heading: true
heading_level: 2
docstring_style: google
show_root_full_path: true
show_object_full_path: false
separate_signature: false
inherited_members: true
:::
<!-- END_API_REF -->

<!-- START_API_REF -->
::: dspy.CompleteAndGrounded
handler: python
options:
show_source: true
show_root_heading: true
heading_level: 2
docstring_style: google
show_root_full_path: true
show_object_full_path: false
separate_signature: false
inherited_members: true
:::
<!-- END_API_REF -->
56 changes: 28 additions & 28 deletions dspy/evaluate/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,26 @@
from dspy.dsp.utils.utils import print_message


def EM(prediction, answers_list): # noqa: N802
def EM(prediction: str, answers_list: list[str]) -> float: # noqa: N802
"""Returns max exact match score between prediction and any reference in answers_list."""
assert isinstance(answers_list, list)

return max(em_score(prediction, ans) for ans in answers_list)


def F1(prediction, answers_list): # noqa: N802
def F1(prediction: str, answers_list: list[str]) -> float: # noqa: N802
"""Returns maximal token-level F1 between prediction and references."""
assert isinstance(answers_list, list)

return max(f1_score(prediction, ans) for ans in answers_list)


def HotPotF1(prediction, answers_list): # noqa: N802
def HotPotF1(prediction: str, answers_list: list[str]) -> float: # noqa: N802
"""Returns maximal F1 specifically for HotpotQA-style QA."""
assert isinstance(answers_list, list)

return max(hotpot_f1_score(prediction, ans) for ans in answers_list)


def normalize_text(s):
def normalize_text(s: str) -> str:
"""Normalize string by unicode normalization, strip articles and punctuation, and lowercase."""
s = unicodedata.normalize("NFD", s)

def remove_articles(text):
Expand All @@ -45,16 +46,19 @@ def lower(text):
return white_space_fix(remove_articles(remove_punc(lower(s))))


def em_score(prediction, ground_truth):
def em_score(prediction: str, ground_truth: str) -> bool:
"""Exact string match after normalization."""
return normalize_text(prediction) == normalize_text(ground_truth)


# See: https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py
# See: https://rajpurkar.github.io/SQuAD-explorer/ under Evaluation Script
# See: QReCC's


def f1_score(prediction, ground_truth):
def f1_score(prediction: str, ground_truth: str) -> float:
"""Token-level F1 overlap (precision, recall, F1).
Returns 0 if there is no overlap.
"""
prediction_tokens = normalize_text(prediction).split()
ground_truth_tokens = normalize_text(ground_truth).split()

Expand All @@ -66,52 +70,50 @@ def f1_score(prediction, ground_truth):
print_message("\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")

if num_same == 0:
return 0
return 0.0

precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)

return f1


def hotpot_f1_score(prediction, ground_truth):
def hotpot_f1_score(prediction: str, ground_truth: str) -> float:
"""HotpotQA F1 with special handling for yes/no answers."""
normalized_prediction = normalize_text(prediction)
normalized_ground_truth = normalize_text(ground_truth)

if normalized_prediction in ["yes", "no", "noanswer"] and normalized_prediction != normalized_ground_truth:
return 0
return 0.0
if normalized_ground_truth in ["yes", "no", "noanswer"] and normalized_prediction != normalized_ground_truth:
return 0
return 0.0

prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
return 0.0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1


def precision_score(prediction, ground_truth):
def precision_score(prediction: str, ground_truth: str) -> float:
"""Token-level precision (ignoring recall)."""
prediction_tokens = normalize_text(prediction).split()
ground_truth_tokens = normalize_text(ground_truth).split()

common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())

if len(prediction_tokens) == len(ground_truth_tokens) == 0:
# Unlike most tasks, QReCC and SQuAD-2.0 assign 1.0 in this edge case. We don't for uniformity.
print_message("\n#> Precision Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")

if num_same == 0:
return 0

return 0.0
precision = 1.0 * num_same / len(prediction_tokens)

return precision


Expand All @@ -129,28 +131,26 @@ def passage_has_answers(passage: str, answers: list[str]) -> bool:
return any(passage_has_answers(psg, answers) for psg in passages)


def _answer_match(prediction, answers, frac=1.0):
def _answer_match(prediction: str, answers: list[str], frac: float = 1.0) -> bool:
"""Returns True if the prediction matches any of the answers."""

if frac >= 1.0:
return EM(prediction, answers)

return F1(prediction, answers) >= frac


def answer_exact_match(example, pred, trace=None, frac=1.0):
def answer_exact_match(example, pred, trace=None, frac: float = 1.0) -> bool:
"""Default metric: Checks if gold answer exactly matches model prediction, for str or list[str]."""
if isinstance(example.answer, str):
return _answer_match(pred.answer, [example.answer], frac=frac)
elif isinstance(example.answer, list):
return _answer_match(pred.answer, example.answer, frac=frac)

raise ValueError(f"Invalid answer type: {type(example.answer)}")


def answer_passage_match(example, pred, trace=None):
def answer_passage_match(example, pred, trace=None) -> bool:
"""For RAG systems, checks if gold answer is present in any retrieved context passage."""
if isinstance(example.answer, str):
return _passage_match(pred.context, [example.answer])
elif isinstance(example.answer, list):
return _passage_match(pred.context, example.answer)

raise ValueError(f"Invalid answer type: {type(example.answer)}")
56 changes: 20 additions & 36 deletions dspy/predict/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,26 @@ class OfferFeedback(Signature):


class Refine(Module):
"""
Refines a module by running it up to `N` times with different temperatures and returns the best prediction, as defined by the reward_fn, or the first prediction that passes the threshold. After each attempt (except the final one), `Refine` automatically generates detailed feedback about the module's performance and uses this feedback as hints for subsequent runs, creating an iterative refinement process.

Example:
```python
import dspy
# Use a chain-of-thought QA module as the base
qa = dspy.ChainOfThought("question -> answer")
# Define a reward function that checks for one-word answers
def one_word_answer(args, pred):
return 1.0 if len(pred.answer.split()) == 1 else 0.0
# Create the refined module
best_of_3 = dspy.Refine(module=qa, N=3, reward_fn=one_word_answer, threshold=1.0)
# Use the refined module
result = best_of_3(question="What is the capital of Belgium?").answer
# Returns: Brussels
```

By default, `Refine` will try to run the base module up to N times until the threshold is met. If the module encounters an error, it will keep going up to N failed attempts. You can adjust this behavior with the `fail_count` argument to control the number of computation attempts allowed before raising an error.
"""
def __init__(
self,
module: Module,
Expand All @@ -47,42 +67,6 @@ def __init__(
threshold: float,
fail_count: int | None = None,
):
"""
Refines a module by running it up to N times with different temperatures and returns the best prediction.

This module runs the provided module multiple times with varying temperature settings and selects
either the first prediction that exceeds the specified threshold or the one with the highest reward.
If no prediction meets the threshold, it automatically generates feedback to improve future predictions.


Args:
module (Module): The module to refine.
N (int): The number of times to run the module. must
reward_fn (Callable): The reward function.
threshold (float): The threshold for the reward function.
fail_count (Optional[int], optional): The number of times the module can fail before raising an error

Example:
```python
import dspy

dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini"))

# Define a QA module with chain of thought
qa = dspy.ChainOfThought("question -> answer")

# Define a reward function that checks for one-word answers
def one_word_answer(args, pred):
return 1.0 if len(pred.answer.split()) == 1 else 0.0

# Create a refined module that tries up to 3 times
best_of_3 = dspy.Refine(module=qa, N=3, reward_fn=one_word_answer, threshold=1.0)

# Use the refined module
result = best_of_3(question="What is the capital of Belgium?").answer
# Returns: Brussels
```
"""
self.module = module
self.reward_fn = lambda *args: reward_fn(*args) # to prevent this from becoming a parameter
self.threshold = threshold
Expand Down
Loading