Sebmono · sg-doc-holiday · Aug 28, 2025
diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py
@@ -1,102 +1,64 @@
-from dspy.predict.chain_of_thought import ChainOfThought
-from dspy.primitives import Module
-from dspy.signatures import InputField, OutputField, Signature
-
-
-class SemanticRecallPrecision(Signature):
-    """
-    Compare a system's response to the ground truth to compute its recall and precision.
-    If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.
-    """
-
-    question: str = InputField()
-    ground_truth: str = InputField()
-    system_response: str = InputField()
-    recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
-    precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
-
-
-class DecompositionalSemanticRecallPrecision(Signature):
-    """
-    Compare a system's response to the ground truth to compute recall and precision of key ideas.
-    You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.
-    """
-
-    question: str = InputField()
-    ground_truth: str = InputField()
-    system_response: str = InputField()
-    ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
-    system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
-    discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
-    recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
-    precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
-
-
-def f1_score(precision, recall):
-    precision, recall = max(0.0, min(1.0, precision)), max(0.0, min(1.0, recall))
-    return 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)
-
-
-class SemanticF1(Module):
-    def __init__(self, threshold=0.66, decompositional=False):
-        self.threshold = threshold
-
-        if decompositional:
-            self.module = ChainOfThought(DecompositionalSemanticRecallPrecision)
-        else:
-            self.module = ChainOfThought(SemanticRecallPrecision)
-
-    def forward(self, example, pred, trace=None):
-        scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
-        score = f1_score(scores.precision, scores.recall)
-
-        return score if trace is None else score >= self.threshold
-
-
-
-###########
-
-
-class AnswerCompleteness(Signature):
-    """
-    Estimate the completeness of a system's responses, against the ground truth.
-    You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
-    """
-
-    question: str = InputField()
-    ground_truth: str = InputField()
-    system_response: str = InputField()
-    ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
-    system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
-    discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
-    completeness: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
-
-
-
-class AnswerGroundedness(Signature):
-    """
-    Estimate the groundedness of a system's responses, against real retrieved documents written by people.
-    You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
-    discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense.
-    """
-
-    question: str = InputField()
-    retrieved_context: str = InputField()
-    system_response: str = InputField()
-    system_response_claims: str = OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
-    discussion: str = OutputField(desc="discussion of how supported the claims are by the retrieved context")
-    groundedness: float = OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")
-
-
-class CompleteAndGrounded(Module):
-    def __init__(self, threshold=0.66):
-        self.threshold = threshold
-        self.completeness_module = ChainOfThought(AnswerCompleteness)
-        self.groundedness_module = ChainOfThought(AnswerGroundedness)
-
-    def forward(self, example, pred, trace=None):
-        completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response)
-        groundedness = self.groundedness_module(question=example.question, retrieved_context=pred.context, system_response=pred.response)
-        score = f1_score(groundedness.groundedness, completeness.completeness)
-
-        return score if trace is None else score >= self.threshold
+# dspy.SemanticF1 and dspy.CompleteAndGrounded
+
+DSPy offers automatic evaluation modules for programmatic assessment of prediction quality based on semantic similarity and information completeness/groundedness.
+
+## dspy.SemanticF1
+
+Measures semantic similarity between a predicted response and the ground truth using LLM-powered scoring. Optionally decomposes both responses to compare reasoning overlap.
+
+```python
+from dspy.evaluate import SemanticF1
+from dspy.datasets import HotPotQA
+
+dspy.settings.configure(lm=dspy.LM('openai/gpt-4o-mini'))
+dataset = HotPotQA(train_seed=2024, train_size=500)
+module = dspy.ChainOfThought("question -> response")
+
+# Initialize metric
+metric = SemanticF1(threshold=0.7, decompositional=False)
+
+score = metric(dataset.train[0], module(dataset.train[0]))
+```
+
+## dspy.CompleteAndGrounded
+
+Evaluates both answer completeness (relative to ground truth) and factual groundedness (relative to retrieved evidence), then combines them as an F1 score.
+
+```python
+from dspy.evaluate import CompleteAndGrounded
+
+metric = CompleteAndGrounded(threshold=0.66)
+score = metric(example, module(example))
+```
+
+## API Reference
+
+<!-- START_API_REF -->
+::: dspy.SemanticF1
+    handler: python
+    options:
+        show_source: true
+        show_root_heading: true
+        heading_level: 2
+        docstring_style: google
+        show_root_full_path: true
+        show_object_full_path: false
+        separate_signature: false
+        inherited_members: true
+:::
+<!-- END_API_REF -->
+
+<!-- START_API_REF -->
+::: dspy.CompleteAndGrounded
+    handler: python
+    options:
+        show_source: true
+        show_root_heading: true
+        heading_level: 2
+        docstring_style: google
+        show_root_full_path: true
+        show_object_full_path: false
+        separate_signature: false
+        inherited_members: true
+:::
+<!-- END_API_REF -->
diff --git a/dspy/evaluate/metrics.py b/dspy/evaluate/metrics.py
@@ -8,25 +8,26 @@
 from dspy.dsp.utils.utils import print_message
 
 
-def EM(prediction, answers_list):  # noqa: N802
+def EM(prediction: str, answers_list: list[str]) -> float:  # noqa: N802
+    """Returns max exact match score between prediction and any reference in answers_list."""
     assert isinstance(answers_list, list)
-
     return max(em_score(prediction, ans) for ans in answers_list)
 
 
-def F1(prediction, answers_list):  # noqa: N802
+def F1(prediction: str, answers_list: list[str]) -> float:  # noqa: N802
+    """Returns maximal token-level F1 between prediction and references."""
     assert isinstance(answers_list, list)
-
     return max(f1_score(prediction, ans) for ans in answers_list)
 
 
-def HotPotF1(prediction, answers_list):  # noqa: N802
+def HotPotF1(prediction: str, answers_list: list[str]) -> float:  # noqa: N802
+    """Returns maximal F1 specifically for HotpotQA-style QA."""
     assert isinstance(answers_list, list)
-
     return max(hotpot_f1_score(prediction, ans) for ans in answers_list)
 
 
-def normalize_text(s):
+def normalize_text(s: str) -> str:
+    """Normalize string by unicode normalization, strip articles and punctuation, and lowercase."""
     s = unicodedata.normalize("NFD", s)
 
     def remove_articles(text):
@@ -45,16 +46,19 @@ def lower(text):
     return white_space_fix(remove_articles(remove_punc(lower(s))))
 
 
-def em_score(prediction, ground_truth):
+def em_score(prediction: str, ground_truth: str) -> bool:
+    """Exact string match after normalization."""
     return normalize_text(prediction) == normalize_text(ground_truth)
 
 
 # See: https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py
 # See: https://rajpurkar.github.io/SQuAD-explorer/ under Evaluation Script
 # See: QReCC's
 
-
-def f1_score(prediction, ground_truth):
+def f1_score(prediction: str, ground_truth: str) -> float:
+    """Token-level F1 overlap (precision, recall, F1).
+    Returns 0 if there is no overlap.
+    """
     prediction_tokens = normalize_text(prediction).split()
     ground_truth_tokens = normalize_text(ground_truth).split()
 
@@ -66,52 +70,50 @@ def f1_score(prediction, ground_truth):
         print_message("\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")
 
     if num_same == 0:
-        return 0
+        return 0.0
 
     precision = 1.0 * num_same / len(prediction_tokens)
     recall = 1.0 * num_same / len(ground_truth_tokens)
     f1 = (2 * precision * recall) / (precision + recall)
-
     return f1
 
 
-def hotpot_f1_score(prediction, ground_truth):
+def hotpot_f1_score(prediction: str, ground_truth: str) -> float:
+    """HotpotQA F1 with special handling for yes/no answers."""
     normalized_prediction = normalize_text(prediction)
     normalized_ground_truth = normalize_text(ground_truth)
 
     if normalized_prediction in ["yes", "no", "noanswer"] and normalized_prediction != normalized_ground_truth:
-        return 0
+        return 0.0
     if normalized_ground_truth in ["yes", "no", "noanswer"] and normalized_prediction != normalized_ground_truth:
-        return 0
+        return 0.0
 
     prediction_tokens = normalized_prediction.split()
     ground_truth_tokens = normalized_ground_truth.split()
     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
     num_same = sum(common.values())
     if num_same == 0:
-        return 0
+        return 0.0
     precision = 1.0 * num_same / len(prediction_tokens)
     recall = 1.0 * num_same / len(ground_truth_tokens)
     f1 = (2 * precision * recall) / (precision + recall)
     return f1
 
 
-def precision_score(prediction, ground_truth):
+def precision_score(prediction: str, ground_truth: str) -> float:
+    """Token-level precision (ignoring recall)."""
     prediction_tokens = normalize_text(prediction).split()
     ground_truth_tokens = normalize_text(ground_truth).split()
 
     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
     num_same = sum(common.values())
 
     if len(prediction_tokens) == len(ground_truth_tokens) == 0:
-        # Unlike most tasks, QReCC and SQuAD-2.0 assign 1.0 in this edge case. We don't for uniformity.
         print_message("\n#> Precision Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")
 
     if num_same == 0:
-        return 0
-
+        return 0.0
     precision = 1.0 * num_same / len(prediction_tokens)
-
     return precision
 
 
@@ -129,28 +131,26 @@ def passage_has_answers(passage: str, answers: list[str]) -> bool:
     return any(passage_has_answers(psg, answers) for psg in passages)
 
 
-def _answer_match(prediction, answers, frac=1.0):
+def _answer_match(prediction: str, answers: list[str], frac: float = 1.0) -> bool:
     """Returns True if the prediction matches any of the answers."""
-
     if frac >= 1.0:
         return EM(prediction, answers)
-
     return F1(prediction, answers) >= frac
 
 
-def answer_exact_match(example, pred, trace=None, frac=1.0):
+def answer_exact_match(example, pred, trace=None, frac: float = 1.0) -> bool:
+    """Default metric: Checks if gold answer exactly matches model prediction, for str or list[str]."""
     if isinstance(example.answer, str):
         return _answer_match(pred.answer, [example.answer], frac=frac)
     elif isinstance(example.answer, list):
         return _answer_match(pred.answer, example.answer, frac=frac)
-
     raise ValueError(f"Invalid answer type: {type(example.answer)}")
 
 
-def answer_passage_match(example, pred, trace=None):
+def answer_passage_match(example, pred, trace=None) -> bool:
+    """For RAG systems, checks if gold answer is present in any retrieved context passage."""
     if isinstance(example.answer, str):
         return _passage_match(pred.context, [example.answer])
     elif isinstance(example.answer, list):
         return _passage_match(pred.context, example.answer)
-
     raise ValueError(f"Invalid answer type: {type(example.answer)}")
diff --git a/dspy/predict/refine.py b/dspy/predict/refine.py
@@ -39,6 +39,26 @@ class OfferFeedback(Signature):
 
 
 class Refine(Module):
+    """
+    Refines a module by running it up to `N` times with different temperatures and returns the best prediction, as defined by the reward_fn, or the first prediction that passes the threshold. After each attempt (except the final one), `Refine` automatically generates detailed feedback about the module's performance and uses this feedback as hints for subsequent runs, creating an iterative refinement process.
+
+    Example:
+    ```python
+    import dspy
+    # Use a chain-of-thought QA module as the base
+    qa = dspy.ChainOfThought("question -> answer")
+    # Define a reward function that checks for one-word answers
+    def one_word_answer(args, pred):
+        return 1.0 if len(pred.answer.split()) == 1 else 0.0
+    # Create the refined module
+    best_of_3 = dspy.Refine(module=qa, N=3, reward_fn=one_word_answer, threshold=1.0)
+    # Use the refined module
+    result = best_of_3(question="What is the capital of Belgium?").answer
+    # Returns: Brussels
+    ```
+
+    By default, `Refine` will try to run the base module up to N times until the threshold is met. If the module encounters an error, it will keep going up to N failed attempts. You can adjust this behavior with the `fail_count` argument to control the number of computation attempts allowed before raising an error.
+    """
     def __init__(
         self,
         module: Module,
@@ -47,42 +67,6 @@ def __init__(
         threshold: float,
         fail_count: int | None = None,
     ):
-        """
-        Refines a module by running it up to N times with different temperatures and returns the best prediction.
-
-        This module runs the provided module multiple times with varying temperature settings and selects
-        either the first prediction that exceeds the specified threshold or the one with the highest reward.
-        If no prediction meets the threshold, it automatically generates feedback to improve future predictions.
-
-
-        Args:
-            module (Module): The module to refine.
-            N (int): The number of times to run the module. must
-            reward_fn (Callable): The reward function.
-            threshold (float): The threshold for the reward function.
-            fail_count (Optional[int], optional): The number of times the module can fail before raising an error
-
-        Example:
-            ```python
-            import dspy
-
-            dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini"))
-
-            # Define a QA module with chain of thought
-            qa = dspy.ChainOfThought("question -> answer")
-
-            # Define a reward function that checks for one-word answers
-            def one_word_answer(args, pred):
-                return 1.0 if len(pred.answer.split()) == 1 else 0.0
-
-            # Create a refined module that tries up to 3 times
-            best_of_3 = dspy.Refine(module=qa, N=3, reward_fn=one_word_answer, threshold=1.0)
-
-            # Use the refined module
-            result = best_of_3(question="What is the capital of Belgium?").answer
-            # Returns: Brussels
-            ```
-        """
         self.module = module
         self.reward_fn = lambda *args: reward_fn(*args)  # to prevent this from becoming a parameter
         self.threshold = threshold