diff --git a/dspy/datasets/math.py b/dspy/datasets/math.py index 09c4e916ab..8b0062e699 100644 --- a/dspy/datasets/math.py +++ b/dspy/datasets/math.py @@ -1,8 +1,28 @@ import random import re - class MATH: + """ + MATH is a dataset wrapper for the DigitalLearningGmbH/MATH-lighteval dataset, designed for math question answering tasks in DSPy. This class automatically loads, shuffles, and splits the MATH dataset into train, dev, and test splits for easy use in program development and evaluation workflows. The `metric` method checks mathematical equivalence between gold and predicted answers using the official math_equivalence library. + + Args: + subset (str): Subset to load (typically a split such as 'test'). + + Attributes: + train (list[dspy.Example]): Training set with input 'question', and labels 'reasoning' and 'answer'. + dev (list[dspy.Example]): Development set. + test (list[dspy.Example]): Test set. + + Example: + >>> import dspy + >>> from dspy.datasets.math import MATH + >>> math_ds = MATH('test') + >>> for example in math_ds.train[:3]: + ... print(example.question) + ... print(example.answer) + ... + >>> score = math_ds.metric(example, prediction) + """ def __init__(self, subset): from datasets import load_dataset @@ -25,6 +45,16 @@ def __init__(self, subset): self.train, self.dev, self.test = dataset[:size], dataset[size : 2 * size], dataset[2 * size :] def metric(self, example, pred, trace=None): + """ + Math equivalence metric: checks whether the predicted answer matches the gold answer up to mathematical equivalence using Hendrycks's math_equivalence package (see: https://github.com/hendrycks/math). + + Args: + example (dspy.Example): Example with the gold answer. + pred (dspy.Example): Prediction with answer field. + trace (any, optional): Not used. + Returns: + bool: True if answers are mathematically equivalent, else False. + """ try: import math_equivalence except ImportError: @@ -34,6 +64,10 @@ def metric(self, example, pred, trace=None): def extract_answer(s): + """ + Extracts the final boxed answer from a LaTeX-formatted solution string. + Returns the text inside the last \boxed{}. + """ start = s.find("\\boxed{") if start == -1: return None diff --git a/dspy/propose/utils.py b/dspy/propose/utils.py index 8bd720a23a..858d84ef7d 100644 --- a/dspy/propose/utils.py +++ b/dspy/propose/utils.py @@ -2,18 +2,6 @@ import json import re -import dspy - -try: - from IPython.core.magics.code import extract_symbols -except ImportError: - # Won't be able to read code from jupyter notebooks - extract_symbols = None - -from dspy.predict.parameter import Parameter -from dspy.teleprompt.utils import get_signature, new_getfile - - def strip_prefix(text): pattern = r"^[\*\s]*(([\w\'\-]+\s+){0,4}[\w\'\-]+):\s*" modified_text = re.sub(pattern, "", text) diff --git a/dspy/teleprompt/bettertogether.py b/dspy/teleprompt/bettertogether.py index d1154f9ae4..64235ed738 100644 --- a/dspy/teleprompt/bettertogether.py +++ b/dspy/teleprompt/bettertogether.py @@ -14,6 +14,7 @@ ) from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch from dspy.teleprompt.teleprompt import Teleprompter +from dspy.teleprompt.gepa import GEPA logger = logging.getLogger(__name__) @@ -37,15 +38,16 @@ def __init__(self, # a BootstrapFinetune without a metric, say, if there aren't labels # available for the training data. Should this be noted somewhere? # TODO: We should re-consider if the metric should be required. + # GEPA: GEPA is now a supported optimizer for both prompt and weight optimization. self.prompt_optimizer = prompt_optimizer if prompt_optimizer else BootstrapFewShotWithRandomSearch(metric=metric) self.weight_optimizer = weight_optimizer if weight_optimizer else BootstrapFinetune(metric=metric) - is_supported_prompt = isinstance(self.prompt_optimizer, BootstrapFewShotWithRandomSearch) - is_supported_weight = isinstance(self.weight_optimizer, BootstrapFinetune) + is_supported_prompt = isinstance(self.prompt_optimizer, (BootstrapFewShotWithRandomSearch, GEPA)) + is_supported_weight = isinstance(self.weight_optimizer, (BootstrapFinetune, GEPA)) if not is_supported_prompt or not is_supported_weight: raise ValueError( "The BetterTogether optimizer only supports the following optimizers for now: BootstrapFinetune, " - "BootstrapFewShotWithRandomSearch." + "BootstrapFewShotWithRandomSearch, GEPA." ) self.rng = random.Random(seed) diff --git a/dspy/teleprompt/bootstrap_finetune.py b/dspy/teleprompt/bootstrap_finetune.py index 6a431e355e..f152d8f849 100644 --- a/dspy/teleprompt/bootstrap_finetune.py +++ b/dspy/teleprompt/bootstrap_finetune.py @@ -320,3 +320,4 @@ def kill_lms(program: Module): lms = get_unique_lms(program) for lm in lms: lm.kill() + import logging from collections import defaultdict from typing import Any, Callable import dspy from dspy.adapters.base import Adapter from dspy.adapters.chat_adapter import ChatAdapter from dspy.clients.lm import LM from dspy.clients.utils_finetune import infer_data_format from dspy.dsp.utils.settings import settings from dspy.predict.predict import Predict from dspy.primitives.example import Example from dspy.primitives.module import Module from dspy.teleprompt.bootstrap_trace import bootstrap_trace_data from dspy.teleprompt.teleprompt import Teleprompter logger = logging.getLogger(__name__) class FinetuneTeleprompter(Teleprompter): def __init__( self, train_kwargs: dict[str, Any] | dict[LM, dict[str, Any]] | None = None, ): self.train_kwargs: dict[LM, Any] = self.convert_to_lm_dict(train_kwargs or {}) @staticmethod def convert_to_lm_dict(arg) -> dict[LM, Any]: non_empty_dict = arg and isinstance(arg, dict) if non_empty_dict and all(isinstance(k, LM) for k in arg.keys()): return arg # Default to using the same value for all LMs return defaultdict(lambda: arg) class BootstrapFinetune(FinetuneTeleprompter): def __init__( self, metric: Callable | None = None, multitask: bool = True, train_kwargs: dict[str, Any] | dict[LM, dict[str, Any]] | None = None, adapter: Adapter | dict[LM, Adapter] | None = None, exclude_demos: bool = False, num_threads: int | None = None, ): # TODO(feature): Inputs train_kwargs (a dict with string keys) and # adapter (Adapter) can depend on the LM they are used with. We are # takingthese as parameters for the time being. However, they can be # attached to LMs themselves -- an LM could know which adapter it should # be used with along with the train_kwargs. This will lead the only # required argument for LM.finetune() to be the train dataset. super().__init__(train_kwargs=train_kwargs) self.metric = metric self.multitask = multitask self.adapter: dict[LM, Adapter] = self.convert_to_lm_dict(adapter) self.exclude_demos = exclude_demos self.num_threads = num_threads def compile( self, student: Module, trainset: list[Example], teacher: Module | list[Module] | None = None ) -> Module: # TODO: Print statements can be converted to logger.info if we ensure # that the default DSPy logger logs info level messages in notebook # environments. logger.info( \ No newline at end of file diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py index 360a096117..e34e482850 100644 --- a/dspy/teleprompt/mipro_optimizer_v2.py +++ b/dspy/teleprompt/mipro_optimizer_v2.py @@ -109,16 +109,8 @@ def compile( view_data_batch_size: int = 10, tip_aware_proposer: bool = True, fewshot_aware_proposer: bool = True, - requires_permission_to_run: bool | None = None, # deprecated provide_traceback: bool | None = None, ) -> Any: - if requires_permission_to_run == False: - logger.warning( - "'requires_permission_to_run' is deprecated and will be removed in a future version." - ) - elif requires_permission_to_run == True: - raise ValueError("User confirmation is removed from MIPROv2. Please remove the 'requires_permission_to_run' argument.") - effective_max_errors = ( self.max_errors if self.max_errors is not None @@ -629,7 +621,7 @@ def _log_minibatch_eval( logger.info(f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}.") minibatch_scores = ", ".join([f"{s['score']}" for s in score_data if not s["full_eval"]]) - logger.info(f"Minibatch scores so far: {'[' + minibatch_scores + ']'}") + logger.info(f"Minibatch scores so far: '[' + minibatch_scores + '']") full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]]) trajectory = "[" + full_eval_scores + "]" logger.info(f"Full eval scores so far: {trajectory}") @@ -662,7 +654,7 @@ def _log_normal_eval( logger.info(f"Score: {score} with parameters {chosen_params}.") full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]]) - logger.info(f"Scores so far: {'[' + full_eval_scores + ']'}") + logger.info(f"Scores so far: '[' + full_eval_scores + '']") logger.info(f"Best score so far: {best_score}") logger.info(f"{'=' * len(f'===== Trial {trial.number + 1} / {num_trials} =====')}\n\n") diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py index c6447e8330..232dde686d 100644 --- a/dspy/teleprompt/random_search.py +++ b/dspy/teleprompt/random_search.py @@ -1,30 +1,36 @@ -import random - -import dspy -from dspy.evaluate.evaluate import Evaluate -from dspy.teleprompt.teleprompt import Teleprompter - -from .bootstrap import BootstrapFewShot -from .vanilla import LabeledFewShot - -# TODO: Don't forget dealing with the raw demos. -# TODO: Deal with the (pretty common) case of having a metric for filtering and a separate metric for eval. -# The metric itself may tell though by the presence of trace. - -# TODO: This function should take a max_budget and max_teacher_budget. That's in the number of program calls. -# In this case, max_student_budget is max_budget - max_teacher_budget. -# For max_teacher_budget, this will just limit the total number of things we bootstrap. -# This can end up implicitly defining the number of candidate programs (i.e., stop when runs out). Cap at 16. -# For max_student_budget, this will be a more upfront calculation. -# Right now, it can also just induce the number of candidate programs. Later, it could be used more interestingly -# for selective early stopping. -# Progressive elimination sounds about right: after 50 examples, drop bottom third, after 100, another third, etc. -# until only 3--5 are left for the end. Could also be systematic and add (earlier) stopping based on error bounds. -# In general, though, the early filtering is just saying: either there are some really bad ones, or some really really -# good ones, or most things are pretty close. In all of these cases, dropping the bottom third is not going to hurt. - - class BootstrapFewShotWithRandomSearch(Teleprompter): + """ + An experimental teleprompter that bootstraps multiple candidate sets of demonstrations and selects the best DSPy program based on evaluation scores. + + This teleprompter samples up to `num_candidate_programs` candidate demonstration sets by resampling and shuffling the training data, optionally bootstrapping further demonstrations. It then evaluates each candidate program using the provided `metric` on a validation set, tracks scores (with support for error limits), and returns the best-performing program. + + Args: + metric (Callable): Evaluation metric used to score candidate programs. + teacher_settings (dict): Settings for the teacher module, passed to underlying BootstrapFewShot. + max_bootstrapped_demos (int): Maximum number of bootstrapped demonstrations per predictor. + max_labeled_demos (int): Maximum labeled demonstrations from the training set per predictor. + max_rounds (int): Maximum rounds of bootstrapping (per candidate program). + num_candidate_programs (int): Number of candidate sets/programs to generate and evaluate. + num_threads (int): Number of threads to use for parallelized scoring (optional). + max_errors (int): Maximum allowed scoring errors before halting (defaults to DSPy global max_errors). + stop_at_score (float): Early stopping when a candidate achieves this score or higher. + metric_threshold (float): Optionally require metric >= threshold to accept a bootstrapped demo. + + Example: + ```python + from dspy.teleprompt import BootstrapFewShotWithRandomSearch + teleprompter = BootstrapFewShotWithRandomSearch( + metric=custom_metric, + num_candidate_programs=8, + max_bootstrapped_demos=3, + max_labeled_demos=4, + num_threads=4, + stop_at_score=0.95, + max_errors=15 + ) + optimized_program = teleprompter.compile(student=qa_module, trainset=train_examples) + ``` + """ def __init__( self, metric, @@ -55,8 +61,22 @@ def __init__( print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): + """ + Compile and optimize a DSPy program by bootstrapping and evaluating multiple candidate programs. + + Args: + student (Module): The DSPy program to optimize. + teacher (Module): Teacher model for bootstrapping demos (optional; defaults to student). + trainset (list): List of training examples. + valset (list): Evaluation data for scoring candidates (defaults to trainset). + restrict: Optionally restrict candidate seeds (advanced usage). + labeled_sample (bool): If True, sample labeled demos from trainset. + + Returns: + Module: The best performing compiled program with candidate history. + """ self.trainset = trainset - self.valset = valset or trainset # TODO: FIXME: Note this choice. + self.valset = valset or trainset effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors @@ -120,9 +140,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None ) result = evaluate(program) - score, subscores = result.score, [output[2] for output in result.results] - all_subscores.append(subscores) if len(scores) == 0 or score > max(scores): @@ -139,22 +157,11 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}") break - # To best program, attach all program candidates in decreasing average score + # Attach all evaluated candidate programs to the best performing program, sorted by score best_program.candidate_programs = score_data best_program.candidate_programs = sorted( best_program.candidate_programs, key=lambda x: x["score"], reverse=True ) print(f"{len(best_program.candidate_programs)} candidate programs found.") - return best_program - - -# sample between 4 and 10 examples from traces -# TODO: FIXME: The max number of demos should be determined in part by the LM's tokenizer + max_length. -# This does require executing the program, or at least the predictor. -# # # # # # (Actually we can just combine the token counts of the traces, when formatted via signature/adapter). -# Alternatively, we can keep track of the (zero-shot) number of tokens when we bootstrap. -# As another option, we can just try a wide range and handle failures as penalties on the score. -# The number "24" of traces to collect can also be affected. If we only need 3x10, some overlap is ok. -# We can also consider having short_demos and long_demos.