Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion dspy/datasets/math.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,28 @@
import random
import re


class MATH:
"""
MATH is a dataset wrapper for the DigitalLearningGmbH/MATH-lighteval dataset, designed for math question answering tasks in DSPy. This class automatically loads, shuffles, and splits the MATH dataset into train, dev, and test splits for easy use in program development and evaluation workflows. The `metric` method checks mathematical equivalence between gold and predicted answers using the official math_equivalence library.

Args:
subset (str): Subset to load (typically a split such as 'test').

Attributes:
train (list[dspy.Example]): Training set with input 'question', and labels 'reasoning' and 'answer'.
dev (list[dspy.Example]): Development set.
test (list[dspy.Example]): Test set.

Example:
>>> import dspy
>>> from dspy.datasets.math import MATH
>>> math_ds = MATH('test')
>>> for example in math_ds.train[:3]:
... print(example.question)
... print(example.answer)
...
>>> score = math_ds.metric(example, prediction)
"""
def __init__(self, subset):
from datasets import load_dataset

Expand All @@ -25,6 +45,16 @@ def __init__(self, subset):
self.train, self.dev, self.test = dataset[:size], dataset[size : 2 * size], dataset[2 * size :]

def metric(self, example, pred, trace=None):
"""
Math equivalence metric: checks whether the predicted answer matches the gold answer up to mathematical equivalence using Hendrycks's math_equivalence package (see: https://github.com/hendrycks/math).

Args:
example (dspy.Example): Example with the gold answer.
pred (dspy.Example): Prediction with answer field.
trace (any, optional): Not used.
Returns:
bool: True if answers are mathematically equivalent, else False.
"""
try:
import math_equivalence
except ImportError:
Expand All @@ -34,6 +64,10 @@ def metric(self, example, pred, trace=None):


def extract_answer(s):
"""
Extracts the final boxed answer from a LaTeX-formatted solution string.
Returns the text inside the last \boxed{}.
"""
start = s.find("\\boxed{")
if start == -1:
return None
Expand Down
12 changes: 0 additions & 12 deletions dspy/propose/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,6 @@
import json
import re

import dspy

try:
from IPython.core.magics.code import extract_symbols
except ImportError:
# Won't be able to read code from jupyter notebooks
extract_symbols = None

from dspy.predict.parameter import Parameter
from dspy.teleprompt.utils import get_signature, new_getfile


def strip_prefix(text):
pattern = r"^[\*\s]*(([\w\'\-]+\s+){0,4}[\w\'\-]+):\s*"
modified_text = re.sub(pattern, "", text)
Expand Down
8 changes: 5 additions & 3 deletions dspy/teleprompt/bettertogether.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch
from dspy.teleprompt.teleprompt import Teleprompter
from dspy.teleprompt.gepa import GEPA

logger = logging.getLogger(__name__)

Expand All @@ -37,15 +38,16 @@ def __init__(self,
# a BootstrapFinetune without a metric, say, if there aren't labels
# available for the training data. Should this be noted somewhere?
# TODO: We should re-consider if the metric should be required.
# GEPA: GEPA is now a supported optimizer for both prompt and weight optimization.
self.prompt_optimizer = prompt_optimizer if prompt_optimizer else BootstrapFewShotWithRandomSearch(metric=metric)
self.weight_optimizer = weight_optimizer if weight_optimizer else BootstrapFinetune(metric=metric)

is_supported_prompt = isinstance(self.prompt_optimizer, BootstrapFewShotWithRandomSearch)
is_supported_weight = isinstance(self.weight_optimizer, BootstrapFinetune)
is_supported_prompt = isinstance(self.prompt_optimizer, (BootstrapFewShotWithRandomSearch, GEPA))
is_supported_weight = isinstance(self.weight_optimizer, (BootstrapFinetune, GEPA))
if not is_supported_prompt or not is_supported_weight:
raise ValueError(
"The BetterTogether optimizer only supports the following optimizers for now: BootstrapFinetune, "
"BootstrapFewShotWithRandomSearch."
"BootstrapFewShotWithRandomSearch, GEPA."
)

self.rng = random.Random(seed)
Expand Down
1 change: 1 addition & 0 deletions dspy/teleprompt/bootstrap_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,4 @@ def kill_lms(program: Module):
lms = get_unique_lms(program)
for lm in lms:
lm.kill()
</raw-text> </documents> <current-document> import logging from collections import defaultdict from typing import Any, Callable import dspy from dspy.adapters.base import Adapter from dspy.adapters.chat_adapter import ChatAdapter from dspy.clients.lm import LM from dspy.clients.utils_finetune import infer_data_format from dspy.dsp.utils.settings import settings from dspy.predict.predict import Predict from dspy.primitives.example import Example from dspy.primitives.module import Module from dspy.teleprompt.bootstrap_trace import bootstrap_trace_data from dspy.teleprompt.teleprompt import Teleprompter logger = logging.getLogger(__name__) class FinetuneTeleprompter(Teleprompter): def __init__( self, train_kwargs: dict[str, Any] | dict[LM, dict[str, Any]] | None = None, ): self.train_kwargs: dict[LM, Any] = self.convert_to_lm_dict(train_kwargs or {}) @staticmethod def convert_to_lm_dict(arg) -> dict[LM, Any]: non_empty_dict = arg and isinstance(arg, dict) if non_empty_dict and all(isinstance(k, LM) for k in arg.keys()): return arg # Default to using the same value for all LMs return defaultdict(lambda: arg) class BootstrapFinetune(FinetuneTeleprompter): def __init__( self, metric: Callable | None = None, multitask: bool = True, train_kwargs: dict[str, Any] | dict[LM, dict[str, Any]] | None = None, adapter: Adapter | dict[LM, Adapter] | None = None, exclude_demos: bool = False, num_threads: int | None = None, ): # TODO(feature): Inputs train_kwargs (a dict with string keys) and # adapter (Adapter) can depend on the LM they are used with. We are # takingthese as parameters for the time being. However, they can be # attached to LMs themselves -- an LM could know which adapter it should # be used with along with the train_kwargs. This will lead the only # required argument for LM.finetune() to be the train dataset. super().__init__(train_kwargs=train_kwargs) self.metric = metric self.multitask = multitask self.adapter: dict[LM, Adapter] = self.convert_to_lm_dict(adapter) self.exclude_demos = exclude_demos self.num_threads = num_threads def compile( self, student: Module, trainset: list[Example], teacher: Module | list[Module] | None = None ) -> Module: # TODO: Print statements can be converted to logger.info if we ensure # that the default DSPy logger logs info level messages in notebook # environments. logger.info(
12 changes: 2 additions & 10 deletions dspy/teleprompt/mipro_optimizer_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,8 @@ def compile(
view_data_batch_size: int = 10,
tip_aware_proposer: bool = True,
fewshot_aware_proposer: bool = True,
requires_permission_to_run: bool | None = None, # deprecated
provide_traceback: bool | None = None,
) -> Any:
if requires_permission_to_run == False:
logger.warning(
"'requires_permission_to_run' is deprecated and will be removed in a future version."
)
elif requires_permission_to_run == True:
raise ValueError("User confirmation is removed from MIPROv2. Please remove the 'requires_permission_to_run' argument.")

effective_max_errors = (
self.max_errors
if self.max_errors is not None
Expand Down Expand Up @@ -629,7 +621,7 @@ def _log_minibatch_eval(

logger.info(f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}.")
minibatch_scores = ", ".join([f"{s['score']}" for s in score_data if not s["full_eval"]])
logger.info(f"Minibatch scores so far: {'[' + minibatch_scores + ']'}")
logger.info(f"Minibatch scores so far: '[' + minibatch_scores + '']")
full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]])
trajectory = "[" + full_eval_scores + "]"
logger.info(f"Full eval scores so far: {trajectory}")
Expand Down Expand Up @@ -662,7 +654,7 @@ def _log_normal_eval(

logger.info(f"Score: {score} with parameters {chosen_params}.")
full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]])
logger.info(f"Scores so far: {'[' + full_eval_scores + ']'}")
logger.info(f"Scores so far: '[' + full_eval_scores + '']")
logger.info(f"Best score so far: {best_score}")
logger.info(f"{'=' * len(f'===== Trial {trial.number + 1} / {num_trials} =====')}\n\n")

Expand Down
89 changes: 48 additions & 41 deletions dspy/teleprompt/random_search.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
import random

import dspy
from dspy.evaluate.evaluate import Evaluate
from dspy.teleprompt.teleprompt import Teleprompter

from .bootstrap import BootstrapFewShot
from .vanilla import LabeledFewShot

# TODO: Don't forget dealing with the raw demos.
# TODO: Deal with the (pretty common) case of having a metric for filtering and a separate metric for eval.
# The metric itself may tell though by the presence of trace.

# TODO: This function should take a max_budget and max_teacher_budget. That's in the number of program calls.
# In this case, max_student_budget is max_budget - max_teacher_budget.
# For max_teacher_budget, this will just limit the total number of things we bootstrap.
# This can end up implicitly defining the number of candidate programs (i.e., stop when runs out). Cap at 16.
# For max_student_budget, this will be a more upfront calculation.
# Right now, it can also just induce the number of candidate programs. Later, it could be used more interestingly
# for selective early stopping.
# Progressive elimination sounds about right: after 50 examples, drop bottom third, after 100, another third, etc.
# until only 3--5 are left for the end. Could also be systematic and add (earlier) stopping based on error bounds.
# In general, though, the early filtering is just saying: either there are some really bad ones, or some really really
# good ones, or most things are pretty close. In all of these cases, dropping the bottom third is not going to hurt.


class BootstrapFewShotWithRandomSearch(Teleprompter):
"""
An experimental teleprompter that bootstraps multiple candidate sets of demonstrations and selects the best DSPy program based on evaluation scores.

This teleprompter samples up to `num_candidate_programs` candidate demonstration sets by resampling and shuffling the training data, optionally bootstrapping further demonstrations. It then evaluates each candidate program using the provided `metric` on a validation set, tracks scores (with support for error limits), and returns the best-performing program.

Args:
metric (Callable): Evaluation metric used to score candidate programs.
teacher_settings (dict): Settings for the teacher module, passed to underlying BootstrapFewShot.
max_bootstrapped_demos (int): Maximum number of bootstrapped demonstrations per predictor.
max_labeled_demos (int): Maximum labeled demonstrations from the training set per predictor.
max_rounds (int): Maximum rounds of bootstrapping (per candidate program).
num_candidate_programs (int): Number of candidate sets/programs to generate and evaluate.
num_threads (int): Number of threads to use for parallelized scoring (optional).
max_errors (int): Maximum allowed scoring errors before halting (defaults to DSPy global max_errors).
stop_at_score (float): Early stopping when a candidate achieves this score or higher.
metric_threshold (float): Optionally require metric >= threshold to accept a bootstrapped demo.

Example:
```python
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
teleprompter = BootstrapFewShotWithRandomSearch(
metric=custom_metric,
num_candidate_programs=8,
max_bootstrapped_demos=3,
max_labeled_demos=4,
num_threads=4,
stop_at_score=0.95,
max_errors=15
)
optimized_program = teleprompter.compile(student=qa_module, trainset=train_examples)
```
"""
def __init__(
self,
metric,
Expand Down Expand Up @@ -55,8 +61,22 @@ def __init__(
print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.")

def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True):
"""
Compile and optimize a DSPy program by bootstrapping and evaluating multiple candidate programs.

Args:
student (Module): The DSPy program to optimize.
teacher (Module): Teacher model for bootstrapping demos (optional; defaults to student).
trainset (list): List of training examples.
valset (list): Evaluation data for scoring candidates (defaults to trainset).
restrict: Optionally restrict candidate seeds (advanced usage).
labeled_sample (bool): If True, sample labeled demos from trainset.

Returns:
Module: The best performing compiled program with candidate history.
"""
self.trainset = trainset
self.valset = valset or trainset # TODO: FIXME: Note this choice.
self.valset = valset or trainset

effective_max_errors = self.max_errors if self.max_errors is not None else dspy.settings.max_errors

Expand Down Expand Up @@ -120,9 +140,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
)

result = evaluate(program)

score, subscores = result.score, [output[2] for output in result.results]

all_subscores.append(subscores)

if len(scores) == 0 or score > max(scores):
Expand All @@ -139,22 +157,11 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}")
break

# To best program, attach all program candidates in decreasing average score
# Attach all evaluated candidate programs to the best performing program, sorted by score
best_program.candidate_programs = score_data
best_program.candidate_programs = sorted(
best_program.candidate_programs, key=lambda x: x["score"], reverse=True
)

print(f"{len(best_program.candidate_programs)} candidate programs found.")

return best_program


# sample between 4 and 10 examples from traces
# TODO: FIXME: The max number of demos should be determined in part by the LM's tokenizer + max_length.
# This does require executing the program, or at least the predictor.
# # # # # # (Actually we can just combine the token counts of the traces, when formatted via signature/adapter).
# Alternatively, we can keep track of the (zero-shot) number of tokens when we bootstrap.
# As another option, we can just try a wide range and handle failures as penalties on the score.
# The number "24" of traces to collect can also be affected. If we only need 3x10, some overlap is ok.
# We can also consider having short_demos and long_demos.