diff --git a/.coverage b/.coverage index 7ac1d3f..44c7c72 100644 Binary files a/.coverage and b/.coverage differ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22c8d88..f08867b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,6 +14,11 @@ repos: rev: 6.0.0 hooks: - id: flake8 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.6 + hooks: + - id: ruff-check + args: [ --fix ] - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: diff --git a/README.md b/README.md index 9288857..8f33e84 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ -![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af) ![Coverage](https://img.shields.io/badge/Coverage-91%25-brightgreen) [![CI](https://github.com/finitearth/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/ci.yml) @@ -7,104 +6,93 @@ ![Python Versions](https://img.shields.io/badge/Python%20Versions-β‰₯3.10-blue) [![Getting Started](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) -Promptolution is a library that provides a modular and extensible framework for implementing prompt tuning for single tasks and larger experiments. It offers a user-friendly interface to assemble the core components for various prompt optimization tasks. +![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af) + +

+lmu_logo +mcml +ellis_logo +uni_freiburg_color +tum_logo +

+ +## πŸš€ What is Promptolution? -This project was developed by [Timo Heiß](https://www.linkedin.com/in/timo-heiss/), [Moritz Schlager](https://www.linkedin.com/in/moritz-schlager/) and [Tom Zehle](https://www.linkedin.com/in/tom-zehle/) as part of a study program at LMU Munich. +**Promptolution** is a unified, modular framework for prompt optimization built for researchers and advanced practitioners who want full control over their experimental setup. Unlike end-to-end application frameworks with high abstraction, promptolution focuses exclusively on the optimization stage, providing a clean, transparent, and extensible API. It allows for simple prompt optimization for one task up to large-scale reproducible benchmark experiments. + +promptolution_framework + +### Key Features -## Installation +* Implementation of many current prompt optimizers out of the box. +* Unified LLM backend supporting API-based models, Local LLMs, and vLLM clusters. +* Built-in response caching to save costs and parallelized inference for speed. +* Detailed logging and token usage tracking for granular post-hoc analysis. -Use pip to install our library: +Have a look at our [Release Notes](https://finitearth.github.io/promptolution/release-notes/) for the latest updates to promptolution. + +## πŸ“¦ Installation ``` pip install promptolution[api] ``` -If you want to run your prompt optimization locally, either via transformers or vLLM, consider running: +Local inference via vLLM or transformers: ``` pip install promptolution[vllm,transformers] ``` -Alternatively, clone the repository, run +From source: ``` +git clone https://github.com/finitearth/promptolution.git +cd promptolution poetry install ``` -to install the necessary dependencies. You might need to install [pipx](https://pipx.pypa.io/stable/installation/) and [poetry](https://python-poetry.org/docs/) first. - -## Usage - -To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) and our [other demos and tutorials](https://github.com/finitearth/promptolution/blob/main/tutorials). -For more details, a comprehensive **documentation** with API reference is availabe at https://finitearth.github.io/promptolution/. +## πŸ”§ Quickstart -### Featured Optimizers +Start with the **Getting Started tutorial**: +[https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) -| **Name** | **Paper** | **init prompts** | **Exploration** | **Costs** | **Parallelizable** | **Utilizes Fewshot Examples** | -| :-----------: | :----------------------------------------------: | :--------------: | :-------------: | :-------: | :-------------------: | :---------------------------: | -| `CAPO` | [Zehle et al.](https://arxiv.org/abs/2504.16005) | _required_ | πŸ‘ | πŸ’² | βœ… | βœ… | -| `EvoPromptDE` | [Guo et al.](https://arxiv.org/abs/2309.08532) | _required_ | πŸ‘ | πŸ’²πŸ’² | βœ… | ❌ | -| `EvoPromptGA` | [Guo et al.](https://arxiv.org/abs/2309.08532) | _required_ | πŸ‘ | πŸ’²πŸ’² | βœ… | ❌ | -| `OPRO` | [Yang et al.](https://arxiv.org/abs/2309.03409) | _optional_ | πŸ‘Ž | πŸ’²πŸ’² | ❌ | ❌ | +Full docs: +[https://finitearth.github.io/promptolution/](https://finitearth.github.io/promptolution/) -### Core Components - -- `Task`: Encapsulates initial prompts, dataset features, targets, and evaluation methods. -- `Predictor`: Implements the prediction logic, interfacing between the `Task` and `LLM` components. -- `LLM`: Unifies the process of obtaining responses from language models, whether locally hosted or accessed via API. -- `Optimizer`: Implements prompt optimization algorithms, utilizing the other components during the optimization process. - -### Key Features -- Modular and object-oriented design -- Extensible architecture -- Easy-to-use interface for assembling experiments -- Parallelized LLM requests for improved efficiency -- Integration with langchain for standardized LLM API calls -- Detailed logging and callback system for optimization analysis +## 🧠 Featured Optimizers -## Changelog +| **Name** | **Paper** | **Init prompts** | **Exploration** | **Costs** | **Parallelizable** | **Few-shot** | +| ---- | ---- | ---- |---- |---- | ----|---- | +| `CAPO` | [Zehle et al., 2025](https://arxiv.org/abs/2504.16005) | required | πŸ‘ | πŸ’² | βœ… | βœ… | +| `EvoPromptDE` | [Guo et al., 2023](https://arxiv.org/abs/2309.08532) | required | πŸ‘ | πŸ’²πŸ’² | βœ… | ❌ | +| `EvoPromptGA` | [Guo et al., 2023](https://arxiv.org/abs/2309.08532) | required | πŸ‘ | πŸ’²πŸ’² | βœ… | ❌ | +| `OPRO` | [Yang et al., 2023](https://arxiv.org/abs/2309.03409) | optional | πŸ‘Ž | πŸ’²πŸ’² | ❌ | ❌ | -Release notes for each version of the library can be found [here](https://finitearth.github.io/promptolution/release-notes/) +## πŸ— Components -## Contributing +* **`Task`** – Manages the dataset, evaluation metrics, and subsampling. +* **`Predictor`** – Defines how to extract the answer from the model's response. +* **`LLM`** – A unified interface handling inference, token counting, and concurrency. +* **`Optimizer`** – The core component that implements the algorithms that refine prompts. +* **`ExperimentConfig`** – A configuration abstraction to streamline and parametrize large-scale scientific experiments. -The first step to contributing is to open an issue describing the bug, feature, or enhancements. Ensure the issue is clearly described, assigned, and properly tagged. All work should be linked to an open issue. +## 🀝 Contributing -### Code Style and Linting +Open an issue β†’ create a branch β†’ PR β†’ CI β†’ review β†’ merge. +Branch naming: `feature/...`, `fix/...`, `chore/...`, `refactor/...`. -We use Black for code formatting, Flake8 for linting, pydocstyle for docstring conventions (Google format), and isort to sort imports. All these checks are enforced via pre-commit hooks, which automatically run on every commit. Install the pre-commit hooks to ensure that all checks run automatically: +Please ensure to use pre-commit, which assists with keeping the code quality high: ``` pre-commit install -``` - -To run all checks manually: - -``` pre-commit run --all-files ``` - -### Branch Protection and Merging Guidelines - -- The main branch is protected. No direct commits are allowed for non-administrators. -- Rebase your branch on main before opening a pull request. -- All contributions must be made on dedicated branches linked to specific issues. -- Name the branch according to {prefix}/{description} with one of the prefixes fix, feature, chore, or refactor. -- A pull request must have at least one approval from a code owner before it can be merged into main. -- CI checks must pass before a pull request can be merged. -- New releases will only be created by code owners. - -### Testing - -We use pytest to run tests, and coverage to track code coverage. Tests automatically run on pull requests and pushes to the main branch, but please ensure they also pass locally before pushing! -To run the tests with coverage locally, use the following commands or your IDE's test runner: +We encourage every contributor to also write tests, that automatically check if the implementation works as expected: ``` poetry run python -m coverage run -m pytest -``` - -To see the coverage report run: -``` poetry run python -m coverage report ``` + +Developed by **Timo Heiß**, **Moritz Schlager**, and **Tom Zehle** (LMU Munich, MCML, ELLIS, TUM, Uni Freiburg). diff --git a/docs/index.md b/docs/index.md index c562b8c..5496305 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,5 +29,6 @@ Or clone our GitHub repository: - [Optimizers](api/optimizers.md) - [Predictors](api/predictors.md) - [Tasks](api/tasks.md) -- [Callbacks](api/callbacks.md) -- [Config](api/config.md) +- [Helpers](api/helpers.md) +- [Utils](api/utils.md) +- [Exemplar Selectors](api/examplar_selectors.md) diff --git a/docs/release-notes/v2.2.0.md b/docs/release-notes/v2.2.0.md new file mode 100644 index 0000000..8724a41 --- /dev/null +++ b/docs/release-notes/v2.2.0.md @@ -0,0 +1,13 @@ +## Release v2.2.0 +### What's changed + +#### Added features: +* Extended interface of APILLM allowing to pass kwargs to the API +* Improve asynchronous parallelization of LLM calls shortening inference times +* Introduced a `Prompt` class to encapsulate instructions and few-shot examples + +#### Further changes: +* Improved error handling +* Improved task-description infusion mechanism for meta-prompts + +**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/2.1.0...v2.2.0) diff --git a/mkdocs.yml b/mkdocs.yml index 57cde7a..ac377fb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -47,6 +47,7 @@ nav: - Home: index.md - Release Notes: - Overview: release-notes.md + - v2.2.0: release-notes/v2.2.0.md - v2.1.0: release-notes/v2.1.0.md - v2.0.1: release-notes/v2.0.1.md - v2.0.0: release-notes/v2.0.0.md diff --git a/promptolution/exemplar_selectors/__init__.py b/promptolution/exemplar_selectors/__init__.py index 62e6c9a..e948a3a 100644 --- a/promptolution/exemplar_selectors/__init__.py +++ b/promptolution/exemplar_selectors/__init__.py @@ -2,3 +2,8 @@ from promptolution.exemplar_selectors.random_search_selector import RandomSearchSelector from promptolution.exemplar_selectors.random_selector import RandomSelector + +__all__ = [ + "RandomSelector", + "RandomSearchSelector", +] diff --git a/promptolution/exemplar_selectors/base_exemplar_selector.py b/promptolution/exemplar_selectors/base_exemplar_selector.py index bb2ee21..5d77647 100644 --- a/promptolution/exemplar_selectors/base_exemplar_selector.py +++ b/promptolution/exemplar_selectors/base_exemplar_selector.py @@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, Optional +from promptolution.utils.prompt import Prompt + if TYPE_CHECKING: # pragma: no cover from promptolution.predictors.base_predictor import BasePredictor from promptolution.tasks.base_task import BaseTask @@ -33,11 +35,11 @@ def __init__(self, task: "BaseTask", predictor: "BasePredictor", config: Optiona config.apply_to(self) @abstractmethod - def select_exemplars(self, prompt: str, n_examples: int = 5) -> str: + def select_exemplars(self, prompt: Prompt, n_examples: int = 5) -> Prompt: """Select exemplars based on the given prompt. Args: - prompt (str): The input prompt to base the exemplar selection on. + prompt (Prompt): The input prompt to base the exemplar selection on. n_examples (int, optional): The number of exemplars to select. Defaults to 5. Returns: diff --git a/promptolution/exemplar_selectors/random_search_selector.py b/promptolution/exemplar_selectors/random_search_selector.py index 7a88b08..b8cb6ee 100644 --- a/promptolution/exemplar_selectors/random_search_selector.py +++ b/promptolution/exemplar_selectors/random_search_selector.py @@ -1,6 +1,7 @@ """Random search exemplar selector.""" from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector +from promptolution.utils.prompt import Prompt class RandomSearchSelector(BaseExemplarSelector): @@ -10,7 +11,7 @@ class RandomSearchSelector(BaseExemplarSelector): evaluates their performance, and selects the best performing set. """ - def select_exemplars(self, prompt: str, n_trials: int = 5) -> str: + def select_exemplars(self, prompt: Prompt, n_trials: int = 5) -> Prompt: """Select exemplars using a random search strategy. This method generates multiple sets of random examples, evaluates their performance @@ -21,7 +22,7 @@ def select_exemplars(self, prompt: str, n_trials: int = 5) -> str: n_trials (int, optional): The number of random trials to perform. Defaults to 5. Returns: - str: The best performing prompt, which includes the original prompt and the selected exemplars. + Prompt: The best performing prompt, which includes the original prompt and the selected exemplars. """ best_score = 0.0 best_prompt = prompt @@ -30,7 +31,7 @@ def select_exemplars(self, prompt: str, n_trials: int = 5) -> str: _, seq = self.task.evaluate( prompt, self.predictor, eval_strategy="subsample", return_seq=True, return_agg_scores=False ) - prompt_with_examples = "\n\n".join([prompt] + [seq[0][0]]) + "\n\n" + prompt_with_examples = Prompt(prompt.instruction, [seq[0][0]]) # evaluate prompts as few shot prompt score = self.task.evaluate(prompt_with_examples, self.predictor, eval_strategy="subsample")[0] if score > best_score: diff --git a/promptolution/exemplar_selectors/random_selector.py b/promptolution/exemplar_selectors/random_selector.py index a6a4b72..7b0ae0f 100644 --- a/promptolution/exemplar_selectors/random_selector.py +++ b/promptolution/exemplar_selectors/random_selector.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, List, Optional from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector +from promptolution.utils.prompt import Prompt if TYPE_CHECKING: # pragma: no cover from promptolution.predictors.base_predictor import BasePredictor @@ -37,18 +38,18 @@ def __init__( self.desired_score = desired_score super().__init__(task, predictor, config) - def select_exemplars(self, prompt: str, n_examples: int = 5) -> str: + def select_exemplars(self, prompt: Prompt, n_examples: int = 5) -> Prompt: """Select exemplars using a random selection strategy. This method generates random examples and selects those that are evaluated as correct (score == self.desired_score) until the desired number of exemplars is reached. Args: - prompt (str): The input prompt to base the exemplar selection on. + prompt (Prompt): The input prompt to base the exemplar selection on. n_examples (int, optional): The number of exemplars to select. Defaults to 5. Returns: - str: A new prompt that includes the original prompt and the selected exemplars. + Prompt: A new prompt that includes the original prompt and the selected exemplars. """ examples: List[str] = [] while len(examples) < n_examples: @@ -59,4 +60,4 @@ def select_exemplars(self, prompt: str, n_examples: int = 5) -> str: seq = seqs[0][0] if score == self.desired_score: examples.append(seq) - return "\n\n".join([prompt] + examples) + "\n\n" + return Prompt(prompt.instruction, examples) diff --git a/promptolution/helpers.py b/promptolution/helpers.py index 2594609..a25c008 100644 --- a/promptolution/helpers.py +++ b/promptolution/helpers.py @@ -1,10 +1,11 @@ """Helper functions for the usage of the libary.""" - -from typing import TYPE_CHECKING, Callable, List, Literal, Optional +from typing import TYPE_CHECKING, Callable, List, Literal, Optional, Union, cast from promptolution.tasks.judge_tasks import JudgeTask from promptolution.tasks.reward_tasks import RewardTask +from promptolution.utils.prompt import Prompt +from promptolution.utils.prompt_creation import create_prompts_from_task_description if TYPE_CHECKING: # pragma: no cover from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector @@ -28,17 +29,8 @@ from promptolution.optimizers.evoprompt_de import EvoPromptDE from promptolution.optimizers.evoprompt_ga import EvoPromptGA from promptolution.optimizers.opro import OPRO -from promptolution.optimizers.templates import ( - CAPO_CROSSOVER_TEMPLATE, - CAPO_MUTATION_TEMPLATE, - EVOPROMPT_DE_TEMPLATE, - EVOPROMPT_DE_TEMPLATE_TD, - EVOPROMPT_GA_TEMPLATE, - EVOPROMPT_GA_TEMPLATE_TD, - OPRO_TEMPLATE, - OPRO_TEMPLATE_TD, -) -from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier +from promptolution.predictors.first_occurrence_predictor import FirstOccurrencePredictor +from promptolution.predictors.maker_based_predictor import MarkerBasedPredictor from promptolution.tasks.classification_tasks import ClassificationTask from promptolution.utils.logging import get_logger @@ -59,12 +51,13 @@ def run_experiment(df: pd.DataFrame, config: "ExperimentConfig") -> pd.DataFrame train_df = df.sample(frac=0.8, random_state=42) test_df = df.drop(train_df.index) prompts = run_optimization(train_df, config) - df_prompt_scores = run_evaluation(test_df, config, prompts) + prompts_str = [p.construct_prompt() for p in prompts] + df_prompt_scores = run_evaluation(test_df, config, prompts_str) return df_prompt_scores -def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]: +def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Prompt]: """Run the optimization phase of the experiment. Configures all LLMs (downstream, meta, and judge) to use @@ -74,12 +67,18 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]: config (Config): Configuration object for the experiment. Returns: - List[str]: The optimized list of prompts. + List[Prompt]: The optimized list of prompts. """ llm = get_llm(config=config) predictor = get_predictor(llm, config=config) - config.task_description = (config.task_description or "") + " " + (predictor.extraction_description or "") + if getattr(config, "prompts") is None: + initial_prompts = create_prompts_from_task_description( + task_description=config.task_description, + llm=llm, + ) + config.prompts = [Prompt(p) for p in initial_prompts] + if config.optimizer == "capo" and (config.eval_strategy is None or "block" not in config.eval_strategy): logger.warning("πŸ“Œ CAPO requires block evaluation strategy. Setting it to 'sequential_block'.") config.eval_strategy = "sequential_block" @@ -94,14 +93,15 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]: logger.warning("πŸ”₯ Starting optimization...") prompts = optimizer.optimize(n_steps=config.n_steps) - if hasattr(config, "prepend_exemplars") and config.prepend_exemplars: + if hasattr(config, "posthoc_exemplar_selection") and config.posthoc_exemplar_selection: selector = get_exemplar_selector(config.exemplar_selector, task, predictor) prompts = [selector.select_exemplars(p, n_examples=config.n_exemplars) for p in prompts] - return prompts -def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[str]) -> pd.DataFrame: +def run_evaluation( + df: pd.DataFrame, config: "ExperimentConfig", prompts: Union[List[Prompt], List[str]] +) -> pd.DataFrame: """Run the evaluation phase of the experiment. Configures all LLMs (downstream, meta, and judge) to use @@ -119,8 +119,13 @@ def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[s task = get_task(df, config, judge_llm=llm) predictor = get_predictor(llm, config=config) logger.warning("πŸ“Š Starting evaluation...") + if isinstance(prompts[0], str): + str_prompts = cast(List[str], prompts) + prompts = [Prompt(p) for p in str_prompts] + else: + str_prompts = [p.construct_prompt() for p in cast(List[Prompt], prompts)] scores = task.evaluate(prompts, predictor, eval_strategy="full") - df = pd.DataFrame(dict(prompt=prompts, score=scores)) + df = pd.DataFrame(dict(prompt=str_prompts, score=scores)) df = df.sort_values("score", ascending=False, ignore_index=True) return df @@ -220,50 +225,27 @@ def get_optimizer( ValueError: If an unknown optimizer type is specified """ final_optimizer = optimizer or (config.optimizer if config else None) - final_task_description = task_description or (config.task_description if config else None) + if config is None: + config = ExperimentConfig() + if task_description is not None: + config.task_description = task_description if final_optimizer == "capo": - crossover_template = ( - CAPO_CROSSOVER_TEMPLATE.replace("", final_task_description) - if final_task_description - else CAPO_CROSSOVER_TEMPLATE - ) - mutation_template = ( - CAPO_MUTATION_TEMPLATE.replace("", final_task_description) - if final_task_description - else CAPO_MUTATION_TEMPLATE - ) - return CAPO( predictor=predictor, meta_llm=meta_llm, task=task, - crossover_template=crossover_template, - mutation_template=mutation_template, config=config, ) if final_optimizer == "evopromptde": - template = ( - EVOPROMPT_DE_TEMPLATE_TD.replace("", final_task_description) - if final_task_description - else EVOPROMPT_DE_TEMPLATE - ) - return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config) + return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, config=config) if final_optimizer == "evopromptga": - template = ( - EVOPROMPT_GA_TEMPLATE_TD.replace("", final_task_description) - if final_task_description - else EVOPROMPT_GA_TEMPLATE - ) - return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config) + return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, config=config) if final_optimizer == "opro": - template = ( - OPRO_TEMPLATE_TD.replace("", final_task_description) if final_task_description else OPRO_TEMPLATE - ) - return OPRO(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config) + return OPRO(predictor=predictor, meta_llm=meta_llm, task=task, config=config) raise ValueError(f"Unknown optimizer: {final_optimizer}") @@ -296,23 +278,23 @@ def get_predictor(downstream_llm=None, type: "PredictorType" = "marker", *args, """Factory function to create and return a predictor instance. This function supports three types of predictors: - 1. FirstOccurrenceClassifier: A predictor that classifies based on first occurrence of the label. - 2. MarkerBasedClassifier: A predictor that classifies based on a marker. + 1. FirstOccurrencePredictor: A predictor that classifies based on first occurrence of the label. + 2. MarkerBasedPredictor: A predictor that classifies based on a marker. Args: downstream_llm: The language model to use for prediction. type (Literal["first_occurrence", "marker"]): The type of predictor to create: - - "first_occurrence" for FirstOccurrenceClassifier - - "marker" (default) for MarkerBasedClassifier + - "first_occurrence" for FirstOccurrencePredictor + - "marker" (default) for MarkerBasedPredictor *args: Variable length argument list passed to the predictor constructor. **kwargs: Arbitrary keyword arguments passed to the predictor constructor. Returns: - An instance of FirstOccurrenceClassifier or MarkerBasedClassifier. + An instance of FirstOccurrencePredictor or MarkerBasedPredictor. """ if type == "first_occurrence": - return FirstOccurrenceClassifier(downstream_llm, *args, **kwargs) + return FirstOccurrencePredictor(downstream_llm, *args, **kwargs) elif type == "marker": - return MarkerBasedClassifier(downstream_llm, *args, **kwargs) + return MarkerBasedPredictor(downstream_llm, *args, **kwargs) else: raise ValueError(f"Invalid predictor type: '{type}'") diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py index 7fd7b97..8110f87 100644 --- a/promptolution/llms/__init__.py +++ b/promptolution/llms/__init__.py @@ -1,6 +1,11 @@ """Module for Large Language Models.""" - from promptolution.llms.api_llm import APILLM from promptolution.llms.local_llm import LocalLLM from promptolution.llms.vllm import VLLM + +__all__ = [ + "APILLM", + "LocalLLM", + "VLLM", +] diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py index 093478e..c6971a6 100644 --- a/promptolution/llms/api_llm.py +++ b/promptolution/llms/api_llm.py @@ -1,141 +1,241 @@ """Module to interface with various language models through their respective APIs.""" -try: - import asyncio +import asyncio +import threading +from concurrent.futures import TimeoutError as FuturesTimeout - from openai import AsyncOpenAI - from openai.types.chat import ChatCompletion, ChatCompletionMessageParam +from openai import AsyncOpenAI +from openai.types.chat import ChatCompletion - import_successful = True -except ImportError: - import_successful = False - - -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import Any, Dict, List, Optional from promptolution.llms.base_llm import BaseLLM - -if TYPE_CHECKING: # pragma: no cover - from promptolution.utils.config import ExperimentConfig - +from promptolution.utils.config import ExperimentConfig from promptolution.utils.logging import get_logger logger = get_logger(__name__) -async def _invoke_model( - prompt: str, - system_prompt: str, - max_tokens: int, - model_id: str, - client: AsyncOpenAI, - semaphore: asyncio.Semaphore, - max_retries: int = 20, - retry_delay: float = 5, -) -> ChatCompletion: - async with semaphore: - messages: List[ChatCompletionMessageParam] = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": prompt}, - ] - - for attempt in range(max_retries + 1): # +1 for the initial attempt - try: - response = await client.chat.completions.create( - model=model_id, - messages=messages, - max_tokens=max_tokens, - ) - return response - except Exception as e: - if attempt < max_retries: - # Calculate exponential backoff with jitter - logger.warning( - f"⚠️ API call failed (attempt {attempt + 1} / {max_retries + 1}): {str(e)}. " - f"Retrying in {retry_delay:.2f} seconds..." - ) - await asyncio.sleep(retry_delay) - else: - # Log the final failure and re-raise the exception - logger.error(f"❌ API call failed after {max_retries + 1} attempts: {str(e)}") - raise # Re-raise the exception after all retries fail - raise RuntimeError("Failed to get response after multiple retries.") - - class APILLM(BaseLLM): - """A class to interface with language models through their respective APIs. - - This class provides a unified interface for making API calls to language models - using the OpenAI client library. It handles rate limiting through semaphores - and supports both synchronous and asynchronous operations. - - Attributes: - model_id (str): Identifier for the model to use. - client (AsyncOpenAI): The initialized API client. - max_tokens (int): Maximum number of tokens in model responses. - semaphore (asyncio.Semaphore): Semaphore to limit concurrent API calls. - """ + """Persistent asynchronous LLM wrapper using a background event loop.""" def __init__( self, api_url: Optional[str] = None, model_id: Optional[str] = None, api_key: Optional[str] = None, - max_concurrent_calls: int = 50, - max_tokens: int = 512, + max_concurrent_calls: int = 32, + max_tokens: int = 4096, + call_timeout_s: float = 200.0, # per request + gather_timeout_s: float = 500.0, # whole batch + max_retries: int = 5, + retry_base_delay_s: float = 1, + client_kwargs: Optional[Dict[str, Any]] = None, + call_kwargs: Optional[Dict[str, Any]] = None, config: Optional["ExperimentConfig"] = None, ) -> None: - """Initialize the APILLM with a specific model and API configuration. + """Initialize the APILLM. Args: - api_url (str): The base URL for the API endpoint. - model_id (str): Identifier for the model to use. - api_key (str, optional): API key for authentication. Defaults to None. - max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50. - max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512. - config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults. - - Raises: - ImportError: If required libraries are not installed. + api_url (Optional[str]): Base URL for the API endpoint. + model_id (Optional[str]): Identifier of the model to call. Must be set. + api_key (Optional[str]): API key/token for authentication. + max_concurrent_calls (int): Maximum number of concurrent API calls. + max_tokens (int): Default maximum number of tokens in model responses. + call_timeout_s (float): Per-call timeout in seconds. + gather_timeout_s (float): Timeout in seconds for the entire batch. + max_retries (int): Number of retry attempts per prompt in addition to the initial call. + retry_base_delay_s (float): Base delay in seconds for exponential backoff between retries. + client_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `AsyncOpenAI(...)`. + call_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `client.chat.completions.create(...)`. + config (Optional[ExperimentConfig]): Configuration for the LLM, overriding defaults. """ - if not import_successful: - raise ImportError( - "Could not import at least one of the required libraries: openai, asyncio. " - "Please ensure they are installed in your environment." - ) - self.api_url = api_url self.model_id = model_id self.api_key = api_key - self.max_concurrent_calls = max_concurrent_calls self.max_tokens = max_tokens + self.call_timeout_s = call_timeout_s + self.gather_timeout_s = gather_timeout_s + self.max_retries = max_retries + self.retry_base_delay_s = retry_base_delay_s + + # extra kwargs + self._client_kwargs: Dict[str, Any] = dict(client_kwargs or {}) + self._call_kwargs: Dict[str, Any] = dict(call_kwargs or {}) + self.max_concurrent_calls = max_concurrent_calls super().__init__(config=config) - self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.api_key) - self.semaphore = asyncio.Semaphore(self.max_concurrent_calls) + + # --- persistent loop + semaphore --- + self._loop = asyncio.new_event_loop() + self._sem = asyncio.Semaphore(self.max_concurrent_calls) + + def _run_loop() -> None: + """Run the background event loop forever.""" + asyncio.set_event_loop(self._loop) + self._loop.run_forever() + + self._thread = threading.Thread(target=_run_loop, name="APILLMLoop", daemon=True) + self._thread.start() + + # Create client once; can still be customised via client_kwargs. + self.client = AsyncOpenAI( + base_url=self.api_url, + api_key=self.api_key, + timeout=self.call_timeout_s, + **self._client_kwargs, + ) + + # ---------- async bits that run inside the loop ---------- + async def _ainvoke_once(self, prompt: str, system_prompt: str) -> ChatCompletion: + """Perform a single API call with a per-call timeout. + + Args: + prompt (str): User prompt content. + system_prompt (str): System-level instructions for the model. + + Returns: + ChatCompletion: Raw completion response from the API. + + Raises: + asyncio.TimeoutError: If the call exceeds `call_timeout_s`. + Exception: Any exception raised by the underlying client call. + """ + messages = [ + {"role": "system", "content": str(system_prompt)}, + {"role": "user", "content": str(prompt)}, + ] + + # base kwargs; user can override via call_kwargs + kwargs: Dict[str, Any] = { + "model": self.model_id, + "messages": messages, + "max_tokens": self.max_tokens, + } + kwargs.update(self._call_kwargs) + + async with self._sem: + # per-call timeout enforces failure instead of hang + return await asyncio.wait_for( + self.client.chat.completions.create(**kwargs), + timeout=self.call_timeout_s, + ) + + async def _ainvoke_with_retries(self, prompt: str, system_prompt: str) -> str: + """Invoke the model with retries and exponential backoff. + + Args: + prompt (str): User prompt content. + system_prompt (str): System-level instructions for the model. + + Returns: + str: The message content of the first choice in the completion. + + Raises: + Exception: The last exception encountered after all retries are exhausted. + """ + last_err: Optional[Exception] = None + for attempt in range(self.max_retries + 1): + try: + r = await self._ainvoke_once(prompt, system_prompt) + content = r.choices[0].message.content + if content is None: + raise RuntimeError("Empty content from model") + return content + except Exception as e: + last_err = e + if attempt < self.max_retries: + delay = self.retry_base_delay_s * (2**attempt) + logger.error( + f"LLM call failed ({attempt + 1}/{self.max_retries + 1}): β€” retrying in {delay}s", exc_info=e + ) + await asyncio.sleep(delay) + assert last_err is not None + raise last_err + + async def _aget_batch(self, prompts: List[str], system_prompts: List[str]) -> List[str]: + """Execute a batch of prompts concurrently and collect responses. + + Args: + prompts (List[str]): List of user prompts. + system_prompts (List[str]): List of system prompts; must match `prompts` in length. + + Returns: + List[str]: List of model outputs. For failed entries, an empty string is inserted. + + Raises: + TimeoutError: If the entire batch exceeds `gather_timeout_s`. + RuntimeError: If any of the tasks fails; the first exception is propagated. + """ + tasks = [asyncio.create_task(self._ainvoke_with_retries(p, s)) for p, s in zip(prompts, system_prompts)] + + try: + results = await asyncio.wait_for( + asyncio.gather(*tasks, return_exceptions=True), + timeout=self.gather_timeout_s, + ) + except asyncio.TimeoutError: + for t in tasks: + t.cancel() + raise TimeoutError(f"LLM batch timed out after {self.gather_timeout_s}s") + + outs: List[str] = [] + first_exc: Optional[BaseException] = None + for r in results: + if isinstance(r, BaseException): + if first_exc is None: + first_exc = r + outs.append("") + else: + outs.append(r) + + if first_exc: + for t in tasks: + if not t.done(): + t.cancel() + raise RuntimeError(f"LLM batch failed: {first_exc}") from first_exc + + return outs + + # ---------- sync API used by the threads ---------- + def _submit(self, coro): + """Submit a coroutine to the background event loop. + + Args: + coro: Coroutine object to be scheduled on the loop. + + Returns: + concurrent.futures.Future: Future representing the coroutine result. + """ + return asyncio.run_coroutine_threadsafe(coro, self._loop) def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]: - # Setup for async execution in sync context + """Synchronously obtain responses for a batch of prompts. + + This is the main entrypoint used by external callers. It handles system + prompt broadcasting and delegates the actual work to the async batch + execution on the background loop. + + Args: + prompts (List[str]): List of user prompts. + system_prompts (List[str]): List of system prompts. If a single system + prompt is provided and multiple prompts are given, the system + prompt is broadcast to all prompts. Otherwise, the list is + normalized to match the length of `prompts`. + + Returns: + List[str]: List of model responses corresponding to `prompts`. + + Raises: + TimeoutError: If waiting on the batch future exceeds `gather_timeout_s + 5.0`. + Exception: Any underlying error from the async batch execution. + """ + fut = self._submit(self._aget_batch(prompts, system_prompts)) try: - loop = asyncio.get_running_loop() - except RuntimeError: # 'get_running_loop' raises a RuntimeError if there is no running loop - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts)) - return responses - - async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]: - assert self.model_id is not None, "model_id must be set" - tasks = [ - _invoke_model(prompt, system_prompt, self.max_tokens, self.model_id, self.client, self.semaphore) - for prompt, system_prompt in zip(prompts, system_prompts) - ] - messages = await asyncio.gather(*tasks) - responses = [] - for message in messages: - response = message.choices[0].message.content - if response is None: - raise ValueError("Received None response from the API.") - responses.append(response) - return responses + r = fut.result(timeout=self.gather_timeout_s + 5.0) + return r + except FuturesTimeout: + fut.cancel() + raise TimeoutError(f"LLM batch (future) timed out after {self.gather_timeout_s + 5.0}s") + except Exception: + raise diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py index 2fe43f9..2007a10 100644 --- a/promptolution/llms/base_llm.py +++ b/promptolution/llms/base_llm.py @@ -9,8 +9,8 @@ from promptolution.utils.config import ExperimentConfig from transformers import PreTrainedTokenizer -from promptolution.optimizers.templates import DEFAULT_SYS_PROMPT from promptolution.utils.logging import get_logger +from promptolution.utils.templates import DEFAULT_SYS_PROMPT logger = get_logger(__name__) @@ -42,7 +42,7 @@ def __init__(self, config: Optional["ExperimentConfig"] = None): # Initialize token counters self.input_token_count = 0 self.output_token_count = 0 - self.tokenizer: Optional[PreTrainedTokenizer] = None + self.tokenizer: Optional["PreTrainedTokenizer"] = None def get_token_count(self) -> Dict[str, int]: """Get the current count of input and output tokens. diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py index 1df5121..f22ff52 100644 --- a/promptolution/llms/vllm.py +++ b/promptolution/llms/vllm.py @@ -1,10 +1,10 @@ """Module for running language models locally using the vLLM library.""" - from typing import TYPE_CHECKING, Any, Dict, List, Optional if TYPE_CHECKING: # pragma: no cover from promptolution.utils.config import ExperimentConfig + from transformers import PreTrainedTokenizer from promptolution.llms.base_llm import BaseLLM @@ -14,7 +14,6 @@ try: from transformers import AutoTokenizer # type: ignore - from transformers import PreTrainedTokenizer from vllm import LLM, SamplingParams imports_successful = True @@ -38,7 +37,7 @@ class VLLM(BaseLLM): update_token_count: Update the token count based on the given inputs and outputs. """ - tokenizer: PreTrainedTokenizer + tokenizer: "PreTrainedTokenizer" def __init__( self, diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py index 47f78a3..4b7a7db 100644 --- a/promptolution/optimizers/__init__.py +++ b/promptolution/optimizers/__init__.py @@ -1,23 +1,13 @@ """Module for prompt optimizers.""" - from promptolution.optimizers.capo import CAPO from promptolution.optimizers.evoprompt_de import EvoPromptDE from promptolution.optimizers.evoprompt_ga import EvoPromptGA from promptolution.optimizers.opro import OPRO -from promptolution.optimizers.templates import ( - CAPO_CROSSOVER_TEMPLATE, - CAPO_DOWNSTREAM_TEMPLATE, - CAPO_FEWSHOT_TEMPLATE, - CAPO_MUTATION_TEMPLATE, - DEFAULT_SYS_PROMPT, - EVOPROMPT_DE_TEMPLATE, - EVOPROMPT_DE_TEMPLATE_TD, - EVOPROMPT_GA_TEMPLATE, - EVOPROMPT_GA_TEMPLATE_TD, - OPRO_TEMPLATE, - OPRO_TEMPLATE_TD, - PROMPT_CREATION_TEMPLATE, - PROMPT_CREATION_TEMPLATE_TD, - PROMPT_VARIATION_TEMPLATE, -) + +__all__ = [ + "CAPO", + "EvoPromptDE", + "EvoPromptGA", + "OPRO", +] diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py index ded87e5..7264f6f 100644 --- a/promptolution/optimizers/base_optimizer.py +++ b/promptolution/optimizers/base_optimizer.py @@ -1,6 +1,5 @@ """Base module for optimizers in the promptolution library.""" - from abc import ABC, abstractmethod from typing import TYPE_CHECKING, List, Literal, Optional @@ -12,6 +11,7 @@ from promptolution.utils.callbacks import BaseCallback from promptolution.utils.logging import get_logger +from promptolution.utils.prompt import Prompt logger = get_logger(__name__) @@ -49,7 +49,7 @@ def __init__( config (ExperimentConfig, optional): Configuration for the optimizer, overriding defaults. """ # Set up optimizer state - self.prompts: List[str] = initial_prompts or [] + self.prompts: List[Prompt] = [Prompt(p) for p in initial_prompts] if initial_prompts else [] self.task = task self.callbacks: List["BaseCallback"] = callbacks or [] self.predictor = predictor @@ -60,7 +60,7 @@ def __init__( self.config = config - def optimize(self, n_steps: int) -> List[str]: + def optimize(self, n_steps: int) -> List[Prompt]: """Perform the optimization process. This method should be implemented by concrete optimizer classes to define @@ -82,8 +82,7 @@ def optimize(self, n_steps: int) -> List[str]: self.prompts = self._step() except Exception as e: # exit training loop and gracefully fail - logger.error(f"β›” Error during optimization step: {e}") - logger.error("⚠️ Exiting optimization loop.") + logger.error("β›” Error during optimization step! ⚠️ Exiting optimization loop.", exc_info=e) break # Callbacks at the end of each step @@ -105,7 +104,7 @@ def _pre_optimization_loop(self) -> None: pass @abstractmethod - def _step(self) -> List[str]: + def _step(self) -> List[Prompt]: """Perform a single optimization step. This method should be implemented by concrete optimizer classes to define @@ -129,3 +128,15 @@ def _on_train_end(self) -> None: """Call all registered callbacks at the end of the entire optimization process.""" for callback in self.callbacks: callback.on_train_end(self) + + def _initialize_meta_template(self, template: str) -> str: + task_description = getattr(self.task, "task_description") + extraction_description = getattr(self.predictor, "extraction_description") + if self.config is not None and getattr(self.config, "task_description") is not None: + task_description = self.config.task_description + if task_description is None: + logger.warning("Task description is not provided. Please make sure to include relevant task details.") + task_description = "" + if extraction_description is not None: + task_description += "\n" + extraction_description + return template.replace("", task_description) diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py index bcfa275..3c5955a 100644 --- a/promptolution/optimizers/capo.py +++ b/promptolution/optimizers/capo.py @@ -1,15 +1,12 @@ """Implementation of the CAPO (Cost-Aware Prompt Optimization) algorithm.""" import random -from itertools import compress import numpy as np import pandas as pd from typing import TYPE_CHECKING, Any, List, Optional, Tuple -from promptolution.utils.formatting import extract_from_tag - if TYPE_CHECKING: # pragma: no cover from promptolution.utils.callbacks import BaseCallback from promptolution.llms.base_llm import BaseLLM @@ -19,52 +16,16 @@ from promptolution.utils.test_statistics import TestStatistics from promptolution.optimizers.base_optimizer import BaseOptimizer -from promptolution.optimizers.templates import ( - CAPO_CROSSOVER_TEMPLATE, - CAPO_DOWNSTREAM_TEMPLATE, - CAPO_FEWSHOT_TEMPLATE, - CAPO_MUTATION_TEMPLATE, -) +from promptolution.utils.formatting import extract_from_tag from promptolution.utils.logging import get_logger +from promptolution.utils.prompt import Prompt, sort_prompts_by_scores +from promptolution.utils.templates import CAPO_CROSSOVER_TEMPLATE, CAPO_FEWSHOT_TEMPLATE, CAPO_MUTATION_TEMPLATE from promptolution.utils.test_statistics import get_test_statistic_func from promptolution.utils.token_counter import get_token_counter logger = get_logger(__name__) -class CAPOPrompt: - """Represents a prompt consisting of an instruction and few-shot examples.""" - - def __init__(self, instruction_text: str, few_shots: List[str]) -> None: - """Initializes the Prompt with an instruction and associated examples. - - Args: - instruction_text (str): The instruction or prompt text. - few_shots (List[str]): List of examples as string. - """ - self.instruction_text = instruction_text.strip() - self.few_shots = few_shots - - def construct_prompt(self) -> str: - """Constructs the full prompt string by replacing placeholders in the template with the instruction and formatted examples. - - Returns: - str: The constructed prompt string. - """ - few_shot_str = "\n\n".join(self.few_shots).strip() - prompt = ( - CAPO_DOWNSTREAM_TEMPLATE.replace("", self.instruction_text) - .replace("", few_shot_str) - .replace("\n\n\n\n", "\n\n") # replace extra newlines if no few shots are provided - .strip() - ) - return prompt - - def __str__(self) -> str: - """Returns the string representation of the prompt.""" - return self.construct_prompt() - - class CAPO(BaseOptimizer): """CAPO: Cost-Aware Prompt Optimization. @@ -80,6 +41,8 @@ def __init__( task: "BaseTask", meta_llm: "BaseLLM", initial_prompts: Optional[List[str]] = None, + crossover_template: Optional[str] = None, + mutation_template: Optional[str] = None, crossovers_per_iter: int = 4, upper_shots: int = 5, max_n_blocks_eval: int = 10, @@ -89,8 +52,6 @@ def __init__( check_fs_accuracy: bool = True, create_fs_reasoning: bool = True, df_few_shots: Optional[pd.DataFrame] = None, - crossover_template: Optional[str] = None, - mutation_template: Optional[str] = None, callbacks: Optional[List["BaseCallback"]] = None, config: Optional["ExperimentConfig"] = None, ) -> None: @@ -101,6 +62,8 @@ def __init__( task (BaseTask): The task instance containing dataset and description. meta_llm (BaseLLM): The meta language model for crossover/mutation. initial_prompts (List[str]): Initial prompt instructions. + crossover_template (str, optional): Template for crossover instructions. + mutation_template (str, optional): Template for mutation instructions. crossovers_per_iter (int): Number of crossover operations per iteration. upper_shots (int): Maximum number of few-shot examples per prompt. p_few_shot_reasoning (float): Probability of generating llm-reasoning for few-shot examples, instead of simply using input-output pairs. @@ -113,17 +76,12 @@ def __init__( create_fs_reasoning (bool): Whether to create reasoning for few-shot examples using the downstream model, instead of simply using input-output pairs from the few shots DataFrame. Default is True. df_few_shots (pd.DataFrame): DataFrame containing few-shot examples. If None, will pop 10% of datapoints from task. - crossover_template (str, optional): Template for crossover instructions. - mutation_template (str, optional): Template for mutation instructions. callbacks (List[Callable], optional): Callbacks for optimizer events. config (ExperimentConfig, optional): Configuration for the optimizer. """ self.meta_llm = meta_llm self.downstream_llm = predictor.llm - self.crossover_template = crossover_template or CAPO_CROSSOVER_TEMPLATE - self.mutation_template = mutation_template or CAPO_MUTATION_TEMPLATE - self.crossovers_per_iter = crossovers_per_iter self.upper_shots = upper_shots self.max_n_blocks_eval = max_n_blocks_eval @@ -136,8 +94,11 @@ def __init__( self.check_fs_accuracy = check_fs_accuracy self.create_fs_reasoning = create_fs_reasoning - self.scores: List[float] = [] super().__init__(predictor, task, initial_prompts, callbacks, config) + + self.crossover_template = self._initialize_meta_template(crossover_template or CAPO_CROSSOVER_TEMPLATE) + self.mutation_template = self._initialize_meta_template(mutation_template or CAPO_MUTATION_TEMPLATE) + self.df_few_shots = df_few_shots if df_few_shots is not None else task.pop_datapoints(frac=0.1) if self.max_n_blocks_eval > self.task.n_blocks: logger.warning( @@ -145,6 +106,11 @@ def __init__( f" Setting max_n_blocks_eval to {self.task.n_blocks}." ) self.max_n_blocks_eval = self.task.n_blocks + if "block" not in self.task.eval_strategy: + logger.warning( + f"ℹ️ CAPO requires 'block' in the eval_strategy, but got {self.task.eval_strategy}. Setting eval_strategy to 'sequential_block'." + ) + self.task.eval_strategy = "sequential_block" self.population_size = len(self.prompts) if hasattr(self.predictor, "begin_marker") and hasattr(self.predictor, "end_marker"): @@ -154,7 +120,7 @@ def __init__( self.target_begin_marker = "" self.target_end_marker = "" - def _initialize_population(self, initial_prompts: List[str]) -> List[CAPOPrompt]: + def _initialize_population(self, initial_prompts: List[Prompt]) -> List[Prompt]: """Initializes the population of Prompt objects from initial instructions. Args: @@ -164,10 +130,10 @@ def _initialize_population(self, initial_prompts: List[str]) -> List[CAPOPrompt] List[Prompt]: Initialized population of prompts with few-shot examples. """ population = [] - for instruction_text in initial_prompts: + for prompt in initial_prompts: num_examples = random.randint(0, self.upper_shots) - few_shots = self._create_few_shot_examples(instruction_text, num_examples) - population.append(CAPOPrompt(instruction_text, few_shots)) + few_shots = self._create_few_shot_examples(prompt.instruction, num_examples) + population.append(Prompt(prompt.instruction, few_shots)) return population @@ -202,18 +168,18 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List # Check which predictions are correct and get a single one per example for j in range(num_examples): # Process and clean up the generated sequences - seqs[j] = seqs[j].replace(sample_inputs[j], "").strip() + seqs[j] = seqs[j].replace(sample_inputs[j], "", 1).strip() # Check if the prediction is correct and add reasoning if so if preds[j] == sample_targets[j] or not self.check_fs_accuracy: few_shots[j] = CAPO_FEWSHOT_TEMPLATE.replace("", sample_inputs[j]).replace("", seqs[j]) return few_shots - def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]: + def _crossover(self, parents: List[Prompt]) -> List[Prompt]: """Performs crossover among parent prompts to generate offsprings. Args: - parents (List[CAPOPrompt]): List of parent prompts. + parents (List[Prompt]): List of parent prompts. Returns: List[Prompt]: List of new offsprings after crossover. @@ -223,8 +189,8 @@ def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]: for _ in range(self.crossovers_per_iter): mother, father = random.sample(parents, 2) crossover_prompt = ( - self.crossover_template.replace("", mother.instruction_text) - .replace("", father.instruction_text) + self.crossover_template.replace("", mother.instruction) + .replace("", father.instruction) .strip() ) # collect all crossover prompts then pass them bundled to the meta llm (speedup) @@ -239,22 +205,22 @@ def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]: offsprings = [] for instruction, examples in zip(child_instructions, offspring_few_shots): instruction = extract_from_tag(instruction, "", "") - offsprings.append(CAPOPrompt(instruction, examples)) + offsprings.append(Prompt(instruction, examples)) return offsprings - def _mutate(self, offsprings: List[CAPOPrompt]) -> List[CAPOPrompt]: + def _mutate(self, offsprings: List[Prompt]) -> List[Prompt]: """Apply mutation to offsprings to generate new candidate prompts. Args: - offsprings (List[CAPOPrompt]): List of offsprings to mutate. + offsprings (List[Prompt]): List of offsprings to mutate. Returns: List[Prompt]: List of mutated prompts. """ # collect all mutation prompts then pass them bundled to the meta llm (speedup) mutation_prompts = [ - self.mutation_template.replace("", prompt.instruction_text) for prompt in offsprings + self.mutation_template.replace("", prompt.instruction) for prompt in offsprings ] new_instructions = self.meta_llm.get_response(mutation_prompts) @@ -273,15 +239,15 @@ def _mutate(self, offsprings: List[CAPOPrompt]) -> List[CAPOPrompt]: new_few_shots = prompt.few_shots random.shuffle(new_few_shots) - mutated.append(CAPOPrompt(new_instruction, new_few_shots)) + mutated.append(Prompt(new_instruction, new_few_shots)) return mutated - def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]: + def _do_racing(self, candidates: List[Prompt], k: int) -> Tuple[List[Prompt], List[float]]: """Perform the racing (selection) phase by comparing candidates based on their evaluation scores using the provided test statistic. Args: - candidates (List[CAPOPrompt]): List of candidate prompts. + candidates (List[Prompt]): List of candidate prompts. k (int): Number of survivors to retain. Returns: @@ -292,9 +258,7 @@ def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]: i = 0 while len(candidates) > k and i < self.max_n_blocks_eval: # new_scores shape: (n_candidates, n_samples) - new_scores: List[float] = self.task.evaluate( - [c.construct_prompt() for c in candidates], self.predictor, return_agg_scores=False - ) + new_scores: List[float] = self.task.evaluate(candidates, self.predictor, return_agg_scores=False) # subtract length penalty prompt_lengths = np.array([self.token_counter(c.construct_prompt()) for c in candidates]) @@ -315,40 +279,57 @@ def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]: # Sum along rows to get number of better scores for each candidate n_better = np.sum(comparison_matrix, axis=1) - # Create mask for survivors and filter candidates - survivor_mask = n_better < k - candidates = list(compress(candidates, survivor_mask)) - block_scores = list(compress(block_scores, survivor_mask)) + candidates, block_scores = filter_survivors(candidates, block_scores, mask=n_better < k) i += 1 self.task.increment_block_idx() - avg_scores = self.task.evaluate( - [c.construct_prompt() for c in candidates], self.predictor, eval_strategy="evaluated" - ) - order = np.argsort(-np.array(avg_scores))[:k] - candidates = [candidates[i] for i in order] - self.scores = [avg_scores[i] for i in order] + avg_scores = self.task.evaluate(candidates, self.predictor, eval_strategy="evaluated") + prompts, avg_scores = sort_prompts_by_scores(candidates, avg_scores, top_k=k) - return candidates + return prompts, avg_scores def _pre_optimization_loop(self) -> None: - self.prompt_objects = self._initialize_population(self.prompts) - self.prompts = [p.construct_prompt() for p in self.prompt_objects] - self.max_prompt_length = max(self.token_counter(p) for p in self.prompts) if self.prompts else 1 + self.prompts = self._initialize_population(self.prompts) + self.max_prompt_length = ( + max(self.token_counter(p.construct_prompt()) for p in self.prompts) if self.prompts else 1 + ) self.task.reset_block_idx() - def _step(self) -> List[str]: + def _step(self) -> List[Prompt]: """Perform a single optimization step. Returns: - List[str]: The optimized list of prompts after the step. + List[Prompt]: The optimized list of prompts after the step. """ - offsprings = self._crossover(self.prompt_objects) + offsprings = self._crossover(self.prompts) mutated = self._mutate(offsprings) - combined = self.prompt_objects + mutated + combined = self.prompts + mutated - self.prompt_objects = self._do_racing(combined, self.population_size) - self.prompts = [p.construct_prompt() for p in self.prompt_objects] + self.prompts, self.scores = self._do_racing(combined, self.population_size) return self.prompts + + +def filter_survivors( + candidates: List[Prompt], scores: List[List[float]], mask: Any +) -> Tuple[List[Prompt], List[List[float]]]: + """Filter candidates and scores based on a boolean mask. + + Args: + candidates (List[Prompt]): List of candidate prompts. + scores (List[List[float]]): Corresponding scores for the candidates. + mask (Any): Boolean mask indicating which candidates to keep. + + Returns: + Tuple[List[Prompt], List[List[float]]]: Filtered candidates and their scores. + """ + assert len(candidates) == len(mask), "Length of candidates, and mask must be the same." + assert all( + len(candidates) == len(score) for score in scores + ), "Each score list must have the same length as candidates." + + filtered_candidates = [c for c, m in zip(candidates, mask) if m] + filtered_scores = [[s for s, m in zip(score, mask) if m] for score in scores] + + return filtered_candidates, filtered_scores diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py index 0412d5d..f6e701a 100644 --- a/promptolution/optimizers/evoprompt_de.py +++ b/promptolution/optimizers/evoprompt_de.py @@ -1,12 +1,14 @@ """Module for EvoPromptDE optimizer.""" -import numpy as np +import random -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, List, Optional from promptolution.optimizers.base_optimizer import BaseOptimizer from promptolution.utils.formatting import extract_from_tag +from promptolution.utils.prompt import Prompt, sort_prompts_by_scores +from promptolution.utils.templates import EVOPROMPT_DE_TEMPLATE_TD if TYPE_CHECKING: # pragma: no cover from promptolution.llms.base_llm import BaseLLM @@ -42,27 +44,26 @@ def __init__( self, predictor: "BasePredictor", task: "BaseTask", - prompt_template: str, meta_llm: "BaseLLM", initial_prompts: Optional[List[str]] = None, + prompt_template: Optional[str] = None, donor_random: bool = False, callbacks: Optional[List["BaseCallback"]] = None, config: Optional["ExperimentConfig"] = None, ) -> None: """Initialize the EvoPromptDE optimizer.""" - self.prompt_template = prompt_template self.donor_random = donor_random self.meta_llm = meta_llm super().__init__( predictor=predictor, task=task, initial_prompts=initial_prompts, callbacks=callbacks, config=config ) + self.prompt_template = self._initialize_meta_template(prompt_template or EVOPROMPT_DE_TEMPLATE_TD) def _pre_optimization_loop(self) -> None: self.scores = self.task.evaluate(self.prompts, self.predictor, return_agg_scores=True) - self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)] - self.scores = sorted(self.scores, reverse=True) + self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores) - def _step(self) -> List[str]: + def _step(self) -> List[Prompt]: """Perform the optimization process for a specified number of steps. This method iteratively improves the prompts using a differential evolution strategy. @@ -71,7 +72,7 @@ def _step(self) -> List[str]: Returns: - List[str]: The optimized list of prompts after all steps. + List[Prompt]: The optimized list of prompts after all steps. """ cur_best = self.prompts[0] meta_prompts = [] @@ -80,22 +81,23 @@ def _step(self) -> List[str]: old_prompt = self.prompts[i] candidates = [prompt for prompt in self.prompts if prompt != old_prompt] - a, b, c = np.random.choice(candidates, size=3, replace=False) + a, b, c = random.sample(candidates, k=3) if not self.donor_random: c = cur_best meta_prompt = ( - self.prompt_template.replace("", old_prompt) - .replace("", a) - .replace("", b) - .replace("", c) + self.prompt_template.replace("", old_prompt.construct_prompt()) + .replace("", a.construct_prompt()) + .replace("", b.construct_prompt()) + .replace("", c.construct_prompt()) ) meta_prompts.append(meta_prompt) - child_prompts = self.meta_llm.get_response(meta_prompts) - child_prompts = extract_from_tag(child_prompts, "", "") + child_instructions = self.meta_llm.get_response(meta_prompts) + child_instructions = extract_from_tag(child_instructions, "", "") + child_prompts = [Prompt(p) for p in child_instructions] child_scores = self.task.evaluate(child_prompts, self.predictor, return_agg_scores=True) @@ -104,7 +106,6 @@ def _step(self) -> List[str]: self.prompts[i] = child_prompts[i] self.scores[i] = child_scores[i] - self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)] - self.scores = sorted(self.scores, reverse=True) + self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores) return self.prompts diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py index 91cc6a7..9a0b4e3 100644 --- a/promptolution/optimizers/evoprompt_ga.py +++ b/promptolution/optimizers/evoprompt_ga.py @@ -3,9 +3,11 @@ import numpy as np -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, List, Optional from promptolution.optimizers.base_optimizer import BaseOptimizer +from promptolution.utils.prompt import Prompt, sort_prompts_by_scores +from promptolution.utils.templates import EVOPROMPT_GA_TEMPLATE_TD if TYPE_CHECKING: # pragma: no cover from promptolution.llms.base_llm import BaseLLM @@ -48,43 +50,39 @@ def __init__( self, predictor: "BasePredictor", task: "BaseTask", - prompt_template: str, meta_llm: "BaseLLM", initial_prompts: Optional[List[str]] = None, + prompt_template: Optional[str] = None, selection_mode: str = "wheel", callbacks: Optional[List["BaseCallback"]] = None, config: Optional["ExperimentConfig"] = None, ) -> None: """Initialize the EvoPromptGA optimizer.""" - self.prompt_template = prompt_template self.meta_llm = meta_llm self.selection_mode = selection_mode super().__init__( predictor=predictor, initial_prompts=initial_prompts, task=task, callbacks=callbacks, config=config ) + self.prompt_template = self._initialize_meta_template(prompt_template or EVOPROMPT_GA_TEMPLATE_TD) + assert self.selection_mode in ["random", "wheel", "tour"], "Invalid selection mode." def _pre_optimization_loop(self) -> None: self.scores = self.task.evaluate(self.prompts, self.predictor, return_agg_scores=True) - # sort prompts by score - self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)] - self.scores = sorted(self.scores, reverse=True) + self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores) - def _step(self) -> List[str]: + def _step(self) -> List[Prompt]: new_prompts = self._crossover(self.prompts, self.scores) - prompts = self.prompts + new_prompts - new_scores = self.task.evaluate(new_prompts, self.predictor, return_agg_scores=True) + prompts = self.prompts + new_prompts scores = self.scores + new_scores - # sort scores and prompts - self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)] - self.scores = sorted(scores, reverse=True)[: len(self.prompts)] + self.prompts, self.scores = sort_prompts_by_scores(prompts, scores, top_k=len(self.prompts)) return self.prompts - def _crossover(self, prompts: List[str], scores: List[float]) -> List[str]: + def _crossover(self, prompts: List[Prompt], scores: List[float]) -> List[Prompt]: """Perform crossover operation to generate new child prompts. This method selects parent prompts based on the chosen selection mode, @@ -123,10 +121,12 @@ def _crossover(self, prompts: List[str], scores: List[float]) -> List[str]: parent_1 = group_1[np.argmax([self.scores[self.prompts.index(p)] for p in group_1])] parent_2 = group_2[np.argmax([self.scores[self.prompts.index(p)] for p in group_2])] + parent_1, parent_2 = parent_1.construct_prompt(), parent_2.construct_prompt() meta_prompt = self.prompt_template.replace("", parent_1).replace("", parent_2) meta_prompts.append(meta_prompt) - child_prompts = self.meta_llm.get_response(meta_prompts) - child_prompts = extract_from_tag(child_prompts, "", "") + child_instructions = self.meta_llm.get_response(meta_prompts) + child_instructions = extract_from_tag(child_instructions, "", "") + child_prompts = [Prompt(p) for p in child_instructions] return child_prompts diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py index 864da31..e7b9048 100644 --- a/promptolution/optimizers/opro.py +++ b/promptolution/optimizers/opro.py @@ -3,11 +3,12 @@ import numpy as np -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, List, Optional from promptolution.optimizers.base_optimizer import BaseOptimizer -from promptolution.optimizers.templates import OPRO_TEMPLATE from promptolution.utils.formatting import extract_from_tag +from promptolution.utils.prompt import Prompt, sort_prompts_by_scores +from promptolution.utils.templates import OPRO_TEMPLATE if TYPE_CHECKING: # pragma: no cover from promptolution.llms.base_llm import BaseLLM @@ -55,13 +56,13 @@ def __init__( config: "ExperimentConfig" overwriting default parameters """ self.meta_llm = meta_llm - self.meta_prompt_template = prompt_template or OPRO_TEMPLATE self.max_num_instructions = max_num_instructions self.num_instructions_per_step = num_instructions_per_step self.num_few_shots = num_few_shots super().__init__( predictor=predictor, task=task, initial_prompts=initial_prompts, callbacks=callbacks, config=config ) + self.meta_prompt_template = self._initialize_meta_template(prompt_template or OPRO_TEMPLATE) def _sample_examples(self) -> str: """Sample few-shot examples from the dataset. @@ -87,7 +88,7 @@ def _format_instructions(self) -> str: return "".join([f"text:\n{prompt}\nscore: {int(100 * round(score, 2))}\n\n" for prompt, score in sorted_pairs]) - def _add_prompt_and_score(self, prompt: str, score: float) -> None: + def _add_prompt_and_score(self, prompt: Prompt, score: float) -> None: """Add a prompt and its score to the lists, maintaining max length. Args: @@ -101,17 +102,15 @@ def _add_prompt_and_score(self, prompt: str, score: float) -> None: self.scores.append(score) # Keep only the top-performing prompts if we exceed the maximum number of instructions - keep_indices = np.argsort(self.scores)[-self.max_num_instructions :] - self.prompts = [self.prompts[i] for i in keep_indices] - self.scores = [self.scores[i] for i in keep_indices] + self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores, top_k=self.max_num_instructions) def _pre_optimization_loop(self): - self.scores = list(self.task.evaluate(self.prompts, self.predictor)) + self.scores = self.task.evaluate(self.prompts, self.predictor) self.meta_prompt = self.meta_prompt_template.replace("", self._format_instructions()).replace( "", self._sample_examples() ) - def _step(self) -> List[str]: + def _step(self) -> List[Prompt]: duplicate_prompts = 0 for _ in range(self.num_instructions_per_step): generation_seed = np.random.randint(0, int(1e9)) @@ -119,7 +118,8 @@ def _step(self) -> List[str]: response = self.meta_llm.get_response([self.meta_prompt])[0] - prompt = extract_from_tag(response, "", "") + instruction = extract_from_tag(response, "", "") + prompt = Prompt(instruction) if prompt in self.prompts: duplicate_prompts += 1 diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py index ddc9595..8751335 100644 --- a/promptolution/predictors/__init__.py +++ b/promptolution/predictors/__init__.py @@ -1,4 +1,9 @@ """Module for LLM predictors.""" +from promptolution.predictors.first_occurrence_predictor import FirstOccurrencePredictor +from promptolution.predictors.maker_based_predictor import MarkerBasedPredictor -from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier +__all__ = [ + "FirstOccurrencePredictor", + "MarkerBasedPredictor", +] diff --git a/promptolution/predictors/base_predictor.py b/promptolution/predictors/base_predictor.py index a345872..292d56d 100644 --- a/promptolution/predictors/base_predictor.py +++ b/promptolution/predictors/base_predictor.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: # pragma: no cover from promptolution.utils.config import ExperimentConfig + PredictorType = Literal["first_occurrence", "marker"] diff --git a/promptolution/predictors/first_occurrence_predictor.py b/promptolution/predictors/first_occurrence_predictor.py new file mode 100644 index 0000000..7e84f0c --- /dev/null +++ b/promptolution/predictors/first_occurrence_predictor.py @@ -0,0 +1,65 @@ +"""Module for the FirstOccurrencePredictor.""" + +from typing import TYPE_CHECKING, List, Optional + +from promptolution.predictors.base_predictor import BasePredictor + +if TYPE_CHECKING: # pragma: no cover + from promptolution.llms.base_llm import BaseLLM + from promptolution.utils.config import ExperimentConfig + + +class FirstOccurrencePredictor(BasePredictor): + """A predictor class for classification tasks using language models. + + This class takes a language model and a list of classes, and provides a method + to predict classes for given prompts and input data. The class labels are extracted + by matching the words in the prediction with the list of valid class labels. + The first occurrence of a valid class label in the prediction is used as the predicted class. + If no valid class label is found, the first class label in the list is used as the default prediction. + + Attributes: + llm: The language model used for generating predictions. + classes (List[str]): The list of valid class labels. + config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults. + + Inherits from: + BasePredictor: The base class for predictors in the promptolution library. + """ + + def __init__(self, llm: "BaseLLM", classes: List[str], config: Optional["ExperimentConfig"] = None) -> None: + """Initialize the FirstOccurrencePredictor. + + Args: + llm: The language model to use for predictions. + classes (List[str]): The list of valid class labels. + config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults. + """ + assert all([c.islower() for c in classes]), "Class labels should be lowercase." + self.classes = classes + + self.extraction_description = ( + f"The task is to classify the texts into one of those classes: {', '.join(classes)}." + "The first occurrence of a valid class label in the prediction is used as the predicted class." + ) + + super().__init__(llm, config) + + def _extract_preds(self, preds: List[str]) -> List[str]: + """Extract class labels from the predictions, based on the list of valid class labels. + + Args: + preds: The raw predictions from the language model. + """ + result = [] + for pred in preds: + predicted_class = self.classes[0] # use first class as default pred + for word in pred.split(): + word = "".join([c for c in word if c.isalnum()]).lower() + if word in self.classes: + predicted_class = word + break + + result.append(predicted_class) + + return result diff --git a/promptolution/predictors/classifier.py b/promptolution/predictors/maker_based_predictor.py similarity index 50% rename from promptolution/predictors/classifier.py rename to promptolution/predictors/maker_based_predictor.py index 2a4fa00..bf5dbcb 100644 --- a/promptolution/predictors/classifier.py +++ b/promptolution/predictors/maker_based_predictor.py @@ -1,9 +1,6 @@ -"""Module for classification predictors.""" +"""Module for the MarkerBasedPredictor.""" - -import numpy as np - -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, List, Optional from promptolution.predictors.base_predictor import BasePredictor from promptolution.utils.formatting import extract_from_tag @@ -13,64 +10,8 @@ from promptolution.utils.config import ExperimentConfig -class FirstOccurrenceClassifier(BasePredictor): - """A predictor class for classification tasks using language models. - - This class takes a language model and a list of classes, and provides a method - to predict classes for given prompts and input data. The class labels are extracted - by matching the words in the prediction with the list of valid class labels. - The first occurrence of a valid class label in the prediction is used as the predicted class. - If no valid class label is found, the first class label in the list is used as the default prediction. - - Attributes: - llm: The language model used for generating predictions. - classes (List[str]): The list of valid class labels. - config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults. - - Inherits from: - BasePredictor: The base class for predictors in the promptolution library. - """ - - def __init__(self, llm: "BaseLLM", classes: List[str], config: Optional["ExperimentConfig"] = None) -> None: - """Initialize the FirstOccurrenceClassifier. - - Args: - llm: The language model to use for predictions. - classes (List[str]): The list of valid class labels. - config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults. - """ - assert all([c.islower() for c in classes]), "Class labels should be lowercase." - self.classes = classes - - self.extraction_description = ( - f"The task is to classify the texts into one of those classes: {', '.join(classes)}." - "The first occurrence of a valid class label in the prediction is used as the predicted class." - ) - - super().__init__(llm, config) - - def _extract_preds(self, preds: List[str]) -> List[str]: - """Extract class labels from the predictions, based on the list of valid class labels. - - Args: - preds: The raw predictions from the language model. - """ - result = [] - for pred in preds: - predicted_class = self.classes[0] # use first class as default pred - for word in pred.split(): - word = "".join([c for c in word if c.isalnum()]).lower() - if word in self.classes: - predicted_class = word - break - - result.append(predicted_class) - - return result - - -class MarkerBasedClassifier(BasePredictor): - """A predictor class for classification tasks using language models. +class MarkerBasedPredictor(BasePredictor): + """A predictor class task using language models. This class takes a language model and a list of classes, and provides a method to predict classes for given prompts and input data. The class labels are extracted. @@ -92,7 +33,7 @@ def __init__( end_marker: str = "", config: Optional["ExperimentConfig"] = None, ) -> None: - """Initialize the MarkerBasedClassifier. + """Initialize the MarkerBasedPredictor. Args: llm: The language model to use for predictions. diff --git a/promptolution/tasks/__init__.py b/promptolution/tasks/__init__.py index 7222256..825dbad 100644 --- a/promptolution/tasks/__init__.py +++ b/promptolution/tasks/__init__.py @@ -1,3 +1,11 @@ """Module for task-related functions and classes.""" from promptolution.tasks.classification_tasks import ClassificationTask +from promptolution.tasks.judge_tasks import JudgeTask +from promptolution.tasks.reward_tasks import RewardTask + +__all__ = [ + "ClassificationTask", + "JudgeTask", + "RewardTask", +] diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py index 2e7fa89..2f1c164 100644 --- a/promptolution/tasks/base_task.py +++ b/promptolution/tasks/base_task.py @@ -6,7 +6,9 @@ import numpy as np import pandas as pd -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union, overload +from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union, overload + +from promptolution.utils.prompt import Prompt if TYPE_CHECKING: # pragma: no cover from promptolution.predictors.base_predictor import BasePredictor @@ -103,7 +105,7 @@ def subsample(self, eval_strategy: "EvalStrategy" = None) -> Tuple[List[str], Li def _prepare_batch( self, - prompts: List[str], + prompts: List[Prompt], xs: List[str], ys: List[str], eval_strategy: Literal["full", "subsample", "sequential_block", "random_block", "evaluated"] = "full", @@ -117,14 +119,14 @@ def _prepare_batch( keys_to_predict = [] for prompt in prompts: for x, y in zip(xs, ys): - cache_key = (prompt, x, str(y)) + cache_key = (prompt.construct_prompt(), x, str(y)) if cache_key not in self.eval_cache: keys_to_predict.append(cache_key) return keys_to_predict def _collect_results_from_cache( self, - prompts: List[str], + prompts: List[Prompt], xs: List[str], ys: List[str], return_agg_scores: bool, @@ -140,8 +142,11 @@ def _collect_results_from_cache( datapoint_scores = [] datapoint_seqs = [] for x, y in zip(xs, ys): - cache_key = (prompt, x, y) - datapoint_scores.append(self.eval_cache[cache_key]) + cache_key = (prompt.construct_prompt(), x, y) + datapoint_score = self.eval_cache.get(cache_key) + if datapoint_score is None: + continue + datapoint_scores.append(datapoint_score) if return_seq: datapoint_seqs.append(self.seq_cache.get(cache_key, "")) scores.append(datapoint_scores) @@ -165,7 +170,7 @@ def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[floa @overload def evaluate( self, - prompts: List[str], + prompts: List[Prompt], predictor: "BasePredictor", system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[True] = True, @@ -177,7 +182,7 @@ def evaluate( @overload def evaluate( self, - prompts: List[str], + prompts: List[Prompt], predictor: "BasePredictor", system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, @@ -189,7 +194,7 @@ def evaluate( @overload def evaluate( self, - prompts: List[str], + prompts: List[Prompt], predictor: "BasePredictor", system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, @@ -201,7 +206,7 @@ def evaluate( @overload def evaluate( self, - prompts: str, + prompts: Prompt, predictor: "BasePredictor", system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[True] = True, @@ -213,7 +218,7 @@ def evaluate( @overload def evaluate( self, - prompts: str, + prompts: Prompt, predictor: "BasePredictor", system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, @@ -225,7 +230,7 @@ def evaluate( @overload def evaluate( self, - prompts: str, + prompts: Prompt, predictor: "BasePredictor", system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: Literal[False] = False, @@ -236,7 +241,7 @@ def evaluate( def evaluate( self, - prompts: Union[str, List[str]], + prompts: Union[Prompt, List[Prompt]], predictor: "BasePredictor", system_prompts: Optional[Union[str, List[str]]] = None, return_agg_scores: bool = True, @@ -253,7 +258,7 @@ def evaluate( seqs: List[str] = [] - prompts = [prompts] if isinstance(prompts, str) else prompts + prompts = [prompts] if isinstance(prompts, Prompt) else prompts eval_strategy = eval_strategy or self.eval_strategy xs, ys = self.subsample(eval_strategy=eval_strategy) batches = self._prepare_batch(prompts, xs, ys, eval_strategy=eval_strategy) diff --git a/promptolution/tasks/judge_tasks.py b/promptolution/tasks/judge_tasks.py index c53742f..2570458 100644 --- a/promptolution/tasks/judge_tasks.py +++ b/promptolution/tasks/judge_tasks.py @@ -1,16 +1,15 @@ """Module for judge tasks.""" -import numpy as np import pandas as pd -from typing import TYPE_CHECKING, List, Literal, Optional, Union +from typing import TYPE_CHECKING, List, Optional -from promptolution.llms.base_llm import BaseLLM from promptolution.tasks.base_task import BaseTask from promptolution.utils.formatting import extract_from_tag from promptolution.utils.logging import get_logger if TYPE_CHECKING: # pragma: no cover + from promptolution.llms.base_llm import BaseLLM from promptolution.tasks.base_task import EvalStrategy from promptolution.utils.config import ExperimentConfig @@ -132,7 +131,7 @@ def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[floa judge_responses = self.judge_llm.get_response(prompts) scores_str = extract_from_tag(judge_responses, "", "") scores = [] - for score_str, judge_response in zip(scores_str, judge_responses): + for score_str in scores_str: try: # only numeric chars, - or . are allowed score_str = "".join(filter(lambda c: c.isdigit() or c in "-.", score_str)) @@ -141,7 +140,7 @@ def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[floa score = (score - self.min_score) / (self.max_score - self.min_score) score = max(0.0, min(1.0, score)) except ValueError: - logger.warning(f"Failed to parse score '{score}' as float. Defaulting to a score 0.0.") + logger.warning(f"Failed to parse score '{score_str}' as float. Defaulting to a score 0.0.") score = 0.0 scores.append(score) diff --git a/promptolution/utils/__init__.py b/promptolution/utils/__init__.py index eba584c..1163f63 100644 --- a/promptolution/utils/__init__.py +++ b/promptolution/utils/__init__.py @@ -1,6 +1,5 @@ """Module for utility functions and classes.""" - from promptolution.utils.callbacks import ( BestPromptCallback, FileOutputCallback, @@ -10,6 +9,61 @@ ) from promptolution.utils.config import ExperimentConfig from promptolution.utils.logging import get_logger, setup_logging -from promptolution.utils.prompt_creation import create_prompt_variation, create_prompts_from_samples +from promptolution.utils.prompt import Prompt, sort_prompts_by_scores +from promptolution.utils.prompt_creation import ( + create_prompt_variation, + create_prompts_from_samples, + create_prompts_from_task_description, +) +from promptolution.utils.templates import ( + CAPO_CROSSOVER_TEMPLATE, + CAPO_FEWSHOT_TEMPLATE, + CAPO_MUTATION_TEMPLATE, + DEFAULT_SYS_PROMPT, + DOWNSTREAM_TEMPLATE, + EVOPROMPT_DE_TEMPLATE, + EVOPROMPT_DE_TEMPLATE_TD, + EVOPROMPT_GA_TEMPLATE, + EVOPROMPT_GA_TEMPLATE_TD, + OPRO_TEMPLATE, + OPRO_TEMPLATE_TD, + PROMPT_CREATION_TEMPLATE, + PROMPT_CREATION_TEMPLATE_TD, + PROMPT_VARIATION_TEMPLATE, +) from promptolution.utils.test_statistics import TestStatistics, get_test_statistic_func, paired_t_test from promptolution.utils.token_counter import get_token_counter + +__all__ = [ + "BestPromptCallback", + "FileOutputCallback", + "LoggerCallback", + "ProgressBarCallback", + "TokenCountCallback", + "ExperimentConfig", + "get_logger", + "setup_logging", + "Prompt", + "sort_prompts_by_scores", + "create_prompt_variation", + "create_prompts_from_samples", + "create_prompts_from_task_description", + "CAPO_CROSSOVER_TEMPLATE", + "CAPO_FEWSHOT_TEMPLATE", + "CAPO_MUTATION_TEMPLATE", + "DEFAULT_SYS_PROMPT", + "DOWNSTREAM_TEMPLATE", + "EVOPROMPT_DE_TEMPLATE", + "EVOPROMPT_DE_TEMPLATE_TD", + "EVOPROMPT_GA_TEMPLATE", + "EVOPROMPT_GA_TEMPLATE_TD", + "OPRO_TEMPLATE", + "OPRO_TEMPLATE_TD", + "PROMPT_CREATION_TEMPLATE", + "PROMPT_CREATION_TEMPLATE_TD", + "PROMPT_VARIATION_TEMPLATE", + "TestStatistics", + "get_test_statistic_func", + "paired_t_test", + "get_token_counter", +] diff --git a/promptolution/utils/callbacks.py b/promptolution/utils/callbacks.py index 083f749..98129e2 100644 --- a/promptolution/utils/callbacks.py +++ b/promptolution/utils/callbacks.py @@ -155,7 +155,7 @@ def on_step_end(self, optimizer: "BaseOptimizer") -> bool: "output_tokens": [optimizer.predictor.llm.output_token_count] * len(optimizer.prompts), "time": [datetime.now().timestamp()] * len(optimizer.prompts), "score": optimizer.scores, - "prompt": optimizer.prompts, + "prompt": [str(p) for p in optimizer.prompts], } ) diff --git a/promptolution/utils/prompt.py b/promptolution/utils/prompt.py new file mode 100644 index 0000000..d660e49 --- /dev/null +++ b/promptolution/utils/prompt.py @@ -0,0 +1,72 @@ +"""Module defining the Prompt class and related utilities.""" + +from typing import List, Optional, Tuple + +from promptolution.utils.templates import DOWNSTREAM_TEMPLATE, DOWNSTREAM_TEMPLATE_W_FEWSHOTS + + +class Prompt: + """Represents a prompt consisting of an instruction and few-shot examples.""" + + def __init__( + self, instruction: str, few_shots: Optional[List[str]] = None, downstream_template: Optional[str] = None + ) -> None: + """Initializes the Prompt with an instruction and associated examples. + + Args: + instruction (str): The instruction or prompt text. + few_shots (List[str]): List of examples as string. + downstream_template (str, optional): Template for formatting the full prompt. + """ + self.instruction = instruction.strip() + self.few_shots = few_shots or [] + if downstream_template is None: + if self.few_shots: + downstream_template = DOWNSTREAM_TEMPLATE_W_FEWSHOTS + else: + downstream_template = DOWNSTREAM_TEMPLATE + self.downstream_template = downstream_template + + def construct_prompt(self) -> str: + """Constructs the full prompt string by replacing placeholders in the template with the instruction and formatted examples. + + Returns: + str: The constructed prompt string. + """ + few_shot_str = "\n\n".join(self.few_shots).strip() + prompt = ( + self.downstream_template.replace("", self.instruction) + .replace("", few_shot_str) + .replace("\n\n\n\n", "\n\n") # replace extra newlines if no few shots are provided + .strip() + ) + return prompt + + def __str__(self) -> str: + """Returns the string representation of the prompt.""" + return self.construct_prompt() + + +def sort_prompts_by_scores( + prompts: List[Prompt], scores: List[float], top_k: Optional[int] = None +) -> Tuple[List[Prompt], List[float]]: + """Sorts prompts based on their associated scores in descending order. + + Args: + prompts (List[Prompt]): List of Prompt objects. + scores (List[float]): Corresponding list of scores. + top_k (Optional[int]): If provided, limits the result to the top_k prompts. Defaults to None (returns all). + + Returns: + Tuple[List[Prompt], List[float]]: A tuple containing prompts sorted by scores in descending order and their corresponding sorted scores. + """ + assert len(prompts) == len(scores), "Prompts and scores must have the same length." + + sorted_prompts = [prompt for score, prompt in sorted(zip(scores, prompts), reverse=True, key=lambda x: x[0])] + sorted_scores = sorted(scores, reverse=True) + + if top_k is not None: + sorted_prompts = sorted_prompts[:top_k] + sorted_scores = sorted_scores[:top_k] + + return sorted_prompts, sorted_scores diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py index 77082db..fd0087d 100644 --- a/promptolution/utils/prompt_creation.py +++ b/promptolution/utils/prompt_creation.py @@ -1,22 +1,28 @@ """Utility functions for prompt creation.""" +import json import numpy as np from typing import TYPE_CHECKING, List, Optional, Union from promptolution.utils.formatting import extract_from_tag +from promptolution.utils.logging import get_logger if TYPE_CHECKING: # pragma: no cover from promptolution.llms.base_llm import BaseLLM from promptolution.tasks.base_task import BaseTask -from promptolution.optimizers.templates import ( +from promptolution.tasks.classification_tasks import ClassificationTask +from promptolution.utils.templates import ( PROMPT_CREATION_TEMPLATE, + PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION, PROMPT_CREATION_TEMPLATE_TD, PROMPT_VARIATION_TEMPLATE, + default_prompts, ) -from promptolution.tasks.classification_tasks import ClassificationTask + +logger = get_logger(__name__) def create_prompt_variation( @@ -99,9 +105,9 @@ def create_prompts_from_samples( # sample xs: List[str] = [] ys: List[str] = [] - for label, num_samples in zip(unique_labels, samples_per_class): + for label, n_per_class in zip(unique_labels, samples_per_class): indices = np.where(task.ys == label)[0] - indices = np.random.choice(indices, n_samples, replace=False) + indices = np.random.choice(indices, n_per_class, replace=False) xs.extend(task.xs[indices]) ys.extend(task.ys[indices]) @@ -119,3 +125,44 @@ def create_prompts_from_samples( prompts = extract_from_tag(prompts, "", "") return prompts + + +def create_prompts_from_task_description( + task_description: str, + llm: "BaseLLM", + meta_prompt: Optional[str] = None, + n_prompts: int = 10, + n_retries: int = 3, +) -> List[str]: + """Generate a set of prompts from a given task description. + + Args: + task_description (str): The description of the task to generate prompts for. + llm (BaseLLM): The language model to use for generating the prompts. + meta_prompt (str): The meta prompt to use for generating the prompts. + If None, a default meta prompt is used. + n_prompts (int): The number of prompts to generate. + n_retries (int): The number of retries to attempt if prompt generation fails. + """ + if meta_prompt is None: + meta_prompt = PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION + + meta_prompt = meta_prompt.replace("", task_description).replace("", str(n_prompts)) + final_prompts = None + for _ in range(n_retries): + prompts_str = llm.get_response(meta_prompt)[0] + try: + prompts = json.loads(prompts_str) + assert isinstance(prompts, list) and all(isinstance(p, str) for p in prompts) and len(prompts) == n_prompts + final_prompts = prompts + break + except (json.JSONDecodeError, AssertionError): + logger.warning("Failed to parse prompts JSON, retrying...") + + if final_prompts is None: + logger.error( + f"Failed to generate prompts from task description after {n_retries} retries, returning default prompts." + ) + final_prompts = default_prompts[:n_prompts] + + return final_prompts diff --git a/promptolution/optimizers/templates.py b/promptolution/utils/templates.py similarity index 67% rename from promptolution/optimizers/templates.py rename to promptolution/utils/templates.py index aaa1f63..60d55d4 100644 --- a/promptolution/optimizers/templates.py +++ b/promptolution/utils/templates.py @@ -138,8 +138,18 @@ The instruction was""" +PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION = """Please create diverse system prompts for the following task, not using any placeholders, working universally, for any datapoint-specific instructions following each system prompt. -CAPO_DOWNSTREAM_TEMPLATE = """ +Task: + +Explicitly state the expected format above by repeating its exact character sequence verbatim in every prompt if applicable. + +Create overall prompts within json format, meaning strings inside quotations marks ("") as an array. Do not response with anything else. Start the array with [ and end with ]. Separate each prompt by a comma, and do not use quotation marks inside the prompts.""" + + +DOWNSTREAM_TEMPLATE = "" + +DOWNSTREAM_TEMPLATE_W_FEWSHOTS = """ @@ -164,3 +174,31 @@ Return the new prompt in the following format: new prompt""" + + +default_prompts = [ + "Give me your response within tags.", + "Please provide a thoughtful answer to my question and wrap your response in tags so I can easily identify it.", + "I need your expertise on this matter. Kindly structure your response within tags for better readability.", + "Analyze the following and present your findings enclosed in tags.", + "Consider this inquiry carefully. Your comprehensive response should be formatted within tags to facilitate extraction.", + "Respond succinctly. Ensure all content appears between and markers.", + "Would you mind addressing this request? Please place your entire response inside formatting.", + "I'm seeking your insights on a particular topic. Kindly ensure that your complete analysis is contained within tags for my convenience.", + "Examine this query thoroughly and deliver your conclusions. All output must be encapsulated in notation for processing purposes.", + "Help me understand this subject better. Your explanation should begin with and conclude with to maintain proper structure.", + "I require information on the following. Please format your response with tags at the beginning and end for clarity.", + "Contemplate this scenario and offer your perspective. Remember to enclose all content within tags as per requirements.", + "Elaborate on this concept, making sure to wrap the entirety of your explanation in markers for systematic review.", + "Describe your approach to this situation. Be thorough yet concise, and place your complete response between and tags.", + "Share your knowledge on this matter. Your entire response should be presented within tags to facilitate proper integration into my workflow.", + "Let's think step by step. Your answer should be enclosed within tags.", + "Provide a detailed response to the following question, ensuring that all information is contained within tags for easy extraction.", + "Kindly address the following topic, formatting your entire response between and markers for clarity and organization.", + "Offer your insights on this issue, making sure to encapsulate your full response within tags for seamless processing.", + "Delve into this subject and present your findings, ensuring that all content is wrapped in notation for systematic analysis.", + "Illuminate this topic with your expertise, formatting your complete explanation within tags for straightforward comprehension.", + "Provide your perspective on this matter, ensuring that your entire response is contained within tags for efficient review.", + "Analyze the following scenario and deliver your conclusions, making sure to enclose all output in markers for clarity.", + "Help me grasp this concept better by structuring your explanation between and tags for proper formatting.", +] diff --git a/promptolution/utils/test_statistics.py b/promptolution/utils/test_statistics.py index d0de2d3..dd9b5ff 100644 --- a/promptolution/utils/test_statistics.py +++ b/promptolution/utils/test_statistics.py @@ -6,7 +6,7 @@ import numpy as np from scipy.stats import ttest_rel -from typing import Any, Callable, List, Literal +from typing import Callable, List, Literal TestStatistics = Literal["paired_t_test"] diff --git a/promptolution/utils/token_counter.py b/promptolution/utils/token_counter.py index c19c815..422e277 100644 --- a/promptolution/utils/token_counter.py +++ b/promptolution/utils/token_counter.py @@ -27,7 +27,7 @@ def get_token_counter(llm: "BaseLLM") -> Callable[[str], int]: """ if llm.tokenizer is not None: - tokenizer: PreTrainedTokenizer = llm.tokenizer + tokenizer: "PreTrainedTokenizer" = llm.tokenizer return lambda x: len(tokenizer.encode(x)) else: logger.warning("⚠️ The LLM does not have a tokenizer. Using simple token count.") diff --git a/pyproject.toml b/pyproject.toml index 7b63b20..6a6999c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,14 +8,15 @@ readme = "README.md" [tool.poetry.dependencies] python = ">=3.10,<3.13" numpy = ">=1.26.0, <3.0.0" -pandas = "^2.2.2" -tqdm = "^4.66.5" -scikit-learn = "^1.5.2" -fastparquet = "^2024.11.0" -openai = {version = "^1.0.0", optional = true} -requests = {version = "^2.31.0", optional = true} -vllm = {version = "^0.10.1.1", optional = true} -transformers = {version = "^4.48.0", optional = true} +pandas = ">=2.2.2" +tqdm = ">=4.66.5" +scikit-learn = ">=1.5.2" +fastparquet = ">=2024.11.0" +openai = {version = ">=1.0.0", optional = true} +requests = {version = ">=2.31.0", optional = true} +vllm = {version = ">=0.10.1.1", optional = true} +transformers = {version = ">=4.48.0", optional = true} +scipy = ">=1.15" [tool.poetry.extras] api = ["openai", "requests"] @@ -25,41 +26,41 @@ transformers = ["transformers"] [tool.poetry.group.api] optional = true [tool.poetry.group.api.dependencies] -openai = "^1.0.0" -requests = "^2.31.0" +openai = ">=1.0.0" +requests = ">=2.31.0" [tool.poetry.group.vllm] optional = true [tool.poetry.group.vllm.dependencies] -vllm = "^0.10.1.1" +vllm = ">=0.10.1.1" [tool.poetry.group.transformers] optional = true [tool.poetry.group.transformers.dependencies] -transformers = "^4.48.0" +transformers = ">=4.48.0" [tool.poetry.group.dev.dependencies] -black = "^24.4.2" -flake8 = "^7.1.0" -isort = "^5.13.2" -pre-commit = "^3.7.1" -ipykernel = "^6.29.5" -mypy = "^1.8.0" +black = ">=24.4.2" +flake8 = ">=7.1.0" +isort = ">=5.13.2" +pre-commit = ">=3.7.1" +ipykernel = ">=6.29.5" +mypy = ">=1.8.0" [tool.poetry.group.test.dependencies] -pytest = "^8.3.5" -pytest-cov = "^6.1.1" -openai = "^1.0.0" -requests = "^2.31.0" -vllm = "^0.10.1.1" -transformers = "^4.48.0" +pytest = ">=8.3.5" +pytest-cov = ">=6.1.1" +openai = ">=1.0.0" +requests = ">=2.31.0" +vllm = "==0.10.1.1" +transformers = ">=4.48.0" [tool.poetry.group.docs.dependencies] -mkdocs = "^1.6.1" -mkdocs-material = "^9.5.39" -mkdocstrings = {version = "^0.26.1", extras = ["python"]} -jupyter = "^1.1.1" -nbconvert = "^7.16.6" +mkdocs = ">=1.6.1" +mkdocs-material = ">=9.5.39" +mkdocstrings = {version = ">=0.26.1", extras = ["python"]} +jupyter = ">=1.1.1" +nbconvert = ">=7.16.6" [build-system] requires = ["poetry-core"] diff --git a/tests/helpers/test_helpers.py b/tests/helpers/test_helpers.py index 03231a6..d39ec38 100644 --- a/tests/helpers/test_helpers.py +++ b/tests/helpers/test_helpers.py @@ -10,6 +10,7 @@ from promptolution.helpers import run_evaluation, run_experiment, run_optimization from promptolution.utils import ExperimentConfig +from promptolution.utils.prompt import Prompt @pytest.fixture @@ -39,7 +40,7 @@ def experiment_config(): predictor_name="first_occurrence", classes=["positive", "neutral", "negative"], n_steps=2, - prepend_exemplars=False, + posthoc_exemplar_selection=False, ) @@ -54,7 +55,7 @@ def experiment_config_with_exemplars(): predictor_name="first_occurrence", classes=["positive", "neutral", "negative"], n_steps=2, - prepend_exemplars=True, + posthoc_exemplar_selection=True, exemplar_selector="random", n_exemplars=2, ) @@ -196,6 +197,8 @@ def test_run_evaluation(mock_get_task, mock_get_predictor, mock_get_llm, sample_ "Is this text positive, negative, or neutral?", ] + prompts = [Prompt(p) for p in prompts] + # Now this will work because mock_task is a MagicMock mock_task.evaluate.return_value = np.array([0.8, 0.7, 0.9]) @@ -225,14 +228,15 @@ def test_run_evaluation(mock_get_task, mock_get_predictor, mock_get_llm, sample_ def test_run_experiment(mock_run_evaluation, mock_run_optimization, sample_df, experiment_config): """Test the run_experiment function.""" # Set up mocks - optimized_prompts = [ + optimized_prompts_strs = [ "Classify this as positive or negative:", "Determine the sentiment (positive/negative/neutral):", ] + optimized_prompts = [Prompt(p) for p in optimized_prompts_strs] mock_run_optimization.return_value = optimized_prompts # Create a sample results DataFrame - eval_results = pd.DataFrame({"prompt": optimized_prompts, "score": [0.8, 0.7]}) + eval_results = pd.DataFrame({"prompt": optimized_prompts_strs, "score": [0.8, 0.7]}) mock_run_evaluation.return_value = eval_results # Run the function @@ -256,7 +260,7 @@ def test_run_experiment(mock_run_evaluation, mock_run_optimization, sample_df, e assert len(train_df) + len(test_df) == len(sample_df) # Verify the prompts were passed to evaluation - assert mock_run_evaluation.call_args[0][2] == optimized_prompts + assert mock_run_evaluation.call_args[0][2] == optimized_prompts_strs def test_helpers_integration(sample_df, experiment_config): @@ -286,7 +290,8 @@ def test_helpers_integration(sample_df, experiment_config): mock_get_optimizer.return_value = mock_optimizer # Set up optimizer to return prompts - optimized_prompts = ["Classify sentiment:", "Determine if positive/negative:"] + optimized_prompts_str = ["Classify sentiment:", "Determine if positive/negative:"] + optimized_prompts = [Prompt(p) for p in optimized_prompts_str] mock_optimizer.optimize.return_value = optimized_prompts # Run the experiment @@ -295,7 +300,8 @@ def test_helpers_integration(sample_df, experiment_config): # Verify results assert isinstance(result, pd.DataFrame) assert len(result) == 2 - assert all(p in result["prompt"].values for p in optimized_prompts) + print([p in result["prompt"].values for p in optimized_prompts_str]) + assert all(p in result["prompt"].values for p in optimized_prompts_str) # Verify optimization was called mock_optimizer.optimize.assert_called_once() diff --git a/tests/mocks/mock_predictor.py b/tests/mocks/mock_predictor.py index 445794a..36f11d0 100644 --- a/tests/mocks/mock_predictor.py +++ b/tests/mocks/mock_predictor.py @@ -2,7 +2,7 @@ import numpy as np -from typing import List, Optional, Tuple +from typing import List, Optional from promptolution.llms.base_llm import BaseLLM from promptolution.predictors.base_predictor import BasePredictor diff --git a/tests/mocks/mock_task.py b/tests/mocks/mock_task.py index b5e1d14..9aeb46c 100644 --- a/tests/mocks/mock_task.py +++ b/tests/mocks/mock_task.py @@ -2,7 +2,6 @@ from unittest.mock import MagicMock -import numpy as np import pandas as pd from typing import List @@ -39,7 +38,7 @@ def __init__(self, predetermined_scores=None): self.x_column = "x" self.y_column = "y" # Default attributes similar to ClassificationTask - self.description = "Mock classification task" + self.task_description = "Mock classification task" self.classes = ["positive", "neutral", "negative"] self.initial_prompts = ["Classify:", "Determine:"] self.n_blocks = 10 diff --git a/tests/optimizers/test_capo.py b/tests/optimizers/test_capo.py index 466c92b..305f290 100644 --- a/tests/optimizers/test_capo.py +++ b/tests/optimizers/test_capo.py @@ -4,31 +4,9 @@ from tests.mocks.mock_task import MockTask -from promptolution.optimizers.capo import CAPO, CAPOPrompt - - -def test_capo_prompt_initialization(): - """Test that CAPOPrompt initializes correctly.""" - instruction = "Classify the sentiment of the text." - few_shots = ["Example 1: Positive", "Example 2: Negative"] - prompt = CAPOPrompt(instruction, few_shots) - - # Verify attributes - assert prompt.instruction_text == instruction - assert prompt.few_shots == few_shots - - -def test_capo_prompt_construct_prompt(): - """Test the construct_prompt method of CAPOPrompt.""" - instruction = "Classify the sentiment of the text." - few_shots = ["Example 1: Positive", "Example 2: Negative"] - prompt = CAPOPrompt(instruction, few_shots) - - # Get the constructed prompt - constructed = prompt.construct_prompt() - - # Verify the prompt contains the instruction - assert instruction in constructed +from promptolution.optimizers.capo import CAPO +from promptolution.utils.prompt import Prompt +from promptolution.utils.templates import CAPO_CROSSOVER_TEMPLATE, CAPO_MUTATION_TEMPLATE def test_capo_initialization(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df): @@ -67,11 +45,11 @@ def mock_create_few_shot_examples(instruction, num_examples): # Control randomness with patch("random.randint", return_value=2): - population = optimizer._initialize_population(initial_prompts) + population = optimizer._initialize_population([Prompt(p) for p in initial_prompts]) # Verify population was created assert len(population) == len(initial_prompts) - assert all(isinstance(p, CAPOPrompt) for p in population) + assert all(isinstance(p, Prompt) for p in population) def test_capo_step(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df): @@ -86,25 +64,26 @@ def test_capo_step(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mo ) # Create mock prompt objects - mock_prompts = [CAPOPrompt("Instruction 1", ["Example 1"]), CAPOPrompt("Instruction 2", ["Example 2"])] + mock_prompts = [Prompt("Instruction 1", ["Example 1"]), Prompt("Instruction 2", ["Example 2"])] optimizer.prompt_objects = mock_prompts # Mock the internal methods to avoid complexity - mock_offspring = [CAPOPrompt("Offspring", ["Example"])] + mock_offspring = [Prompt("Offspring", ["Example"])] optimizer._crossover = lambda x: mock_offspring - mock_mutated = [CAPOPrompt("Mutated", ["Example"])] + mock_mutated = [Prompt("Mutated", ["Example"])] optimizer._mutate = lambda x: mock_mutated - mock_survivors = [CAPOPrompt("Survivor 1", ["Example"]), CAPOPrompt("Survivor 2", ["Example"])] - optimizer._do_racing = lambda x, k: mock_survivors + mock_survivors = [Prompt("Survivor 1", ["Example"]), Prompt("Survivor 2", ["Example"])] + mock_scores = [0.9, 0.8] + optimizer._do_racing = lambda x, k: (mock_survivors, mock_scores) # Call _step result = optimizer._step() # Verify results assert len(result) == 2 # Should match population_size - assert all(isinstance(p, str) for p in result) + assert all(isinstance(p, Prompt) for p in result) def test_capo_optimize(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df): @@ -169,9 +148,7 @@ def test_crossover(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mo crossovers_per_iter=5, ) - offsprings = optimizer._crossover( - [CAPOPrompt("Instruction 1", ["Example 1"]), CAPOPrompt("Instruction 2", ["Example 2"])] - ) + offsprings = optimizer._crossover([Prompt("Instruction 1", ["Example 1"]), Prompt("Instruction 2", ["Example 2"])]) assert len(offsprings) == 5 @@ -184,9 +161,7 @@ def test_mutate(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_ df_few_shots=mock_df, ) - mutated = optimizer._mutate( - [CAPOPrompt("Instruction 1", ["Example 1"]), CAPOPrompt("Instruction 2", ["Example 2"])] - ) + mutated = optimizer._mutate([Prompt("Instruction 1", ["Example 1"]), Prompt("Instruction 2", ["Example 2"])]) assert len(mutated) == 2 @@ -200,11 +175,59 @@ def test_do_racing(mock_meta_llm, mock_predictor, initial_prompts, mock_df): df_few_shots=pd.concat([mock_df] * 5, ignore_index=True), ) optimizer._pre_optimization_loop() - survivors = optimizer._do_racing( - [CAPOPrompt("good instruction", ["Example 1"]), CAPOPrompt("better instruction", ["Example 2"])], 1 + survivors, scores = optimizer._do_racing( + [Prompt("good instruction", ["Example 1"]), Prompt("better instruction", ["Example 2"])], 1 ) assert len(survivors) == 1 - assert "better instruction" in survivors[0].instruction_text + assert len(scores) == 1 + + assert "better instruction" in survivors[0].instruction assert mock_task.reset_block_idx.call_count == 2 assert mock_task.increment_block_idx.call_count == 3 + + +def test_capo_crossover_prompt(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df): + """Test that when _crossover is called, the mock_meta_llm received a call with the correct meta prompt.""" + optimizer = CAPO( + predictor=mock_predictor, + task=mock_task, + meta_llm=mock_meta_llm, + initial_prompts=initial_prompts, + df_few_shots=mock_df, + ) + + mother = Prompt("Classify the sentiment of the text.", ["Input: I love this! Output: Positive"]) + father = Prompt("Determine if the review is positive or negative.", ["Input: This is terrible. Output: Negative"]) + optimizer._crossover([mother, father]) + + full_task_desc = mock_task.task_description + "\n" + optimizer.predictor.extraction_description + + expected_meta_prompt = ( + CAPO_CROSSOVER_TEMPLATE.replace("", mother.instruction) + .replace("", father.instruction) + .replace("", full_task_desc) + ) + + assert str(mock_meta_llm.call_history[0]["prompts"][0]) == expected_meta_prompt + + +def test_capo_mutate_prompt(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df): + """Test that when _mutate is called, the mock_meta_llm received a call with the correct meta prompt.""" + optimizer = CAPO( + predictor=mock_predictor, + task=mock_task, + meta_llm=mock_meta_llm, + initial_prompts=initial_prompts, + df_few_shots=mock_df, + ) + full_task_desc = mock_task.task_description + "\n" + optimizer.predictor.extraction_description + + parent = Prompt("Classify the sentiment of the text.", ["Input: I love this! Output: Positive"]) + optimizer._mutate([parent]) + + expected_meta_prompt = CAPO_MUTATION_TEMPLATE.replace("", parent.instruction).replace( + "", full_task_desc + ) + + assert mock_meta_llm.call_history[0]["prompts"][0] == expected_meta_prompt diff --git a/tests/optimizers/test_evoprompt_de.py b/tests/optimizers/test_evoprompt_de.py index 8c01adb..cdd2a31 100644 --- a/tests/optimizers/test_evoprompt_de.py +++ b/tests/optimizers/test_evoprompt_de.py @@ -1,9 +1,7 @@ from unittest.mock import patch -import numpy as np -import pytest - from promptolution.optimizers import EvoPromptDE +from promptolution.utils.prompt import Prompt def test_evoprompt_de_initialization(mock_meta_llm, initial_prompts, mock_task, mock_predictor): @@ -20,7 +18,7 @@ def test_evoprompt_de_initialization(mock_meta_llm, initial_prompts, mock_task, # Only verify the essential properties assert optimizer.prompt_template == "Create a new prompt from: , , , " assert not optimizer.donor_random - assert optimizer.prompts == initial_prompts + assert [p.instruction for p in optimizer.prompts] == initial_prompts def test_evoprompt_de_pre_optimization_loop(mock_meta_llm, initial_prompts, mock_task, mock_predictor): @@ -53,8 +51,8 @@ def test_evoprompt_de_step(mock_meta_llm, initial_prompts, mock_task, mock_predi ) # Set up initial state - optimizer.prompts = initial_prompts - optimizer.scores = [0.8, 0.7, 0.6, 0.5, 0.4] # First prompt is best + optimizer.prompts = [Prompt(p) for p in initial_prompts] + optimizer.scores = [0.8, 0.7, 0.6, 0.5] # First prompt is best # Control randomness with patch("numpy.random.choice") as mock_choice: diff --git a/tests/optimizers/test_evoprompt_ga.py b/tests/optimizers/test_evoprompt_ga.py index a1c3160..724a400 100644 --- a/tests/optimizers/test_evoprompt_ga.py +++ b/tests/optimizers/test_evoprompt_ga.py @@ -1,6 +1,7 @@ from unittest.mock import patch from promptolution.optimizers import EvoPromptGA +from promptolution.utils.prompt import Prompt def test_evoprompt_ga_initialization(mock_meta_llm, initial_prompts, mock_task, experiment_config, mock_predictor): @@ -18,7 +19,7 @@ def test_evoprompt_ga_initialization(mock_meta_llm, initial_prompts, mock_task, # Verify only essential properties assert optimizer.prompt_template == "Combine these prompts to create a better one: and ." assert optimizer.selection_mode == "random" - assert optimizer.prompts == initial_prompts + assert [p.instruction for p in optimizer.prompts] == initial_prompts def test_evoprompt_ga_crossover(mock_meta_llm, initial_prompts, mock_task, experiment_config, mock_predictor): @@ -34,7 +35,7 @@ def test_evoprompt_ga_crossover(mock_meta_llm, initial_prompts, mock_task, exper ) # Set up state for testing - optimizer.prompts = initial_prompts + optimizer.prompts = [Prompt(p) for p in initial_prompts] optimizer.scores = [0.8, 0.7, 0.6, 0.5, 0.4] # Control randomness @@ -63,8 +64,8 @@ def test_evoprompt_ga_step(mock_meta_llm, initial_prompts, mock_task, experiment ) # Set up state for testing - optimizer.prompts = initial_prompts - optimizer.scores = [0.8, 0.7, 0.6, 0.5, 0.4] + optimizer.prompts = [Prompt(p) for p in initial_prompts] + optimizer.scores = [0.8, 0.7, 0.6, 0.5] # Control randomness with patch("numpy.random.choice") as mock_choice: diff --git a/tests/optimizers/test_opro.py b/tests/optimizers/test_opro.py index 9910442..5dd5385 100644 --- a/tests/optimizers/test_opro.py +++ b/tests/optimizers/test_opro.py @@ -3,6 +3,7 @@ import numpy as np from promptolution.optimizers import OPRO +from promptolution.utils.prompt import Prompt def test_opro_initialization(mock_meta_llm, initial_prompts, mock_task, mock_predictor): @@ -23,7 +24,7 @@ def test_opro_initialization(mock_meta_llm, initial_prompts, mock_task, mock_pre assert optimizer.max_num_instructions == 10 assert optimizer.num_instructions_per_step == 4 assert optimizer.num_few_shots == 2 - assert optimizer.prompts == initial_prompts + assert [p.instruction for p in optimizer.prompts] == initial_prompts def test_opro_sample_examples(mock_meta_llm, initial_prompts, mock_task, mock_predictor): @@ -62,7 +63,7 @@ def test_opro_format_instructions(mock_meta_llm, initial_prompts, mock_task, moc ) # Set scores for testing - optimizer.prompts = initial_prompts + optimizer.prompts = [Prompt(p) for p in initial_prompts] optimizer.scores = [0.7, 0.9, 0.5, 0.8, 0.6] # Format instructions @@ -109,7 +110,7 @@ def test_opro_step(mock_meta_llm, initial_prompts, mock_task, mock_predictor): ) # Set up initial state - optimizer.prompts = initial_prompts + optimizer.prompts = [Prompt(p) for p in initial_prompts] optimizer.scores = [0.7, 0.6, 0.5, 0.8] optimizer.meta_prompt = "Meta prompt with instructions and examples" diff --git a/tests/predictors/test_base_predictor.py b/tests/predictors/test_base_predictor.py index 5ba537b..4bfeacd 100644 --- a/tests/predictors/test_base_predictor.py +++ b/tests/predictors/test_base_predictor.py @@ -1,7 +1,5 @@ import numpy as np -from tests.mocks.mock_predictor import MockPredictor - def test_predictor_predict_flow(mock_predictor): """Test the basic prediction flow from prompt to final prediction.""" diff --git a/tests/predictors/test_classifiers.py b/tests/predictors/test_predictors.py similarity index 80% rename from tests/predictors/test_classifiers.py rename to tests/predictors/test_predictors.py index 54885b9..2f7e11f 100644 --- a/tests/predictors/test_classifiers.py +++ b/tests/predictors/test_predictors.py @@ -1,13 +1,13 @@ import numpy as np import pytest -from promptolution.helpers import FirstOccurrenceClassifier, MarkerBasedClassifier +from promptolution.helpers import FirstOccurrencePredictor, MarkerBasedPredictor def test_first_occurrence_classifier(mock_downstream_llm, mock_df): - """Test the FirstOccurrenceClassifier.""" + """Test the FirstOccurrencePredictor.""" # Create classifier - classifier = FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values) + classifier = FirstOccurrencePredictor(llm=mock_downstream_llm, classes=mock_df["y"].values) # Test with multiple inputs xs = ["I love this product!", "I hate this product!", "This product is okay.", "ja ne"] @@ -25,9 +25,9 @@ def test_first_occurrence_classifier(mock_downstream_llm, mock_df): def test_marker_based_classifier(mock_downstream_llm, mock_df): - """Test the MarkerBasedClassifier.""" + """Test the MarkerBasedPredictor.""" # Create classifier - classifier = MarkerBasedClassifier( + classifier = MarkerBasedPredictor( llm=mock_downstream_llm, classes=mock_df["y"].values, begin_marker="", @@ -56,9 +56,9 @@ def test_marker_based_classifier(mock_downstream_llm, mock_df): def test_marker_based_without_classes(mock_downstream_llm): - """Test MarkerBasedClassifier without predefined classes.""" + """Test MarkerBasedPredictor without predefined classes.""" # Create classifier without classes - classifier = MarkerBasedClassifier( + predictor = MarkerBasedPredictor( llm=mock_downstream_llm, classes=None, # No class restrictions begin_marker="", @@ -70,7 +70,7 @@ def test_marker_based_without_classes(mock_downstream_llm): prompts = ["Classify:"] * len(xs) # Make predictions - predictions = classifier.predict(prompts, xs) + predictions = predictor.predict(prompts, xs) # Verify shape and content - should accept any value between markers assert len(predictions) == 4 @@ -83,7 +83,7 @@ def test_marker_based_without_classes(mock_downstream_llm): def test_multiple_prompts_with_classifiers(mock_downstream_llm, mock_df): """Test using multiple prompts with classifiers.""" # Create classifier - classifier = FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values) + classifier = FirstOccurrencePredictor(llm=mock_downstream_llm, classes=mock_df["y"].values) # Test with multiple prompts prompts = ["Classify:", "Classify:", "Rate:", "Rate:"] @@ -103,7 +103,7 @@ def test_multiple_prompts_with_classifiers(mock_downstream_llm, mock_df): def test_sequence_return_with_classifiers(mock_downstream_llm, mock_df): """Test return_seq parameter with classifiers.""" # Create classifier - classifier = MarkerBasedClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values) + classifier = MarkerBasedPredictor(llm=mock_downstream_llm, classes=mock_df["y"].values) # Test with return_seq=True prompts = ["Classify:"] @@ -128,15 +128,15 @@ def test_invalid_class_labels(mock_downstream_llm): # Should raise an assertion error with pytest.raises(AssertionError): - FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=invalid_classes) + FirstOccurrencePredictor(llm=mock_downstream_llm, classes=invalid_classes) with pytest.raises(AssertionError): - MarkerBasedClassifier(llm=mock_downstream_llm, classes=invalid_classes) + MarkerBasedPredictor(llm=mock_downstream_llm, classes=invalid_classes) def test_marker_based_missing_markers(mock_downstream_llm): - """Test MarkerBasedClassifier behavior when markers are missing.""" - classifier = MarkerBasedClassifier(llm=mock_downstream_llm, classes=["will", "not", "be", "used"]) + """Test MarkerBasedPredictor behavior when markers are missing.""" + classifier = MarkerBasedPredictor(llm=mock_downstream_llm, classes=["will", "not", "be", "used"]) # When markers are missing, it should default to first class prompts = ["Classify:"] diff --git a/tests/tasks/test_classifications_tasks.py b/tests/tasks/test_classifications_tasks.py index 9651e98..256a63d 100644 --- a/tests/tasks/test_classifications_tasks.py +++ b/tests/tasks/test_classifications_tasks.py @@ -3,6 +3,7 @@ from sklearn.metrics import accuracy_score from promptolution.tasks import ClassificationTask +from promptolution.utils.prompt import Prompt def test_classification_task_initialization(mock_df): @@ -19,7 +20,7 @@ def test_classification_task_initialization(mock_df): def test_task_evaluate(mock_classification_task_with_subsampling, mock_predictor): """Test the evaluate method of ClassificationTask.""" - prompts = ["Classify sentiment:"] + prompts = [Prompt("Classify sentiment:")] scores = mock_classification_task_with_subsampling.evaluate(prompts, mock_predictor) assert isinstance(scores, list) @@ -27,6 +28,8 @@ def test_task_evaluate(mock_classification_task_with_subsampling, mock_predictor assert 0 <= scores[0] <= 1 prompts = ["Classify sentiment:", "Rate the text:"] + prompts = [Prompt(p) for p in prompts] + scores = mock_classification_task_with_subsampling.evaluate(prompts, mock_predictor) assert len(scores) == 2 @@ -35,7 +38,7 @@ def test_task_evaluate(mock_classification_task_with_subsampling, mock_predictor def test_task_evaluate_with_subsampling(mock_classification_task_with_subsampling, mock_predictor): """Test the evaluate method with subsampling.""" - prompts = ["Classify sentiment:"] + prompts = [Prompt("Classify sentiment:")] scores = mock_classification_task_with_subsampling.evaluate( prompts, @@ -62,7 +65,7 @@ def test_task_evaluate_with_subsampling(mock_classification_task_with_subsamplin def test_task_evaluate_with_return_seq(mock_classification_task_with_subsampling, mock_predictor): """Test the evaluate method with return_seq=True.""" - prompts = ["Classify sentiment:"] + prompts = [Prompt("Classify sentiment:")] scores, seqs = mock_classification_task_with_subsampling.evaluate( prompts, mock_predictor, return_seq=True, return_agg_scores=False @@ -79,7 +82,7 @@ def test_task_evaluate_with_system_prompts( ): """Test the evaluate method with system prompts.""" - prompts = ["Classify sentiment:"] + prompts = [Prompt("Classify sentiment:")] system_prompts = ["Be concise"] scores = mock_classification_task_with_subsampling.evaluate( @@ -126,7 +129,7 @@ def test_classification_task_evaluate_random_block(mock_df, mock_predictor): eval_strategy="random_block", seed=42, ) - prompts = ["Classify sentiment:"] + prompts = [Prompt("Classify sentiment:")] evaluated_x_sets = [] for _ in range(5): @@ -151,7 +154,7 @@ def test_classification_task_evaluate_sequential_block(mock_df, mock_predictor): eval_strategy="sequential_block", seed=42, ) - prompts = ["Classify sentiment:"] + prompts = [Prompt("Classify sentiment:")] task.reset_block_idx() assert task.block_idx == 0 diff --git a/tests/tasks/test_judge_task.py b/tests/tasks/test_judge_task.py index 3698bb5..3cf0066 100644 --- a/tests/tasks/test_judge_task.py +++ b/tests/tasks/test_judge_task.py @@ -1,5 +1,7 @@ import numpy as np +from promptolution.utils.prompt import Prompt + def test_judge_task_initialization(mock_judge_task_with_y, mock_judge_llm): """Test that JudgeTask initializes correctly with ground truth.""" @@ -50,6 +52,7 @@ def test_judge_task_construct_judge_prompt_without_ground_truth(mock_judge_task_ def test_judge_task_evaluate_with_ground_truth(mock_judge_task_with_y, mock_predictor, mock_judge_llm): """Test the evaluate method of JudgeTask with ground truth and full evaluation.""" prompts = ["Rate the sentiment:", "What is the sentiment?", "How would you classify this?"] + prompts = [Prompt(p) for p in prompts] mock_predictor.call_history = [] mock_judge_llm.call_history = [] @@ -72,6 +75,7 @@ def test_judge_task_evaluate_with_ground_truth(mock_judge_task_with_y, mock_pred def test_judge_task_evaluate_no_ground_truth(mock_judge_task_no_y, mock_predictor, mock_judge_llm): """Test the evaluate method of JudgeTask without a y_column (no ground truth).""" prompts = ["Tell a funny joke:", "Make me laugh:", "What's a good joke?"] + prompts = [Prompt(p) for p in prompts] mock_predictor.call_history = [] mock_judge_llm.call_history = [] @@ -86,6 +90,8 @@ def test_judge_task_evaluate_no_ground_truth(mock_judge_task_no_y, mock_predicto def test_judge_task_evaluate_with_return_seq(mock_judge_task_with_y, mock_predictor): """Test the evaluate method with return_seq=True for JudgeTask.""" prompts = ["Evaluate this text:", "What is the sentiment?", "How would you classify this?"] + prompts = [Prompt(p) for p in prompts] + scores, seqs = mock_judge_task_with_y.evaluate(prompts, mock_predictor, return_seq=True, return_agg_scores=False) assert len(scores) == 3 diff --git a/tests/tasks/test_reward_tasks.py b/tests/tasks/test_reward_tasks.py index c707da9..76e3545 100644 --- a/tests/tasks/test_reward_tasks.py +++ b/tests/tasks/test_reward_tasks.py @@ -1,4 +1,4 @@ -import numpy as np +from promptolution.utils.prompt import Prompt def test_reward_task_initialization(mock_reward_task, simple_reward_function): @@ -22,7 +22,7 @@ def test_reward_task_initialization_no_x_column(mock_reward_task_no_x_column, si def test_reward_task_evaluate_with_return_seq(mock_reward_task, mock_predictor): """Test the evaluate method with return_seq=True for RewardTask.""" - prompts = ["Generate a short text:"] + prompts = [Prompt("Generate a short text:")] scores, seqs = mock_reward_task.evaluate(prompts, mock_predictor, return_seq=True, return_agg_scores=False) diff --git a/tests/utils/test_prompt.py b/tests/utils/test_prompt.py new file mode 100644 index 0000000..3dc90bb --- /dev/null +++ b/tests/utils/test_prompt.py @@ -0,0 +1,41 @@ +from promptolution.utils.prompt import Prompt, sort_prompts_by_scores + + +def test_prompt_initialization(): + """Test that Prompt initializes correctly.""" + instruction = "Classify the sentiment of the text." + few_shots = ["Example 1: Positive", "Example 2: Negative"] + prompt = Prompt(instruction, few_shots) + + # Verify attributes + assert prompt.instruction == instruction + assert prompt.few_shots == few_shots + + +def test_prompt_construct_prompt(): + """Test the construct_prompt method of Prompt.""" + instruction = "Classify the sentiment of the text." + few_shots = ["Example 1: Positive", "Example 2: Negative"] + prompt = Prompt(instruction, few_shots) + + # Get the constructed prompt + constructed = prompt.construct_prompt() + + # Verify the prompt contains the instruction + assert instruction in constructed + + +def test_sort_prompts_by_scores(): + """Test the sort_prompts_by_scores function.""" + prompt1 = Prompt("Instruction 1", ["Example A"]) + prompt2 = Prompt("Instruction 2", ["Example B"]) + prompt3 = Prompt("Instruction 3", ["Example C"]) + + prompts = [prompt1, prompt2, prompt3] + scores = [0.75, 0.90, 0.60] + + sorted_prompts, sorted_scores = sort_prompts_by_scores(prompts, scores) + + # Verify sorting + assert sorted_prompts == [prompt2, prompt1, prompt3] + assert sorted_scores == [0.90, 0.75, 0.60] diff --git a/tutorials/api_llm_demo.py b/tutorials/api_llm_demo.py index d369a1b..8f7dcbd 100644 --- a/tutorials/api_llm_demo.py +++ b/tutorials/api_llm_demo.py @@ -8,13 +8,12 @@ from promptolution.llms import APILLM from promptolution.optimizers import CAPO -from promptolution.predictors import MarkerBasedClassifier +from promptolution.predictors import MarkerBasedPredictor from promptolution.tasks import ClassificationTask from promptolution.utils import LoggerCallback logger = Logger(__name__) -"""Run a test run for any of the implemented optimizers.""" parser = argparse.ArgumentParser() parser.add_argument("--base-url", default="https://api.openai.com/v1") parser.add_argument("--model", default="gpt-4o-2024-08-06") @@ -49,7 +48,7 @@ downstream_llm = llm meta_llm = llm -predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes) +predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes) callbacks = [LoggerCallback(logger)] diff --git a/tutorials/capo_demo.py b/tutorials/capo_demo.py index a7cc53f..0a42335 100644 --- a/tutorials/capo_demo.py +++ b/tutorials/capo_demo.py @@ -8,7 +8,7 @@ from promptolution.llms import APILLM from promptolution.optimizers import CAPO -from promptolution.predictors import MarkerBasedClassifier +from promptolution.predictors import MarkerBasedPredictor from promptolution.tasks import ClassificationTask from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback @@ -56,7 +56,7 @@ downstream_llm = llm meta_llm = llm -predictor = MarkerBasedClassifier(downstream_llm, classes=None) +predictor = MarkerBasedPredictor(downstream_llm, classes=None) optimizer = CAPO( task=task, diff --git a/tutorials/evoprompt_demo.py b/tutorials/evoprompt_demo.py index 6568230..d15d5b0 100644 --- a/tutorials/evoprompt_demo.py +++ b/tutorials/evoprompt_demo.py @@ -2,14 +2,13 @@ import argparse -import random from logging import Logger from datasets import load_dataset from promptolution.llms import APILLM from promptolution.optimizers import EVOPROMPT_GA_TEMPLATE, EvoPromptGA -from promptolution.predictors import MarkerBasedClassifier +from promptolution.predictors import MarkerBasedPredictor from promptolution.tasks import ClassificationTask from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback @@ -60,7 +59,7 @@ downstream_llm = llm meta_llm = llm -predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes) +predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes) optimizer = EvoPromptGA( task=task, diff --git a/tutorials/opro_demo.py b/tutorials/opro_demo.py index 2b6ea93..760759d 100644 --- a/tutorials/opro_demo.py +++ b/tutorials/opro_demo.py @@ -8,7 +8,7 @@ from promptolution.llms import VLLM from promptolution.optimizers import OPRO, OPRO_TEMPLATE_TD -from promptolution.predictors import MarkerBasedClassifier +from promptolution.predictors import MarkerBasedPredictor from promptolution.tasks import ClassificationTask from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback @@ -58,7 +58,7 @@ downstream_llm = llm meta_llm = llm -predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes) +predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes) optimizer = OPRO( task=task,