From c61cc375b8b9d32b2a6e8e53a9630bf9648df60a Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 21 Nov 2025 11:41:19 +0100
Subject: [PATCH 01/43] improving error logging in optimizer

---
 .coverage                                  | Bin 69632 -> 69632 bytes
 promptolution/llms/vllm.py                 |   2 +-
 promptolution/optimizers/base_optimizer.py |   3 +-
 promptolution/utils/token_counter.py       |   2 +-
 tutorials/aime_eval.py                     |  66 +++++++++++++++++++++
 5 files changed, 69 insertions(+), 4 deletions(-)
 create mode 100644 tutorials/aime_eval.py
diff --git a/.coverage b/.coverage
index 7ac1d3fe7f98765c01efabe28aca9702419b59dd..7be2815a8d6d0f0a80d2682dcfd6757b2dc53084 100644
GIT binary patch
delta 1073
zcmXAodrVtZ9LIm>KHA=QDQzjEtZd>ex1?YhCJ^N@qM2;MQX-WNg+I2E47L$s)TM5{
zF!zU%P4EWx$TI2@GE!g!D`WD|GNbb^$8czZVbo+!Gn_608v)kqDW2SO&-eHFJx^|~
z5~7q4)scqSPJ0>6g4<U<eDF%JDFD)M(rxLMG$dV+I;2{uO3IRKk{~`2ABsPU--{z+
ztJos?#R{=N+$I`?W#O*yoiHi%3SGi!p;4$6@&$ok;3IrLe}(VnKju&HNBFn+m-y#+
ziCg6sxjS5h3vpdsAd+NcxN2KIg&0qM#cc2F9^bil@+SRjYk3zMXJ!dzC-#_k&Q$gs
zPRheFGjFEn^1aHZcgJ2CsR-CT7!`WlD?c5rx_Vo^_(IczrAu#UYhCe?*n19j<&(Cu
zjhgtpF4=|SoUU*UBo+l=05FO&hFfKM{C?GE+Idd_n}uy_R{7Dq%XZF*wapoBD|G-;
zN88i0<xEVfGiQit4czDhMme+KM$tL~*NfvFb0jL`rI4VgTKh-ay{Ahx6e|{N4xH$5
z?9X8nos|_0;A`#h+u9?S`uUmEU|DP#b&Tt*b{yrldpQ!QCi8}iR8GQZ)s#<{hPjw#
z61L#ez9pRcE=)k7yfqb@MXBC_1|2-qr2kvvcVZlTIm%?i{vEclJ{uroe4U2TL48UJ
zX1OWel-v?c4{^#@PkWvQKeQYotXAxCTfKBQ0wumacWnSll;crrvITRe#mm_M*8W){
znveASe86f>1e?93hhUj}JP6JDy@E}7jb@WEw<hn(LK!;2gtgstvmCzF!vNt)!lplI
zhK<C@=)%kM#C^?iVs)r){hLO)KKP_z^SN5QZ=rH=or%T#JJ_N&QZnMC3}cJsR9p{H
zty)s=oXrffLZXY{Z-<Fu3y*a+?|t<LRFb3N(HqktI!`bh#%4}N^^C+OYOrB`to9@w
z?YfDa3<h4N#bq-X6`-OpSl8+z4FkTZ;3XINWsLP?IQv2aWIwGp;7FsvwCd4g;n&Z}
z_02W)!KtC#kcNA~^<#m3+0kE~nTGl-v7R9xEFRyFCWbF|{(3Fv2Cdg6vUUF=GmV8z
zdxv=iz5)Zfhm-)#TPaN4)ZDJ#jl{RpDRIbHbpz6on2ThKk-+D6LOuo0-+<}B!BUit
zD26FoQ~&&w=lxmhw<fgn%3s9~9A9)$xe+S#Oj6Nwm?HaORqgz}P(549MwX^Ul42Ur
z`<tX=u?-tAs&0F8cSM-A(iGPXlbhDDxmKh;K5!lX7f7$Z_-MB(|2-ZVoM#Cd5BEBw
MqCW~ED+^rif3%uO-2eap

delta 1400
zcmW;MdrVVz6bJBgd+%*&d++ZQ`hb?&RxJVrp;&MaA3V$kGiG9SKF~PgYet<CG+RuV
z?Z%cYqNvno;t;oNnKC!E@sT03=rSEThu~vu*@V5=;$yxY^5}kNH_hkwJ?H#x?)~G}
z8*Y-qP14p>LBGv8Mb<;k8+AvPd>C4_1F#o6@H|Gb5qIEXtiU|<pq0hhBi79>vo^My
zZDb*~oXunt*eHhTJ+)WuR@>Ag>VCCOtx;c5$0~8<igHmotDI1pm3>ORQlnHTrAk0?
zDrkIU>@uD*9yacXCYnYYEA0i6pwBOO`9S2wNq=8?>5DxJ18)uO*}by!s>YDdN4EUV
z>CarXTLV022RipP{xbc^lJ<rJA84j^Z)cuyyj?Ob=a2NY%i6a!mQG!EX4CHvoPpj~
zS53K}Yu<aTCUUv%olt*tu*Gi5;(JGB`OH9<1R$V~cRW5{%Y%l&bKX9$&XCC)w#=!k
z1C35Y2IY){m41*6>6FuhZ5~L`jp5lerro1W<AJ3)GQE7A<?Ylwb;r^mR~WycMe4~F
z0Ag-{aGXC&PY;AD{)^n$^ls?r#^N~i6v`GiztQKe$dv&!5PSN4XyqXkn2UE^t`$p7
zRV}S@6S5Ihj|irT3EL+_^>v$**E!B$odBBghT#G+JP}UJY`d;a<$ac##2mEWanQaF
zPQk+rz0=O?=k3AI0wN62+m!wPHXhOgro<FJ_NFYf*g-H2w?NPFg5+f0@+SwA{Y7yd
zJk@_5Gc38&?`xNAHa^I)1?8Orgopbh{(4)e?}4W3^?ldoTP=K+Z3)iM7P(-J9G+0Q
zA_n<Zy_vTy<`O;3Q<8XeB%PTGXLaytdLpmN6FUtz-7xrF*KO#JpX)unVPMV5rovm3
zTZW{ns(@K`w+YO|H@ZyK*4kkaV)5Bp<TXQsgYk8I<VwBGlc4gLq;?L77KJCD5`3Wy
zIxd6&?*0RC#}-kyI78`rJ}TE&UnmNL<N@vSA;SWDk*C!#{M!P1lRRguP+Tl8Rp*Y?
zCGd@=grtA{Ivy73e)RcIW*_dElhP3{1gIRiR=Y^}>dvmxLlCcN)y&OZEZ_a+()r^z
zjjIKnmTuMF7tAsb8FFx*)>CAIkuf9W4Q5rECF)8?OFrhO&kqkpXnV6*EAl22t5L&O
zkH%LEX(yq>b)YHIyTz>h8GO8DeLPR-4qYrfx>fS8E?`dp2k<f8!&}&gzv9pM19ssh
z{1(5))A%JG!^7B&pWuGni@UG^H{<(QiyLq~uERCB0+(VX&d1p}1E*mrmf$3O76X`r
zndn6~I?#$HG@_Ot#6k9i|3}|rA+bONRm>;&AyUP0L>}QMa)}&bERjuQ5k4Z5$RN^*
zF+>{SB}NmYh>?Vca1$<K1mO&ksf2^D6DdS8VI!=Bg)kFIL?U4#kYI#LD1?zP5PBkk
z&=FcfCL}@>!KjKFiUOyhqxqxeaxv1}H9IQI>eEF3x;8<Kj;>fMM0WmEK_8XYNZ#$T
c=)?tCV&t_O<#L(ywpAYy`ksqEx?%MH2Q5xjQ2+n{

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index 1df5121..f431878 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -38,7 +38,7 @@ class VLLM(BaseLLM):
         update_token_count: Update the token count based on the given inputs and outputs.
     """
 
-    tokenizer: PreTrainedTokenizer
+    tokenizer: "PreTrainedTokenizer"
 
     def __init__(
         self,
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index ded87e5..6726d2e 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -82,8 +82,7 @@ def optimize(self, n_steps: int) -> List[str]:
                 self.prompts = self._step()
             except Exception as e:
                 # exit training loop and gracefully fail
-                logger.error(f"⛔ Error during optimization step: {e}")
-                logger.error("⚠️ Exiting optimization loop.")
+                logger.error(f"⛔ Error during optimization step! ⚠️ Exiting optimization loop.", exc_info=e)
                 break
 
             # Callbacks at the end of each step
diff --git a/promptolution/utils/token_counter.py b/promptolution/utils/token_counter.py
index c19c815..422e277 100644
--- a/promptolution/utils/token_counter.py
+++ b/promptolution/utils/token_counter.py
@@ -27,7 +27,7 @@ def get_token_counter(llm: "BaseLLM") -> Callable[[str], int]:
 
     """
     if llm.tokenizer is not None:
-        tokenizer: PreTrainedTokenizer = llm.tokenizer
+        tokenizer: "PreTrainedTokenizer" = llm.tokenizer
         return lambda x: len(tokenizer.encode(x))
     else:
         logger.warning("⚠️ The LLM does not have a tokenizer. Using simple token count.")
diff --git a/tutorials/aime_eval.py b/tutorials/aime_eval.py
new file mode 100644
index 0000000..d369a1b
--- /dev/null
+++ b/tutorials/aime_eval.py
@@ -0,0 +1,66 @@
+"""Test run for the Opro optimizer."""
+
+
+import argparse
+from logging import Logger
+
+from datasets import load_dataset
+
+from promptolution.llms import APILLM
+from promptolution.optimizers import CAPO
+from promptolution.predictors import MarkerBasedClassifier
+from promptolution.tasks import ClassificationTask
+from promptolution.utils import LoggerCallback
+
+logger = Logger(__name__)
+
+"""Run a test run for any of the implemented optimizers."""
+parser = argparse.ArgumentParser()
+parser.add_argument("--base-url", default="https://api.openai.com/v1")
+parser.add_argument("--model", default="gpt-4o-2024-08-06")
+# parser.add_argument("--base-url", default="https://api.deepinfra.com/v1/openai")
+# parser.add_argument("--model", default="meta-llama/Meta-Llama-3-8B-Instruct")
+# parser.add_argument("--base-url", default="https://api.anthropic.com/v1/")
+# parser.add_argument("--model", default="claude-3-haiku-20240307")
+parser.add_argument("--n-steps", type=int, default=2)
+parser.add_argument("--token", default=None)
+args = parser.parse_args()
+
+df = load_dataset("SetFit/ag_news", split="train", revision="main").to_pandas().sample(300)
+
+df["input"] = df["text"]
+df["target"] = df["label_text"]
+
+task = ClassificationTask(
+    df,
+    task_description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
+    x_column="input",
+    y_column="target",
+)
+
+initial_prompts = [
+    "Classify this news article as World, Sports, Business, or Tech. Provide your answer between <final_answer> and </final_answer> tags.",
+    "Read the following news article and determine which category it belongs to: World, Sports, Business, or Tech. Your classification must be placed between <final_answer> </final_answer> markers.",
+    "Your task is to identify whether this news article belongs to World, Sports, Business, or Tech news. Provide your classification between the markers <final_answer> </final_answer>.",
+    "Conduct a thorough analysis of the provided news article and classify it as belonging to one of these four categories: World, Sports, Business, or Tech. Your answer should be presented within <final_answer> </final_answer> markers.",
+]
+
+llm = APILLM(api_url=args.base_url, model_id=args.model, api_key=args.token)
+downstream_llm = llm
+meta_llm = llm
+
+predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
+
+callbacks = [LoggerCallback(logger)]
+
+optimizer = CAPO(
+    task=task,
+    predictor=predictor,
+    meta_llm=meta_llm,
+    initial_prompts=initial_prompts,
+    callbacks=callbacks,
+)
+
+best_prompts = optimizer.optimize(n_steps=args.n_steps)
+
+logger.info(f"Optimized prompts: {best_prompts}")

From a09de91fd5c8732d51670db7a262ce6355a49578 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 21 Nov 2025 11:49:59 +0100
Subject: [PATCH 02/43] fix error messaging inside judge

---
 promptolution/optimizers/base_optimizer.py | 2 +-
 promptolution/tasks/judge_tasks.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 6726d2e..21b4ade 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -82,7 +82,7 @@ def optimize(self, n_steps: int) -> List[str]:
                 self.prompts = self._step()
             except Exception as e:
                 # exit training loop and gracefully fail
-                logger.error(f"⛔ Error during optimization step! ⚠️ Exiting optimization loop.", exc_info=e)
+                logger.error("⛔ Error during optimization step! ⚠️ Exiting optimization loop.", exc_info=e)
                 break
 
             # Callbacks at the end of each step
diff --git a/promptolution/tasks/judge_tasks.py b/promptolution/tasks/judge_tasks.py
index c53742f..9a82e66 100644
--- a/promptolution/tasks/judge_tasks.py
+++ b/promptolution/tasks/judge_tasks.py
@@ -141,7 +141,7 @@ def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[floa
                 score = (score - self.min_score) / (self.max_score - self.min_score)
                 score = max(0.0, min(1.0, score))
             except ValueError:
-                logger.warning(f"Failed to parse score '{score}' as float. Defaulting to a score 0.0.")
+                logger.warning(f"Failed to parse score '{score_str}' as float. Defaulting to a score 0.0.")
                 score = 0.0
 
             scores.append(score)

From d4124d145befa88557bd1077f58bb7cf23cf0078 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 21 Nov 2025 17:11:46 +0100
Subject: [PATCH 03/43] improve readablity

---
 promptolution/optimizers/capo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index bcfa275..6b1a14b 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -318,7 +318,7 @@ def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]:
             # Create mask for survivors and filter candidates
             survivor_mask = n_better < k
             candidates = list(compress(candidates, survivor_mask))
-            block_scores = list(compress(block_scores, survivor_mask))
+            block_scores = [list(compress(bs, survivor_mask)) for bs in block_scores]
 
             i += 1
             self.task.increment_block_idx()

From 9a0ab28589bfd963cdef678b30739cf76b6bd1bd Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 21 Nov 2025 17:21:04 +0100
Subject: [PATCH 04/43] fail save for task cache

---
 promptolution/tasks/base_task.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py
index 2e7fa89..896f400 100644
--- a/promptolution/tasks/base_task.py
+++ b/promptolution/tasks/base_task.py
@@ -141,7 +141,10 @@ def _collect_results_from_cache(
             datapoint_seqs = []
             for x, y in zip(xs, ys):
                 cache_key = (prompt, x, y)
-                datapoint_scores.append(self.eval_cache[cache_key])
+                datapoint_score = self.eval_cache.get(cache_key)
+                if datapoint_score is None:
+                    continue
+                datapoint_scores.append(datapoint_score)
                 if return_seq:
                     datapoint_seqs.append(self.seq_cache.get(cache_key, ""))
             scores.append(datapoint_scores)

From 6e89ff961aff9ff956a82f3b533dd47581f315bc Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 21 Nov 2025 17:40:42 +0100
Subject: [PATCH 05/43] extend api llm interface

---
 promptolution/llms/api_llm.py | 331 +++++++++++++++++++++++-----------
 1 file changed, 222 insertions(+), 109 deletions(-)

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index 093478e..fa662e5 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -1,141 +1,254 @@
 """Module to interface with various language models through their respective APIs."""
 
-try:
-    import asyncio
+import asyncio
+import threading
+from concurrent.futures import TimeoutError as FuturesTimeout
 
-    from openai import AsyncOpenAI
-    from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
+from openai import AsyncOpenAI
+from openai.types.chat import ChatCompletion
 
-    import_successful = True
-except ImportError:
-    import_successful = False
-
-
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 from promptolution.llms.base_llm import BaseLLM
-
-if TYPE_CHECKING:  # pragma: no cover
-    from promptolution.utils.config import ExperimentConfig
-
 from promptolution.utils.logging import get_logger
 
 logger = get_logger(__name__)
 
 
-async def _invoke_model(
-    prompt: str,
-    system_prompt: str,
-    max_tokens: int,
-    model_id: str,
-    client: AsyncOpenAI,
-    semaphore: asyncio.Semaphore,
-    max_retries: int = 20,
-    retry_delay: float = 5,
-) -> ChatCompletion:
-    async with semaphore:
-        messages: List[ChatCompletionMessageParam] = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": prompt},
-        ]
-
-        for attempt in range(max_retries + 1):  # +1 for the initial attempt
-            try:
-                response = await client.chat.completions.create(
-                    model=model_id,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                )
-                return response
-            except Exception as e:
-                if attempt < max_retries:
-                    # Calculate exponential backoff with jitter
-                    logger.warning(
-                        f"⚠️ API call failed (attempt {attempt + 1} / {max_retries + 1}): {str(e)}. "
-                        f"Retrying in {retry_delay:.2f} seconds..."
-                    )
-                    await asyncio.sleep(retry_delay)
-                else:
-                    # Log the final failure and re-raise the exception
-                    logger.error(f"❌ API call failed after {max_retries + 1} attempts: {str(e)}")
-                    raise  # Re-raise the exception after all retries fail
-        raise RuntimeError("Failed to get response after multiple retries.")
-
-
 class APILLM(BaseLLM):
-    """A class to interface with language models through their respective APIs.
-
-    This class provides a unified interface for making API calls to language models
-    using the OpenAI client library. It handles rate limiting through semaphores
-    and supports both synchronous and asynchronous operations.
-
-    Attributes:
-        model_id (str): Identifier for the model to use.
-        client (AsyncOpenAI): The initialized API client.
-        max_tokens (int): Maximum number of tokens in model responses.
-        semaphore (asyncio.Semaphore): Semaphore to limit concurrent API calls.
-    """
+    """Persistent asynchronous LLM wrapper using a background event loop."""
 
     def __init__(
         self,
         api_url: Optional[str] = None,
         model_id: Optional[str] = None,
         api_key: Optional[str] = None,
-        max_concurrent_calls: int = 50,
+        max_concurrent_calls: int = 16,
         max_tokens: int = 512,
-        config: Optional["ExperimentConfig"] = None,
+        call_timeout_s: float = 30.0,  # per request
+        gather_timeout_s: float = 120.0,  # whole batch
+        max_retries: int = 2,
+        retry_base_delay_s: float = 0.5,
+        client_kwargs: Optional[Dict[str, Any]] = None,
+        call_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
-        """Initialize the APILLM with a specific model and API configuration.
+        """Initialize the APILLM.
 
-        Args:
-            api_url (str): The base URL for the API endpoint.
-            model_id (str): Identifier for the model to use.
-            api_key (str, optional): API key for authentication. Defaults to None.
-            max_concurrent_calls (int, optional): Maximum number of concurrent API calls. Defaults to 50.
-            max_tokens (int, optional): Maximum number of tokens in model responses. Defaults to 512.
-            config (ExperimentConfig, optional): Configuration for the LLM, overriding defaults.
+        This class manages a dedicated asyncio event loop in a background thread,
+        reuses a single `AsyncOpenAI` client, and exposes a synchronous interface
+        for batch completions with timeouts and retries.
 
-        Raises:
-            ImportError: If required libraries are not installed.
+        Args:
+            api_url (Optional[str]): Base URL for the API endpoint.
+            model_id (Optional[str]): Identifier of the model to call. Must be set.
+            api_key (Optional[str]): API key/token for authentication.
+            max_concurrent_calls (int): Maximum number of concurrent API calls.
+            max_tokens (int): Default maximum number of tokens in model responses.
+            call_timeout_s (float): Per-call timeout in seconds.
+            gather_timeout_s (float): Timeout in seconds for the entire batch.
+            max_retries (int): Number of retry attempts per prompt in addition to the initial call.
+            retry_base_delay_s (float): Base delay in seconds for exponential backoff between retries.
+            client_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `AsyncOpenAI(...)`.
+            call_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `client.chat.completions.create(...)`.
         """
-        if not import_successful:
-            raise ImportError(
-                "Could not import at least one of the required libraries: openai, asyncio. "
-                "Please ensure they are installed in your environment."
-            )
+        super().__init__(config=None)
+
+        if not model_id:
+            raise ValueError("model_id must be set")
 
         self.api_url = api_url
         self.model_id = model_id
         self.api_key = api_key
-        self.max_concurrent_calls = max_concurrent_calls
         self.max_tokens = max_tokens
+        self.call_timeout_s = call_timeout_s
+        self.gather_timeout_s = gather_timeout_s
+        self.max_retries = max_retries
+        self.retry_base_delay_s = retry_base_delay_s
+
+        # extra kwargs
+        self._client_kwargs: Dict[str, Any] = dict(client_kwargs or {})
+        self._call_kwargs: Dict[str, Any] = dict(call_kwargs or {})
+
+        # --- persistent loop + semaphore ---
+        self._loop = asyncio.new_event_loop()
+        self._sem = asyncio.Semaphore(max_concurrent_calls)
+
+        def _run_loop() -> None:
+            """Run the background event loop forever."""
+            asyncio.set_event_loop(self._loop)
+            self._loop.run_forever()
+
+        self._thread = threading.Thread(target=_run_loop, name="APILLMLoop", daemon=True)
+        self._thread.start()
+
+        # Create client once; can still be customised via client_kwargs.
+        self.client = AsyncOpenAI(
+            base_url=self.api_url,
+            api_key=self.api_key,
+            timeout=self.call_timeout_s,
+            **self._client_kwargs,
+        )
+
+    # ---------- async bits that run inside the loop ----------
+    async def _ainvoke_once(self, prompt: str, system_prompt: str) -> ChatCompletion:
+        """Perform a single API call with a per-call timeout.
+
+        Args:
+            prompt (str): User prompt content.
+            system_prompt (str): System-level instructions for the model.
+
+        Returns:
+            ChatCompletion: Raw completion response from the API.
+
+        Raises:
+            asyncio.TimeoutError: If the call exceeds `call_timeout_s`.
+            Exception: Any exception raised by the underlying client call.
+        """
+        messages = [
+            {"role": "system", "content": str(system_prompt)},
+            {"role": "user", "content": str(prompt)},
+        ]
+
+        # base kwargs; user can override via call_kwargs
+        kwargs: Dict[str, Any] = {
+            "model": self.model_id,
+            "messages": messages,
+            "max_tokens": self.max_tokens,
+        }
+        kwargs.update(self._call_kwargs)
+
+        async with self._sem:
+            # per-call timeout enforces failure instead of hang
+            return await asyncio.wait_for(
+                self.client.chat.completions.create(**kwargs),
+                timeout=self.call_timeout_s,
+            )
+
+    async def _ainvoke_with_retries(self, prompt: str, system_prompt: str) -> str:
+        """Invoke the model with retries and exponential backoff.
+
+        Args:
+            prompt (str): User prompt content.
+            system_prompt (str): System-level instructions for the model.
 
-        super().__init__(config=config)
-        self.client = AsyncOpenAI(base_url=self.api_url, api_key=self.api_key)
-        self.semaphore = asyncio.Semaphore(self.max_concurrent_calls)
+        Returns:
+            str: The message content of the first choice in the completion.
+
+        Raises:
+            Exception: The last exception encountered after all retries are exhausted.
+        """
+        last_err: Optional[Exception] = None
+        for attempt in range(self.max_retries + 1):
+            try:
+                r = await self._ainvoke_once(prompt, system_prompt)
+                content = r.choices[0].message.content
+                if content is None:
+                    raise RuntimeError("Empty content from model")
+                return content
+            except Exception as e:
+                last_err = e
+                if attempt < self.max_retries:
+                    delay = self.retry_base_delay_s * (2**attempt)
+                    logger.warning(
+                        "meta LLM call failed (%d/%d): %s — retrying in %.2fs",
+                        attempt + 1,
+                        self.max_retries + 1,
+                        e,
+                        delay,
+                    )
+                    await asyncio.sleep(delay)
+        assert last_err is not None
+        raise last_err
+
+    async def _aget_batch(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
+        """Execute a batch of prompts concurrently and collect responses.
+
+        Args:
+            prompts (List[str]): List of user prompts.
+            system_prompts (List[str]): List of system prompts; must match `prompts` in length.
+
+        Returns:
+            List[str]: List of model outputs. For failed entries, an empty string is inserted.
+
+        Raises:
+            TimeoutError: If the entire batch exceeds `gather_timeout_s`.
+            RuntimeError: If any of the tasks fails; the first exception is propagated.
+        """
+        tasks = [asyncio.create_task(self._ainvoke_with_retries(p, s)) for p, s in zip(prompts, system_prompts)]
+
+        try:
+            results = await asyncio.wait_for(
+                asyncio.gather(*tasks, return_exceptions=True),
+                timeout=self.gather_timeout_s,
+            )
+        except asyncio.TimeoutError:
+            for t in tasks:
+                t.cancel()
+            raise TimeoutError(f"Meta LLM batch timed out after {self.gather_timeout_s}s")
+
+        outs: List[str] = []
+        first_exc: Optional[BaseException] = None
+        for r in results:
+            if isinstance(r, BaseException):
+                if first_exc is None:
+                    first_exc = r
+                outs.append("")
+            else:
+                outs.append(r)
+
+        if first_exc:
+            for t in tasks:
+                if not t.done():
+                    t.cancel()
+            raise RuntimeError(f"Meta LLM batch failed: {first_exc}") from first_exc
+
+        return outs
+
+    # ---------- sync API used by the threads ----------
+    def _submit(self, coro):
+        """Submit a coroutine to the background event loop.
+
+        Args:
+            coro: Coroutine object to be scheduled on the loop.
+
+        Returns:
+            concurrent.futures.Future: Future representing the coroutine result.
+        """
+        return asyncio.run_coroutine_threadsafe(coro, self._loop)
 
     def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
-        # Setup for async execution in sync context
+        """Synchronously obtain responses for a batch of prompts.
+
+        This is the main entrypoint used by external callers. It handles system
+        prompt broadcasting and delegates the actual work to the async batch
+        execution on the background loop.
+
+        Args:
+            prompts (List[str]): List of user prompts.
+            system_prompts (List[str]): List of system prompts. If a single system
+                prompt is provided and multiple prompts are given, the system
+                prompt is broadcast to all prompts. Otherwise, the list is
+                normalized to match the length of `prompts`.
+
+        Returns:
+            List[str]: List of model responses corresponding to `prompts`.
+
+        Raises:
+            TimeoutError: If waiting on the batch future exceeds `gather_timeout_s + 5.0`.
+            Exception: Any underlying error from the async batch execution.
+        """
+        if len(system_prompts) == 1 and len(prompts) > 1:
+            system_prompts = system_prompts * len(prompts)
+        elif len(system_prompts) != len(prompts):
+            system_prompts = [system_prompts[0] if system_prompts else "You are a helpful assistant."] * len(prompts)
+
+        fut = self._submit(self._aget_batch(prompts, system_prompts))
         try:
-            loop = asyncio.get_running_loop()
-        except RuntimeError:  # 'get_running_loop' raises a RuntimeError if there is no running loop
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-
-        responses = loop.run_until_complete(self._get_response_async(prompts, system_prompts))
-        return responses
-
-    async def _get_response_async(self, prompts: List[str], system_prompts: List[str]) -> List[str]:
-        assert self.model_id is not None, "model_id must be set"
-        tasks = [
-            _invoke_model(prompt, system_prompt, self.max_tokens, self.model_id, self.client, self.semaphore)
-            for prompt, system_prompt in zip(prompts, system_prompts)
-        ]
-        messages = await asyncio.gather(*tasks)
-        responses = []
-        for message in messages:
-            response = message.choices[0].message.content
-            if response is None:
-                raise ValueError("Received None response from the API.")
-            responses.append(response)
-        return responses
+            r = fut.result(timeout=self.gather_timeout_s + 5.0)
+            logger.debug(f"🔥APILLM: obtained {len(r)} responses from model.")
+            return r
+        except FuturesTimeout:
+            fut.cancel()
+            raise TimeoutError(f"Meta LLM batch (future) timed out after {self.gather_timeout_s + 5.0}s")
+        except Exception:
+            raise

From 32c93f6ae1830ef69f7c62290d8a00ba4581b6bb Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 22 Nov 2025 13:01:46 +0100
Subject: [PATCH 06/43] bring

---
 promptolution/llms/api_llm.py | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index fa662e5..8216a6e 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -10,6 +10,7 @@
 from typing import Any, Dict, List, Optional
 
 from promptolution.llms.base_llm import BaseLLM
+from promptolution.utils.config import ExperimentConfig
 from promptolution.utils.logging import get_logger
 
 logger = get_logger(__name__)
@@ -23,7 +24,7 @@ def __init__(
         api_url: Optional[str] = None,
         model_id: Optional[str] = None,
         api_key: Optional[str] = None,
-        max_concurrent_calls: int = 16,
+        max_concurrent_calls: int = 32,
         max_tokens: int = 512,
         call_timeout_s: float = 30.0,  # per request
         gather_timeout_s: float = 120.0,  # whole batch
@@ -31,13 +32,10 @@ def __init__(
         retry_base_delay_s: float = 0.5,
         client_kwargs: Optional[Dict[str, Any]] = None,
         call_kwargs: Optional[Dict[str, Any]] = None,
+        config: Optional["ExperimentConfig"] = None,
     ) -> None:
         """Initialize the APILLM.
 
-        This class manages a dedicated asyncio event loop in a background thread,
-        reuses a single `AsyncOpenAI` client, and exposes a synchronous interface
-        for batch completions with timeouts and retries.
-
         Args:
             api_url (Optional[str]): Base URL for the API endpoint.
             model_id (Optional[str]): Identifier of the model to call. Must be set.
@@ -50,12 +48,8 @@ def __init__(
             retry_base_delay_s (float): Base delay in seconds for exponential backoff between retries.
             client_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `AsyncOpenAI(...)`.
             call_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `client.chat.completions.create(...)`.
+            config (Optional[ExperimentConfig]): Configuration for the LLM, overriding defaults.
         """
-        super().__init__(config=None)
-
-        if not model_id:
-            raise ValueError("model_id must be set")
-
         self.api_url = api_url
         self.model_id = model_id
         self.api_key = api_key
@@ -69,6 +63,8 @@ def __init__(
         self._client_kwargs: Dict[str, Any] = dict(client_kwargs or {})
         self._call_kwargs: Dict[str, Any] = dict(call_kwargs or {})
 
+        super().__init__(config=config)
+
         # --- persistent loop + semaphore ---
         self._loop = asyncio.new_event_loop()
         self._sem = asyncio.Semaphore(max_concurrent_calls)
@@ -237,15 +233,9 @@ def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[s
             TimeoutError: If waiting on the batch future exceeds `gather_timeout_s + 5.0`.
             Exception: Any underlying error from the async batch execution.
         """
-        if len(system_prompts) == 1 and len(prompts) > 1:
-            system_prompts = system_prompts * len(prompts)
-        elif len(system_prompts) != len(prompts):
-            system_prompts = [system_prompts[0] if system_prompts else "You are a helpful assistant."] * len(prompts)
-
         fut = self._submit(self._aget_batch(prompts, system_prompts))
         try:
             r = fut.result(timeout=self.gather_timeout_s + 5.0)
-            logger.debug(f"🔥APILLM: obtained {len(r)} responses from model.")
             return r
         except FuturesTimeout:
             fut.cancel()

From cef2370299f8297a25c3d934951f582a35b69aaa Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 22 Nov 2025 13:01:46 +0100
Subject: [PATCH 07/43] bring

---
 .coverage                                     | Bin 69632 -> 69632 bytes
 .../base_exemplar_selector.py                 |   6 +-
 .../random_search_selector.py                 |   7 +-
 .../exemplar_selectors/random_selector.py     |   9 +-
 promptolution/helpers.py                      |  17 +--
 promptolution/llms/api_llm.py                 |  25 ++--
 promptolution/llms/base_llm.py                |   4 +-
 promptolution/optimizers/__init__.py          |  16 ---
 promptolution/optimizers/base_optimizer.py    |   7 +-
 promptolution/optimizers/capo.py              | 128 +++++++-----------
 promptolution/optimizers/evoprompt_de.py      |  30 ++--
 promptolution/optimizers/evoprompt_ga.py      |  24 ++--
 promptolution/optimizers/opro.py              |  18 +--
 promptolution/tasks/base_task.py              |  26 ++--
 promptolution/utils/__init__.py               |  17 +++
 promptolution/utils/prompt.py                 |  66 +++++++++
 promptolution/utils/prompt_creation.py        |   4 +-
 .../{optimizers => utils}/templates.py        |   2 +-
 tests/optimizers/test_capo.py                 |  60 +++-----
 tests/optimizers/test_evoprompt_de.py         |   7 +-
 tests/optimizers/test_evoprompt_ga.py         |   9 +-
 tests/optimizers/test_opro.py                 |   7 +-
 tests/tasks/test_classifications_tasks.py     |  15 +-
 tests/tasks/test_judge_task.py                |   6 +
 tests/tasks/test_reward_tasks.py              |   4 +-
 tests/utils/test_prompt.py                    |  41 ++++++
 26 files changed, 310 insertions(+), 245 deletions(-)
 create mode 100644 promptolution/utils/prompt.py
 rename promptolution/{optimizers => utils}/templates.py (99%)
 create mode 100644 tests/utils/test_prompt.py

diff --git a/.coverage b/.coverage
index 7be2815a8d6d0f0a80d2682dcfd6757b2dc53084..a9a49faea208e2dfd1d036d377be010141138f38 100644
GIT binary patch
delta 4360
zcmeHKdsLKV8h_t!zW1A%?>aNw21FR->Tm~zv=V{14WKe1mjM|O&{ULJqbyq;sNHk+
z5-GhUbUEs_YRaambYinqY`0@;wszdwb`8zWQIw1{v{Oq8`+Of`YNz(E+P|7J=X}rQ
zeV^y~-ucb@{GPTgY}*!gkKgQKUfL_wNUXR`EEYzDXN3&@6yL_j8vklsX;cjR3}yPu
z`YwGI_Zhd5n?X*J)g*>}hizd4bT8|wbq3~-ObR}L%g`9w+1Xjl-5bxM_IQfV&!A6a
z^Kx@bLw&Qmxp_sB=ixFtEz5S2sj#4$rsikUJ9GW$wQN5eM@>1asFG6+nvtH(F+&Q}
zUhG>FPNG4J)U++R0*L}GQA;qCYbBAOm1^3Ci?LkQ6i}!8R4aj`u86J{5oc0Qu}tGH
z1u)e%8kXlG;aW(w%AG?3m4@_OeHfSuAWI1i$+zIi^o!y=s^klFZN5xG!FQ+T8^{;Q
zB+w=*P&d~NiA-4tyqcEe$+^V%;Kch@1k+C(d0?Oe1roN>-KF!S7SE~{sLCA#A#$}>
zN-h+{rUzB~3QwzhNj;ra=oT8AU{OtgmKB#4(7Mk|%mRNXshFA^7OvcHqJq)_Wnv$!
zuD5_qtv{PWNIJjBOs6;sG<mGlKulUJpI36bm#$pe>UP7vN(Bnbqr+|svq+|E9SvB5
zvMZQMQM2zV(wm!Bc$({5o9>6zRRY9;1BRO38<v;_Jk(T&jnq6hk3L;uCI(+&g`{QW
z;-yRI&Qb?m^SOno;MB9A&y|{c7tPs<tGW?<6r~$Gbw<S`@0T0&v$<iu-S`$CXK?Dj
z(06k)$wTBL;W_@akjWlnJ+dI}lWN5hi5OeN?ZOqK$<WIN>h>soIu{y8J<1ZUi^O(z
zmaye40I>7iOcj9AtpJRt^U9pWzMKX8Ma<9f;m?AnQpLH7%1)OLe<l*!$O8T*)`XFm
zc`V>h)wbmFIGV1!FpCc52H_}*%bQ4KH47)A=G$=5%EB2<p@++Ba0Jc1<RUh0tLpUO
zjD#;@;b&5VDfyu|j9#wE_rXj8&}|;PLxZXcyn%cHhib`uFvFAJ1vEc^%HTs;2<@z_
z#KCmef-=#EF6%-Tz_oWl>34ut<>oYC1%8`mS5Y7ifD%enSYdy9dbmXX7Eyj$r4VKU
zOs*SX(qpqPK(Of+AepMz_H=l$LfYaGiPFHf7irqtjvy>Uk@+;H*i0mCRqAk|x%C!U
ziP{;0RYesRprSg=Q<FO%KxG1WogSV~3^#Vtw$DYZ*CJ|os>{FzK$CoX#(fJ}n>SOa
zLCPn}apj1zRe3`BmEuy0lvzrwGD#knzmf;#eR7Zdh}<BT%X#t)IZ76#E7Hf(A!(1a
zL)sv<Ngk;}Dv)gA*Wyuelekgr5ZlF8@jkIi%oU?Vvv6G)6#gw768<1`2#*R6K)PE$
zy<w(*@m^~ti;ZcSb3WMiL2+eQUHXB=qoe81EI;u9BV>Tbn(=8#?9fJRoDPcr^b>P8
zFSASoF=m>>yJk(+sek-)c<IGg!a~!*HZy(Yr!Ss*cIMhm@84CQzWIFf=!Y*~eVO!q
zV@(5ZTH40Ok-r@|`^?#fk)oK*jcK&`C=tA=;IgJVw$Dg=Mi-C*N<zxA(e-n^Pd&e`
zWKNc2d)D8LmSix;BtJX$i_s%b6^(ZnjUI_ICxI<0>EzWTLj~Kjwnts>W+x|t>8`|O
zL*Xxmzf-dA*n?@;FFaY4wQuD3$ddo8UhbCw?zn`WYV8bIgy3f}$tCe%4U4zGe)irc
z$Gg*J@OIDx?T(e{n?i-DAX}&QJenPjFdwHn;+%(V$f}P8*%W)!#>Idj$As9U;aL$)
z_dg!PMS<vwqJxjeagiX-iKKURgwUi812+ZKk}04@cq2dzjd1=tdmNw1lB_mR(`>F}
z9w8RBeL0TM!SQ7a#)sqOaPWkMFKaS+UOzy#C~}yphdHBj&s@bDUgrWQgPt_mDd-T}
z?0+p@*R~Gz-Fvu_3kACzYLO<vGjvi<a4UKP&9sMwfI2J0Ne4QDy}=;I1n=KrMc6d<
z4MBao^g&jzrC6QG3){vNbj`TCpKn}qFtCXU3<5_|kW)y|A$0w#2l{x#cIyX-Um)1y
z0-ZrA2wna7JmXohd3Plr0LH+8oT=ExN&X;*`oHNwf6=3t7|Bmn{a)=ShrA9n%mi9M
zPqH`<{|>*W^!G9Sle<GwPzRrA279gfq<!H-!S9X3d7zK_rpE=Ixw!7PmGASHkAE27
z_GRNs=ycV)Z?8CThUZMMTsDoQDezpZoUk7oRgm{etFU3*d-an;>xM6_mOeW^Hu1fx
z|MZ)sYqPvKbl})MhYLK444Z87s({DF^r(NlLg6Gen#6~Q@XQeHkR7XVF(DgbX4`DD
zO$@^e;0qKMKaSqW3gAIb;GKDI<JrmM7_}{*jQUS8MkAR0jdr{cs0xTSLyx$>b{*+|
z`b`|CXT8|y)US%S8cc17?MJ)Z?eRLp*2=@)=i_y+oZ-`KaXYrWr27*waIn|HS-$4|
z2}pM0+$*-B*N_39#%+%fF@go_J?#Dms>j38na%H>{3T!;YMrPWLCu)c!4g@w%EZ8P
z3S)l^MgJMmf%S2^Ou3!vH!Nr6-^lH~W|JV(^_z$>GgHlpGctPDZ@NFV)`bGF6ER!y
zK_>O5Xo=7wcl>lMkwkM{OqdoQ+9HWE^y1aNh8TN$do-x+(a?+fw9X*qDL+n|B%u(q
zLQDChwMu@XRSKEdLe<6u6zpcJB>qAsBn119)h>lAsPX&SrLu(asCjO0mk2RIT2<e+
zO9B@gt^=Es7(vo&wKZrY49@%qd*|sFaDD_!k+MxGlE09<<s9jP)G6I54v3G5QNj^n
zg`npT^7rup#(tv*{$NK94;v!&Z|Rrl&D>tDoQ#m?NDh06rF%PzSekmS2DtXTvRSm{
zoLS{U8MNbE6`l^Q{0_P&kE8FL6KL|Fgs0KF2kUV<@Q7NH=4RWowIA=)pou!pn{g_z
zo;d~|n@ORj3snpd<K5?LC-_SexaX6^?@}uqek)lbljgGR5k5@RlZ5}x3b9?=3-INl
z3fcAmqc%enT=d+I-wAqmgQVTYo{0Yqej?q*Y<ye>m=%AMS>2#E#CR+5q1$RJeu(B^
z?k1~r+a$Pp;?Yi|ys9i#6!|rIk^J)<_wyY0|DEH;UcjNEa1<$f6qkHW-Xs^o-SK;A
zs(3<tSd0;lLbvM?7+0n`fV-Ho3>cB`R__IMEU%mn(`BabN(f=*PlE~ahN|5V+`@Di
zCG)hu;Uj~QOifzXDGAq-RIp}))y0*kfS5?@&(C9Ole;d<ctO`C8COlr&gF?dXOYxU
V?@>qYISF8i(Bi>dsP|1z{{_yPX!rmC

delta 4049
zcmeHKdr(x@89(RlJ@@W??=I}JAdlrC@)BVYU#;RokcbGnD9cO1C@>Mx)i|b(Nr1HZ
zqm2gfj1tp~P_3E)j4?DZ^${_TCZ=s3nzU0hBAu$Jj2Mk3jt{oqy%5Jb&GaApUl?ZZ
z_d4JAopXQZe&5&B!8UcUZv@SpLzHf1fqX~aEf-2xrLEFr@vOK(j1k@!Y6X*dr`f|_
z<TvxvxdHAsTr&BXJWaycUbdD+rcP5CbAxGSRJ;=xq1&jlwY7jNOlDDYbX%)}r?y?y
zu$3!L0zvY9fk@4lWoCZj7LS)CfToOo@tB8R&JLnoo*<f@T}DH)!*D!3<|ze<Kxwv>
zj0L+nqjpgw4!Br@E6R~@EcMTxOgk<|Fw0`-ft(r=4eCrI939GWl2rftau;3wSOt!v
ze|@Z+MCz+Y)Qc}k)Sqi65g^Jl95&68=!sk#4yV@Hxm4#;W~pQh=zlh<55Zxh0XeF#
z@1LdM&@G<H^o?9~OA!&)tys|jAxoUpljp-B^lW}E?aK?I-p4DXx+S2gvV*rA!&_gT
zWUpDb%D-k+gMV$+vKr7X3BI2TXsILL%2fmz;pZ3R(ZJ(YY=hJmF+MAJE+rOF&8EMr
zTFjJa5HOdvT;-IiswY=Ixwfhb+*PV=9V(toKhFzQ*VZ)nS1egu^AyNS6*{+|49nna
z4o$dXV-`x(T9ivuhd6rXipVSwVUu&{OG5(V6GrwhWMfLqMhx9(`c;V@n43pe<_8hZ
z$WMQZkaCZ&a6c~Hh2T%qcHv#7a?PS1QkU|r{0x;7SY@AimpPliBu(L7Bqz8Ov0sc8
zpMh6!Tv;gJmG?;7<Ra-C;RB(Ltz)rv(A0(Jp*yHctK%+`U$nN)rT$ArIGOfdT7Z*i
zV4jyG*0TVjYbDvEAhOXkZ#jS{6py3n-ZdnC84KVU(MOW8^H>0n&K2c|IF4EeX3&>&
zLI6f@6xZMw+Avs#qp5#E1#y+L&65pFI8^G)RY{bwcJ<&i<IVeXc@p^q3&_#EjUdO0
zBdDcp)+llmz)^F#&N=WHBhm<BWWZ2h-jq?um`5QKO0Nyg0tnF;^JMI#eXy`E&qn8$
zR&oY#9Q2y6LNrifzaMrKBskazq`#TK;}#qQNh}7W^p%$q>rxgVqNk{D<6>0^w&;O$
z&hh|Kv_)+5bi)i>S|7xUTE;eK8k*FhAgqAv0v$2{p4KZR)H76q#gSx(Y*+x-K6+r0
z4h$RSA*srqCl*h_G|eNi00~u330vQ+W?d^Yr42~?r}mllk+xsku5H#TwPGz%b83qE
zoqAdQRQ;3smfEbYQESygb*dVznw1;Mm&%9AQDwKXRe3?FS1OfEMUl_SJ#v@4Q{E>3
zT7Fh;kW1xBa;&ULx1>So3#muyfREaS9;-!=8l9Oe7TlSIU+w;Cj<0QT#?ji_w==fa
zpZtmuCW0n-;>n`+<<<-!<1$XR@8~P~{>wwt-zwY??gr5`ck0bkD~tDjMqe4X;=;GD
z&JEn&I{a4ux+rJ5?lpbevg;>4I=B7Y((AM1+n2fNicg4?1{!Bt{+_As?I!z#M&ME=
zB;EUHZ&rR`R{Y+mv8SR2SX(NH<5TkkQ|>MO<oAt#iMzMdW*rZ*xbfw;|8*nwR8;Yn
z=L#5C3W%~(I@j)@H(E}$)Wm1i{<a|N(6W;)-FJKLmJOfbf|9`^A-S_mzkLYK+l-J!
zO#)$LQqtjb#aldqm)%pvL=ZR<^H*l<2$vFob|!Q_?}<WK9H%SBd5^o0laB}5690*d
z8w-RwHY_O)o+WW~_r`cG7RYk=0&h&<Vt_1+p@EGtTr`jc(LjbZ(z5tJr<fEMNWxv-
z7d!#npQSjXfOSWePZ0Ionr;RVdN;6qQQ%6l8VQ=n$mKN_|KX#gLsKJkKEgXTr~d}t
ze3)~D1D_i1l}w0T5qvk<)bt!WSbW^ajRCnj#-@b9Gd!#_bS*l9rX@u-h5|n$)Z1Oh
zq00fyI1T-W(SOW{wK+i_?>zLH6Jg8n9fA(-<3mC~mKNd-Nt+qq(J<?)ChqT_vDYvT
z2Pje<Ua}3Nrr|rMU++dUo7VR_g6tqquzMvZLbtDe%lI4G|5o7$){Ps3y`CC$!}QHQ
z<mU?%XVWYq27$s6l%0rOoMHny-1c5RdY?z#jN1xqt@WhXGi>TT`|~r*xj@vjxAs@v
zIaXiY_+6dxzc*{%*`mQaLVtg4yg1`|lV*W6F3TGGFNb;58z|8@4FswthN<vuoTMhf
zPO-jO3vwZ5rpq<c!Z4x&N{3Rr5gp00%Rr~d-rSGz%n1S5T73lSJ;MkRh=ZjhTrFrK
zkS?)PK2-S}>D~GsP7pxg71ks>&6Xy__M&}N&B-Qnx9@o4tI4K6^owp+!fPGfrmj4~
zo59p#w%u~`K-BW5ldAn?)yMazbp#+5dVAG|c`3brzh~S3n!Ue$zYh(rFULm?ywdvh
z>l5A~Jf|<_u4DRAivx}yxK#FZ_U^m-o8+2?k?U3}{jep=l&Q8*eseus?FhA}Wye$L
zm0ie=cOm8`qcP9_A8HS4>O!|D6ab^Mc~n)1#BmpyNZqYoiAbzaho1({OBW4lBNd56
z3w7w==ttcCXe*Oxj8I2c|6>xgP(O4ukG?*UU}+m(KUzc{s8PfwZ9_*#H^$MVIr1Z@
zS6Uqt@?-UiNN_KBL)$mR)0QFg|EgYnJ8^atNk`hNnnyjaZc-;Ieaf$t@$#qgD%mEz
zBY7o3>=CO)NjNOj3wHAd=9Oj(|0Z9_-R0W3>Er_0MA8qm4ziS<tH7C10jC&e-E=+-
zPlOu!11GhGD_9{JOW5XI0}N>wVzC=a=!n793zoLi3RZDCJ#e84r$IllkbW>wMvn~;
zl4?|M9YI9{mbNP@7CEC)7!Ov3^3o;#RnYXMJlwwE$cLH|*QNFcL5&j){YZQC?&a6-
zw?}$Y^Kh^6A8i4V9%>q_15@bdL-zl6;Mn|Q?bgGcOO-`|FF-QtMB3Y0t){7eRG(0P
z8ee`IUmoT7GQ1BvMY#cKomzo9q`s_9QNC8zD>3p3xdAH8e3>cFgmF?IIXOE1ytOS%
z#xmoZ2vgFC4DL0vFayS^A6VmJtHcdsQtrs$2{)UmNN?Mu;7XDPVvjDKO#>IK%z_E`
g$IGID5T-I!AK(_@@%qEZ8<%=P$^#}Ae)M+#8>;Cu5&!@I

diff --git a/promptolution/exemplar_selectors/base_exemplar_selector.py b/promptolution/exemplar_selectors/base_exemplar_selector.py
index bb2ee21..5d77647 100644
--- a/promptolution/exemplar_selectors/base_exemplar_selector.py
+++ b/promptolution/exemplar_selectors/base_exemplar_selector.py
@@ -5,6 +5,8 @@
 
 from typing import TYPE_CHECKING, Optional
 
+from promptolution.utils.prompt import Prompt
+
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.tasks.base_task import BaseTask
@@ -33,11 +35,11 @@ def __init__(self, task: "BaseTask", predictor: "BasePredictor", config: Optiona
             config.apply_to(self)
 
     @abstractmethod
-    def select_exemplars(self, prompt: str, n_examples: int = 5) -> str:
+    def select_exemplars(self, prompt: Prompt, n_examples: int = 5) -> Prompt:
         """Select exemplars based on the given prompt.
 
         Args:
-            prompt (str): The input prompt to base the exemplar selection on.
+            prompt (Prompt): The input prompt to base the exemplar selection on.
             n_examples (int, optional): The number of exemplars to select. Defaults to 5.
 
         Returns:
diff --git a/promptolution/exemplar_selectors/random_search_selector.py b/promptolution/exemplar_selectors/random_search_selector.py
index 7a88b08..b8cb6ee 100644
--- a/promptolution/exemplar_selectors/random_search_selector.py
+++ b/promptolution/exemplar_selectors/random_search_selector.py
@@ -1,6 +1,7 @@
 """Random search exemplar selector."""
 
 from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
+from promptolution.utils.prompt import Prompt
 
 
 class RandomSearchSelector(BaseExemplarSelector):
@@ -10,7 +11,7 @@ class RandomSearchSelector(BaseExemplarSelector):
     evaluates their performance, and selects the best performing set.
     """
 
-    def select_exemplars(self, prompt: str, n_trials: int = 5) -> str:
+    def select_exemplars(self, prompt: Prompt, n_trials: int = 5) -> Prompt:
         """Select exemplars using a random search strategy.
 
         This method generates multiple sets of random examples, evaluates their performance
@@ -21,7 +22,7 @@ def select_exemplars(self, prompt: str, n_trials: int = 5) -> str:
             n_trials (int, optional): The number of random trials to perform. Defaults to 5.
 
         Returns:
-            str: The best performing prompt, which includes the original prompt and the selected exemplars.
+            Prompt: The best performing prompt, which includes the original prompt and the selected exemplars.
         """
         best_score = 0.0
         best_prompt = prompt
@@ -30,7 +31,7 @@ def select_exemplars(self, prompt: str, n_trials: int = 5) -> str:
             _, seq = self.task.evaluate(
                 prompt, self.predictor, eval_strategy="subsample", return_seq=True, return_agg_scores=False
             )
-            prompt_with_examples = "\n\n".join([prompt] + [seq[0][0]]) + "\n\n"
+            prompt_with_examples = Prompt(prompt.instruction, [seq[0][0]])
             # evaluate prompts as few shot prompt
             score = self.task.evaluate(prompt_with_examples, self.predictor, eval_strategy="subsample")[0]
             if score > best_score:
diff --git a/promptolution/exemplar_selectors/random_selector.py b/promptolution/exemplar_selectors/random_selector.py
index a6a4b72..7b0ae0f 100644
--- a/promptolution/exemplar_selectors/random_selector.py
+++ b/promptolution/exemplar_selectors/random_selector.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, List, Optional
 
 from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
+from promptolution.utils.prompt import Prompt
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
@@ -37,18 +38,18 @@ def __init__(
         self.desired_score = desired_score
         super().__init__(task, predictor, config)
 
-    def select_exemplars(self, prompt: str, n_examples: int = 5) -> str:
+    def select_exemplars(self, prompt: Prompt, n_examples: int = 5) -> Prompt:
         """Select exemplars using a random selection strategy.
 
         This method generates random examples and selects those that are evaluated as correct
         (score == self.desired_score) until the desired number of exemplars is reached.
 
         Args:
-            prompt (str): The input prompt to base the exemplar selection on.
+            prompt (Prompt): The input prompt to base the exemplar selection on.
             n_examples (int, optional): The number of exemplars to select. Defaults to 5.
 
         Returns:
-            str: A new prompt that includes the original prompt and the selected exemplars.
+            Prompt: A new prompt that includes the original prompt and the selected exemplars.
         """
         examples: List[str] = []
         while len(examples) < n_examples:
@@ -59,4 +60,4 @@ def select_exemplars(self, prompt: str, n_examples: int = 5) -> str:
             seq = seqs[0][0]
             if score == self.desired_score:
                 examples.append(seq)
-        return "\n\n".join([prompt] + examples) + "\n\n"
+        return Prompt(prompt.instruction, examples)
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 2594609..a39d7df 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -5,6 +5,7 @@
 
 from promptolution.tasks.judge_tasks import JudgeTask
 from promptolution.tasks.reward_tasks import RewardTask
+from promptolution.utils.prompt import Prompt
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
@@ -28,7 +29,10 @@
 from promptolution.optimizers.evoprompt_de import EvoPromptDE
 from promptolution.optimizers.evoprompt_ga import EvoPromptGA
 from promptolution.optimizers.opro import OPRO
-from promptolution.optimizers.templates import (
+from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier
+from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.utils.logging import get_logger
+from promptolution.utils.templates import (
     CAPO_CROSSOVER_TEMPLATE,
     CAPO_MUTATION_TEMPLATE,
     EVOPROMPT_DE_TEMPLATE,
@@ -38,9 +42,6 @@
     OPRO_TEMPLATE,
     OPRO_TEMPLATE_TD,
 )
-from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier
-from promptolution.tasks.classification_tasks import ClassificationTask
-from promptolution.utils.logging import get_logger
 
 logger = get_logger(__name__)
 
@@ -59,12 +60,13 @@ def run_experiment(df: pd.DataFrame, config: "ExperimentConfig") -> pd.DataFrame
     train_df = df.sample(frac=0.8, random_state=42)
     test_df = df.drop(train_df.index)
     prompts = run_optimization(train_df, config)
-    df_prompt_scores = run_evaluation(test_df, config, prompts)
+    prompts_str = [p.construct_prompt() for p in prompts]
+    df_prompt_scores = run_evaluation(test_df, config, prompts_str)
 
     return df_prompt_scores
 
 
-def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
+def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Prompt]:
     """Run the optimization phase of the experiment.
 
     Configures all LLMs (downstream, meta, and judge) to use
@@ -74,7 +76,7 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
         config (Config): Configuration object for the experiment.
 
     Returns:
-        List[str]: The optimized list of prompts.
+        List[Prompt]: The optimized list of prompts.
     """
     llm = get_llm(config=config)
     predictor = get_predictor(llm, config=config)
@@ -97,7 +99,6 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[str]:
     if hasattr(config, "prepend_exemplars") and config.prepend_exemplars:
         selector = get_exemplar_selector(config.exemplar_selector, task, predictor)
         prompts = [selector.select_exemplars(p, n_examples=config.n_exemplars) for p in prompts]
-
     return prompts
 
 
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index fa662e5..e3fa699 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -10,6 +10,7 @@
 from typing import Any, Dict, List, Optional
 
 from promptolution.llms.base_llm import BaseLLM
+from promptolution.utils.config import ExperimentConfig
 from promptolution.utils.logging import get_logger
 
 logger = get_logger(__name__)
@@ -23,7 +24,7 @@ def __init__(
         api_url: Optional[str] = None,
         model_id: Optional[str] = None,
         api_key: Optional[str] = None,
-        max_concurrent_calls: int = 16,
+        max_concurrent_calls: int = 32,
         max_tokens: int = 512,
         call_timeout_s: float = 30.0,  # per request
         gather_timeout_s: float = 120.0,  # whole batch
@@ -31,13 +32,10 @@ def __init__(
         retry_base_delay_s: float = 0.5,
         client_kwargs: Optional[Dict[str, Any]] = None,
         call_kwargs: Optional[Dict[str, Any]] = None,
+        config: Optional["ExperimentConfig"] = None,
     ) -> None:
         """Initialize the APILLM.
 
-        This class manages a dedicated asyncio event loop in a background thread,
-        reuses a single `AsyncOpenAI` client, and exposes a synchronous interface
-        for batch completions with timeouts and retries.
-
         Args:
             api_url (Optional[str]): Base URL for the API endpoint.
             model_id (Optional[str]): Identifier of the model to call. Must be set.
@@ -50,12 +48,8 @@ def __init__(
             retry_base_delay_s (float): Base delay in seconds for exponential backoff between retries.
             client_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `AsyncOpenAI(...)`.
             call_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed to `client.chat.completions.create(...)`.
+            config (Optional[ExperimentConfig]): Configuration for the LLM, overriding defaults.
         """
-        super().__init__(config=None)
-
-        if not model_id:
-            raise ValueError("model_id must be set")
-
         self.api_url = api_url
         self.model_id = model_id
         self.api_key = api_key
@@ -69,9 +63,12 @@ def __init__(
         self._client_kwargs: Dict[str, Any] = dict(client_kwargs or {})
         self._call_kwargs: Dict[str, Any] = dict(call_kwargs or {})
 
+        self.max_concurrent_calls = max_concurrent_calls
+        super().__init__(config=config)
+
         # --- persistent loop + semaphore ---
         self._loop = asyncio.new_event_loop()
-        self._sem = asyncio.Semaphore(max_concurrent_calls)
+        self._sem = asyncio.Semaphore(self.max_concurrent_calls)
 
         def _run_loop() -> None:
             """Run the background event loop forever."""
@@ -237,15 +234,9 @@ def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[s
             TimeoutError: If waiting on the batch future exceeds `gather_timeout_s + 5.0`.
             Exception: Any underlying error from the async batch execution.
         """
-        if len(system_prompts) == 1 and len(prompts) > 1:
-            system_prompts = system_prompts * len(prompts)
-        elif len(system_prompts) != len(prompts):
-            system_prompts = [system_prompts[0] if system_prompts else "You are a helpful assistant."] * len(prompts)
-
         fut = self._submit(self._aget_batch(prompts, system_prompts))
         try:
             r = fut.result(timeout=self.gather_timeout_s + 5.0)
-            logger.debug(f"🔥APILLM: obtained {len(r)} responses from model.")
             return r
         except FuturesTimeout:
             fut.cancel()
diff --git a/promptolution/llms/base_llm.py b/promptolution/llms/base_llm.py
index 2fe43f9..2007a10 100644
--- a/promptolution/llms/base_llm.py
+++ b/promptolution/llms/base_llm.py
@@ -9,8 +9,8 @@
     from promptolution.utils.config import ExperimentConfig
     from transformers import PreTrainedTokenizer
 
-from promptolution.optimizers.templates import DEFAULT_SYS_PROMPT
 from promptolution.utils.logging import get_logger
+from promptolution.utils.templates import DEFAULT_SYS_PROMPT
 
 logger = get_logger(__name__)
 
@@ -42,7 +42,7 @@ def __init__(self, config: Optional["ExperimentConfig"] = None):
         # Initialize token counters
         self.input_token_count = 0
         self.output_token_count = 0
-        self.tokenizer: Optional[PreTrainedTokenizer] = None
+        self.tokenizer: Optional["PreTrainedTokenizer"] = None
 
     def get_token_count(self) -> Dict[str, int]:
         """Get the current count of input and output tokens.
diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
index 47f78a3..9f82a8f 100644
--- a/promptolution/optimizers/__init__.py
+++ b/promptolution/optimizers/__init__.py
@@ -5,19 +5,3 @@
 from promptolution.optimizers.evoprompt_de import EvoPromptDE
 from promptolution.optimizers.evoprompt_ga import EvoPromptGA
 from promptolution.optimizers.opro import OPRO
-from promptolution.optimizers.templates import (
-    CAPO_CROSSOVER_TEMPLATE,
-    CAPO_DOWNSTREAM_TEMPLATE,
-    CAPO_FEWSHOT_TEMPLATE,
-    CAPO_MUTATION_TEMPLATE,
-    DEFAULT_SYS_PROMPT,
-    EVOPROMPT_DE_TEMPLATE,
-    EVOPROMPT_DE_TEMPLATE_TD,
-    EVOPROMPT_GA_TEMPLATE,
-    EVOPROMPT_GA_TEMPLATE_TD,
-    OPRO_TEMPLATE,
-    OPRO_TEMPLATE_TD,
-    PROMPT_CREATION_TEMPLATE,
-    PROMPT_CREATION_TEMPLATE_TD,
-    PROMPT_VARIATION_TEMPLATE,
-)
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 21b4ade..a9d6ea9 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -12,6 +12,7 @@
     from promptolution.utils.callbacks import BaseCallback
 
 from promptolution.utils.logging import get_logger
+from promptolution.utils.prompt import Prompt
 
 logger = get_logger(__name__)
 
@@ -49,7 +50,7 @@ def __init__(
             config (ExperimentConfig, optional): Configuration for the optimizer, overriding defaults.
         """
         # Set up optimizer state
-        self.prompts: List[str] = initial_prompts or []
+        self.prompts: List[Prompt] = [Prompt(p) for p in initial_prompts] if initial_prompts else []
         self.task = task
         self.callbacks: List["BaseCallback"] = callbacks or []
         self.predictor = predictor
@@ -60,7 +61,7 @@ def __init__(
 
         self.config = config
 
-    def optimize(self, n_steps: int) -> List[str]:
+    def optimize(self, n_steps: int) -> List[Prompt]:
         """Perform the optimization process.
 
         This method should be implemented by concrete optimizer classes to define
@@ -104,7 +105,7 @@ def _pre_optimization_loop(self) -> None:
         pass
 
     @abstractmethod
-    def _step(self) -> List[str]:
+    def _step(self) -> List[Prompt]:
         """Perform a single optimization step.
 
         This method should be implemented by concrete optimizer classes to define
diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index 6b1a14b..a4556c1 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -19,52 +19,15 @@
     from promptolution.utils.test_statistics import TestStatistics
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
-from promptolution.optimizers.templates import (
-    CAPO_CROSSOVER_TEMPLATE,
-    CAPO_DOWNSTREAM_TEMPLATE,
-    CAPO_FEWSHOT_TEMPLATE,
-    CAPO_MUTATION_TEMPLATE,
-)
 from promptolution.utils.logging import get_logger
+from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
+from promptolution.utils.templates import CAPO_CROSSOVER_TEMPLATE, CAPO_FEWSHOT_TEMPLATE, CAPO_MUTATION_TEMPLATE
 from promptolution.utils.test_statistics import get_test_statistic_func
 from promptolution.utils.token_counter import get_token_counter
 
 logger = get_logger(__name__)
 
 
-class CAPOPrompt:
-    """Represents a prompt consisting of an instruction and few-shot examples."""
-
-    def __init__(self, instruction_text: str, few_shots: List[str]) -> None:
-        """Initializes the Prompt with an instruction and associated examples.
-
-        Args:
-            instruction_text (str): The instruction or prompt text.
-            few_shots (List[str]): List of examples as string.
-        """
-        self.instruction_text = instruction_text.strip()
-        self.few_shots = few_shots
-
-    def construct_prompt(self) -> str:
-        """Constructs the full prompt string by replacing placeholders in the template with the instruction and formatted examples.
-
-        Returns:
-            str: The constructed prompt string.
-        """
-        few_shot_str = "\n\n".join(self.few_shots).strip()
-        prompt = (
-            CAPO_DOWNSTREAM_TEMPLATE.replace("<instruction>", self.instruction_text)
-            .replace("<few_shots>", few_shot_str)
-            .replace("\n\n\n\n", "\n\n")  # replace extra newlines if no few shots are provided
-            .strip()
-        )
-        return prompt
-
-    def __str__(self) -> str:
-        """Returns the string representation of the prompt."""
-        return self.construct_prompt()
-
-
 class CAPO(BaseOptimizer):
     """CAPO: Cost-Aware Prompt Optimization.
 
@@ -136,8 +99,8 @@ def __init__(
         self.check_fs_accuracy = check_fs_accuracy
         self.create_fs_reasoning = create_fs_reasoning
 
-        self.scores: List[float] = []
         super().__init__(predictor, task, initial_prompts, callbacks, config)
+
         self.df_few_shots = df_few_shots if df_few_shots is not None else task.pop_datapoints(frac=0.1)
         if self.max_n_blocks_eval > self.task.n_blocks:
             logger.warning(
@@ -154,7 +117,7 @@ def __init__(
             self.target_begin_marker = ""
             self.target_end_marker = ""
 
-    def _initialize_population(self, initial_prompts: List[str]) -> List[CAPOPrompt]:
+    def _initialize_population(self, initial_prompts: List[Prompt]) -> List[Prompt]:
         """Initializes the population of Prompt objects from initial instructions.
 
         Args:
@@ -164,10 +127,10 @@ def _initialize_population(self, initial_prompts: List[str]) -> List[CAPOPrompt]
             List[Prompt]: Initialized population of prompts with few-shot examples.
         """
         population = []
-        for instruction_text in initial_prompts:
+        for prompt in initial_prompts:
             num_examples = random.randint(0, self.upper_shots)
-            few_shots = self._create_few_shot_examples(instruction_text, num_examples)
-            population.append(CAPOPrompt(instruction_text, few_shots))
+            few_shots = self._create_few_shot_examples(prompt.instruction, num_examples)
+            population.append(Prompt(prompt.instruction, few_shots))
 
         return population
 
@@ -209,11 +172,11 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
 
         return few_shots
 
-    def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]:
+    def _crossover(self, parents: List[Prompt]) -> List[Prompt]:
         """Performs crossover among parent prompts to generate offsprings.
 
         Args:
-            parents (List[CAPOPrompt]): List of parent prompts.
+            parents (List[Prompt]): List of parent prompts.
 
         Returns:
             List[Prompt]: List of new offsprings after crossover.
@@ -223,8 +186,8 @@ def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]:
         for _ in range(self.crossovers_per_iter):
             mother, father = random.sample(parents, 2)
             crossover_prompt = (
-                self.crossover_template.replace("<mother>", mother.instruction_text)
-                .replace("<father>", father.instruction_text)
+                self.crossover_template.replace("<mother>", mother.instruction)
+                .replace("<father>", father.instruction)
                 .strip()
             )
             # collect all crossover prompts then pass them bundled to the meta llm (speedup)
@@ -239,22 +202,22 @@ def _crossover(self, parents: List[CAPOPrompt]) -> List[CAPOPrompt]:
         offsprings = []
         for instruction, examples in zip(child_instructions, offspring_few_shots):
             instruction = extract_from_tag(instruction, "<prompt>", "</prompt>")
-            offsprings.append(CAPOPrompt(instruction, examples))
+            offsprings.append(Prompt(instruction, examples))
 
         return offsprings
 
-    def _mutate(self, offsprings: List[CAPOPrompt]) -> List[CAPOPrompt]:
+    def _mutate(self, offsprings: List[Prompt]) -> List[Prompt]:
         """Apply mutation to offsprings to generate new candidate prompts.
 
         Args:
-            offsprings (List[CAPOPrompt]): List of offsprings to mutate.
+            offsprings (List[Prompt]): List of offsprings to mutate.
 
         Returns:
             List[Prompt]: List of mutated prompts.
         """
         # collect all mutation prompts then pass them bundled to the meta llm (speedup)
         mutation_prompts = [
-            self.mutation_template.replace("<instruction>", prompt.instruction_text) for prompt in offsprings
+            self.mutation_template.replace("<instruction>", prompt.instruction) for prompt in offsprings
         ]
         new_instructions = self.meta_llm.get_response(mutation_prompts)
 
@@ -273,15 +236,15 @@ def _mutate(self, offsprings: List[CAPOPrompt]) -> List[CAPOPrompt]:
                 new_few_shots = prompt.few_shots
 
             random.shuffle(new_few_shots)
-            mutated.append(CAPOPrompt(new_instruction, new_few_shots))
+            mutated.append(Prompt(new_instruction, new_few_shots))
 
         return mutated
 
-    def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]:
+    def _do_racing(self, candidates: List[Prompt], k: int) -> Tuple[List[Prompt], List[float]]:
         """Perform the racing (selection) phase by comparing candidates based on their evaluation scores using the provided test statistic.
 
         Args:
-            candidates (List[CAPOPrompt]): List of candidate prompts.
+            candidates (List[Prompt]): List of candidate prompts.
             k (int): Number of survivors to retain.
 
         Returns:
@@ -292,9 +255,7 @@ def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]:
         i = 0
         while len(candidates) > k and i < self.max_n_blocks_eval:
             # new_scores shape: (n_candidates, n_samples)
-            new_scores: List[float] = self.task.evaluate(
-                [c.construct_prompt() for c in candidates], self.predictor, return_agg_scores=False
-            )
+            new_scores: List[float] = self.task.evaluate(candidates, self.predictor, return_agg_scores=False)
 
             # subtract length penalty
             prompt_lengths = np.array([self.token_counter(c.construct_prompt()) for c in candidates])
@@ -315,40 +276,51 @@ def _do_racing(self, candidates: List[CAPOPrompt], k: int) -> List[CAPOPrompt]:
             # Sum along rows to get number of better scores for each candidate
             n_better = np.sum(comparison_matrix, axis=1)
 
-            # Create mask for survivors and filter candidates
-            survivor_mask = n_better < k
-            candidates = list(compress(candidates, survivor_mask))
-            block_scores = [list(compress(bs, survivor_mask)) for bs in block_scores]
+            candidates, block_scores = filter_survivors(candidates, block_scores, mask=n_better < k)
 
             i += 1
             self.task.increment_block_idx()
 
-        avg_scores = self.task.evaluate(
-            [c.construct_prompt() for c in candidates], self.predictor, eval_strategy="evaluated"
-        )
-        order = np.argsort(-np.array(avg_scores))[:k]
-        candidates = [candidates[i] for i in order]
-        self.scores = [avg_scores[i] for i in order]
+        avg_scores = self.task.evaluate(candidates, self.predictor, eval_strategy="evaluated")
+        prompts, avg_scores = sort_prompts_by_scores(candidates, avg_scores, top_k=k)
 
-        return candidates
+        return prompts, avg_scores
 
     def _pre_optimization_loop(self) -> None:
-        self.prompt_objects = self._initialize_population(self.prompts)
-        self.prompts = [p.construct_prompt() for p in self.prompt_objects]
-        self.max_prompt_length = max(self.token_counter(p) for p in self.prompts) if self.prompts else 1
+        self.prompts = self._initialize_population(self.prompts)
+        self.max_prompt_length = (
+            max(self.token_counter(p.construct_prompt()) for p in self.prompts) if self.prompts else 1
+        )
         self.task.reset_block_idx()
 
-    def _step(self) -> List[str]:
+    def _step(self) -> List[Prompt]:
         """Perform a single optimization step.
 
         Returns:
-            List[str]: The optimized list of prompts after the step.
+            List[Prompt]: The optimized list of prompts after the step.
         """
-        offsprings = self._crossover(self.prompt_objects)
+        offsprings = self._crossover(self.prompts)
         mutated = self._mutate(offsprings)
-        combined = self.prompt_objects + mutated
+        combined = self.prompts + mutated
 
-        self.prompt_objects = self._do_racing(combined, self.population_size)
-        self.prompts = [p.construct_prompt() for p in self.prompt_objects]
+        self.prompts, self.scores = self._do_racing(combined, self.population_size)
 
         return self.prompts
+
+
+def filter_survivors(
+    candidates: List[Prompt], scores: List[List[float]], mask: Any
+) -> Tuple[List[Prompt], List[List[float]]]:
+    """Filter candidates and scores based on a boolean mask.
+
+    Args:
+        candidates (List[Prompt]): List of candidate prompts.
+        scores (List[List[float]]): Corresponding scores for the candidates.
+        mask (Any): Boolean mask indicating which candidates to keep.
+
+    Returns:
+        Tuple[List[Prompt], List[List[float]]]: Filtered candidates and their scores.
+    """
+    filtered_candidates = list(compress(candidates, mask))
+    filtered_scores = list(compress(scores, mask))
+    return filtered_candidates, filtered_scores
diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py
index 0412d5d..05d8336 100644
--- a/promptolution/optimizers/evoprompt_de.py
+++ b/promptolution/optimizers/evoprompt_de.py
@@ -1,12 +1,13 @@
 """Module for EvoPromptDE optimizer."""
 
 
-import numpy as np
+import random
 
-from typing import TYPE_CHECKING, Any, List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
 from promptolution.utils.formatting import extract_from_tag
+from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
@@ -59,10 +60,9 @@ def __init__(
 
     def _pre_optimization_loop(self) -> None:
         self.scores = self.task.evaluate(self.prompts, self.predictor, return_agg_scores=True)
-        self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
-        self.scores = sorted(self.scores, reverse=True)
+        self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores)
 
-    def _step(self) -> List[str]:
+    def _step(self) -> List[Prompt]:
         """Perform the optimization process for a specified number of steps.
 
         This method iteratively improves the prompts using a differential evolution strategy.
@@ -71,7 +71,7 @@ def _step(self) -> List[str]:
 
 
         Returns:
-            List[str]: The optimized list of prompts after all steps.
+            List[Prompt]: The optimized list of prompts after all steps.
         """
         cur_best = self.prompts[0]
         meta_prompts = []
@@ -80,22 +80,23 @@ def _step(self) -> List[str]:
             old_prompt = self.prompts[i]
 
             candidates = [prompt for prompt in self.prompts if prompt != old_prompt]
-            a, b, c = np.random.choice(candidates, size=3, replace=False)
+            a, b, c = random.sample(candidates, k=3)
 
             if not self.donor_random:
                 c = cur_best
 
             meta_prompt = (
-                self.prompt_template.replace("<prompt0>", old_prompt)
-                .replace("<prompt1>", a)
-                .replace("<prompt2>", b)
-                .replace("<prompt3>", c)
+                self.prompt_template.replace("<prompt0>", old_prompt.construct_prompt())
+                .replace("<prompt1>", a.construct_prompt())
+                .replace("<prompt2>", b.construct_prompt())
+                .replace("<prompt3>", c.construct_prompt())
             )
 
             meta_prompts.append(meta_prompt)
 
-        child_prompts = self.meta_llm.get_response(meta_prompts)
-        child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")
+        child_instructions = self.meta_llm.get_response(meta_prompts)
+        child_instructions = extract_from_tag(child_instructions, "<prompt>", "</prompt>")
+        child_prompts = [Prompt(p) for p in child_instructions]
 
         child_scores = self.task.evaluate(child_prompts, self.predictor, return_agg_scores=True)
 
@@ -104,7 +105,6 @@ def _step(self) -> List[str]:
                 self.prompts[i] = child_prompts[i]
                 self.scores[i] = child_scores[i]
 
-        self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
-        self.scores = sorted(self.scores, reverse=True)
+        self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores)
 
         return self.prompts
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index 91cc6a7..c7b66a4 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -3,9 +3,10 @@
 
 import numpy as np
 
-from typing import TYPE_CHECKING, Any, List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
+from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
@@ -66,25 +67,20 @@ def __init__(
 
     def _pre_optimization_loop(self) -> None:
         self.scores = self.task.evaluate(self.prompts, self.predictor, return_agg_scores=True)
-        # sort prompts by score
-        self.prompts = [prompt for _, prompt in sorted(zip(self.scores, self.prompts), reverse=True)]
-        self.scores = sorted(self.scores, reverse=True)
+        self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores)
 
-    def _step(self) -> List[str]:
+    def _step(self) -> List[Prompt]:
         new_prompts = self._crossover(self.prompts, self.scores)
-        prompts = self.prompts + new_prompts
-
         new_scores = self.task.evaluate(new_prompts, self.predictor, return_agg_scores=True)
 
+        prompts = self.prompts + new_prompts
         scores = self.scores + new_scores
 
-        # sort scores and prompts
-        self.prompts = [prompt for _, prompt in sorted(zip(scores, prompts), reverse=True)][: len(self.prompts)]
-        self.scores = sorted(scores, reverse=True)[: len(self.prompts)]
+        self.prompts, self.scores = sort_prompts_by_scores(prompts, scores, top_k=len(self.prompts))
 
         return self.prompts
 
-    def _crossover(self, prompts: List[str], scores: List[float]) -> List[str]:
+    def _crossover(self, prompts: List[Prompt], scores: List[float]) -> List[Prompt]:
         """Perform crossover operation to generate new child prompts.
 
         This method selects parent prompts based on the chosen selection mode,
@@ -123,10 +119,12 @@ def _crossover(self, prompts: List[str], scores: List[float]) -> List[str]:
                 parent_1 = group_1[np.argmax([self.scores[self.prompts.index(p)] for p in group_1])]
                 parent_2 = group_2[np.argmax([self.scores[self.prompts.index(p)] for p in group_2])]
 
+            parent_1, parent_2 = parent_1.construct_prompt(), parent_2.construct_prompt()
             meta_prompt = self.prompt_template.replace("<prompt1>", parent_1).replace("<prompt2>", parent_2)
             meta_prompts.append(meta_prompt)
 
-        child_prompts = self.meta_llm.get_response(meta_prompts)
-        child_prompts = extract_from_tag(child_prompts, "<prompt>", "</prompt>")
+        child_instructions = self.meta_llm.get_response(meta_prompts)
+        child_instructions = extract_from_tag(child_instructions, "<prompt>", "</prompt>")
+        child_prompts = [Prompt(p) for p in child_instructions]
 
         return child_prompts
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index 864da31..a37ed81 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -3,11 +3,12 @@
 
 import numpy as np
 
-from typing import TYPE_CHECKING, Any, List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
-from promptolution.optimizers.templates import OPRO_TEMPLATE
 from promptolution.utils.formatting import extract_from_tag
+from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
+from promptolution.utils.templates import OPRO_TEMPLATE
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
@@ -87,7 +88,7 @@ def _format_instructions(self) -> str:
 
         return "".join([f"text:\n{prompt}\nscore: {int(100 * round(score, 2))}\n\n" for prompt, score in sorted_pairs])
 
-    def _add_prompt_and_score(self, prompt: str, score: float) -> None:
+    def _add_prompt_and_score(self, prompt: Prompt, score: float) -> None:
         """Add a prompt and its score to the lists, maintaining max length.
 
         Args:
@@ -101,17 +102,15 @@ def _add_prompt_and_score(self, prompt: str, score: float) -> None:
         self.scores.append(score)
 
         # Keep only the top-performing prompts if we exceed the maximum number of instructions
-        keep_indices = np.argsort(self.scores)[-self.max_num_instructions :]
-        self.prompts = [self.prompts[i] for i in keep_indices]
-        self.scores = [self.scores[i] for i in keep_indices]
+        self.prompts, self.scores = sort_prompts_by_scores(self.prompts, self.scores, top_k=self.max_num_instructions)
 
     def _pre_optimization_loop(self):
-        self.scores = list(self.task.evaluate(self.prompts, self.predictor))
+        self.scores = self.task.evaluate(self.prompts, self.predictor)
         self.meta_prompt = self.meta_prompt_template.replace("<instructions>", self._format_instructions()).replace(
             "<examples>", self._sample_examples()
         )
 
-    def _step(self) -> List[str]:
+    def _step(self) -> List[Prompt]:
         duplicate_prompts = 0
         for _ in range(self.num_instructions_per_step):
             generation_seed = np.random.randint(0, int(1e9))
@@ -119,7 +118,8 @@ def _step(self) -> List[str]:
 
             response = self.meta_llm.get_response([self.meta_prompt])[0]
 
-            prompt = extract_from_tag(response, "<prompt>", "</prompt>")
+            instruction = extract_from_tag(response, "<prompt>", "</prompt>")
+            prompt = Prompt(instruction)
 
             if prompt in self.prompts:
                 duplicate_prompts += 1
diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py
index 896f400..a364ae3 100644
--- a/promptolution/tasks/base_task.py
+++ b/promptolution/tasks/base_task.py
@@ -8,6 +8,8 @@
 
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union, overload
 
+from promptolution.optimizers.base_optimizer import Prompt
+
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
     from promptolution.utils.config import ExperimentConfig
@@ -103,7 +105,7 @@ def subsample(self, eval_strategy: "EvalStrategy" = None) -> Tuple[List[str], Li
 
     def _prepare_batch(
         self,
-        prompts: List[str],
+        prompts: List[Prompt],
         xs: List[str],
         ys: List[str],
         eval_strategy: Literal["full", "subsample", "sequential_block", "random_block", "evaluated"] = "full",
@@ -117,14 +119,14 @@ def _prepare_batch(
         keys_to_predict = []
         for prompt in prompts:
             for x, y in zip(xs, ys):
-                cache_key = (prompt, x, str(y))
+                cache_key = (prompt.construct_prompt(), x, str(y))
                 if cache_key not in self.eval_cache:
                     keys_to_predict.append(cache_key)
         return keys_to_predict
 
     def _collect_results_from_cache(
         self,
-        prompts: List[str],
+        prompts: List[Prompt],
         xs: List[str],
         ys: List[str],
         return_agg_scores: bool,
@@ -140,7 +142,7 @@ def _collect_results_from_cache(
             datapoint_scores = []
             datapoint_seqs = []
             for x, y in zip(xs, ys):
-                cache_key = (prompt, x, y)
+                cache_key = (prompt.construct_prompt(), x, y)
                 datapoint_score = self.eval_cache.get(cache_key)
                 if datapoint_score is None:
                     continue
@@ -168,7 +170,7 @@ def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[floa
     @overload
     def evaluate(
         self,
-        prompts: List[str],
+        prompts: List[Prompt],
         predictor: "BasePredictor",
         system_prompts: Optional[Union[str, List[str]]] = None,
         return_agg_scores: Literal[True] = True,
@@ -180,7 +182,7 @@ def evaluate(
     @overload
     def evaluate(
         self,
-        prompts: List[str],
+        prompts: List[Prompt],
         predictor: "BasePredictor",
         system_prompts: Optional[Union[str, List[str]]] = None,
         return_agg_scores: Literal[False] = False,
@@ -192,7 +194,7 @@ def evaluate(
     @overload
     def evaluate(
         self,
-        prompts: List[str],
+        prompts: List[Prompt],
         predictor: "BasePredictor",
         system_prompts: Optional[Union[str, List[str]]] = None,
         return_agg_scores: Literal[False] = False,
@@ -204,7 +206,7 @@ def evaluate(
     @overload
     def evaluate(
         self,
-        prompts: str,
+        prompts: Prompt,
         predictor: "BasePredictor",
         system_prompts: Optional[Union[str, List[str]]] = None,
         return_agg_scores: Literal[True] = True,
@@ -216,7 +218,7 @@ def evaluate(
     @overload
     def evaluate(
         self,
-        prompts: str,
+        prompts: Prompt,
         predictor: "BasePredictor",
         system_prompts: Optional[Union[str, List[str]]] = None,
         return_agg_scores: Literal[False] = False,
@@ -228,7 +230,7 @@ def evaluate(
     @overload
     def evaluate(
         self,
-        prompts: str,
+        prompts: Prompt,
         predictor: "BasePredictor",
         system_prompts: Optional[Union[str, List[str]]] = None,
         return_agg_scores: Literal[False] = False,
@@ -239,7 +241,7 @@ def evaluate(
 
     def evaluate(
         self,
-        prompts: Union[str, List[str]],
+        prompts: Union[Prompt, List[Prompt]],
         predictor: "BasePredictor",
         system_prompts: Optional[Union[str, List[str]]] = None,
         return_agg_scores: bool = True,
@@ -256,7 +258,7 @@ def evaluate(
 
         seqs: List[str] = []
 
-        prompts = [prompts] if isinstance(prompts, str) else prompts
+        prompts = [prompts] if isinstance(prompts, Prompt) else prompts
         eval_strategy = eval_strategy or self.eval_strategy
         xs, ys = self.subsample(eval_strategy=eval_strategy)
         batches = self._prepare_batch(prompts, xs, ys, eval_strategy=eval_strategy)
diff --git a/promptolution/utils/__init__.py b/promptolution/utils/__init__.py
index eba584c..11eb2ea 100644
--- a/promptolution/utils/__init__.py
+++ b/promptolution/utils/__init__.py
@@ -10,6 +10,23 @@
 )
 from promptolution.utils.config import ExperimentConfig
 from promptolution.utils.logging import get_logger, setup_logging
+from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
 from promptolution.utils.prompt_creation import create_prompt_variation, create_prompts_from_samples
+from promptolution.utils.templates import (
+    CAPO_CROSSOVER_TEMPLATE,
+    CAPO_FEWSHOT_TEMPLATE,
+    CAPO_MUTATION_TEMPLATE,
+    DEFAULT_SYS_PROMPT,
+    DOWNSTREAM_TEMPLATE,
+    EVOPROMPT_DE_TEMPLATE,
+    EVOPROMPT_DE_TEMPLATE_TD,
+    EVOPROMPT_GA_TEMPLATE,
+    EVOPROMPT_GA_TEMPLATE_TD,
+    OPRO_TEMPLATE,
+    OPRO_TEMPLATE_TD,
+    PROMPT_CREATION_TEMPLATE,
+    PROMPT_CREATION_TEMPLATE_TD,
+    PROMPT_VARIATION_TEMPLATE,
+)
 from promptolution.utils.test_statistics import TestStatistics, get_test_statistic_func, paired_t_test
 from promptolution.utils.token_counter import get_token_counter
diff --git a/promptolution/utils/prompt.py b/promptolution/utils/prompt.py
new file mode 100644
index 0000000..e663a93
--- /dev/null
+++ b/promptolution/utils/prompt.py
@@ -0,0 +1,66 @@
+"""Module defining the Prompt class and related utilities."""
+
+from typing import List, Optional, Tuple
+
+from promptolution.utils.templates import DOWNSTREAM_TEMPLATE
+
+
+class Prompt:
+    """Represents a prompt consisting of an instruction and few-shot examples."""
+
+    def __init__(
+        self, instruction: str, few_shots: Optional[List[str]] = None, downstream_template: Optional[str] = None
+    ) -> None:
+        """Initializes the Prompt with an instruction and associated examples.
+
+        Args:
+            instruction (str): The instruction or prompt text.
+            few_shots (List[str]): List of examples as string.
+            downstream_template (str, optional): Template for formatting the full prompt.
+        """
+        self.instruction = instruction.strip()
+        self.few_shots = few_shots or []
+        self.downstream_template = downstream_template or DOWNSTREAM_TEMPLATE
+
+    def construct_prompt(self) -> str:
+        """Constructs the full prompt string by replacing placeholders in the template with the instruction and formatted examples.
+
+        Returns:
+            str: The constructed prompt string.
+        """
+        few_shot_str = "\n\n".join(self.few_shots).strip()
+        prompt = (
+            self.downstream_template.replace("<instruction>", self.instruction)
+            .replace("<few_shots>", few_shot_str)
+            .replace("\n\n\n\n", "\n\n")  # replace extra newlines if no few shots are provided
+            .strip()
+        )
+        return prompt
+
+    def __str__(self) -> str:
+        """Returns the string representation of the prompt."""
+        return self.construct_prompt()
+
+
+def sort_prompts_by_scores(
+    prompts: List[Prompt], scores: List[float], top_k: Optional[int] = None
+) -> Tuple[List[Prompt], List[float]]:
+    """Sorts prompts based on their associated scores in descending order.
+
+    Args:
+        prompts (List[Prompt]): List of Prompt objects.
+        scores (List[float]): Corresponding list of scores.
+
+    Returns:
+        List[Prompt]: Prompts sorted by scores in descending order.
+    """
+    assert len(prompts) == len(scores), "Prompts and scores must have the same length."
+
+    sorted_prompts = [prompt for score, prompt in sorted(zip(scores, prompts), reverse=True, key=lambda x: x[0])]
+    sorted_scores = sorted(scores, reverse=True)
+
+    if top_k is not None:
+        sorted_prompts = sorted_prompts[:top_k]
+        sorted_scores = sorted_scores[:top_k]
+
+    return sorted_prompts, sorted_scores
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 77082db..0a5088a 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -11,12 +11,12 @@
     from promptolution.llms.base_llm import BaseLLM
     from promptolution.tasks.base_task import BaseTask
 
-from promptolution.optimizers.templates import (
+from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.utils.templates import (
     PROMPT_CREATION_TEMPLATE,
     PROMPT_CREATION_TEMPLATE_TD,
     PROMPT_VARIATION_TEMPLATE,
 )
-from promptolution.tasks.classification_tasks import ClassificationTask
 
 
 def create_prompt_variation(
diff --git a/promptolution/optimizers/templates.py b/promptolution/utils/templates.py
similarity index 99%
rename from promptolution/optimizers/templates.py
rename to promptolution/utils/templates.py
index aaa1f63..9afeac2 100644
--- a/promptolution/optimizers/templates.py
+++ b/promptolution/utils/templates.py
@@ -139,7 +139,7 @@
 The instruction was"""
 
 
-CAPO_DOWNSTREAM_TEMPLATE = """<instruction>
+DOWNSTREAM_TEMPLATE = """<instruction>
 
 <few_shots>
 
diff --git a/tests/optimizers/test_capo.py b/tests/optimizers/test_capo.py
index 466c92b..834ba67 100644
--- a/tests/optimizers/test_capo.py
+++ b/tests/optimizers/test_capo.py
@@ -4,31 +4,8 @@
 
 from tests.mocks.mock_task import MockTask
 
-from promptolution.optimizers.capo import CAPO, CAPOPrompt
-
-
-def test_capo_prompt_initialization():
-    """Test that CAPOPrompt initializes correctly."""
-    instruction = "Classify the sentiment of the text."
-    few_shots = ["Example 1: Positive", "Example 2: Negative"]
-    prompt = CAPOPrompt(instruction, few_shots)
-
-    # Verify attributes
-    assert prompt.instruction_text == instruction
-    assert prompt.few_shots == few_shots
-
-
-def test_capo_prompt_construct_prompt():
-    """Test the construct_prompt method of CAPOPrompt."""
-    instruction = "Classify the sentiment of the text."
-    few_shots = ["Example 1: Positive", "Example 2: Negative"]
-    prompt = CAPOPrompt(instruction, few_shots)
-
-    # Get the constructed prompt
-    constructed = prompt.construct_prompt()
-
-    # Verify the prompt contains the instruction
-    assert instruction in constructed
+from promptolution.optimizers.capo import CAPO
+from promptolution.utils.prompt import Prompt
 
 
 def test_capo_initialization(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):
@@ -67,11 +44,11 @@ def mock_create_few_shot_examples(instruction, num_examples):
 
     # Control randomness
     with patch("random.randint", return_value=2):
-        population = optimizer._initialize_population(initial_prompts)
+        population = optimizer._initialize_population([Prompt(p) for p in initial_prompts])
 
     # Verify population was created
     assert len(population) == len(initial_prompts)
-    assert all(isinstance(p, CAPOPrompt) for p in population)
+    assert all(isinstance(p, Prompt) for p in population)
 
 
 def test_capo_step(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):
@@ -86,25 +63,26 @@ def test_capo_step(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mo
     )
 
     # Create mock prompt objects
-    mock_prompts = [CAPOPrompt("Instruction 1", ["Example 1"]), CAPOPrompt("Instruction 2", ["Example 2"])]
+    mock_prompts = [Prompt("Instruction 1", ["Example 1"]), Prompt("Instruction 2", ["Example 2"])]
     optimizer.prompt_objects = mock_prompts
 
     # Mock the internal methods to avoid complexity
-    mock_offspring = [CAPOPrompt("Offspring", ["Example"])]
+    mock_offspring = [Prompt("Offspring", ["Example"])]
     optimizer._crossover = lambda x: mock_offspring
 
-    mock_mutated = [CAPOPrompt("Mutated", ["Example"])]
+    mock_mutated = [Prompt("Mutated", ["Example"])]
     optimizer._mutate = lambda x: mock_mutated
 
-    mock_survivors = [CAPOPrompt("Survivor 1", ["Example"]), CAPOPrompt("Survivor 2", ["Example"])]
-    optimizer._do_racing = lambda x, k: mock_survivors
+    mock_survivors = [Prompt("Survivor 1", ["Example"]), Prompt("Survivor 2", ["Example"])]
+    mock_scores = [0.9, 0.8]
+    optimizer._do_racing = lambda x, k: (mock_survivors, mock_scores)
 
     # Call _step
     result = optimizer._step()
 
     # Verify results
     assert len(result) == 2  # Should match population_size
-    assert all(isinstance(p, str) for p in result)
+    assert all(isinstance(p, Prompt) for p in result)
 
 
 def test_capo_optimize(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):
@@ -169,9 +147,7 @@ def test_crossover(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mo
         crossovers_per_iter=5,
     )
 
-    offsprings = optimizer._crossover(
-        [CAPOPrompt("Instruction 1", ["Example 1"]), CAPOPrompt("Instruction 2", ["Example 2"])]
-    )
+    offsprings = optimizer._crossover([Prompt("Instruction 1", ["Example 1"]), Prompt("Instruction 2", ["Example 2"])])
     assert len(offsprings) == 5
 
 
@@ -184,9 +160,7 @@ def test_mutate(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_
         df_few_shots=mock_df,
     )
 
-    mutated = optimizer._mutate(
-        [CAPOPrompt("Instruction 1", ["Example 1"]), CAPOPrompt("Instruction 2", ["Example 2"])]
-    )
+    mutated = optimizer._mutate([Prompt("Instruction 1", ["Example 1"]), Prompt("Instruction 2", ["Example 2"])])
     assert len(mutated) == 2
 
 
@@ -200,11 +174,13 @@ def test_do_racing(mock_meta_llm, mock_predictor, initial_prompts, mock_df):
         df_few_shots=pd.concat([mock_df] * 5, ignore_index=True),
     )
     optimizer._pre_optimization_loop()
-    survivors = optimizer._do_racing(
-        [CAPOPrompt("good instruction", ["Example 1"]), CAPOPrompt("better instruction", ["Example 2"])], 1
+    survivors, scores = optimizer._do_racing(
+        [Prompt("good instruction", ["Example 1"]), Prompt("better instruction", ["Example 2"])], 1
     )
     assert len(survivors) == 1
-    assert "better instruction" in survivors[0].instruction_text
+    assert len(scores) == 1
+
+    assert "better instruction" in survivors[0].instruction
 
     assert mock_task.reset_block_idx.call_count == 2
     assert mock_task.increment_block_idx.call_count == 3
diff --git a/tests/optimizers/test_evoprompt_de.py b/tests/optimizers/test_evoprompt_de.py
index 8c01adb..41e4bfd 100644
--- a/tests/optimizers/test_evoprompt_de.py
+++ b/tests/optimizers/test_evoprompt_de.py
@@ -4,6 +4,7 @@
 import pytest
 
 from promptolution.optimizers import EvoPromptDE
+from promptolution.utils.prompt import Prompt
 
 
 def test_evoprompt_de_initialization(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
@@ -20,7 +21,7 @@ def test_evoprompt_de_initialization(mock_meta_llm, initial_prompts, mock_task,
     # Only verify the essential properties
     assert optimizer.prompt_template == "Create a new prompt from: <prompt0>, <prompt1>, <prompt2>, <prompt3>"
     assert not optimizer.donor_random
-    assert optimizer.prompts == initial_prompts
+    assert [p.instruction for p in optimizer.prompts] == initial_prompts
 
 
 def test_evoprompt_de_pre_optimization_loop(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
@@ -53,8 +54,8 @@ def test_evoprompt_de_step(mock_meta_llm, initial_prompts, mock_task, mock_predi
     )
 
     # Set up initial state
-    optimizer.prompts = initial_prompts
-    optimizer.scores = [0.8, 0.7, 0.6, 0.5, 0.4]  # First prompt is best
+    optimizer.prompts = [Prompt(p) for p in initial_prompts]
+    optimizer.scores = [0.8, 0.7, 0.6, 0.5]  # First prompt is best
 
     # Control randomness
     with patch("numpy.random.choice") as mock_choice:
diff --git a/tests/optimizers/test_evoprompt_ga.py b/tests/optimizers/test_evoprompt_ga.py
index a1c3160..724a400 100644
--- a/tests/optimizers/test_evoprompt_ga.py
+++ b/tests/optimizers/test_evoprompt_ga.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 from promptolution.optimizers import EvoPromptGA
+from promptolution.utils.prompt import Prompt
 
 
 def test_evoprompt_ga_initialization(mock_meta_llm, initial_prompts, mock_task, experiment_config, mock_predictor):
@@ -18,7 +19,7 @@ def test_evoprompt_ga_initialization(mock_meta_llm, initial_prompts, mock_task,
     # Verify only essential properties
     assert optimizer.prompt_template == "Combine these prompts to create a better one: <prompt1> and <prompt2>."
     assert optimizer.selection_mode == "random"
-    assert optimizer.prompts == initial_prompts
+    assert [p.instruction for p in optimizer.prompts] == initial_prompts
 
 
 def test_evoprompt_ga_crossover(mock_meta_llm, initial_prompts, mock_task, experiment_config, mock_predictor):
@@ -34,7 +35,7 @@ def test_evoprompt_ga_crossover(mock_meta_llm, initial_prompts, mock_task, exper
     )
 
     # Set up state for testing
-    optimizer.prompts = initial_prompts
+    optimizer.prompts = [Prompt(p) for p in initial_prompts]
     optimizer.scores = [0.8, 0.7, 0.6, 0.5, 0.4]
 
     # Control randomness
@@ -63,8 +64,8 @@ def test_evoprompt_ga_step(mock_meta_llm, initial_prompts, mock_task, experiment
     )
 
     # Set up state for testing
-    optimizer.prompts = initial_prompts
-    optimizer.scores = [0.8, 0.7, 0.6, 0.5, 0.4]
+    optimizer.prompts = [Prompt(p) for p in initial_prompts]
+    optimizer.scores = [0.8, 0.7, 0.6, 0.5]
 
     # Control randomness
     with patch("numpy.random.choice") as mock_choice:
diff --git a/tests/optimizers/test_opro.py b/tests/optimizers/test_opro.py
index 9910442..5dd5385 100644
--- a/tests/optimizers/test_opro.py
+++ b/tests/optimizers/test_opro.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from promptolution.optimizers import OPRO
+from promptolution.utils.prompt import Prompt
 
 
 def test_opro_initialization(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
@@ -23,7 +24,7 @@ def test_opro_initialization(mock_meta_llm, initial_prompts, mock_task, mock_pre
     assert optimizer.max_num_instructions == 10
     assert optimizer.num_instructions_per_step == 4
     assert optimizer.num_few_shots == 2
-    assert optimizer.prompts == initial_prompts
+    assert [p.instruction for p in optimizer.prompts] == initial_prompts
 
 
 def test_opro_sample_examples(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
@@ -62,7 +63,7 @@ def test_opro_format_instructions(mock_meta_llm, initial_prompts, mock_task, moc
     )
 
     # Set scores for testing
-    optimizer.prompts = initial_prompts
+    optimizer.prompts = [Prompt(p) for p in initial_prompts]
     optimizer.scores = [0.7, 0.9, 0.5, 0.8, 0.6]
 
     # Format instructions
@@ -109,7 +110,7 @@ def test_opro_step(mock_meta_llm, initial_prompts, mock_task, mock_predictor):
     )
 
     # Set up initial state
-    optimizer.prompts = initial_prompts
+    optimizer.prompts = [Prompt(p) for p in initial_prompts]
     optimizer.scores = [0.7, 0.6, 0.5, 0.8]
     optimizer.meta_prompt = "Meta prompt with instructions and examples"
 
diff --git a/tests/tasks/test_classifications_tasks.py b/tests/tasks/test_classifications_tasks.py
index 9651e98..256a63d 100644
--- a/tests/tasks/test_classifications_tasks.py
+++ b/tests/tasks/test_classifications_tasks.py
@@ -3,6 +3,7 @@
 from sklearn.metrics import accuracy_score
 
 from promptolution.tasks import ClassificationTask
+from promptolution.utils.prompt import Prompt
 
 
 def test_classification_task_initialization(mock_df):
@@ -19,7 +20,7 @@ def test_classification_task_initialization(mock_df):
 
 def test_task_evaluate(mock_classification_task_with_subsampling, mock_predictor):
     """Test the evaluate method of ClassificationTask."""
-    prompts = ["Classify sentiment:"]
+    prompts = [Prompt("Classify sentiment:")]
     scores = mock_classification_task_with_subsampling.evaluate(prompts, mock_predictor)
 
     assert isinstance(scores, list)
@@ -27,6 +28,8 @@ def test_task_evaluate(mock_classification_task_with_subsampling, mock_predictor
     assert 0 <= scores[0] <= 1
 
     prompts = ["Classify sentiment:", "Rate the text:"]
+    prompts = [Prompt(p) for p in prompts]
+
     scores = mock_classification_task_with_subsampling.evaluate(prompts, mock_predictor)
 
     assert len(scores) == 2
@@ -35,7 +38,7 @@ def test_task_evaluate(mock_classification_task_with_subsampling, mock_predictor
 
 def test_task_evaluate_with_subsampling(mock_classification_task_with_subsampling, mock_predictor):
     """Test the evaluate method with subsampling."""
-    prompts = ["Classify sentiment:"]
+    prompts = [Prompt("Classify sentiment:")]
 
     scores = mock_classification_task_with_subsampling.evaluate(
         prompts,
@@ -62,7 +65,7 @@ def test_task_evaluate_with_subsampling(mock_classification_task_with_subsamplin
 
 def test_task_evaluate_with_return_seq(mock_classification_task_with_subsampling, mock_predictor):
     """Test the evaluate method with return_seq=True."""
-    prompts = ["Classify sentiment:"]
+    prompts = [Prompt("Classify sentiment:")]
 
     scores, seqs = mock_classification_task_with_subsampling.evaluate(
         prompts, mock_predictor, return_seq=True, return_agg_scores=False
@@ -79,7 +82,7 @@ def test_task_evaluate_with_system_prompts(
 ):
     """Test the evaluate method with system prompts."""
 
-    prompts = ["Classify sentiment:"]
+    prompts = [Prompt("Classify sentiment:")]
     system_prompts = ["Be concise"]
 
     scores = mock_classification_task_with_subsampling.evaluate(
@@ -126,7 +129,7 @@ def test_classification_task_evaluate_random_block(mock_df, mock_predictor):
         eval_strategy="random_block",
         seed=42,
     )
-    prompts = ["Classify sentiment:"]
+    prompts = [Prompt("Classify sentiment:")]
 
     evaluated_x_sets = []
     for _ in range(5):
@@ -151,7 +154,7 @@ def test_classification_task_evaluate_sequential_block(mock_df, mock_predictor):
         eval_strategy="sequential_block",
         seed=42,
     )
-    prompts = ["Classify sentiment:"]
+    prompts = [Prompt("Classify sentiment:")]
 
     task.reset_block_idx()
     assert task.block_idx == 0
diff --git a/tests/tasks/test_judge_task.py b/tests/tasks/test_judge_task.py
index 3698bb5..3cf0066 100644
--- a/tests/tasks/test_judge_task.py
+++ b/tests/tasks/test_judge_task.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+from promptolution.utils.prompt import Prompt
+
 
 def test_judge_task_initialization(mock_judge_task_with_y, mock_judge_llm):
     """Test that JudgeTask initializes correctly with ground truth."""
@@ -50,6 +52,7 @@ def test_judge_task_construct_judge_prompt_without_ground_truth(mock_judge_task_
 def test_judge_task_evaluate_with_ground_truth(mock_judge_task_with_y, mock_predictor, mock_judge_llm):
     """Test the evaluate method of JudgeTask with ground truth and full evaluation."""
     prompts = ["Rate the sentiment:", "What is the sentiment?", "How would you classify this?"]
+    prompts = [Prompt(p) for p in prompts]
 
     mock_predictor.call_history = []
     mock_judge_llm.call_history = []
@@ -72,6 +75,7 @@ def test_judge_task_evaluate_with_ground_truth(mock_judge_task_with_y, mock_pred
 def test_judge_task_evaluate_no_ground_truth(mock_judge_task_no_y, mock_predictor, mock_judge_llm):
     """Test the evaluate method of JudgeTask without a y_column (no ground truth)."""
     prompts = ["Tell a funny joke:", "Make me laugh:", "What's a good joke?"]
+    prompts = [Prompt(p) for p in prompts]
 
     mock_predictor.call_history = []
     mock_judge_llm.call_history = []
@@ -86,6 +90,8 @@ def test_judge_task_evaluate_no_ground_truth(mock_judge_task_no_y, mock_predicto
 def test_judge_task_evaluate_with_return_seq(mock_judge_task_with_y, mock_predictor):
     """Test the evaluate method with return_seq=True for JudgeTask."""
     prompts = ["Evaluate this text:", "What is the sentiment?", "How would you classify this?"]
+    prompts = [Prompt(p) for p in prompts]
+
     scores, seqs = mock_judge_task_with_y.evaluate(prompts, mock_predictor, return_seq=True, return_agg_scores=False)
 
     assert len(scores) == 3
diff --git a/tests/tasks/test_reward_tasks.py b/tests/tasks/test_reward_tasks.py
index c707da9..76e3545 100644
--- a/tests/tasks/test_reward_tasks.py
+++ b/tests/tasks/test_reward_tasks.py
@@ -1,4 +1,4 @@
-import numpy as np
+from promptolution.utils.prompt import Prompt
 
 
 def test_reward_task_initialization(mock_reward_task, simple_reward_function):
@@ -22,7 +22,7 @@ def test_reward_task_initialization_no_x_column(mock_reward_task_no_x_column, si
 
 def test_reward_task_evaluate_with_return_seq(mock_reward_task, mock_predictor):
     """Test the evaluate method with return_seq=True for RewardTask."""
-    prompts = ["Generate a short text:"]
+    prompts = [Prompt("Generate a short text:")]
 
     scores, seqs = mock_reward_task.evaluate(prompts, mock_predictor, return_seq=True, return_agg_scores=False)
 
diff --git a/tests/utils/test_prompt.py b/tests/utils/test_prompt.py
new file mode 100644
index 0000000..3dc90bb
--- /dev/null
+++ b/tests/utils/test_prompt.py
@@ -0,0 +1,41 @@
+from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
+
+
+def test_prompt_initialization():
+    """Test that Prompt initializes correctly."""
+    instruction = "Classify the sentiment of the text."
+    few_shots = ["Example 1: Positive", "Example 2: Negative"]
+    prompt = Prompt(instruction, few_shots)
+
+    # Verify attributes
+    assert prompt.instruction == instruction
+    assert prompt.few_shots == few_shots
+
+
+def test_prompt_construct_prompt():
+    """Test the construct_prompt method of Prompt."""
+    instruction = "Classify the sentiment of the text."
+    few_shots = ["Example 1: Positive", "Example 2: Negative"]
+    prompt = Prompt(instruction, few_shots)
+
+    # Get the constructed prompt
+    constructed = prompt.construct_prompt()
+
+    # Verify the prompt contains the instruction
+    assert instruction in constructed
+
+
+def test_sort_prompts_by_scores():
+    """Test the sort_prompts_by_scores function."""
+    prompt1 = Prompt("Instruction 1", ["Example A"])
+    prompt2 = Prompt("Instruction 2", ["Example B"])
+    prompt3 = Prompt("Instruction 3", ["Example C"])
+
+    prompts = [prompt1, prompt2, prompt3]
+    scores = [0.75, 0.90, 0.60]
+
+    sorted_prompts, sorted_scores = sort_prompts_by_scores(prompts, scores)
+
+    # Verify sorting
+    assert sorted_prompts == [prompt2, prompt1, prompt3]
+    assert sorted_scores == [0.90, 0.75, 0.60]

From c7a202d4f9904e642f11f637019d40aac75eba7a Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 22 Nov 2025 15:26:33 +0100
Subject: [PATCH 08/43] restrain dependency restrictions

---
 .coverage      | Bin 69632 -> 69632 bytes
 pyproject.toml |  58 ++++++++++++++++++++++++-------------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/.coverage b/.coverage
index 52c605daa1e453dcae390a3d9490aee3902b568f..45f90aa9ad5a84befc7f4cdb13f54cf9459c1ae9 100644
GIT binary patch
delta 118
zcmV-+0Ez#Apag)R1h58x0alX-fJi6C44Dj(42TST3~vl$3{wm~3?B>(3-}A+3)Bm`
z3!Dpx3w;Z43s$oc5a0^4SBr1~2`nT80SQ_px;vB5j3faGv-FIY0Rm6PlZ}p?6F~?F
Y4hjST2_6b6JK$UJs~kYPvpkQ)fQFGKHvj+t

delta 116
zcmV-)0E_>Cpag)R1h58x0aud;fJi6D44Mp*42cYV3~&r&3{(t13?K{*3;7G;3)Ks|
z3!Mvz3w{f63s<ud5a9~5R*P@}2`eN70SQ<nx;K;0j3l%4jF$ld&(o8Qj+_$^0s;;S
W1OW*i3MxC`f?wqT+6A*bkHmmtf+ivW

diff --git a/pyproject.toml b/pyproject.toml
index 7b63b20..3146f22 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,14 +8,14 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = ">=3.10,<3.13"
 numpy = ">=1.26.0, <3.0.0"
-pandas = "^2.2.2"
-tqdm = "^4.66.5"
-scikit-learn = "^1.5.2"
-fastparquet = "^2024.11.0"
-openai = {version = "^1.0.0", optional = true}
-requests = {version = "^2.31.0", optional = true}
-vllm = {version = "^0.10.1.1", optional = true}
-transformers = {version = "^4.48.0", optional = true}
+pandas = ">=2.2.2"
+tqdm = ">=4.66.5"
+scikit-learn = ">=1.5.2"
+fastparquet = ">=2024.11.0"
+openai = {version = ">=1.0.0", optional = true}
+requests = {version = ">=2.31.0", optional = true}
+vllm = {version = ">=0.10.1.1", optional = true}
+transformers = {version = ">=4.48.0", optional = true}
 
 [tool.poetry.extras]
 api = ["openai", "requests"]
@@ -25,41 +25,41 @@ transformers = ["transformers"]
 [tool.poetry.group.api]
 optional = true
 [tool.poetry.group.api.dependencies]
-openai = "^1.0.0"
-requests = "^2.31.0"
+openai = ">=1.0.0"
+requests = ">=2.31.0"
 
 [tool.poetry.group.vllm]
 optional = true
 [tool.poetry.group.vllm.dependencies]
-vllm = "^0.10.1.1"
+vllm = ">=0.10.1.1"
 
 [tool.poetry.group.transformers]
 optional = true
 [tool.poetry.group.transformers.dependencies]
-transformers = "^4.48.0"
+transformers = ">=4.48.0"
 
 [tool.poetry.group.dev.dependencies]
-black = "^24.4.2"
-flake8 = "^7.1.0"
-isort = "^5.13.2"
-pre-commit = "^3.7.1"
-ipykernel = "^6.29.5"
-mypy = "^1.8.0"
+black = ">=24.4.2"
+flake8 = ">=7.1.0"
+isort = ">=5.13.2"
+pre-commit = ">=3.7.1"
+ipykernel = ">=6.29.5"
+mypy = ">=1.8.0"
 
 [tool.poetry.group.test.dependencies]
-pytest = "^8.3.5"
-pytest-cov = "^6.1.1"
-openai = "^1.0.0"
-requests = "^2.31.0"
-vllm = "^0.10.1.1"
-transformers = "^4.48.0"
+pytest = ">=8.3.5"
+pytest-cov = ">=6.1.1"
+openai = ">=1.0.0"
+requests = ">=2.31.0"
+vllm = ">=0.10.1.1"
+transformers = ">=4.48.0"
 
 [tool.poetry.group.docs.dependencies]
-mkdocs = "^1.6.1"
-mkdocs-material = "^9.5.39"
-mkdocstrings = {version = "^0.26.1", extras = ["python"]}
-jupyter = "^1.1.1"
-nbconvert = "^7.16.6"
+mkdocs = ">=1.6.1"
+mkdocs-material = ">=9.5.39"
+mkdocstrings = {version = ">=0.26.1", extras = ["python"]}
+jupyter = ">=1.1.1"
+nbconvert = ">=7.16.6"
 
 [build-system]
 requires = ["poetry-core"]

From b8134e29d43b63bf1c00ee98257fd9ace59064e4 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 22 Nov 2025 15:29:44 +0100
Subject: [PATCH 09/43] explicitly add scipy

---
 promptolution/utils/test_statistics.py | 2 +-
 pyproject.toml                         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/promptolution/utils/test_statistics.py b/promptolution/utils/test_statistics.py
index d0de2d3..dd9b5ff 100644
--- a/promptolution/utils/test_statistics.py
+++ b/promptolution/utils/test_statistics.py
@@ -6,7 +6,7 @@
 import numpy as np
 from scipy.stats import ttest_rel
 
-from typing import Any, Callable, List, Literal
+from typing import Callable, List, Literal
 
 TestStatistics = Literal["paired_t_test"]
 
diff --git a/pyproject.toml b/pyproject.toml
index 3146f22..0647c64 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ openai = {version = ">=1.0.0", optional = true}
 requests = {version = ">=2.31.0", optional = true}
 vllm = {version = ">=0.10.1.1", optional = true}
 transformers = {version = ">=4.48.0", optional = true}
+scipy = ">=1.15"
 
 [tool.poetry.extras]
 api = ["openai", "requests"]

From 03a5ec7af243da1f8d29ba8533b7ab526b6a4c47 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 22 Nov 2025 15:32:28 +0100
Subject: [PATCH 10/43] replace only one occurence of 'input' for few shot
 examples

---
 promptolution/optimizers/capo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index a4556c1..021dbab 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -165,7 +165,7 @@ def _create_few_shot_examples(self, instruction: str, num_examples: int) -> List
         # Check which predictions are correct and get a single one per example
         for j in range(num_examples):
             # Process and clean up the generated sequences
-            seqs[j] = seqs[j].replace(sample_inputs[j], "").strip()
+            seqs[j] = seqs[j].replace(sample_inputs[j], "", 1).strip()
             # Check if the prediction is correct and add reasoning if so
             if preds[j] == sample_targets[j] or not self.check_fs_accuracy:
                 few_shots[j] = CAPO_FEWSHOT_TEMPLATE.replace("<input>", sample_inputs[j]).replace("<output>", seqs[j])

From bb146f1390b277fd8b1e5c8b663df81de5e68232 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 22 Nov 2025 16:10:22 +0100
Subject: [PATCH 11/43] improve task description handling

---
 .coverage                                  | Bin 69632 -> 69632 bytes
 promptolution/helpers.py                   |  38 ++++--------------
 promptolution/optimizers/base_optimizer.py |  10 ++++-
 promptolution/optimizers/capo.py           |  14 +++----
 promptolution/optimizers/evoprompt_de.py   |   5 ++-
 promptolution/optimizers/evoprompt_ga.py   |   6 ++-
 promptolution/optimizers/opro.py           |   2 +-
 tests/mocks/mock_task.py                   |   2 +-
 tests/optimizers/test_capo.py              |  44 +++++++++++++++++++++
 9 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/.coverage b/.coverage
index 45f90aa9ad5a84befc7f4cdb13f54cf9459c1ae9..64215bc3ebb237f7e6e9a4333895ce6fbf63e9be 100644
GIT binary patch
delta 344
zcmV-e0jK_epag)R1h58x0a23&fFw(24o(g~4l@oO4h9YF4cQI34XO>I4T=qV4QdTg
z4M7bW4Dt-b44Mp*42cYV3~&r&3{(t13>pjv3-b%y3(pI+3zrLn3wR4_3sJKX5Zelq
zR*T6cBP|302}mt!(?3o>mYMvR?PU7VC6j(+<)8U8|87k`_lbV?1d~3DTOlYV1OW+B
zCF%mu{|6ue_!0mczdZrKUjhIC0R8{~0N@|9y^KKte+sU^3S@e|7G(k!7X$$bE*H`Z
zECT=l{rBo_6#xh@54Z~x76btaC>ByLU;qH<zt;ue+CE*c3Kawa2^JMXDFGk`4HN_c
z2^<u{N&v$L0IUKR69fSXFcYd^WdQU8@T~xF2c`xLK%;UAFA@X+31AZ1D&Us^{r?XD
q0I+Xs9R|e&__|->6}1BV^Ysc~Jx?oum(`5gdIeVwzyY(Ej%a}hOK*Sx

delta 339
zcmZozz|ydQWdmCSW7uT&1{LoNz5qT)K2tsgJ`UdZytjFG^RD4t!rQ}J!<)k!#Our}
z!}Epb7|%?ei98)VH9Yw|$vnY4c0BStJlsFHA97#e-o-tKyMw!yyMQ}vv!K8OuE~Mj
zC*@^zSr|EObXUEUUoL;zr|wLd(cc%7Jnc7qI{#;8{dTs=_C3)m+NwY`5vrG17yRdU
zVEUlSU{+Vn)Ucn0fq~%<BLl+$dmv-;?jA?Rdd_tRxKgV2iKQ}$i39cMi3f4(Ff%ax
z{a(7Ai-BbZ`yEzcF`&4HSdh2@1H+g5&zTBx=dTOr5(Ns1i8^UADzNd20NFAkhrAdL
zvoow=5)%fQBOG+_6vGSl2hofV_<VRJ8r)d*gn;64LbtRYOlSQ2pPzwYLv3`kbQ0sf
jchf6e*E0OC*~=8+e~jxvs_EiwaR;X|7i^x<m(d6S4`OQ)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index a39d7df..790e982 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -1,6 +1,5 @@
 """Helper functions for the usage of the libary."""
 
-
 from typing import TYPE_CHECKING, Callable, List, Literal, Optional
 
 from promptolution.tasks.judge_tasks import JudgeTask
@@ -221,50 +220,27 @@ def get_optimizer(
         ValueError: If an unknown optimizer type is specified
     """
     final_optimizer = optimizer or (config.optimizer if config else None)
-    final_task_description = task_description or (config.task_description if config else None)
+    if config is None:
+        config = ExperimentConfig()
+    if task_description is not None:
+        config.task_description = task_description
 
     if final_optimizer == "capo":
-        crossover_template = (
-            CAPO_CROSSOVER_TEMPLATE.replace("<task_desc>", final_task_description)
-            if final_task_description
-            else CAPO_CROSSOVER_TEMPLATE
-        )
-        mutation_template = (
-            CAPO_MUTATION_TEMPLATE.replace("<task_desc>", final_task_description)
-            if final_task_description
-            else CAPO_MUTATION_TEMPLATE
-        )
-
         return CAPO(
             predictor=predictor,
             meta_llm=meta_llm,
             task=task,
-            crossover_template=crossover_template,
-            mutation_template=mutation_template,
             config=config,
         )
 
     if final_optimizer == "evopromptde":
-        template = (
-            EVOPROMPT_DE_TEMPLATE_TD.replace("<task_desc>", final_task_description)
-            if final_task_description
-            else EVOPROMPT_DE_TEMPLATE
-        )
-        return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+        return EvoPromptDE(predictor=predictor, meta_llm=meta_llm, task=task, config=config)
 
     if final_optimizer == "evopromptga":
-        template = (
-            EVOPROMPT_GA_TEMPLATE_TD.replace("<task_desc>", final_task_description)
-            if final_task_description
-            else EVOPROMPT_GA_TEMPLATE
-        )
-        return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+        return EvoPromptGA(predictor=predictor, meta_llm=meta_llm, task=task, config=config)
 
     if final_optimizer == "opro":
-        template = (
-            OPRO_TEMPLATE_TD.replace("<task_desc>", final_task_description) if final_task_description else OPRO_TEMPLATE
-        )
-        return OPRO(predictor=predictor, meta_llm=meta_llm, task=task, prompt_template=template, config=config)
+        return OPRO(predictor=predictor, meta_llm=meta_llm, task=task, config=config)
 
     raise ValueError(f"Unknown optimizer: {final_optimizer}")
 
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index a9d6ea9..42f432e 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -1,6 +1,5 @@
 """Base module for optimizers in the promptolution library."""
 
-
 from abc import ABC, abstractmethod
 
 from typing import TYPE_CHECKING, List, Literal, Optional
@@ -129,3 +128,12 @@ def _on_train_end(self) -> None:
         """Call all registered callbacks at the end of the entire optimization process."""
         for callback in self.callbacks:
             callback.on_train_end(self)
+
+    def _initialize_meta_template(self, template: str) -> str:
+        task_description = getattr(self.task, "task_description")
+        if self.config is not None and getattr(self.config, "task_description") is not None:
+            task_description = self.config.task_description
+        if task_description is None:
+            logger.warning("Task description is not provided. Please make sure to include relevant task details.")
+            task_description = ""
+        return template.replace("<task_desc>", task_description)
diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index 021dbab..db476a0 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -43,6 +43,8 @@ def __init__(
         task: "BaseTask",
         meta_llm: "BaseLLM",
         initial_prompts: Optional[List[str]] = None,
+        crossover_template: Optional[str] = None,
+        mutation_template: Optional[str] = None,
         crossovers_per_iter: int = 4,
         upper_shots: int = 5,
         max_n_blocks_eval: int = 10,
@@ -52,8 +54,6 @@ def __init__(
         check_fs_accuracy: bool = True,
         create_fs_reasoning: bool = True,
         df_few_shots: Optional[pd.DataFrame] = None,
-        crossover_template: Optional[str] = None,
-        mutation_template: Optional[str] = None,
         callbacks: Optional[List["BaseCallback"]] = None,
         config: Optional["ExperimentConfig"] = None,
     ) -> None:
@@ -64,6 +64,8 @@ def __init__(
             task (BaseTask): The task instance containing dataset and description.
             meta_llm (BaseLLM): The meta language model for crossover/mutation.
             initial_prompts (List[str]): Initial prompt instructions.
+            crossover_template (str, optional): Template for crossover instructions.
+            mutation_template (str, optional): Template for mutation instructions.
             crossovers_per_iter (int): Number of crossover operations per iteration.
             upper_shots (int): Maximum number of few-shot examples per prompt.
             p_few_shot_reasoning (float): Probability of generating llm-reasoning for few-shot examples, instead of simply using input-output pairs.
@@ -76,17 +78,12 @@ def __init__(
             create_fs_reasoning (bool): Whether to create reasoning for few-shot examples using the downstream model,
                 instead of simply using input-output pairs from the few shots DataFrame. Default is True.
             df_few_shots (pd.DataFrame): DataFrame containing few-shot examples. If None, will pop 10% of datapoints from task.
-            crossover_template (str, optional): Template for crossover instructions.
-            mutation_template (str, optional): Template for mutation instructions.
             callbacks (List[Callable], optional): Callbacks for optimizer events.
             config (ExperimentConfig, optional): Configuration for the optimizer.
         """
         self.meta_llm = meta_llm
         self.downstream_llm = predictor.llm
 
-        self.crossover_template = crossover_template or CAPO_CROSSOVER_TEMPLATE
-        self.mutation_template = mutation_template or CAPO_MUTATION_TEMPLATE
-
         self.crossovers_per_iter = crossovers_per_iter
         self.upper_shots = upper_shots
         self.max_n_blocks_eval = max_n_blocks_eval
@@ -101,6 +98,9 @@ def __init__(
 
         super().__init__(predictor, task, initial_prompts, callbacks, config)
 
+        self.crossover_template = self._initialize_meta_template(crossover_template or CAPO_CROSSOVER_TEMPLATE)
+        self.mutation_template = self._initialize_meta_template(mutation_template or CAPO_MUTATION_TEMPLATE)
+
         self.df_few_shots = df_few_shots if df_few_shots is not None else task.pop_datapoints(frac=0.1)
         if self.max_n_blocks_eval > self.task.n_blocks:
             logger.warning(
diff --git a/promptolution/optimizers/evoprompt_de.py b/promptolution/optimizers/evoprompt_de.py
index 05d8336..f6e701a 100644
--- a/promptolution/optimizers/evoprompt_de.py
+++ b/promptolution/optimizers/evoprompt_de.py
@@ -8,6 +8,7 @@
 from promptolution.optimizers.base_optimizer import BaseOptimizer
 from promptolution.utils.formatting import extract_from_tag
 from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
+from promptolution.utils.templates import EVOPROMPT_DE_TEMPLATE_TD
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
@@ -43,20 +44,20 @@ def __init__(
         self,
         predictor: "BasePredictor",
         task: "BaseTask",
-        prompt_template: str,
         meta_llm: "BaseLLM",
         initial_prompts: Optional[List[str]] = None,
+        prompt_template: Optional[str] = None,
         donor_random: bool = False,
         callbacks: Optional[List["BaseCallback"]] = None,
         config: Optional["ExperimentConfig"] = None,
     ) -> None:
         """Initialize the EvoPromptDE optimizer."""
-        self.prompt_template = prompt_template
         self.donor_random = donor_random
         self.meta_llm = meta_llm
         super().__init__(
             predictor=predictor, task=task, initial_prompts=initial_prompts, callbacks=callbacks, config=config
         )
+        self.prompt_template = self._initialize_meta_template(prompt_template or EVOPROMPT_DE_TEMPLATE_TD)
 
     def _pre_optimization_loop(self) -> None:
         self.scores = self.task.evaluate(self.prompts, self.predictor, return_agg_scores=True)
diff --git a/promptolution/optimizers/evoprompt_ga.py b/promptolution/optimizers/evoprompt_ga.py
index c7b66a4..9a0b4e3 100644
--- a/promptolution/optimizers/evoprompt_ga.py
+++ b/promptolution/optimizers/evoprompt_ga.py
@@ -7,6 +7,7 @@
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
 from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
+from promptolution.utils.templates import EVOPROMPT_GA_TEMPLATE_TD
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
@@ -49,20 +50,21 @@ def __init__(
         self,
         predictor: "BasePredictor",
         task: "BaseTask",
-        prompt_template: str,
         meta_llm: "BaseLLM",
         initial_prompts: Optional[List[str]] = None,
+        prompt_template: Optional[str] = None,
         selection_mode: str = "wheel",
         callbacks: Optional[List["BaseCallback"]] = None,
         config: Optional["ExperimentConfig"] = None,
     ) -> None:
         """Initialize the EvoPromptGA optimizer."""
-        self.prompt_template = prompt_template
         self.meta_llm = meta_llm
         self.selection_mode = selection_mode
         super().__init__(
             predictor=predictor, initial_prompts=initial_prompts, task=task, callbacks=callbacks, config=config
         )
+        self.prompt_template = self._initialize_meta_template(prompt_template or EVOPROMPT_GA_TEMPLATE_TD)
+
         assert self.selection_mode in ["random", "wheel", "tour"], "Invalid selection mode."
 
     def _pre_optimization_loop(self) -> None:
diff --git a/promptolution/optimizers/opro.py b/promptolution/optimizers/opro.py
index a37ed81..e7b9048 100644
--- a/promptolution/optimizers/opro.py
+++ b/promptolution/optimizers/opro.py
@@ -56,13 +56,13 @@ def __init__(
             config: "ExperimentConfig" overwriting default parameters
         """
         self.meta_llm = meta_llm
-        self.meta_prompt_template = prompt_template or OPRO_TEMPLATE
         self.max_num_instructions = max_num_instructions
         self.num_instructions_per_step = num_instructions_per_step
         self.num_few_shots = num_few_shots
         super().__init__(
             predictor=predictor, task=task, initial_prompts=initial_prompts, callbacks=callbacks, config=config
         )
+        self.meta_prompt_template = self._initialize_meta_template(prompt_template or OPRO_TEMPLATE)
 
     def _sample_examples(self) -> str:
         """Sample few-shot examples from the dataset.
diff --git a/tests/mocks/mock_task.py b/tests/mocks/mock_task.py
index b5e1d14..ecd717f 100644
--- a/tests/mocks/mock_task.py
+++ b/tests/mocks/mock_task.py
@@ -39,7 +39,7 @@ def __init__(self, predetermined_scores=None):
         self.x_column = "x"
         self.y_column = "y"
         # Default attributes similar to ClassificationTask
-        self.description = "Mock classification task"
+        self.task_description = "Mock classification task"
         self.classes = ["positive", "neutral", "negative"]
         self.initial_prompts = ["Classify:", "Determine:"]
         self.n_blocks = 10
diff --git a/tests/optimizers/test_capo.py b/tests/optimizers/test_capo.py
index 834ba67..a3a2048 100644
--- a/tests/optimizers/test_capo.py
+++ b/tests/optimizers/test_capo.py
@@ -6,6 +6,7 @@
 
 from promptolution.optimizers.capo import CAPO
 from promptolution.utils.prompt import Prompt
+from promptolution.utils.templates import CAPO_CROSSOVER_TEMPLATE, CAPO_MUTATION_TEMPLATE
 
 
 def test_capo_initialization(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):
@@ -184,3 +185,46 @@ def test_do_racing(mock_meta_llm, mock_predictor, initial_prompts, mock_df):
 
     assert mock_task.reset_block_idx.call_count == 2
     assert mock_task.increment_block_idx.call_count == 3
+
+
+def test_capo_crossover_prompt(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):
+    """Test that when _crossover is called, the mock_meta_llm received a call with the correct meta prompt."""
+    optimizer = CAPO(
+        predictor=mock_predictor,
+        task=mock_task,
+        meta_llm=mock_meta_llm,
+        initial_prompts=initial_prompts,
+        df_few_shots=mock_df,
+    )
+
+    mother = Prompt("Classify the sentiment of the text.", ["Input: I love this! Output: Positive"])
+    father = Prompt("Determine if the review is positive or negative.", ["Input: This is terrible. Output: Negative"])
+    optimizer._crossover([mother, father])
+
+    expected_meta_prompt = (
+        CAPO_CROSSOVER_TEMPLATE.replace("<mother>", mother.instruction)
+        .replace("<father>", father.instruction)
+        .replace("<task_desc>", mock_task.task_description)
+    )
+
+    assert mock_meta_llm.call_history[0]["prompts"][0] == expected_meta_prompt
+
+
+def test_capo_mutate_prompt(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):
+    """Test that when _mutate is called, the mock_meta_llm received a call with the correct meta prompt."""
+    optimizer = CAPO(
+        predictor=mock_predictor,
+        task=mock_task,
+        meta_llm=mock_meta_llm,
+        initial_prompts=initial_prompts,
+        df_few_shots=mock_df,
+    )
+
+    parent = Prompt("Classify the sentiment of the text.", ["Input: I love this! Output: Positive"])
+    optimizer._mutate([parent])
+
+    expected_meta_prompt = CAPO_MUTATION_TEMPLATE.replace("<instruction>", parent.instruction).replace(
+        "<task_desc>", mock_task.task_description
+    )
+
+    assert mock_meta_llm.call_history[0]["prompts"][0] == expected_meta_prompt

From 527270383507617f549df387c29d9c2958c4861b Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sat, 22 Nov 2025 16:19:52 +0100
Subject: [PATCH 12/43] incoperate extraction description

---
 .coverage                                  | Bin 69632 -> 69632 bytes
 promptolution/helpers.py                   |   1 -
 promptolution/optimizers/base_optimizer.py |   3 +++
 tests/mocks/mock_task.py                   |   1 -
 tests/optimizers/test_capo.py              |   7 +++++--
 5 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.coverage b/.coverage
index 64215bc3ebb237f7e6e9a4333895ce6fbf63e9be..44c7c72d750d8c0618335cd34d0a10e261ad14b2 100644
GIT binary patch
delta 273
zcmV+s0q*{Qpag)R1h58x0aKF)fFw(34o?n14m1uQ4hIeH4cZO54XX{K4T}wX4QmZi
z4MGhY4D$@d44e#<42ukZ3~~%+3|0(53>*vz3-t@$3(*U=3z-Xr3wjG}3sbWZ5Zwxs
zQH#(Xs?$GCKbD#NnC)cx(Iu09WaXdvGyiT)Ki89fZ32@&j9V!uB?JKpP$lXD(EkS@
z0r(OCi{G9A;4c9H004gg008h000000fCB)4lf;az392#(xZU0Y&<B%xjgt{l7gz=W
z0Q&FM-6{YOU><N6lOB#oGV*dR5(EJWUJ}|W;FkgY{|^8Fuy1RFVgbJHmv}|30RMcw
X0$9)23gBfmqqbhbl>>0In2u<HYgl8_

delta 275
zcmV+u0qp*Opag)R1h58x0a23&fFw(24o(g~4l@oO4h9YF4cQI34XO>I4T=qV4QdTg
z4M7bW4Dt-b44Mp*42cYV3~&r&3{(t13>pjv3-b%y3(pI+3zrLn3wR4_3sJKX5Zelq
zQ;X0ZYSTYXKbD#NnC)cx(Iu09WaXdvGyiT)Klh1#_5_nbj9V%wB?JKpQYGpF(EkS@
z0r(OC8^1jPz+VCY008~~007`00000000#g70h7gytqE#n5OBM_1)vX;dySJ3(hDpD
z008~>>TVSP2rv)03zHv?M>BE>FA@X+31AZ1D&Us^{r?XD0I+Xs2E_#Ux?kcIwF3O}
Z^$K7;Pb+|z)r{JD1y>Hh0kfEnXo2~<V4VN}

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 790e982..b917332 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -80,7 +80,6 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Promp
     llm = get_llm(config=config)
     predictor = get_predictor(llm, config=config)
 
-    config.task_description = (config.task_description or "") + " " + (predictor.extraction_description or "")
     if config.optimizer == "capo" and (config.eval_strategy is None or "block" not in config.eval_strategy):
         logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
         config.eval_strategy = "sequential_block"
diff --git a/promptolution/optimizers/base_optimizer.py b/promptolution/optimizers/base_optimizer.py
index 42f432e..7264f6f 100644
--- a/promptolution/optimizers/base_optimizer.py
+++ b/promptolution/optimizers/base_optimizer.py
@@ -131,9 +131,12 @@ def _on_train_end(self) -> None:
 
     def _initialize_meta_template(self, template: str) -> str:
         task_description = getattr(self.task, "task_description")
+        extraction_description = getattr(self.predictor, "extraction_description")
         if self.config is not None and getattr(self.config, "task_description") is not None:
             task_description = self.config.task_description
         if task_description is None:
             logger.warning("Task description is not provided. Please make sure to include relevant task details.")
             task_description = ""
+        if extraction_description is not None:
+            task_description += "\n" + extraction_description
         return template.replace("<task_desc>", task_description)
diff --git a/tests/mocks/mock_task.py b/tests/mocks/mock_task.py
index ecd717f..9aeb46c 100644
--- a/tests/mocks/mock_task.py
+++ b/tests/mocks/mock_task.py
@@ -2,7 +2,6 @@
 
 from unittest.mock import MagicMock
 
-import numpy as np
 import pandas as pd
 
 from typing import List
diff --git a/tests/optimizers/test_capo.py b/tests/optimizers/test_capo.py
index a3a2048..05fb6c9 100644
--- a/tests/optimizers/test_capo.py
+++ b/tests/optimizers/test_capo.py
@@ -201,10 +201,12 @@ def test_capo_crossover_prompt(mock_meta_llm, mock_predictor, initial_prompts, m
     father = Prompt("Determine if the review is positive or negative.", ["Input: This is terrible. Output: Negative"])
     optimizer._crossover([mother, father])
 
+    full_task_desc = mock_task.task_description + "\n" + optimizer.predictor.extraction_description
+
     expected_meta_prompt = (
         CAPO_CROSSOVER_TEMPLATE.replace("<mother>", mother.instruction)
         .replace("<father>", father.instruction)
-        .replace("<task_desc>", mock_task.task_description)
+        .replace("<task_desc>", full_task_desc)
     )
 
     assert mock_meta_llm.call_history[0]["prompts"][0] == expected_meta_prompt
@@ -219,12 +221,13 @@ def test_capo_mutate_prompt(mock_meta_llm, mock_predictor, initial_prompts, mock
         initial_prompts=initial_prompts,
         df_few_shots=mock_df,
     )
+    full_task_desc = mock_task.task_description + "\n" + optimizer.predictor.extraction_description
 
     parent = Prompt("Classify the sentiment of the text.", ["Input: I love this! Output: Positive"])
     optimizer._mutate([parent])
 
     expected_meta_prompt = CAPO_MUTATION_TEMPLATE.replace("<instruction>", parent.instruction).replace(
-        "<task_desc>", mock_task.task_description
+        "<task_desc>", full_task_desc
     )
 
     assert mock_meta_llm.call_history[0]["prompts"][0] == expected_meta_prompt

From 2cbaedb028fc031445530b688211f36680eebcf8 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Tue, 25 Nov 2025 13:06:43 +0100
Subject: [PATCH 13/43] renaming predictors AND prompt creation

---
 promptolution/helpers.py                      | 17 ++---
 promptolution/optimizers/capo.py              |  3 +-
 promptolution/predictors/__init__.py          |  3 +-
 promptolution/predictors/base_predictor.py    |  1 +
 .../predictors/first_occurence_predictor.py   | 65 +++++++++++++++++
 ...classifier.py => maker_based_predictor.py} | 69 ++-----------------
 promptolution/tasks/judge_tasks.py            |  2 +-
 promptolution/utils/prompt_creation.py        | 33 ++++++++-
 promptolution/utils/templates.py              |  6 ++
 ...test_classifiers.py => test_predictors.py} | 28 ++++----
 tutorials/aime_eval.py                        |  4 +-
 tutorials/api_llm_demo.py                     |  4 +-
 tutorials/capo_demo.py                        |  4 +-
 tutorials/evoprompt_demo.py                   |  4 +-
 tutorials/opro_demo.py                        |  4 +-
 15 files changed, 146 insertions(+), 101 deletions(-)
 create mode 100644 promptolution/predictors/first_occurence_predictor.py
 rename promptolution/predictors/{classifier.py => maker_based_predictor.py} (50%)
 rename tests/predictors/{test_classifiers.py => test_predictors.py} (80%)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index b917332..5ad6d5e 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -28,7 +28,8 @@
 from promptolution.optimizers.evoprompt_de import EvoPromptDE
 from promptolution.optimizers.evoprompt_ga import EvoPromptGA
 from promptolution.optimizers.opro import OPRO
-from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier
+from promptolution.predictors.first_occurence_predictor import FirstOccurrencePredictor
+from promptolution.predictors.maker_based_predictor import MarkerBasedPredictor
 from promptolution.tasks.classification_tasks import ClassificationTask
 from promptolution.utils.logging import get_logger
 from promptolution.utils.templates import (
@@ -272,23 +273,23 @@ def get_predictor(downstream_llm=None, type: "PredictorType" = "marker", *args,
     """Factory function to create and return a predictor instance.
 
     This function supports three types of predictors:
-    1. FirstOccurrenceClassifier: A predictor that classifies based on first occurrence of the label.
-    2. MarkerBasedClassifier: A predictor that classifies based on a marker.
+    1. FirstOccurrencePredictor: A predictor that classifies based on first occurrence of the label.
+    2. MarkerBasedPredictor: A predictor that classifies based on a marker.
 
     Args:
         downstream_llm: The language model to use for prediction.
         type (Literal["first_occurrence", "marker"]): The type of predictor to create:
-                    - "first_occurrence" for FirstOccurrenceClassifier
-                    - "marker" (default) for MarkerBasedClassifier
+                    - "first_occurrence" for FirstOccurrencePredictor
+                    - "marker" (default) for MarkerBasedPredictor
         *args: Variable length argument list passed to the predictor constructor.
         **kwargs: Arbitrary keyword arguments passed to the predictor constructor.
 
     Returns:
-        An instance of FirstOccurrenceClassifier or MarkerBasedClassifier.
+        An instance of FirstOccurrencePredictor or MarkerBasedPredictor.
     """
     if type == "first_occurrence":
-        return FirstOccurrenceClassifier(downstream_llm, *args, **kwargs)
+        return FirstOccurrencePredictor(downstream_llm, *args, **kwargs)
     elif type == "marker":
-        return MarkerBasedClassifier(downstream_llm, *args, **kwargs)
+        return MarkerBasedPredictor(downstream_llm, *args, **kwargs)
     else:
         raise ValueError(f"Invalid predictor type: '{type}'")
diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index db476a0..301dd80 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -8,8 +8,6 @@
 
 from typing import TYPE_CHECKING, Any, List, Optional, Tuple
 
-from promptolution.utils.formatting import extract_from_tag
-
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.callbacks import BaseCallback
     from promptolution.llms.base_llm import BaseLLM
@@ -19,6 +17,7 @@
     from promptolution.utils.test_statistics import TestStatistics
 
 from promptolution.optimizers.base_optimizer import BaseOptimizer
+from promptolution.utils.formatting import extract_from_tag
 from promptolution.utils.logging import get_logger
 from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
 from promptolution.utils.templates import CAPO_CROSSOVER_TEMPLATE, CAPO_FEWSHOT_TEMPLATE, CAPO_MUTATION_TEMPLATE
diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py
index ddc9595..cf9d5de 100644
--- a/promptolution/predictors/__init__.py
+++ b/promptolution/predictors/__init__.py
@@ -1,4 +1,5 @@
 """Module for LLM predictors."""
 
 
-from promptolution.predictors.classifier import FirstOccurrenceClassifier, MarkerBasedClassifier
+from promptolution.predictors.first_occurence_predictor import FirstOccurrencePredictor
+from promptolution.predictors.maker_based_predictor import MarkerBasedPredictor
diff --git a/promptolution/predictors/base_predictor.py b/promptolution/predictors/base_predictor.py
index a345872..292d56d 100644
--- a/promptolution/predictors/base_predictor.py
+++ b/promptolution/predictors/base_predictor.py
@@ -10,6 +10,7 @@
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
 
+
 PredictorType = Literal["first_occurrence", "marker"]
 
 
diff --git a/promptolution/predictors/first_occurence_predictor.py b/promptolution/predictors/first_occurence_predictor.py
new file mode 100644
index 0000000..27a28dc
--- /dev/null
+++ b/promptolution/predictors/first_occurence_predictor.py
@@ -0,0 +1,65 @@
+"""Module for the FirstOccurencePredictor."""
+
+from typing import TYPE_CHECKING, List, Optional
+
+from promptolution.predictors.base_predictor import BasePredictor
+
+if TYPE_CHECKING:  # pragma: no cover
+    from promptolution.llms.base_llm import BaseLLM
+    from promptolution.utils.config import ExperimentConfig
+
+
+class FirstOccurrencePredictor(BasePredictor):
+    """A predictor class for classification tasks using language models.
+
+    This class takes a language model and a list of classes, and provides a method
+    to predict classes for given prompts and input data. The class labels are extracted
+    by matching the words in the prediction with the list of valid class labels.
+    The first occurrence of a valid class label in the prediction is used as the predicted class.
+    If no valid class label is found, the first class label in the list is used as the default prediction.
+
+    Attributes:
+        llm: The language model used for generating predictions.
+        classes (List[str]): The list of valid class labels.
+        config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults.
+
+    Inherits from:
+        BasePredictor: The base class for predictors in the promptolution library.
+    """
+
+    def __init__(self, llm: "BaseLLM", classes: List[str], config: Optional["ExperimentConfig"] = None) -> None:
+        """Initialize the FirstOccurrencePredictor.
+
+        Args:
+            llm: The language model to use for predictions.
+            classes (List[str]): The list of valid class labels.
+            config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults.
+        """
+        assert all([c.islower() for c in classes]), "Class labels should be lowercase."
+        self.classes = classes
+
+        self.extraction_description = (
+            f"The task is to classify the texts into one of those classes: {', '.join(classes)}."
+            "The first occurrence of a valid class label in the prediction is used as the predicted class."
+        )
+
+        super().__init__(llm, config)
+
+    def _extract_preds(self, preds: List[str]) -> List[str]:
+        """Extract class labels from the predictions, based on the list of valid class labels.
+
+        Args:
+            preds: The raw predictions from the language model.
+        """
+        result = []
+        for pred in preds:
+            predicted_class = self.classes[0]  # use first class as default pred
+            for word in pred.split():
+                word = "".join([c for c in word if c.isalnum()]).lower()
+                if word in self.classes:
+                    predicted_class = word
+                    break
+
+            result.append(predicted_class)
+
+        return result
diff --git a/promptolution/predictors/classifier.py b/promptolution/predictors/maker_based_predictor.py
similarity index 50%
rename from promptolution/predictors/classifier.py
rename to promptolution/predictors/maker_based_predictor.py
index 2a4fa00..bf5dbcb 100644
--- a/promptolution/predictors/classifier.py
+++ b/promptolution/predictors/maker_based_predictor.py
@@ -1,9 +1,6 @@
-"""Module for classification predictors."""
+"""Module for the MarkerBasedPredictor."""
 
-
-import numpy as np
-
-from typing import TYPE_CHECKING, Any, List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from promptolution.predictors.base_predictor import BasePredictor
 from promptolution.utils.formatting import extract_from_tag
@@ -13,64 +10,8 @@
     from promptolution.utils.config import ExperimentConfig
 
 
-class FirstOccurrenceClassifier(BasePredictor):
-    """A predictor class for classification tasks using language models.
-
-    This class takes a language model and a list of classes, and provides a method
-    to predict classes for given prompts and input data. The class labels are extracted
-    by matching the words in the prediction with the list of valid class labels.
-    The first occurrence of a valid class label in the prediction is used as the predicted class.
-    If no valid class label is found, the first class label in the list is used as the default prediction.
-
-    Attributes:
-        llm: The language model used for generating predictions.
-        classes (List[str]): The list of valid class labels.
-        config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults.
-
-    Inherits from:
-        BasePredictor: The base class for predictors in the promptolution library.
-    """
-
-    def __init__(self, llm: "BaseLLM", classes: List[str], config: Optional["ExperimentConfig"] = None) -> None:
-        """Initialize the FirstOccurrenceClassifier.
-
-        Args:
-            llm: The language model to use for predictions.
-            classes (List[str]): The list of valid class labels.
-            config (ExperimentConfig, optional): Configuration for the classifier, overriding defaults.
-        """
-        assert all([c.islower() for c in classes]), "Class labels should be lowercase."
-        self.classes = classes
-
-        self.extraction_description = (
-            f"The task is to classify the texts into one of those classes: {', '.join(classes)}."
-            "The first occurrence of a valid class label in the prediction is used as the predicted class."
-        )
-
-        super().__init__(llm, config)
-
-    def _extract_preds(self, preds: List[str]) -> List[str]:
-        """Extract class labels from the predictions, based on the list of valid class labels.
-
-        Args:
-            preds: The raw predictions from the language model.
-        """
-        result = []
-        for pred in preds:
-            predicted_class = self.classes[0]  # use first class as default pred
-            for word in pred.split():
-                word = "".join([c for c in word if c.isalnum()]).lower()
-                if word in self.classes:
-                    predicted_class = word
-                    break
-
-            result.append(predicted_class)
-
-        return result
-
-
-class MarkerBasedClassifier(BasePredictor):
-    """A predictor class for classification tasks using language models.
+class MarkerBasedPredictor(BasePredictor):
+    """A predictor class task using language models.
 
     This class takes a language model and a list of classes, and provides a method
     to predict classes for given prompts and input data. The class labels are extracted.
@@ -92,7 +33,7 @@ def __init__(
         end_marker: str = "</final_answer>",
         config: Optional["ExperimentConfig"] = None,
     ) -> None:
-        """Initialize the MarkerBasedClassifier.
+        """Initialize the MarkerBasedPredictor.
 
         Args:
             llm: The language model to use for predictions.
diff --git a/promptolution/tasks/judge_tasks.py b/promptolution/tasks/judge_tasks.py
index 9a82e66..8f9fbd7 100644
--- a/promptolution/tasks/judge_tasks.py
+++ b/promptolution/tasks/judge_tasks.py
@@ -132,7 +132,7 @@ def _evaluate(self, xs: List[str], ys: List[str], preds: List[str]) -> List[floa
         judge_responses = self.judge_llm.get_response(prompts)
         scores_str = extract_from_tag(judge_responses, "<final_score>", "</final_score>")
         scores = []
-        for score_str, judge_response in zip(scores_str, judge_responses):
+        for score_str in scores_str:
             try:
                 # only numeric chars, - or . are allowed
                 score_str = "".join(filter(lambda c: c.isdigit() or c in "-.", score_str))
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 0a5088a..6f2992e 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -14,6 +14,7 @@
 from promptolution.tasks.classification_tasks import ClassificationTask
 from promptolution.utils.templates import (
     PROMPT_CREATION_TEMPLATE,
+    PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION,
     PROMPT_CREATION_TEMPLATE_TD,
     PROMPT_VARIATION_TEMPLATE,
 )
@@ -50,7 +51,7 @@ def create_prompts_from_samples(
     llm: "BaseLLM",
     meta_prompt: Optional[str] = None,
     n_samples: int = 3,
-    task_description: Optional[str] = None,
+    task_description: str = None,
     n_prompts: int = 1,
     get_uniform_labels: bool = False,
 ) -> List[str]:
@@ -119,3 +120,33 @@ def create_prompts_from_samples(
     prompts = extract_from_tag(prompts, "<prompt>", "</prompt>")
 
     return prompts
+
+
+def create_prompts_from_task_description(
+    task_description: str,
+    llm: "BaseLLM",
+    meta_prompt: Optional[str] = None,
+    n_prompts: int = 1,
+) -> List[str]:
+    """Generate a set of prompts from a given task description.
+
+    Args:
+        task_description (str): The description of the task to generate prompts for.
+        llm (BaseLLM): The language model to use for generating the prompts.
+        meta_prompt (str): The meta prompt to use for generating the prompts.
+        If None, a default meta prompt is used.
+        n_prompts (int): The number of prompts to generate.
+
+    Returns:
+        List[str]: A list of generated prompts.
+    """
+    if meta_prompt is None:
+        meta_prompt = PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION
+
+    meta_prompt = meta_prompt.replace("<task_desc>", task_description)
+
+    meta_prompts = [meta_prompt for _ in range(n_prompts)]
+    prompts = llm.get_response(meta_prompts)
+    prompts = extract_from_tag(prompts, "<prompt>", "</prompt>")
+
+    return prompts
diff --git a/promptolution/utils/templates.py b/promptolution/utils/templates.py
index 70c4b74..a0a85a6 100644
--- a/promptolution/utils/templates.py
+++ b/promptolution/utils/templates.py
@@ -138,6 +138,12 @@
 
 The instruction was"""
 
+PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION = """Please create a prompt for the following task, not using any placeholders, working universally, for any datapoint-specific instructions following each system prompt.
+
+Task: <task_desc>
+
+Explicitly state this expected format as part of the prompts."""
+
 
 DOWNSTREAM_TEMPLATE = "<instruction>"
 
diff --git a/tests/predictors/test_classifiers.py b/tests/predictors/test_predictors.py
similarity index 80%
rename from tests/predictors/test_classifiers.py
rename to tests/predictors/test_predictors.py
index 54885b9..2f7e11f 100644
--- a/tests/predictors/test_classifiers.py
+++ b/tests/predictors/test_predictors.py
@@ -1,13 +1,13 @@
 import numpy as np
 import pytest
 
-from promptolution.helpers import FirstOccurrenceClassifier, MarkerBasedClassifier
+from promptolution.helpers import FirstOccurrencePredictor, MarkerBasedPredictor
 
 
 def test_first_occurrence_classifier(mock_downstream_llm, mock_df):
-    """Test the FirstOccurrenceClassifier."""
+    """Test the FirstOccurrencePredictor."""
     # Create classifier
-    classifier = FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values)
+    classifier = FirstOccurrencePredictor(llm=mock_downstream_llm, classes=mock_df["y"].values)
 
     # Test with multiple inputs
     xs = ["I love this product!", "I hate this product!", "This product is okay.", "ja ne"]
@@ -25,9 +25,9 @@ def test_first_occurrence_classifier(mock_downstream_llm, mock_df):
 
 
 def test_marker_based_classifier(mock_downstream_llm, mock_df):
-    """Test the MarkerBasedClassifier."""
+    """Test the MarkerBasedPredictor."""
     # Create classifier
-    classifier = MarkerBasedClassifier(
+    classifier = MarkerBasedPredictor(
         llm=mock_downstream_llm,
         classes=mock_df["y"].values,
         begin_marker="<final_answer>",
@@ -56,9 +56,9 @@ def test_marker_based_classifier(mock_downstream_llm, mock_df):
 
 
 def test_marker_based_without_classes(mock_downstream_llm):
-    """Test MarkerBasedClassifier without predefined classes."""
+    """Test MarkerBasedPredictor without predefined classes."""
     # Create classifier without classes
-    classifier = MarkerBasedClassifier(
+    predictor = MarkerBasedPredictor(
         llm=mock_downstream_llm,
         classes=None,  # No class restrictions
         begin_marker="<final_answer>",
@@ -70,7 +70,7 @@ def test_marker_based_without_classes(mock_downstream_llm):
     prompts = ["Classify:"] * len(xs)
 
     # Make predictions
-    predictions = classifier.predict(prompts, xs)
+    predictions = predictor.predict(prompts, xs)
 
     # Verify shape and content - should accept any value between markers
     assert len(predictions) == 4
@@ -83,7 +83,7 @@ def test_marker_based_without_classes(mock_downstream_llm):
 def test_multiple_prompts_with_classifiers(mock_downstream_llm, mock_df):
     """Test using multiple prompts with classifiers."""
     # Create classifier
-    classifier = FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values)
+    classifier = FirstOccurrencePredictor(llm=mock_downstream_llm, classes=mock_df["y"].values)
 
     # Test with multiple prompts
     prompts = ["Classify:", "Classify:", "Rate:", "Rate:"]
@@ -103,7 +103,7 @@ def test_multiple_prompts_with_classifiers(mock_downstream_llm, mock_df):
 def test_sequence_return_with_classifiers(mock_downstream_llm, mock_df):
     """Test return_seq parameter with classifiers."""
     # Create classifier
-    classifier = MarkerBasedClassifier(llm=mock_downstream_llm, classes=mock_df["y"].values)
+    classifier = MarkerBasedPredictor(llm=mock_downstream_llm, classes=mock_df["y"].values)
 
     # Test with return_seq=True
     prompts = ["Classify:"]
@@ -128,15 +128,15 @@ def test_invalid_class_labels(mock_downstream_llm):
 
     # Should raise an assertion error
     with pytest.raises(AssertionError):
-        FirstOccurrenceClassifier(llm=mock_downstream_llm, classes=invalid_classes)
+        FirstOccurrencePredictor(llm=mock_downstream_llm, classes=invalid_classes)
 
     with pytest.raises(AssertionError):
-        MarkerBasedClassifier(llm=mock_downstream_llm, classes=invalid_classes)
+        MarkerBasedPredictor(llm=mock_downstream_llm, classes=invalid_classes)
 
 
 def test_marker_based_missing_markers(mock_downstream_llm):
-    """Test MarkerBasedClassifier behavior when markers are missing."""
-    classifier = MarkerBasedClassifier(llm=mock_downstream_llm, classes=["will", "not", "be", "used"])
+    """Test MarkerBasedPredictor behavior when markers are missing."""
+    classifier = MarkerBasedPredictor(llm=mock_downstream_llm, classes=["will", "not", "be", "used"])
 
     # When markers are missing, it should default to first class
     prompts = ["Classify:"]
diff --git a/tutorials/aime_eval.py b/tutorials/aime_eval.py
index d369a1b..728e288 100644
--- a/tutorials/aime_eval.py
+++ b/tutorials/aime_eval.py
@@ -8,7 +8,7 @@
 
 from promptolution.llms import APILLM
 from promptolution.optimizers import CAPO
-from promptolution.predictors import MarkerBasedClassifier
+from promptolution.predictors import MarkerBasedPredictor
 from promptolution.tasks import ClassificationTask
 from promptolution.utils import LoggerCallback
 
@@ -49,7 +49,7 @@
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
+predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes)
 
 callbacks = [LoggerCallback(logger)]
 
diff --git a/tutorials/api_llm_demo.py b/tutorials/api_llm_demo.py
index d369a1b..728e288 100644
--- a/tutorials/api_llm_demo.py
+++ b/tutorials/api_llm_demo.py
@@ -8,7 +8,7 @@
 
 from promptolution.llms import APILLM
 from promptolution.optimizers import CAPO
-from promptolution.predictors import MarkerBasedClassifier
+from promptolution.predictors import MarkerBasedPredictor
 from promptolution.tasks import ClassificationTask
 from promptolution.utils import LoggerCallback
 
@@ -49,7 +49,7 @@
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
+predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes)
 
 callbacks = [LoggerCallback(logger)]
 
diff --git a/tutorials/capo_demo.py b/tutorials/capo_demo.py
index a7cc53f..0a42335 100644
--- a/tutorials/capo_demo.py
+++ b/tutorials/capo_demo.py
@@ -8,7 +8,7 @@
 
 from promptolution.llms import APILLM
 from promptolution.optimizers import CAPO
-from promptolution.predictors import MarkerBasedClassifier
+from promptolution.predictors import MarkerBasedPredictor
 from promptolution.tasks import ClassificationTask
 from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback
 
@@ -56,7 +56,7 @@
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassifier(downstream_llm, classes=None)
+predictor = MarkerBasedPredictor(downstream_llm, classes=None)
 
 optimizer = CAPO(
     task=task,
diff --git a/tutorials/evoprompt_demo.py b/tutorials/evoprompt_demo.py
index 6568230..1d00369 100644
--- a/tutorials/evoprompt_demo.py
+++ b/tutorials/evoprompt_demo.py
@@ -9,7 +9,7 @@
 
 from promptolution.llms import APILLM
 from promptolution.optimizers import EVOPROMPT_GA_TEMPLATE, EvoPromptGA
-from promptolution.predictors import MarkerBasedClassifier
+from promptolution.predictors import MarkerBasedPredictor
 from promptolution.tasks import ClassificationTask
 from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback
 
@@ -60,7 +60,7 @@
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
+predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes)
 
 optimizer = EvoPromptGA(
     task=task,
diff --git a/tutorials/opro_demo.py b/tutorials/opro_demo.py
index 2b6ea93..760759d 100644
--- a/tutorials/opro_demo.py
+++ b/tutorials/opro_demo.py
@@ -8,7 +8,7 @@
 
 from promptolution.llms import VLLM
 from promptolution.optimizers import OPRO, OPRO_TEMPLATE_TD
-from promptolution.predictors import MarkerBasedClassifier
+from promptolution.predictors import MarkerBasedPredictor
 from promptolution.tasks import ClassificationTask
 from promptolution.utils import FileOutputCallback, LoggerCallback, TokenCountCallback
 
@@ -58,7 +58,7 @@
 downstream_llm = llm
 meta_llm = llm
 
-predictor = MarkerBasedClassifier(downstream_llm, classes=task.classes)
+predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes)
 
 optimizer = OPRO(
     task=task,

From 350b54e1ad61dd963c0ab6ed2ce33d42138c4a02 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Tue, 25 Nov 2025 14:02:41 +0100
Subject: [PATCH 14/43] update prompt creation

---
 promptolution/utils/prompt_creation.py | 17 +++++++----------
 promptolution/utils/templates.py       |  6 ++++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 6f2992e..5330301 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -1,5 +1,6 @@
 """Utility functions for prompt creation."""
 
+import json
 
 import numpy as np
 
@@ -100,9 +101,9 @@ def create_prompts_from_samples(
             # sample
             xs: List[str] = []
             ys: List[str] = []
-            for label, num_samples in zip(unique_labels, samples_per_class):
+            for label, n_per_class in zip(unique_labels, samples_per_class):
                 indices = np.where(task.ys == label)[0]
-                indices = np.random.choice(indices, n_samples, replace=False)
+                indices = np.random.choice(indices, n_per_class, replace=False)
                 xs.extend(task.xs[indices])
                 ys.extend(task.ys[indices])
 
@@ -134,19 +135,15 @@ def create_prompts_from_task_description(
         task_description (str): The description of the task to generate prompts for.
         llm (BaseLLM): The language model to use for generating the prompts.
         meta_prompt (str): The meta prompt to use for generating the prompts.
-        If None, a default meta prompt is used.
+            If None, a default meta prompt is used.
         n_prompts (int): The number of prompts to generate.
-
-    Returns:
-        List[str]: A list of generated prompts.
     """
     if meta_prompt is None:
         meta_prompt = PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION
 
-    meta_prompt = meta_prompt.replace("<task_desc>", task_description)
+    meta_prompt = meta_prompt.replace("<task_desc>", task_description).replace("<n_prompts>", str(n_prompts))
 
-    meta_prompts = [meta_prompt for _ in range(n_prompts)]
-    prompts = llm.get_response(meta_prompts)
-    prompts = extract_from_tag(prompts, "<prompt>", "</prompt>")
+    prompts_str = llm.get_response(meta_prompt)[0]
+    prompts = json.loads(prompts_str)
 
     return prompts
diff --git a/promptolution/utils/templates.py b/promptolution/utils/templates.py
index a0a85a6..2bdea0b 100644
--- a/promptolution/utils/templates.py
+++ b/promptolution/utils/templates.py
@@ -138,11 +138,13 @@
 
 The instruction was"""
 
-PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION = """Please create a prompt for the following task, not using any placeholders, working universally, for any datapoint-specific instructions following each system prompt.
+PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION = """Please create diverse system prompts for the following task, not using any placeholders, working universally, for any datapoint-specific instructions following each system prompt.
 
 Task: <task_desc>
 
-Explicitly state this expected format as part of the prompts."""
+Explicitly state the expected format above by repeating its exact character sequence verbatim in every prompt.
+
+Create overall <n_prompts> prompts within quotes as an array. Do not response with anything else. Start the array with [ and end with ]. Separate each prompt by a comma."""
 
 
 DOWNSTREAM_TEMPLATE = "<instruction>"

From 1d458408d2514176719a89830f0849ae65c94f8a Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Tue, 25 Nov 2025 14:17:33 +0100
Subject: [PATCH 15/43] optimize capo survival

---
 promptolution/optimizers/capo.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index 301dd80..944a4ce 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -320,6 +320,10 @@ def filter_survivors(
     Returns:
         Tuple[List[Prompt], List[List[float]]]: Filtered candidates and their scores.
     """
-    filtered_candidates = list(compress(candidates, mask))
-    filtered_scores = list(compress(scores, mask))
+    assert len(candidates) == len(mask), "Length of candidates, scores, and mask must be the same."
+    assert all(len(score) == len(mask) for score in scores), "Length of candidates, scores, and mask must be the same."
+
+    filtered_candidates = [c for c, m in zip(candidates, mask) if m]
+    filtered_scores = [[s for s, m in zip(score, mask) if m] for score in scores]
+
     return filtered_candidates, filtered_scores

From 5ad89dccbdf2525ee97ac6190c4a8e0ef85ee303 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Wed, 26 Nov 2025 14:10:05 +0100
Subject: [PATCH 16/43] improve fileoutput callback handling

---
 promptolution/utils/callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/utils/callbacks.py b/promptolution/utils/callbacks.py
index 083f749..98129e2 100644
--- a/promptolution/utils/callbacks.py
+++ b/promptolution/utils/callbacks.py
@@ -155,7 +155,7 @@ def on_step_end(self, optimizer: "BaseOptimizer") -> bool:
                 "output_tokens": [optimizer.predictor.llm.output_token_count] * len(optimizer.prompts),
                 "time": [datetime.now().timestamp()] * len(optimizer.prompts),
                 "score": optimizer.scores,
-                "prompt": optimizer.prompts,
+                "prompt": [str(p) for p in optimizer.prompts],
             }
         )
 

From 6b668a37a79defe116e90bf116f57cdaf1528893 Mon Sep 17 00:00:00 2001
From: Tom Zehle <t.zehle@gmail.com>
Date: Wed, 26 Nov 2025 23:16:28 +0100
Subject: [PATCH 17/43] Change vllm test-dependency version to exact match

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0647c64..6a6999c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,7 @@ pytest = ">=8.3.5"
 pytest-cov = ">=6.1.1"
 openai = ">=1.0.0"
 requests = ">=2.31.0"
-vllm = ">=0.10.1.1"
+vllm = "==0.10.1.1"
 transformers = ">=4.48.0"
 
 [tool.poetry.group.docs.dependencies]

From 2b087cae49c961b72d52e667ace705510c514cb6 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Thu, 27 Nov 2025 08:18:16 +0100
Subject: [PATCH 18/43] incoperate comments

---
 promptolution/llms/vllm.py |  3 +-
 tutorials/aime_eval.py     | 66 --------------------------------------
 2 files changed, 1 insertion(+), 68 deletions(-)
 delete mode 100644 tutorials/aime_eval.py

diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
index f431878..f22ff52 100644
--- a/promptolution/llms/vllm.py
+++ b/promptolution/llms/vllm.py
@@ -1,10 +1,10 @@
 """Module for running language models locally using the vLLM library."""
 
-
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.utils.config import ExperimentConfig
+    from transformers import PreTrainedTokenizer
 
 
 from promptolution.llms.base_llm import BaseLLM
@@ -14,7 +14,6 @@
 
 try:
     from transformers import AutoTokenizer  # type: ignore
-    from transformers import PreTrainedTokenizer
     from vllm import LLM, SamplingParams
 
     imports_successful = True
diff --git a/tutorials/aime_eval.py b/tutorials/aime_eval.py
deleted file mode 100644
index 728e288..0000000
--- a/tutorials/aime_eval.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""Test run for the Opro optimizer."""
-
-
-import argparse
-from logging import Logger
-
-from datasets import load_dataset
-
-from promptolution.llms import APILLM
-from promptolution.optimizers import CAPO
-from promptolution.predictors import MarkerBasedPredictor
-from promptolution.tasks import ClassificationTask
-from promptolution.utils import LoggerCallback
-
-logger = Logger(__name__)
-
-"""Run a test run for any of the implemented optimizers."""
-parser = argparse.ArgumentParser()
-parser.add_argument("--base-url", default="https://api.openai.com/v1")
-parser.add_argument("--model", default="gpt-4o-2024-08-06")
-# parser.add_argument("--base-url", default="https://api.deepinfra.com/v1/openai")
-# parser.add_argument("--model", default="meta-llama/Meta-Llama-3-8B-Instruct")
-# parser.add_argument("--base-url", default="https://api.anthropic.com/v1/")
-# parser.add_argument("--model", default="claude-3-haiku-20240307")
-parser.add_argument("--n-steps", type=int, default=2)
-parser.add_argument("--token", default=None)
-args = parser.parse_args()
-
-df = load_dataset("SetFit/ag_news", split="train", revision="main").to_pandas().sample(300)
-
-df["input"] = df["text"]
-df["target"] = df["label_text"]
-
-task = ClassificationTask(
-    df,
-    task_description="The dataset contains news articles categorized into four classes: World, Sports, Business, and Tech. The task is to classify each news article into one of the four categories.",
-    x_column="input",
-    y_column="target",
-)
-
-initial_prompts = [
-    "Classify this news article as World, Sports, Business, or Tech. Provide your answer between <final_answer> and </final_answer> tags.",
-    "Read the following news article and determine which category it belongs to: World, Sports, Business, or Tech. Your classification must be placed between <final_answer> </final_answer> markers.",
-    "Your task is to identify whether this news article belongs to World, Sports, Business, or Tech news. Provide your classification between the markers <final_answer> </final_answer>.",
-    "Conduct a thorough analysis of the provided news article and classify it as belonging to one of these four categories: World, Sports, Business, or Tech. Your answer should be presented within <final_answer> </final_answer> markers.",
-]
-
-llm = APILLM(api_url=args.base_url, model_id=args.model, api_key=args.token)
-downstream_llm = llm
-meta_llm = llm
-
-predictor = MarkerBasedPredictor(downstream_llm, classes=task.classes)
-
-callbacks = [LoggerCallback(logger)]
-
-optimizer = CAPO(
-    task=task,
-    predictor=predictor,
-    meta_llm=meta_llm,
-    initial_prompts=initial_prompts,
-    callbacks=callbacks,
-)
-
-best_prompts = optimizer.optimize(n_steps=args.n_steps)
-
-logger.info(f"Optimized prompts: {best_prompts}")

From fa1ddc47692a39901e50a8973db1013bee1afdb0 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 09:53:02 +0100
Subject: [PATCH 19/43] incoperated comments

---
 .pre-commit-config.yaml                       |  5 ++++
 promptolution/helpers.py                      | 23 ++++++++-----------
 promptolution/llms/api_llm.py                 |  8 +++----
 promptolution/optimizers/capo.py              |  7 +++---
 promptolution/predictors/__init__.py          |  8 +++++--
 ...ictor.py => first_occurrence_predictor.py} |  2 +-
 promptolution/tasks/base_task.py              |  4 ++--
 promptolution/utils/prompt.py                 |  3 ++-
 promptolution/utils/prompt_creation.py        |  4 ++--
 tests/helpers/test_helpers.py                 |  4 ++--
 10 files changed, 37 insertions(+), 31 deletions(-)
 rename promptolution/predictors/{first_occurence_predictor.py => first_occurrence_predictor.py} (98%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 22c8d88..e0e22cf 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,6 +14,11 @@ repos:
     rev: 6.0.0
     hooks:
       - id: flake8
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.6
+    hooks:
+      - id: ruff
+        args: [ --fix ]
   - repo: https://github.com/pycqa/isort
     rev: 5.12.0
     hooks:
diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 5ad6d5e..0548a42 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -1,6 +1,6 @@
 """Helper functions for the usage of the libary."""
 
-from typing import TYPE_CHECKING, Callable, List, Literal, Optional
+from typing import TYPE_CHECKING, Callable, List, Literal, Optional, Union, cast
 
 from promptolution.tasks.judge_tasks import JudgeTask
 from promptolution.tasks.reward_tasks import RewardTask
@@ -28,20 +28,10 @@
 from promptolution.optimizers.evoprompt_de import EvoPromptDE
 from promptolution.optimizers.evoprompt_ga import EvoPromptGA
 from promptolution.optimizers.opro import OPRO
-from promptolution.predictors.first_occurence_predictor import FirstOccurrencePredictor
+from promptolution.predictors.first_occurrence_predictor import FirstOccurrencePredictor
 from promptolution.predictors.maker_based_predictor import MarkerBasedPredictor
 from promptolution.tasks.classification_tasks import ClassificationTask
 from promptolution.utils.logging import get_logger
-from promptolution.utils.templates import (
-    CAPO_CROSSOVER_TEMPLATE,
-    CAPO_MUTATION_TEMPLATE,
-    EVOPROMPT_DE_TEMPLATE,
-    EVOPROMPT_DE_TEMPLATE_TD,
-    EVOPROMPT_GA_TEMPLATE,
-    EVOPROMPT_GA_TEMPLATE_TD,
-    OPRO_TEMPLATE,
-    OPRO_TEMPLATE_TD,
-)
 
 logger = get_logger(__name__)
 
@@ -95,13 +85,15 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Promp
     logger.warning("🔥 Starting optimization...")
     prompts = optimizer.optimize(n_steps=config.n_steps)
 
-    if hasattr(config, "prepend_exemplars") and config.prepend_exemplars:
+    if hasattr(config, "posthoc_exemplar_selection") and config.posthoc_exemplar_selection:
         selector = get_exemplar_selector(config.exemplar_selector, task, predictor)
         prompts = [selector.select_exemplars(p, n_examples=config.n_exemplars) for p in prompts]
     return prompts
 
 
-def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[str]) -> pd.DataFrame:
+def run_evaluation(
+    df: pd.DataFrame, config: "ExperimentConfig", prompts: Union[List[Prompt], List[str]]
+) -> pd.DataFrame:
     """Run the evaluation phase of the experiment.
 
     Configures all LLMs (downstream, meta, and judge) to use
@@ -119,6 +111,9 @@ def run_evaluation(df: pd.DataFrame, config: "ExperimentConfig", prompts: List[s
     task = get_task(df, config, judge_llm=llm)
     predictor = get_predictor(llm, config=config)
     logger.warning("📊 Starting evaluation...")
+    if isinstance(prompts[0], str):
+        str_prompts = cast(List[str], prompts)
+        prompts = [Prompt(p) for p in str_prompts]
     scores = task.evaluate(prompts, predictor, eval_strategy="full")
     df = pd.DataFrame(dict(prompt=prompts, score=scores))
     df = df.sort_values("score", ascending=False, ignore_index=True)
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index e3fa699..e3e0ab0 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -147,7 +147,7 @@ async def _ainvoke_with_retries(self, prompt: str, system_prompt: str) -> str:
                 if attempt < self.max_retries:
                     delay = self.retry_base_delay_s * (2**attempt)
                     logger.warning(
-                        "meta LLM call failed (%d/%d): %s — retrying in %.2fs",
+                        "LLM call failed (%d/%d): %s — retrying in %.2fs",
                         attempt + 1,
                         self.max_retries + 1,
                         e,
@@ -181,7 +181,7 @@ async def _aget_batch(self, prompts: List[str], system_prompts: List[str]) -> Li
         except asyncio.TimeoutError:
             for t in tasks:
                 t.cancel()
-            raise TimeoutError(f"Meta LLM batch timed out after {self.gather_timeout_s}s")
+            raise TimeoutError(f"LLM batch timed out after {self.gather_timeout_s}s")
 
         outs: List[str] = []
         first_exc: Optional[BaseException] = None
@@ -197,7 +197,7 @@ async def _aget_batch(self, prompts: List[str], system_prompts: List[str]) -> Li
             for t in tasks:
                 if not t.done():
                     t.cancel()
-            raise RuntimeError(f"Meta LLM batch failed: {first_exc}") from first_exc
+            raise RuntimeError(f"LLM batch failed: {first_exc}") from first_exc
 
         return outs
 
@@ -240,6 +240,6 @@ def _get_response(self, prompts: List[str], system_prompts: List[str]) -> List[s
             return r
         except FuturesTimeout:
             fut.cancel()
-            raise TimeoutError(f"Meta LLM batch (future) timed out after {self.gather_timeout_s + 5.0}s")
+            raise TimeoutError(f"LLM batch (future) timed out after {self.gather_timeout_s + 5.0}s")
         except Exception:
             raise
diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index 944a4ce..4cdd8fd 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -1,7 +1,6 @@
 """Implementation of the CAPO (Cost-Aware Prompt Optimization) algorithm."""
 
 import random
-from itertools import compress
 
 import numpy as np
 import pandas as pd
@@ -320,8 +319,10 @@ def filter_survivors(
     Returns:
         Tuple[List[Prompt], List[List[float]]]: Filtered candidates and their scores.
     """
-    assert len(candidates) == len(mask), "Length of candidates, scores, and mask must be the same."
-    assert all(len(score) == len(mask) for score in scores), "Length of candidates, scores, and mask must be the same."
+    assert len(candidates) == len(mask), "Length of candidates, and mask must be the same."
+    assert all(
+        len(candidates) == len(score) for score in scores
+    ), "Each score list must have the same length as candidates."
 
     filtered_candidates = [c for c, m in zip(candidates, mask) if m]
     filtered_scores = [[s for s, m in zip(score, mask) if m] for score in scores]
diff --git a/promptolution/predictors/__init__.py b/promptolution/predictors/__init__.py
index cf9d5de..8751335 100644
--- a/promptolution/predictors/__init__.py
+++ b/promptolution/predictors/__init__.py
@@ -1,5 +1,9 @@
 """Module for LLM predictors."""
 
-
-from promptolution.predictors.first_occurence_predictor import FirstOccurrencePredictor
+from promptolution.predictors.first_occurrence_predictor import FirstOccurrencePredictor
 from promptolution.predictors.maker_based_predictor import MarkerBasedPredictor
+
+__all__ = [
+    "FirstOccurrencePredictor",
+    "MarkerBasedPredictor",
+]
diff --git a/promptolution/predictors/first_occurence_predictor.py b/promptolution/predictors/first_occurrence_predictor.py
similarity index 98%
rename from promptolution/predictors/first_occurence_predictor.py
rename to promptolution/predictors/first_occurrence_predictor.py
index 27a28dc..7e84f0c 100644
--- a/promptolution/predictors/first_occurence_predictor.py
+++ b/promptolution/predictors/first_occurrence_predictor.py
@@ -1,4 +1,4 @@
-"""Module for the FirstOccurencePredictor."""
+"""Module for the FirstOccurrencePredictor."""
 
 from typing import TYPE_CHECKING, List, Optional
 
diff --git a/promptolution/tasks/base_task.py b/promptolution/tasks/base_task.py
index a364ae3..2f1c164 100644
--- a/promptolution/tasks/base_task.py
+++ b/promptolution/tasks/base_task.py
@@ -6,9 +6,9 @@
 import numpy as np
 import pandas as pd
 
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union, overload
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union, overload
 
-from promptolution.optimizers.base_optimizer import Prompt
+from promptolution.utils.prompt import Prompt
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.predictors.base_predictor import BasePredictor
diff --git a/promptolution/utils/prompt.py b/promptolution/utils/prompt.py
index 8ff3345..d660e49 100644
--- a/promptolution/utils/prompt.py
+++ b/promptolution/utils/prompt.py
@@ -55,9 +55,10 @@ def sort_prompts_by_scores(
     Args:
         prompts (List[Prompt]): List of Prompt objects.
         scores (List[float]): Corresponding list of scores.
+        top_k (Optional[int]): If provided, limits the result to the top_k prompts. Defaults to None (returns all).
 
     Returns:
-        List[Prompt]: Prompts sorted by scores in descending order.
+        Tuple[List[Prompt], List[float]]: A tuple containing prompts sorted by scores in descending order and their corresponding sorted scores.
     """
     assert len(prompts) == len(scores), "Prompts and scores must have the same length."
 
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 5330301..67879de 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -52,7 +52,7 @@ def create_prompts_from_samples(
     llm: "BaseLLM",
     meta_prompt: Optional[str] = None,
     n_samples: int = 3,
-    task_description: str = None,
+    task_description: Optional[str] = None,
     n_prompts: int = 1,
     get_uniform_labels: bool = False,
 ) -> List[str]:
@@ -127,7 +127,7 @@ def create_prompts_from_task_description(
     task_description: str,
     llm: "BaseLLM",
     meta_prompt: Optional[str] = None,
-    n_prompts: int = 1,
+    n_prompts: int = 10,
 ) -> List[str]:
     """Generate a set of prompts from a given task description.
 
diff --git a/tests/helpers/test_helpers.py b/tests/helpers/test_helpers.py
index 678cd9f..59328b1 100644
--- a/tests/helpers/test_helpers.py
+++ b/tests/helpers/test_helpers.py
@@ -40,7 +40,7 @@ def experiment_config():
         predictor_name="first_occurrence",
         classes=["positive", "neutral", "negative"],
         n_steps=2,
-        prepend_exemplars=False,
+        posthoc_exemplar_selection=False,
     )
 
 
@@ -55,7 +55,7 @@ def experiment_config_with_exemplars():
         predictor_name="first_occurrence",
         classes=["positive", "neutral", "negative"],
         n_steps=2,
-        prepend_exemplars=True,
+        posthoc_exemplar_selection=True,
         exemplar_selector="random",
         n_exemplars=2,
     )

From 275e5e6f07402d7b19b18a013ba90bc07fd8c0d7 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 09:55:03 +0100
Subject: [PATCH 20/43] specify ruff arguments

---
 .pre-commit-config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e0e22cf..081d2ef 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,8 +17,9 @@ repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.14.6
     hooks:
-      - id: ruff
+      - id: ruff-check
         args: [ --fix ]
+      - id: ruff-format
   - repo: https://github.com/pycqa/isort
     rev: 5.12.0
     hooks:

From a44bc485a248acf141353934f33b47334d9548a5 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 10:01:36 +0100
Subject: [PATCH 21/43] make ruff happy

---
 .pre-commit-config.yaml                      |  1 -
 promptolution/exemplar_selectors/__init__.py |  5 +++
 promptolution/llms/__init__.py               |  7 +++-
 promptolution/optimizers/__init__.py         |  8 ++++-
 promptolution/tasks/__init__.py              |  8 +++++
 promptolution/utils/__init__.py              | 34 +++++++++++++++++++-
 6 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 081d2ef..f08867b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,6 @@ repos:
     hooks:
       - id: ruff-check
         args: [ --fix ]
-      - id: ruff-format
   - repo: https://github.com/pycqa/isort
     rev: 5.12.0
     hooks:
diff --git a/promptolution/exemplar_selectors/__init__.py b/promptolution/exemplar_selectors/__init__.py
index 62e6c9a..e948a3a 100644
--- a/promptolution/exemplar_selectors/__init__.py
+++ b/promptolution/exemplar_selectors/__init__.py
@@ -2,3 +2,8 @@
 
 from promptolution.exemplar_selectors.random_search_selector import RandomSearchSelector
 from promptolution.exemplar_selectors.random_selector import RandomSelector
+
+__all__ = [
+    "RandomSelector",
+    "RandomSearchSelector",
+]
diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py
index 7fd7b97..8110f87 100644
--- a/promptolution/llms/__init__.py
+++ b/promptolution/llms/__init__.py
@@ -1,6 +1,11 @@
 """Module for Large Language Models."""
 
-
 from promptolution.llms.api_llm import APILLM
 from promptolution.llms.local_llm import LocalLLM
 from promptolution.llms.vllm import VLLM
+
+__all__ = [
+    "APILLM",
+    "LocalLLM",
+    "VLLM",
+]
diff --git a/promptolution/optimizers/__init__.py b/promptolution/optimizers/__init__.py
index 9f82a8f..4b7a7db 100644
--- a/promptolution/optimizers/__init__.py
+++ b/promptolution/optimizers/__init__.py
@@ -1,7 +1,13 @@
 """Module for prompt optimizers."""
 
-
 from promptolution.optimizers.capo import CAPO
 from promptolution.optimizers.evoprompt_de import EvoPromptDE
 from promptolution.optimizers.evoprompt_ga import EvoPromptGA
 from promptolution.optimizers.opro import OPRO
+
+__all__ = [
+    "CAPO",
+    "EvoPromptDE",
+    "EvoPromptGA",
+    "OPRO",
+]
diff --git a/promptolution/tasks/__init__.py b/promptolution/tasks/__init__.py
index 7222256..825dbad 100644
--- a/promptolution/tasks/__init__.py
+++ b/promptolution/tasks/__init__.py
@@ -1,3 +1,11 @@
 """Module for task-related functions and classes."""
 
 from promptolution.tasks.classification_tasks import ClassificationTask
+from promptolution.tasks.judge_tasks import JudgeTask
+from promptolution.tasks.reward_tasks import RewardTask
+
+__all__ = [
+    "ClassificationTask",
+    "JudgeTask",
+    "RewardTask",
+]
diff --git a/promptolution/utils/__init__.py b/promptolution/utils/__init__.py
index 11eb2ea..ff81f03 100644
--- a/promptolution/utils/__init__.py
+++ b/promptolution/utils/__init__.py
@@ -1,6 +1,5 @@
 """Module for utility functions and classes."""
 
-
 from promptolution.utils.callbacks import (
     BestPromptCallback,
     FileOutputCallback,
@@ -30,3 +29,36 @@
 )
 from promptolution.utils.test_statistics import TestStatistics, get_test_statistic_func, paired_t_test
 from promptolution.utils.token_counter import get_token_counter
+
+__all__ = [
+    "BestPromptCallback",
+    "FileOutputCallback",
+    "LoggerCallback",
+    "ProgressBarCallback",
+    "TokenCountCallback",
+    "ExperimentConfig",
+    "get_logger",
+    "setup_logging",
+    "Prompt",
+    "sort_prompts_by_scores",
+    "create_prompt_variation",
+    "create_prompts_from_samples",
+    "CAPO_CROSSOVER_TEMPLATE",
+    "CAPO_FEWSHOT_TEMPLATE",
+    "CAPO_MUTATION_TEMPLATE",
+    "DEFAULT_SYS_PROMPT",
+    "DOWNSTREAM_TEMPLATE",
+    "EVOPROMPT_DE_TEMPLATE",
+    "EVOPROMPT_DE_TEMPLATE_TD",
+    "EVOPROMPT_GA_TEMPLATE",
+    "EVOPROMPT_GA_TEMPLATE_TD",
+    "OPRO_TEMPLATE",
+    "OPRO_TEMPLATE_TD",
+    "PROMPT_CREATION_TEMPLATE",
+    "PROMPT_CREATION_TEMPLATE_TD",
+    "PROMPT_VARIATION_TEMPLATE",
+    "TestStatistics",
+    "get_test_statistic_func",
+    "paired_t_test",
+    "get_token_counter",
+]

From 3fde9bd1ea279977ad687cf2ee17cfc601299146 Mon Sep 17 00:00:00 2001
From: Tom Zehle <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 10:25:42 +0100
Subject: [PATCH 22/43] first draft for read me

---
 README.md | 129 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 73 insertions(+), 56 deletions(-)

diff --git a/README.md b/README.md
index 9288857..1fa937a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)
 
 ![Coverage](https://img.shields.io/badge/Coverage-91%25-brightgreen)
 [![CI](https://github.com/finitearth/promptolution/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/finitearth/promptolution/actions/workflows/ci.yml)
@@ -7,104 +6,122 @@
 ![Python Versions](https://img.shields.io/badge/Python%20Versions-≥3.10-blue)
 [![Getting Started](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb)
 
-Promptolution is a library that provides a modular and extensible framework for implementing prompt tuning for single tasks and larger experiments. It offers a user-friendly interface to assemble the core components for various prompt optimization tasks.
+![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)
+
+
+
+<p align="center">
+  <img src="https://mcml.ai/images/MCML_Logo_cropped.jpg" height="45">
+  <img src="https://github.com/user-attachments/assets/1ae42b4a-163e-43ed-b691-c253d4f4c814"  height="45">
+  <img  src="https://github.com/user-attachments/assets/e70ec1d4-bbc4-4ff3-8803-8806bc879bb0"  height="45"/>
+  <img src="https://mcml.ai/images/footer/lmu_white.webp" height="45">
+  <img src="https://mcml.ai/images/footer/tum_white.webp" height="45">
+</p>
+
+
 
-This project was developed by [Timo Heiß](https://www.linkedin.com/in/timo-heiss/), [Moritz Schlager](https://www.linkedin.com/in/moritz-schlager/) and [Tom Zehle](https://www.linkedin.com/in/tom-zehle/) as part of a study program at LMU Munich.
+## 🚀 What is Promptolution?
 
-## Installation
+**Promptolution** is a modular framework for *serious* prompt optimization — built for researchers who want full control over optimizers, datasets, evaluation, and logging.
+Unlike end-to-end agent frameworks (DSPy, LangGraph…), Promptolution focuses **exclusively** on the prompt optimization phase, with clean abstractions, transparent internals, and an extensible API.
 
-Use pip to install our library:
+It supports:
+
+* single-task prompt optimization
+* large-scale experiments
+* local + API-based LLMs
+* fast parallelization
+* clean logs for reproducible research
+
+Developed by **Timo Heiß**, **Moritz Schlager**, and **Tom Zehle** (LMU Munich, MCML, ELLIS, TUM, Uni Freiburg).
+
+
+
+## 📦 Installation
 
 ```
 pip install promptolution[api]
 ```
 
-If you want to run your prompt optimization locally, either via transformers or vLLM, consider running:
+Local inference via vLLM or transformers:
 
 ```
 pip install promptolution[vllm,transformers]
 ```
 
-Alternatively, clone the repository, run
+From source:
 
 ```
+git clone https://github.com/finitearth/promptolution.git
+cd promptolution
 poetry install
 ```
 
-to install the necessary dependencies. You might need to install [pipx](https://pipx.pypa.io/stable/installation/) and [poetry](https://python-poetry.org/docs/) first.
 
-## Usage
 
-To get started right away, take a look at our [getting started notebook](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb) and our [other demos and tutorials](https://github.com/finitearth/promptolution/blob/main/tutorials).
-For more details, a comprehensive **documentation** with API reference is availabe at https://finitearth.github.io/promptolution/.
+## 🔧 Quickstart
 
-### Featured Optimizers
+Start with the **Getting Started tutorial**:
+[https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb](https://github.com/finitearth/promptolution/blob/main/tutorials/getting_started.ipynb)
 
-|   **Name**    |                    **Paper**                     | **init prompts** | **Exploration** | **Costs**  | **Parallelizable** | **Utilizes Fewshot Examples** |
-| :-----------: | :----------------------------------------------: | :--------------: | :-------------: | :-------: | :-------------------: | :---------------------------: |
-|    `CAPO`     | [Zehle et al.](https://arxiv.org/abs/2504.16005) |    _required_    |       👍        |    💲     |        ✅         |              ✅               |
-| `EvoPromptDE` |  [Guo et al.](https://arxiv.org/abs/2309.08532)  |    _required_    |       👍        |   💲💲    |               ✅         |              ❌               |
-| `EvoPromptGA` |  [Guo et al.](https://arxiv.org/abs/2309.08532)  |    _required_    |       👍        |   💲💲    |               ✅         |              ❌               |
-|    `OPRO`     | [Yang et al.](https://arxiv.org/abs/2309.03409)  |    _optional_    |       👎        |   💲💲    |                  ❌         |              ❌               |
+Full docs:
+[https://finitearth.github.io/promptolution/](https://finitearth.github.io/promptolution/)
 
-### Core Components
 
-- `Task`: Encapsulates initial prompts, dataset features, targets, and evaluation methods.
-- `Predictor`: Implements the prediction logic, interfacing between the `Task` and `LLM` components.
-- `LLM`: Unifies the process of obtaining responses from language models, whether locally hosted or accessed via API.
-- `Optimizer`: Implements prompt optimization algorithms, utilizing the other components during the optimization process.
 
-### Key Features
+## 🧠 Featured Optimizers
 
-- Modular and object-oriented design
-- Extensible architecture
-- Easy-to-use interface for assembling experiments
-- Parallelized LLM requests for improved efficiency
-- Integration with langchain for standardized LLM API calls
-- Detailed logging and callback system for optimization analysis
+| **Name**      | **Paper**                                              | **Init prompts** | **Exploration** | **Costs** | **Parallelizable** | **Few-shot** |
+| ---- | ---- | ---- |----  |----  |  ----|----  |
+| `CAPO`        | [Zehle et al., 2025](https://arxiv.org/abs/2504.16005) | required         | 👍              | 💲        | ✅                  | ✅            |
+| `EvoPromptDE` | [Guo et al., 2023](https://arxiv.org/abs/2309.08532)   | required         | 👍              | 💲💲      | ✅                  | ❌            |
+| `EvoPromptGA` | [Guo et al., 2023](https://arxiv.org/abs/2309.08532)   | required         | 👍              | 💲💲      | ✅                  | ❌            |
+| `OPRO`        | [Yang et al., 2023](https://arxiv.org/abs/2309.03409)  | optional         | 👎              | 💲💲      | ❌                  | ❌            |
 
-## Changelog
 
-Release notes for each version of the library can be found [here](https://finitearth.github.io/promptolution/release-notes/)
 
-## Contributing
+## 🏗 Core Components
 
-The first step to contributing is to open an issue describing the bug, feature, or enhancements. Ensure the issue is clearly described, assigned, and properly tagged. All work should be linked to an open issue.
+* **Task** – wraps dataset fields, init prompts, evaluation.
+* **Predictor** – runs predictions using your LLM backend.
+* **LLM** – unified interface for OpenAI, HuggingFace, vLLM, etc.
+* **Optimizer** – plug-and-play implementations of CAPO, GA/DE, OPRO, and your own custom ones.
 
-### Code Style and Linting
 
-We use Black for code formatting, Flake8 for linting, pydocstyle for docstring conventions (Google format), and isort to sort imports. All these checks are enforced via pre-commit hooks, which automatically run on every commit. Install the pre-commit hooks to ensure that all checks run automatically:
 
-```
-pre-commit install
-```
+## ⭐ Highlights
 
-To run all checks manually:
+* Modular, OOP design → easy customization
+* Experiment-ready architecture
+* Parallel LLM requests
+* LangChain support
+* JSONL logging, callbacks, detailed event traces
+* Works from laptop to cluster
 
-```
-pre-commit run --all-files
-```
 
-### Branch Protection and Merging Guidelines
 
-- The main branch is protected. No direct commits are allowed for non-administrators.
-- Rebase your branch on main before opening a pull request.
-- All contributions must be made on dedicated branches linked to specific issues.
-- Name the branch according to {prefix}/{description} with one of the prefixes fix, feature, chore, or refactor.
-- A pull request must have at least one approval from a code owner before it can be merged into main.
-- CI checks must pass before a pull request can be merged.
-- New releases will only be created by code owners.
+## 📜 Changelog
+
+[https://finitearth.github.io/promptolution/release-notes/](https://finitearth.github.io/promptolution/release-notes/)
+
+
 
-### Testing
+## 🤝 Contributing
 
-We use pytest to run tests, and coverage to track code coverage. Tests automatically run on pull requests and pushes to the main branch, but please ensure they also pass locally before pushing!
-To run the tests with coverage locally, use the following commands or your IDE's test runner:
+Open an issue → create a branch → PR → CI → review → merge.
+Branch naming: `feature/...`, `fix/...`, `chore/...`, `refactor/...`.
+
+### Code Style
 
 ```
-poetry run python -m coverage run -m pytest
+pre-commit install
+pre-commit run --all-files
 ```
 
-To see the coverage report run:
+### Tests
+
 ```
+poetry run python -m coverage run -m pytest
 poetry run python -m coverage report
 ```
+Just tell me — happy to tune it further.

From a77ca76dfb2e4ac80a917383440085f2f7aafc17 Mon Sep 17 00:00:00 2001
From: Tom Zehle <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 10:28:51 +0100
Subject: [PATCH 23/43] Adjust image heights in README

Reduced the height of images in the README from 45 to 40.
---
 README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 1fa937a..42a30be 100644
--- a/README.md
+++ b/README.md
@@ -11,11 +11,11 @@
 
 
 <p align="center">
-  <img src="https://mcml.ai/images/MCML_Logo_cropped.jpg" height="45">
-  <img src="https://github.com/user-attachments/assets/1ae42b4a-163e-43ed-b691-c253d4f4c814"  height="45">
-  <img  src="https://github.com/user-attachments/assets/e70ec1d4-bbc4-4ff3-8803-8806bc879bb0"  height="45"/>
-  <img src="https://mcml.ai/images/footer/lmu_white.webp" height="45">
-  <img src="https://mcml.ai/images/footer/tum_white.webp" height="45">
+  <img src="https://mcml.ai/images/MCML_Logo_cropped.jpg" height="40">
+  <img src="https://github.com/user-attachments/assets/1ae42b4a-163e-43ed-b691-c253d4f4c814"  height="40">
+  <img  src="https://github.com/user-attachments/assets/e70ec1d4-bbc4-4ff3-8803-8806bc879bb0"  height="40"/>
+  <img src="https://mcml.ai/images/footer/lmu_white.webp" height="40">
+  <img src="https://mcml.ai/images/footer/tum_white.webp" height="40">
 </p>
 
 
@@ -124,4 +124,3 @@ pre-commit run --all-files
 poetry run python -m coverage run -m pytest
 poetry run python -m coverage report
 ```
-Just tell me — happy to tune it further.

From 4eba7daa4d47dcec92ec9d14af53bc2d8022af03 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 13:01:11 +0100
Subject: [PATCH 24/43] allow for no init prompts in helper functions

---
 promptolution/helpers.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 0548a42..f50c8ea 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -5,6 +5,7 @@
 from promptolution.tasks.judge_tasks import JudgeTask
 from promptolution.tasks.reward_tasks import RewardTask
 from promptolution.utils.prompt import Prompt
+from promptolution.utils.prompt_creation import create_prompts_from_task_description
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.exemplar_selectors.base_exemplar_selector import BaseExemplarSelector
@@ -71,6 +72,13 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Promp
     llm = get_llm(config=config)
     predictor = get_predictor(llm, config=config)
 
+    if getattr(config, "prompts") is None:
+        config.prompts = create_prompts_from_task_description(
+            task_description=config.task_description,
+            llm=llm,
+            n_prompts=config.n_initial_prompts,
+        )
+
     if config.optimizer == "capo" and (config.eval_strategy is None or "block" not in config.eval_strategy):
         logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")
         config.eval_strategy = "sequential_block"

From eeea2701f289f5d8012304f2d4515100d2b2ee52 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 13:15:42 +0100
Subject: [PATCH 25/43] fix prompt type in helper

---
 promptolution/helpers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index f50c8ea..712c2f6 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -73,11 +73,12 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Promp
     predictor = get_predictor(llm, config=config)
 
     if getattr(config, "prompts") is None:
-        config.prompts = create_prompts_from_task_description(
+        initial_prompts = create_prompts_from_task_description(
             task_description=config.task_description,
             llm=llm,
             n_prompts=config.n_initial_prompts,
         )
+        config.prompts = [Prompt(p) for p in initial_prompts]
 
     if config.optimizer == "capo" and (config.eval_strategy is None or "block" not in config.eval_strategy):
         logger.warning("📌 CAPO requires block evaluation strategy. Setting it to 'sequential_block'.")

From e40bf4b982bc863566f68e87bcc2643890c3d7f3 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 13:19:42 +0100
Subject: [PATCH 26/43] relax none variables

---
 promptolution/helpers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index 712c2f6..ddcb904 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -76,7 +76,6 @@ def run_optimization(df: pd.DataFrame, config: "ExperimentConfig") -> List[Promp
         initial_prompts = create_prompts_from_task_description(
             task_description=config.task_description,
             llm=llm,
-            n_prompts=config.n_initial_prompts,
         )
         config.prompts = [Prompt(p) for p in initial_prompts]
 

From 352e53d7dc1a16513add2b98c110774905b40b84 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 13:39:59 +0100
Subject: [PATCH 27/43] fix meta prompt

---
 promptolution/utils/templates.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/promptolution/utils/templates.py b/promptolution/utils/templates.py
index 2bdea0b..9579540 100644
--- a/promptolution/utils/templates.py
+++ b/promptolution/utils/templates.py
@@ -142,9 +142,9 @@
 
 Task: <task_desc>
 
-Explicitly state the expected format above by repeating its exact character sequence verbatim in every prompt.
+Explicitly state the expected format above by repeating its exact character sequence verbatim in every prompt if applicable.
 
-Create overall <n_prompts> prompts within quotes as an array. Do not response with anything else. Start the array with [ and end with ]. Separate each prompt by a comma."""
+Create overall <n_prompts> prompts within json format, meaning strings inside quotes as an array. Do not response with anything else. Start the array with [ and end with ]. Separate each prompt by a comma, and do not use quotation marks inside the prompts."""
 
 
 DOWNSTREAM_TEMPLATE = "<instruction>"

From 31dd222c8b0188bdfb834e3e1642b8e5cd4bccb4 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 13:48:32 +0100
Subject: [PATCH 28/43] relax api timemout constraints

---
 promptolution/llms/api_llm.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index e3e0ab0..8a58f63 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -26,10 +26,10 @@ def __init__(
         api_key: Optional[str] = None,
         max_concurrent_calls: int = 32,
         max_tokens: int = 512,
-        call_timeout_s: float = 30.0,  # per request
-        gather_timeout_s: float = 120.0,  # whole batch
-        max_retries: int = 2,
-        retry_base_delay_s: float = 0.5,
+        call_timeout_s: float = 200.0,  # per request
+        gather_timeout_s: float = 500.0,  # whole batch
+        max_retries: int = 5,
+        retry_base_delay_s: float = 1,
         client_kwargs: Optional[Dict[str, Any]] = None,
         call_kwargs: Optional[Dict[str, Any]] = None,
         config: Optional["ExperimentConfig"] = None,
@@ -146,12 +146,8 @@ async def _ainvoke_with_retries(self, prompt: str, system_prompt: str) -> str:
                 last_err = e
                 if attempt < self.max_retries:
                     delay = self.retry_base_delay_s * (2**attempt)
-                    logger.warning(
-                        "LLM call failed (%d/%d): %s — retrying in %.2fs",
-                        attempt + 1,
-                        self.max_retries + 1,
-                        e,
-                        delay,
+                    logger.error(
+                        f"LLM call failed ({attempt + 1}/{self.max_retries + 1}): — retrying in {delay}s", exc_info=e
                     )
                     await asyncio.sleep(delay)
         assert last_err is not None

From 52fc7997114eba9899356352868281182c36e50a Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Fri, 28 Nov 2025 13:50:42 +0100
Subject: [PATCH 29/43] relax token constraints

---
 promptolution/llms/api_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
index 8a58f63..c6971a6 100644
--- a/promptolution/llms/api_llm.py
+++ b/promptolution/llms/api_llm.py
@@ -25,7 +25,7 @@ def __init__(
         model_id: Optional[str] = None,
         api_key: Optional[str] = None,
         max_concurrent_calls: int = 32,
-        max_tokens: int = 512,
+        max_tokens: int = 4096,
         call_timeout_s: float = 200.0,  # per request
         gather_timeout_s: float = 500.0,  # whole batch
         max_retries: int = 5,

From e73e9dfe66df48d37c8d1c9e7ea9fda57c3893f3 Mon Sep 17 00:00:00 2001
From: Moritz Schlager <87517800+mo374z@users.noreply.github.com>
Date: Sat, 29 Nov 2025 23:09:41 +0100
Subject: [PATCH 30/43] Revise README with updated images and details

---
 README.md | 66 ++++++++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 42a30be..74533af 100644
--- a/README.md
+++ b/README.md
@@ -11,31 +11,29 @@
 
 
 <p align="center">
-  <img src="https://mcml.ai/images/MCML_Logo_cropped.jpg" height="40">
+  <img src="https://github.com/user-attachments/assets/41edb678-16c4-4a3a-a579-62459106cf48" height="40">
   <img src="https://github.com/user-attachments/assets/1ae42b4a-163e-43ed-b691-c253d4f4c814"  height="40">
-  <img  src="https://github.com/user-attachments/assets/e70ec1d4-bbc4-4ff3-8803-8806bc879bb0"  height="40"/>
+  <img  src="https://github.com/user-attachments/assets/d6977ca8-7b52-4714-83d7-610743c7d52c"  height="40"/>
   <img src="https://mcml.ai/images/footer/lmu_white.webp" height="40">
   <img src="https://mcml.ai/images/footer/tum_white.webp" height="40">
 </p>
 
 
-
 ## 🚀 What is Promptolution?
 
-**Promptolution** is a modular framework for *serious* prompt optimization — built for researchers who want full control over optimizers, datasets, evaluation, and logging.
-Unlike end-to-end agent frameworks (DSPy, LangGraph…), Promptolution focuses **exclusively** on the prompt optimization phase, with clean abstractions, transparent internals, and an extensible API.
-
-It supports:
+**Promptolution** is a unified, modular framework for prompt optimization — built for researchers and advanced practitioners who want full control over their experimental setup. Unlike end-to-end application frameworks with high abstraction, promptolution focuses exclusively on the optimization stage, providing a clean, transparent, and extensible API.
 
-* single-task prompt optimization
-* large-scale experiments
-* local + API-based LLMs
-* fast parallelization
-* clean logs for reproducible research
+<img width="808" height="356" alt="promptolution_framework" src="https://github.com/user-attachments/assets/e3d05493-30e3-4464-b0d6-1d3e3085f575" />
 
-Developed by **Timo Heiß**, **Moritz Schlager**, and **Tom Zehle** (LMU Munich, MCML, ELLIS, TUM, Uni Freiburg).
+Key features include:
 
+* Allowing for single-prompt optimization and large-scale, reproducible benchmark experiments.
+* Implementation of many current prompt optimizers out of the box.
+* Unified LLM backend supporting API-based models, Local LLMs, and vLLM clusters.
+* Built-in response caching to save costs and parallelized inference for speed.
+* Detailed logging and token usage tracking for granular post-hoc analysis.
 
+Have a look at our [Release Notes](https://finitearth.github.io/promptolution/release-notes/) for the latest updates to promptolution.
 
 ## 📦 Installation
 
@@ -57,8 +55,6 @@ cd promptolution
 poetry install
 ```
 
-
-
 ## 🔧 Quickstart
 
 Start with the **Getting Started tutorial**:
@@ -68,7 +64,6 @@ Full docs:
 [https://finitearth.github.io/promptolution/](https://finitearth.github.io/promptolution/)
 
 
-
 ## 🧠 Featured Optimizers
 
 | **Name**      | **Paper**                                              | **Init prompts** | **Exploration** | **Costs** | **Parallelizable** | **Few-shot** |
@@ -78,49 +73,30 @@ Full docs:
 | `EvoPromptGA` | [Guo et al., 2023](https://arxiv.org/abs/2309.08532)   | required         | 👍              | 💲💲      | ✅                  | ❌            |
 | `OPRO`        | [Yang et al., 2023](https://arxiv.org/abs/2309.03409)  | optional         | 👎              | 💲💲      | ❌                  | ❌            |
 
+## 🏗 Components
 
-
-## 🏗 Core Components
-
-* **Task** – wraps dataset fields, init prompts, evaluation.
-* **Predictor** – runs predictions using your LLM backend.
-* **LLM** – unified interface for OpenAI, HuggingFace, vLLM, etc.
-* **Optimizer** – plug-and-play implementations of CAPO, GA/DE, OPRO, and your own custom ones.
-
-
-
-## ⭐ Highlights
-
-* Modular, OOP design → easy customization
-* Experiment-ready architecture
-* Parallel LLM requests
-* LangChain support
-* JSONL logging, callbacks, detailed event traces
-* Works from laptop to cluster
-
-
-
-## 📜 Changelog
-
-[https://finitearth.github.io/promptolution/release-notes/](https://finitearth.github.io/promptolution/release-notes/)
-
-
+* **`Task`** – Manages the dataset, evaluation metrics, and subsampling.
+* **`Predictor`** – Defines how to extract the answer from the model's response.
+* **`LLM`** – A unified interface handling inference, token counting, and concurrency.
+* **`Optimizer`** – The core component that implements the algorithms that refine prompts.
+* **`ExperimentConfig`** – A configuration abstraction to streamline and parametrize large-scale scientific experiments.
 
 ## 🤝 Contributing
 
 Open an issue → create a branch → PR → CI → review → merge.
 Branch naming: `feature/...`, `fix/...`, `chore/...`, `refactor/...`.
 
-### Code Style
+Please ensure to use pre-commit, which assists with keeping the code quality high:
 
 ```
 pre-commit install
 pre-commit run --all-files
 ```
-
-### Tests
+We encourage every contributor to also write tests, that automatically check if the implementation works as expected:
 
 ```
 poetry run python -m coverage run -m pytest
 poetry run python -m coverage report
 ```
+
+Developed by **Timo Heiß**, **Moritz Schlager**, and **Tom Zehle** (LMU Munich, MCML, ELLIS, TUM, Uni Freiburg).

From 1ba9a2c2e05adc686f67adcd8ddf257c65ab082e Mon Sep 17 00:00:00 2001
From: Moritz Schlager <87517800+mo374z@users.noreply.github.com>
Date: Sat, 29 Nov 2025 23:12:22 +0100
Subject: [PATCH 31/43] Small changes

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 74533af..e1045cc 100644
--- a/README.md
+++ b/README.md
@@ -21,13 +21,12 @@
 
 ## 🚀 What is Promptolution?
 
-**Promptolution** is a unified, modular framework for prompt optimization — built for researchers and advanced practitioners who want full control over their experimental setup. Unlike end-to-end application frameworks with high abstraction, promptolution focuses exclusively on the optimization stage, providing a clean, transparent, and extensible API.
+**Promptolution** is a unified, modular framework for prompt optimization — built for researchers and advanced practitioners who want full control over their experimental setup. Unlike end-to-end application frameworks with high abstraction, promptolution focuses exclusively on the optimization stage, providing a clean, transparent, and extensible API. It allows for simple prompt optimization for one task up to large-scale reproducible benchmark experiments. 
 
 <img width="808" height="356" alt="promptolution_framework" src="https://github.com/user-attachments/assets/e3d05493-30e3-4464-b0d6-1d3e3085f575" />
 
-Key features include:
+### Key Features
 
-* Allowing for single-prompt optimization and large-scale, reproducible benchmark experiments.
 * Implementation of many current prompt optimizers out of the box.
 * Unified LLM backend supporting API-based models, Local LLMs, and vLLM clusters.
 * Built-in response caching to save costs and parallelized inference for speed.

From 3ee0ffe5711285be6c432e9158b63f2bd90c574e Mon Sep 17 00:00:00 2001
From: Moritz Schlager <87517800+mo374z@users.noreply.github.com>
Date: Sun, 30 Nov 2025 14:53:02 +0100
Subject: [PATCH 32/43] Update documentation page to match modules

---
 docs/index.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index c562b8c..5496305 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -29,5 +29,6 @@ Or clone our GitHub repository:
 - [Optimizers](api/optimizers.md)
 - [Predictors](api/predictors.md)
 - [Tasks](api/tasks.md)
-- [Callbacks](api/callbacks.md)
-- [Config](api/config.md)
+- [Helpers](api/helpers.md)
+- [Utils](api/utils.md)
+- [Exemplar Selectors](api/examplar_selectors.md)

From 50c358f5491f3fed067ed987e3c5e33adc6a15e2 Mon Sep 17 00:00:00 2001
From: Moritz Schlager <87517800+mo374z@users.noreply.github.com>
Date: Sun, 30 Nov 2025 14:57:28 +0100
Subject: [PATCH 33/43] move LMU logo

Removed duplicate logo image from footer.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e1045cc..59c8151 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,10 @@
 
 
 <p align="center">
+  <img src="https://mcml.ai/images/footer/lmu_white.webp" height="40">
   <img src="https://github.com/user-attachments/assets/41edb678-16c4-4a3a-a579-62459106cf48" height="40">
   <img src="https://github.com/user-attachments/assets/1ae42b4a-163e-43ed-b691-c253d4f4c814"  height="40">
   <img  src="https://github.com/user-attachments/assets/d6977ca8-7b52-4714-83d7-610743c7d52c"  height="40"/>
-  <img src="https://mcml.ai/images/footer/lmu_white.webp" height="40">
   <img src="https://mcml.ai/images/footer/tum_white.webp" height="40">
 </p>
 

From d72b5fba9cc337cca416885b32969acf2f862ea2 Mon Sep 17 00:00:00 2001
From: Moritz Schlager <87517800+mo374z@users.noreply.github.com>
Date: Sun, 30 Nov 2025 15:17:51 +0100
Subject: [PATCH 34/43] Update Logos

---
 README.md | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 59c8151..02a3ca8 100644
--- a/README.md
+++ b/README.md
@@ -8,17 +8,14 @@
 
 ![promptolution](https://github.com/user-attachments/assets/84c050bd-61a1-4f2e-bc4e-874d9b4a69af)
 
-
-
 <p align="center">
-  <img src="https://mcml.ai/images/footer/lmu_white.webp" height="40">
-  <img src="https://github.com/user-attachments/assets/41edb678-16c4-4a3a-a579-62459106cf48" height="40">
-  <img src="https://github.com/user-attachments/assets/1ae42b4a-163e-43ed-b691-c253d4f4c814"  height="40">
-  <img  src="https://github.com/user-attachments/assets/d6977ca8-7b52-4714-83d7-610743c7d52c"  height="40"/>
-  <img src="https://mcml.ai/images/footer/tum_white.webp" height="40">
+<img height="60" alt="lmu_logo" src="https://github.com/user-attachments/assets/5aecd0d6-fc2d-48b2-b395-d1877578a3c5" />
+<img height="60" alt="mcml" src="https://github.com/user-attachments/assets/d9f3b18e-a5ec-4c3f-b449-e57cb977f483" />
+<img height="60" alt="ellis_logo" src="https://github.com/user-attachments/assets/60654a27-0f8f-4624-a1d5-5122f2632bec" />
+<img height="60" alt="uni_freiburg_color" src="https://github.com/user-attachments/assets/f5eabbd2-ae6a-497b-857b-71958ed77335" />
+<img height="60" alt="tum_logo" src="https://github.com/user-attachments/assets/982ec2f0-ec14-4dc2-8d75-bfae09d4fa73" />
 </p>
 
-
 ## 🚀 What is Promptolution?
 
 **Promptolution** is a unified, modular framework for prompt optimization — built for researchers and advanced practitioners who want full control over their experimental setup. Unlike end-to-end application frameworks with high abstraction, promptolution focuses exclusively on the optimization stage, providing a clean, transparent, and extensible API. It allows for simple prompt optimization for one task up to large-scale reproducible benchmark experiments. 

From 0e53e0da8a864910b8b452d53c8308ddd06d7c9a Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo.home@gmail.com>
Date: Sun, 30 Nov 2025 15:51:24 +0100
Subject: [PATCH 35/43] add release notes

---
 docs/release-notes/v2.2.0.md | 13 +++++++++++++
 mkdocs.yml                   |  1 +
 2 files changed, 14 insertions(+)
 create mode 100644 docs/release-notes/v2.2.0.md

diff --git a/docs/release-notes/v2.2.0.md b/docs/release-notes/v2.2.0.md
new file mode 100644
index 0000000..8724a41
--- /dev/null
+++ b/docs/release-notes/v2.2.0.md
@@ -0,0 +1,13 @@
+## Release v2.2.0
+### What's changed
+
+#### Added features:
+* Extended interface of APILLM allowing to pass kwargs to the API
+* Improve asynchronous parallelization of LLM calls shortening inference times
+* Introduced a `Prompt` class to encapsulate instructions and few-shot examples
+
+#### Further changes:
+* Improved error handling
+* Improved task-description infusion mechanism for meta-prompts
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/2.1.0...v2.2.0)
diff --git a/mkdocs.yml b/mkdocs.yml
index 57cde7a..ac377fb 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -47,6 +47,7 @@ nav:
   - Home: index.md
   - Release Notes:
     - Overview: release-notes.md
+    - v2.2.0: release-notes/v2.2.0.md
     - v2.1.0: release-notes/v2.1.0.md
     - v2.0.1: release-notes/v2.0.1.md
     - v2.0.0: release-notes/v2.0.0.md

From 4f7a6c832a0c02b69fa71e5ba90b993a4b94399b Mon Sep 17 00:00:00 2001
From: mo374z <schlager.mo.home@gmail.com>
Date: Sun, 30 Nov 2025 15:58:01 +0100
Subject: [PATCH 36/43] running pre-commit

---
 promptolution/tasks/judge_tasks.py      | 3 +--
 tests/mocks/mock_predictor.py           | 2 +-
 tests/optimizers/test_evoprompt_de.py   | 3 ---
 tests/predictors/test_base_predictor.py | 2 --
 tutorials/evoprompt_demo.py             | 1 -
 5 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/promptolution/tasks/judge_tasks.py b/promptolution/tasks/judge_tasks.py
index 8f9fbd7..4a685b4 100644
--- a/promptolution/tasks/judge_tasks.py
+++ b/promptolution/tasks/judge_tasks.py
@@ -1,9 +1,8 @@
 """Module for judge tasks."""
 
-import numpy as np
 import pandas as pd
 
-from typing import TYPE_CHECKING, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, List, Optional
 
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.tasks.base_task import BaseTask
diff --git a/tests/mocks/mock_predictor.py b/tests/mocks/mock_predictor.py
index 445794a..36f11d0 100644
--- a/tests/mocks/mock_predictor.py
+++ b/tests/mocks/mock_predictor.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from typing import List, Optional, Tuple
+from typing import List, Optional
 
 from promptolution.llms.base_llm import BaseLLM
 from promptolution.predictors.base_predictor import BasePredictor
diff --git a/tests/optimizers/test_evoprompt_de.py b/tests/optimizers/test_evoprompt_de.py
index 41e4bfd..cdd2a31 100644
--- a/tests/optimizers/test_evoprompt_de.py
+++ b/tests/optimizers/test_evoprompt_de.py
@@ -1,8 +1,5 @@
 from unittest.mock import patch
 
-import numpy as np
-import pytest
-
 from promptolution.optimizers import EvoPromptDE
 from promptolution.utils.prompt import Prompt
 
diff --git a/tests/predictors/test_base_predictor.py b/tests/predictors/test_base_predictor.py
index 5ba537b..4bfeacd 100644
--- a/tests/predictors/test_base_predictor.py
+++ b/tests/predictors/test_base_predictor.py
@@ -1,7 +1,5 @@
 import numpy as np
 
-from tests.mocks.mock_predictor import MockPredictor
-
 
 def test_predictor_predict_flow(mock_predictor):
     """Test the basic prediction flow from prompt to final prediction."""
diff --git a/tutorials/evoprompt_demo.py b/tutorials/evoprompt_demo.py
index 1d00369..d15d5b0 100644
--- a/tutorials/evoprompt_demo.py
+++ b/tutorials/evoprompt_demo.py
@@ -2,7 +2,6 @@
 
 
 import argparse
-import random
 from logging import Logger
 
 from datasets import load_dataset

From 50e28992bea0bee4801f52a8168f6be3a3b229dc Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 30 Nov 2025 16:18:09 +0100
Subject: [PATCH 37/43] only import base llm if type checking

---
 promptolution/tasks/judge_tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/tasks/judge_tasks.py b/promptolution/tasks/judge_tasks.py
index 4a685b4..2570458 100644
--- a/promptolution/tasks/judge_tasks.py
+++ b/promptolution/tasks/judge_tasks.py
@@ -4,12 +4,12 @@
 
 from typing import TYPE_CHECKING, List, Optional
 
-from promptolution.llms.base_llm import BaseLLM
 from promptolution.tasks.base_task import BaseTask
 from promptolution.utils.formatting import extract_from_tag
 from promptolution.utils.logging import get_logger
 
 if TYPE_CHECKING:  # pragma: no cover
+    from promptolution.llms.base_llm import BaseLLM
     from promptolution.tasks.base_task import EvalStrategy
     from promptolution.utils.config import ExperimentConfig
 

From 484844afd1e28b20f1e210e5252b34686150c4ca Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 30 Nov 2025 17:15:51 +0100
Subject: [PATCH 38/43] fix tests

---
 promptolution/helpers.py               |  4 +++-
 promptolution/utils/prompt_creation.py | 29 +++++++++++++++++++++-----
 promptolution/utils/templates.py       | 28 +++++++++++++++++++++++++
 tests/helpers/test_helpers.py          |  3 +++
 tests/optimizers/test_capo.py          |  2 +-
 5 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
index ddcb904..a25c008 100644
--- a/promptolution/helpers.py
+++ b/promptolution/helpers.py
@@ -122,8 +122,10 @@ def run_evaluation(
     if isinstance(prompts[0], str):
         str_prompts = cast(List[str], prompts)
         prompts = [Prompt(p) for p in str_prompts]
+    else:
+        str_prompts = [p.construct_prompt() for p in cast(List[Prompt], prompts)]
     scores = task.evaluate(prompts, predictor, eval_strategy="full")
-    df = pd.DataFrame(dict(prompt=prompts, score=scores))
+    df = pd.DataFrame(dict(prompt=str_prompts, score=scores))
     df = df.sort_values("score", ascending=False, ignore_index=True)
 
     return df
diff --git a/promptolution/utils/prompt_creation.py b/promptolution/utils/prompt_creation.py
index 67879de..fd0087d 100644
--- a/promptolution/utils/prompt_creation.py
+++ b/promptolution/utils/prompt_creation.py
@@ -7,6 +7,7 @@
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from promptolution.utils.formatting import extract_from_tag
+from promptolution.utils.logging import get_logger
 
 if TYPE_CHECKING:  # pragma: no cover
     from promptolution.llms.base_llm import BaseLLM
@@ -18,8 +19,11 @@
     PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION,
     PROMPT_CREATION_TEMPLATE_TD,
     PROMPT_VARIATION_TEMPLATE,
+    default_prompts,
 )
 
+logger = get_logger(__name__)
+
 
 def create_prompt_variation(
     prompt: Union[List[str], str], llm: "BaseLLM", meta_prompt: Optional[str] = None
@@ -128,6 +132,7 @@ def create_prompts_from_task_description(
     llm: "BaseLLM",
     meta_prompt: Optional[str] = None,
     n_prompts: int = 10,
+    n_retries: int = 3,
 ) -> List[str]:
     """Generate a set of prompts from a given task description.
 
@@ -137,13 +142,27 @@ def create_prompts_from_task_description(
         meta_prompt (str): The meta prompt to use for generating the prompts.
             If None, a default meta prompt is used.
         n_prompts (int): The number of prompts to generate.
+        n_retries (int): The number of retries to attempt if prompt generation fails.
     """
     if meta_prompt is None:
         meta_prompt = PROMPT_CREATION_TEMPLATE_FROM_TASK_DESCRIPTION
 
     meta_prompt = meta_prompt.replace("<task_desc>", task_description).replace("<n_prompts>", str(n_prompts))
-
-    prompts_str = llm.get_response(meta_prompt)[0]
-    prompts = json.loads(prompts_str)
-
-    return prompts
+    final_prompts = None
+    for _ in range(n_retries):
+        prompts_str = llm.get_response(meta_prompt)[0]
+        try:
+            prompts = json.loads(prompts_str)
+            assert isinstance(prompts, list) and all(isinstance(p, str) for p in prompts) and len(prompts) == n_prompts
+            final_prompts = prompts
+            break
+        except (json.JSONDecodeError, AssertionError):
+            logger.warning("Failed to parse prompts JSON, retrying...")
+
+    if final_prompts is None:
+        logger.error(
+            f"Failed to generate prompts from task description after {n_retries} retries, returning default prompts."
+        )
+        final_prompts = default_prompts[:n_prompts]
+
+    return final_prompts
diff --git a/promptolution/utils/templates.py b/promptolution/utils/templates.py
index 9579540..2aeb45e 100644
--- a/promptolution/utils/templates.py
+++ b/promptolution/utils/templates.py
@@ -174,3 +174,31 @@
 
 Return the new prompt in the following format:
 <prompt>new prompt</prompt>"""
+
+
+default_prompts = [
+    "Give me your response within <final_answer> tags.",
+    "Please provide a thoughtful answer to my question and wrap your response in <final_answer> tags so I can easily identify it.",
+    "I need your expertise on this matter. Kindly structure your response within <final_answer> tags for better readability.",
+    "Analyze the following and present your findings enclosed in <final_answer> </final_answer> tags.",
+    "Consider this inquiry carefully. Your comprehensive response should be formatted within <final_answer> tags to facilitate extraction.",
+    "Respond succinctly. Ensure all content appears between <final_answer> and </final_answer> markers.",
+    "Would you mind addressing this request? Please place your entire response inside <final_answer> </final_answer> formatting.",
+    "I'm seeking your insights on a particular topic. Kindly ensure that your complete analysis is contained within <final_answer> tags for my convenience.",
+    "Examine this query thoroughly and deliver your conclusions. All output must be encapsulated in <final_answer> </final_answer> notation for processing purposes.",
+    "Help me understand this subject better. Your explanation should begin with <final_answer> and conclude with </final_answer> to maintain proper structure.",
+    "I require information on the following. Please format your response with <final_answer> tags at the beginning and end for clarity.",
+    "Contemplate this scenario and offer your perspective. Remember to enclose all content within <final_answer> tags as per requirements.",
+    "Elaborate on this concept, making sure to wrap the entirety of your explanation in <final_answer> </final_answer> markers for systematic review.",
+    "Describe your approach to this situation. Be thorough yet concise, and place your complete response between <final_answer> and </final_answer> tags.",
+    "Share your knowledge on this matter. Your entire response should be presented within <final_answer> tags to facilitate proper integration into my workflow.",
+    "Let's think step by step. Your answer should be enclosed within <final_answer> </final_answer> tags.",
+    "Provide a detailed response to the following question, ensuring that all information is contained within <final_answer> tags for easy extraction.",
+    "Kindly address the following topic, formatting your entire response between <final_answer> and </final_answer> markers for clarity and organization.",
+    "Offer your insights on this issue, making sure to encapsulate your full response within <final_answer> tags for seamless processing.",
+    "Delve into this subject and present your findings, ensuring that all content is wrapped in <final_answer> </final_answer> notation for systematic analysis.",
+    "Illuminate this topic with your expertise, formatting your complete explanation within <final_answer> tags for straightforward comprehension.",
+    "Provide your perspective on this matter, ensuring that your entire response is contained within <final_answer> tags for efficient review.",
+    "Analyze the following scenario and deliver your conclusions, making sure to enclose all output in <final_answer> </final_answer> markers for clarity.",
+    "Help me grasp this concept better by structuring your explanation between <final_answer> and </final_answer> tags for proper formatting.",
+]
diff --git a/tests/helpers/test_helpers.py b/tests/helpers/test_helpers.py
index 59328b1..d39ec38 100644
--- a/tests/helpers/test_helpers.py
+++ b/tests/helpers/test_helpers.py
@@ -197,6 +197,8 @@ def test_run_evaluation(mock_get_task, mock_get_predictor, mock_get_llm, sample_
         "Is this text positive, negative, or neutral?",
     ]
 
+    prompts = [Prompt(p) for p in prompts]
+
     # Now this will work because mock_task is a MagicMock
     mock_task.evaluate.return_value = np.array([0.8, 0.7, 0.9])
 
@@ -298,6 +300,7 @@ def test_helpers_integration(sample_df, experiment_config):
         # Verify results
         assert isinstance(result, pd.DataFrame)
         assert len(result) == 2
+        print([p in result["prompt"].values for p in optimized_prompts_str])
         assert all(p in result["prompt"].values for p in optimized_prompts_str)
 
         # Verify optimization was called
diff --git a/tests/optimizers/test_capo.py b/tests/optimizers/test_capo.py
index 05fb6c9..305f290 100644
--- a/tests/optimizers/test_capo.py
+++ b/tests/optimizers/test_capo.py
@@ -209,7 +209,7 @@ def test_capo_crossover_prompt(mock_meta_llm, mock_predictor, initial_prompts, m
         .replace("<task_desc>", full_task_desc)
     )
 
-    assert mock_meta_llm.call_history[0]["prompts"][0] == expected_meta_prompt
+    assert str(mock_meta_llm.call_history[0]["prompts"][0]) == expected_meta_prompt
 
 
 def test_capo_mutate_prompt(mock_meta_llm, mock_predictor, initial_prompts, mock_task, mock_df):

From 1b6ca5b2f4a96b1d6a8f7d547925acc7afa1d2d4 Mon Sep 17 00:00:00 2001
From: Tom Zehle <t.zehle@gmail.com>
Date: Sun, 30 Nov 2025 17:40:41 +0100
Subject: [PATCH 39/43] Remove comment

---
 tutorials/api_llm_demo.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tutorials/api_llm_demo.py b/tutorials/api_llm_demo.py
index 728e288..8f7dcbd 100644
--- a/tutorials/api_llm_demo.py
+++ b/tutorials/api_llm_demo.py
@@ -14,7 +14,6 @@
 
 logger = Logger(__name__)
 
-"""Run a test run for any of the implemented optimizers."""
 parser = argparse.ArgumentParser()
 parser.add_argument("--base-url", default="https://api.openai.com/v1")
 parser.add_argument("--model", default="gpt-4o-2024-08-06")

From 4eff0db658e31b8237dcafe1f0adbf92cdc0c33c Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 30 Nov 2025 17:45:20 +0100
Subject: [PATCH 40/43] add create_prompts_from_task_description to imports

---
 promptolution/utils/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/promptolution/utils/__init__.py b/promptolution/utils/__init__.py
index ff81f03..1163f63 100644
--- a/promptolution/utils/__init__.py
+++ b/promptolution/utils/__init__.py
@@ -10,7 +10,11 @@
 from promptolution.utils.config import ExperimentConfig
 from promptolution.utils.logging import get_logger, setup_logging
 from promptolution.utils.prompt import Prompt, sort_prompts_by_scores
-from promptolution.utils.prompt_creation import create_prompt_variation, create_prompts_from_samples
+from promptolution.utils.prompt_creation import (
+    create_prompt_variation,
+    create_prompts_from_samples,
+    create_prompts_from_task_description,
+)
 from promptolution.utils.templates import (
     CAPO_CROSSOVER_TEMPLATE,
     CAPO_FEWSHOT_TEMPLATE,
@@ -43,6 +47,7 @@
     "sort_prompts_by_scores",
     "create_prompt_variation",
     "create_prompts_from_samples",
+    "create_prompts_from_task_description",
     "CAPO_CROSSOVER_TEMPLATE",
     "CAPO_FEWSHOT_TEMPLATE",
     "CAPO_MUTATION_TEMPLATE",

From 7c5f4ff97fd2f25780f7a95d030a2da15497bfc2 Mon Sep 17 00:00:00 2001
From: Moritz Schlager <87517800+mo374z@users.noreply.github.com>
Date: Sun, 30 Nov 2025 17:56:25 +0100
Subject: [PATCH 41/43] fix formatting

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 02a3ca8..8f33e84 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 ## 🚀 What is Promptolution?
 
-**Promptolution** is a unified, modular framework for prompt optimization — built for researchers and advanced practitioners who want full control over their experimental setup. Unlike end-to-end application frameworks with high abstraction, promptolution focuses exclusively on the optimization stage, providing a clean, transparent, and extensible API. It allows for simple prompt optimization for one task up to large-scale reproducible benchmark experiments. 
+**Promptolution** is a unified, modular framework for prompt optimization built for researchers and advanced practitioners who want full control over their experimental setup. Unlike end-to-end application frameworks with high abstraction, promptolution focuses exclusively on the optimization stage, providing a clean, transparent, and extensible API. It allows for simple prompt optimization for one task up to large-scale reproducible benchmark experiments. 
 
 <img width="808" height="356" alt="promptolution_framework" src="https://github.com/user-attachments/assets/e3d05493-30e3-4464-b0d6-1d3e3085f575" />
 

From efa27f36adbe6eb6001a0330a05f07c993e32470 Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 30 Nov 2025 18:17:59 +0100
Subject: [PATCH 42/43] allow capo to overwrite eval strategy

---
 promptolution/optimizers/capo.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/promptolution/optimizers/capo.py b/promptolution/optimizers/capo.py
index 4cdd8fd..3c5955a 100644
--- a/promptolution/optimizers/capo.py
+++ b/promptolution/optimizers/capo.py
@@ -106,6 +106,11 @@ def __init__(
                 f" Setting max_n_blocks_eval to {self.task.n_blocks}."
             )
             self.max_n_blocks_eval = self.task.n_blocks
+        if "block" not in self.task.eval_strategy:
+            logger.warning(
+                f"ℹ️ CAPO requires 'block' in the eval_strategy, but got {self.task.eval_strategy}. Setting eval_strategy to 'sequential_block'."
+            )
+            self.task.eval_strategy = "sequential_block"
         self.population_size = len(self.prompts)
 
         if hasattr(self.predictor, "begin_marker") and hasattr(self.predictor, "end_marker"):

From 6dcf75d011da1643b10da6c4ebdceea5e3ce1e3f Mon Sep 17 00:00:00 2001
From: finitearth <t.zehle@gmail.com>
Date: Sun, 30 Nov 2025 18:27:52 +0100
Subject: [PATCH 43/43] clarify prompt format

---
 promptolution/utils/templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/promptolution/utils/templates.py b/promptolution/utils/templates.py
index 2aeb45e..60d55d4 100644
--- a/promptolution/utils/templates.py
+++ b/promptolution/utils/templates.py
@@ -144,7 +144,7 @@
 
 Explicitly state the expected format above by repeating its exact character sequence verbatim in every prompt if applicable.
 
-Create overall <n_prompts> prompts within json format, meaning strings inside quotes as an array. Do not response with anything else. Start the array with [ and end with ]. Separate each prompt by a comma, and do not use quotation marks inside the prompts."""
+Create overall <n_prompts> prompts within json format, meaning strings inside quotations marks ("") as an array. Do not response with anything else. Start the array with [ and end with ]. Separate each prompt by a comma, and do not use quotation marks inside the prompts."""
 
 
 DOWNSTREAM_TEMPLATE = "<instruction>"