Merge branch 'main' into feat-mars

codelion · codelion · commit 1d2623664175 · 2025-09-27T10:41:39.000+08:00
diff --git a/README.md b/README.md
@@ -490,12 +490,16 @@ optillm supports various command-line arguments for configuration. When using Do
 | `--cepo_planning_m` | Number of attempts to generate n plans in planning stage | 6 |
 | `--cepo_planning_temperature_step1` | Temperature for generator in step 1 of planning stage | 0.55 |
 | `--cepo_planning_temperature_step2` | Temperature for generator in step 2 of planning stage | 0.25 |
+| `--cepo_planning_temperature_direct_resp` | Temperature for generator after step 2 if planning fails and answer directly | 0.1 |
 | `--cepo_planning_temperature_step3` | Temperature for generator in step 3 of planning stage | 0.1 |
 | `--cepo_planning_temperature_step4` | Temperature for generator in step 4 of planning stage | 0 |
 | `--cepo_planning_max_tokens_step1` | Maximum number of tokens in step 1 of planning stage | 4096 |
 | `--cepo_planning_max_tokens_step2` | Maximum number of tokens in step 2 of planning stage | 4096 |
+| `--cepo_planning_max_tokens_direct_resp` | Maximum number of tokens after step 2 if planning fails and answer directly | 4096 |
 | `--cepo_planning_max_tokens_step3` | Maximum number of tokens in step 3 of planning stage | 4096 |
 | `--cepo_planning_max_tokens_step4` | Maximum number of tokens in step 4 of planning stage | 4096 |
+| `--cepo_use_reasoning_fallback` | Whether to fallback to lower levels of reasoning when higher level fails | False |
+| `--cepo_num_of_retries` | Number of retries if llm call fails, 0 for no retries | 0 |
 | `--cepo_print_output` | Whether to print the output of each stage | `False` |
 | `--cepo_config_file` | Path to CePO configuration file | `None` |
 | `--cepo_use_plan_diversity` | Use additional plan diversity step | `False` |
@@ -584,6 +588,19 @@ Authorization: Bearer your_secret_api_key
 
  ¹ Numbers in parentheses for LongCePO indicate accuracy of majority voting from 5 runs.
 
+### CePO on math and code benchmarks (Sep 2025)
+
+| Method                  | AIME 2024 | AIME 2025 |  GPQA  | LiveCodeBench |
+| ----------------------: | :-------: | :-------: | :----: | :-----------: |
+| Qwen3 8B                |   74.0    |   68.3    |  59.3  |     55.7      |
+| CePO (using Qwen3 8B)   |   86.7    |   80.0    |  62.5  |     60.5      |
+| Qwen3 32B               |   81.4    |   72.9    |  66.8  |     65.7      |
+| CePO (using Qwen3 32B)  | **90.7**  | **83.3**  |  70.0  |   **71.9**    |
+| Qwen3 235B              |   85.7    |   81.5    |  71.1  |     70.7      |
+| DeepSeek R1             |   79.8    |   70.0    |  71.5  |     64.3      |
+| OpenAI o3-mini          |   79.6    |   74.8    |  76.8  |     66.3      |
+| Grok3 Think             |   83.9    |   77.3    |**80.2**|     70.6      |
+
 ### CePO on math and code benchmarks (Mar 2025)
 
 | Method                         | Math-L5 | MMLU-Pro (Math) | CRUX | LiveCodeBench (pass@1) | Simple QA |
diff --git a/optillm/__init__.py b/optillm/__init__.py
@@ -1,5 +1,5 @@
 # Version information
-__version__ = "0.2.8"
+__version__ = "0.2.9"
 
 # Import from server module
 from .server import (
diff --git a/optillm/cepo/README.md b/optillm/cepo/README.md
@@ -23,6 +23,19 @@ The model reviews all generated solution proposals and their associated plans, i
 **Step 4**: Final Solution
 The model uses the refined plan from Step 3 to produce the final answer.
 
+## Example Usage
+
+Here’s an example of running Optillm using the CePO method for Qwen3 deployed with VLLM on port 8001:
+
+```bash
+OPENAI_API_KEY=serving-on-vllm \
+python optillm.py \
+  --base-url http://localhost:8001/v1 \
+  --approach cepo \
+  --port 8000 \
+  --cepo_config_file ./optillm/cepo/cepo_configs/cepo_qwen3.yaml
+```
+
 ## CePO Current Status
 
 This project is a work in progress, and the provided code is in an early experimental stage. While the proposed approach works well across the benchmarks we tested, further improvements can be achieved by task-specific customizations to prompts.
diff --git a/optillm/cepo/cepo.py b/optillm/cepo/cepo.py
diff --git a/optillm/cepo/configs/cepo_config.yaml b/optillm/cepo/configs/cepo_config.yaml
@@ -1,17 +1,21 @@
 bestofn_n: 3
 bestofn_temperature: 0.1
 bestofn_max_tokens: 4096
-bestofn_rating_type: "absolute"  # or "pairwise"
+bestofn_rating_type: "absolute"  # or "pairwise", "majority"
 planning_n: 3
 planning_m: 6
 planning_temperature_step1: 0.55
 planning_temperature_step2: 0.25
+planning_temperature_direct_resp: 0.1
 planning_temperature_step3: 0.1
 planning_temperature_step4: 0
 planning_max_tokens_step1: 4096
 planning_max_tokens_step2: 4096
+planning_max_tokens_direct_resp: 4096
 planning_max_tokens_step3: 4096
 planning_max_tokens_step4: 4096
 use_plan_diversity: False
 rating_model: null
+use_reasoning_fallback: False
+num_of_retries: 0
 print_output: False
diff --git a/optillm/cepo/configs/cepo_config_gptoss.yaml b/optillm/cepo/configs/cepo_config_gptoss.yaml
@@ -0,0 +1,21 @@
+bestofn_n: 3
+bestofn_temperature: 0.6
+bestofn_max_tokens: 40960
+bestofn_rating_type: "absolute"
+planning_n: 2
+planning_m: 4
+planning_temperature_step1: 1.0
+planning_temperature_step2: 1.0
+planning_temperature_direct_resp: 0.6
+planning_temperature_step3: 1.0
+planning_temperature_step4: 0.5
+planning_max_tokens_step1: 40960
+planning_max_tokens_step2: 40960
+planning_max_tokens_direct_resp: 32768
+planning_max_tokens_step3: 40960
+planning_max_tokens_step4: 40960
+use_plan_diversity: False
+rating_model: null
+use_reasoning_fallback: True
+num_of_retries: 2
+print_output: true
diff --git a/optillm/cepo/configs/cepo_config_qwen3.yaml b/optillm/cepo/configs/cepo_config_qwen3.yaml
@@ -0,0 +1,21 @@
+bestofn_n: 3
+bestofn_temperature: 0.6
+bestofn_max_tokens: 20480
+bestofn_rating_type: "majority"
+planning_n: 2
+planning_m: 4
+planning_temperature_step1: 0.8
+planning_temperature_step2: 0.8
+planning_temperature_direct_resp: 0.6
+planning_temperature_step3: 0.8
+planning_temperature_step4: 0.8
+planning_max_tokens_step1: 28672
+planning_max_tokens_step2: 24576
+planning_max_tokens_direct_resp: 32768
+planning_max_tokens_step3: 20481
+planning_max_tokens_step4: 20482
+use_plan_diversity: False
+rating_model: null
+use_reasoning_fallback: False
+num_of_retries: 0
+print_output: False
diff --git a/optillm/inference.py b/optillm/inference.py
@@ -1029,7 +1029,7 @@ def _load_model():
             logger.info(f"Using device: {device}")
             
             # Load tokenizer
-            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=os.getenv("HF_TOKEN"))
             
             # Base kwargs for model loading
             model_kwargs = {
@@ -1076,6 +1076,7 @@ def _load_model():
             try:
                 model = AutoModelForCausalLM.from_pretrained(
                     model_id,
+                    token=os.getenv("HF_TOKEN"),
                     **model_kwargs
                 )
             except Exception as e:
@@ -1085,6 +1086,7 @@ def _load_model():
                     model_kwargs.pop("attn_implementation")
                     model = AutoModelForCausalLM.from_pretrained(
                         model_id,
+                        token=os.getenv("HF_TOKEN"),
                         **model_kwargs
                     )
                 elif model_kwargs["torch_dtype"] == torch.float16:
@@ -1094,6 +1096,7 @@ def _load_model():
                     model_kwargs["torch_dtype"] = torch.float32
                     model = AutoModelForCausalLM.from_pretrained(
                         model_id,
+                        token=os.getenv("HF_TOKEN"),
                         **model_kwargs
                     )
             
@@ -1134,7 +1137,7 @@ def validate_adapter(self, adapter_id: str) -> bool:
             config = PeftConfig.from_pretrained(
                 adapter_id,
                 trust_remote_code=True,
-                use_auth_token=os.getenv("HF_TOKEN")
+                token=os.getenv("HF_TOKEN")
             )
             return True
         except Exception as e:
@@ -1159,7 +1162,7 @@ def _load_adapter():
                 config = PeftConfig.from_pretrained(
                     adapter_id,
                     trust_remote_code=True,
-                    use_auth_token=os.getenv("HF_TOKEN")
+                    token=os.getenv("HF_TOKEN")
                 )
                 
                 model = base_model
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "optillm"
-version = "0.2.8"
+version = "0.2.9"
 description = "An optimizing inference proxy for LLMs."
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/requirements.txt b/requirements.txt
@@ -33,4 +33,5 @@ adaptive-classifier
 datasets
 mcp
 # MLX support for Apple Silicon optimization
-mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"
+mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"
+math_verify