Skip to content

Commit 1d26236

Browse files
committed
Merge branch 'main' into feat-mars
2 parents 0ed0b40 + 6d09721 commit 1d26236

File tree

10 files changed

+678
-162
lines changed

10 files changed

+678
-162
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,12 +490,16 @@ optillm supports various command-line arguments for configuration. When using Do
490490
| `--cepo_planning_m` | Number of attempts to generate n plans in planning stage | 6 |
491491
| `--cepo_planning_temperature_step1` | Temperature for generator in step 1 of planning stage | 0.55 |
492492
| `--cepo_planning_temperature_step2` | Temperature for generator in step 2 of planning stage | 0.25 |
493+
| `--cepo_planning_temperature_direct_resp` | Temperature for generator after step 2 if planning fails and answer directly | 0.1 |
493494
| `--cepo_planning_temperature_step3` | Temperature for generator in step 3 of planning stage | 0.1 |
494495
| `--cepo_planning_temperature_step4` | Temperature for generator in step 4 of planning stage | 0 |
495496
| `--cepo_planning_max_tokens_step1` | Maximum number of tokens in step 1 of planning stage | 4096 |
496497
| `--cepo_planning_max_tokens_step2` | Maximum number of tokens in step 2 of planning stage | 4096 |
498+
| `--cepo_planning_max_tokens_direct_resp` | Maximum number of tokens after step 2 if planning fails and answer directly | 4096 |
497499
| `--cepo_planning_max_tokens_step3` | Maximum number of tokens in step 3 of planning stage | 4096 |
498500
| `--cepo_planning_max_tokens_step4` | Maximum number of tokens in step 4 of planning stage | 4096 |
501+
| `--cepo_use_reasoning_fallback` | Whether to fallback to lower levels of reasoning when higher level fails | False |
502+
| `--cepo_num_of_retries` | Number of retries if llm call fails, 0 for no retries | 0 |
499503
| `--cepo_print_output` | Whether to print the output of each stage | `False` |
500504
| `--cepo_config_file` | Path to CePO configuration file | `None` |
501505
| `--cepo_use_plan_diversity` | Use additional plan diversity step | `False` |
@@ -584,6 +588,19 @@ Authorization: Bearer your_secret_api_key
584588

585589
¹ Numbers in parentheses for LongCePO indicate accuracy of majority voting from 5 runs.
586590

591+
### CePO on math and code benchmarks (Sep 2025)
592+
593+
| Method | AIME 2024 | AIME 2025 | GPQA | LiveCodeBench |
594+
| ----------------------: | :-------: | :-------: | :----: | :-----------: |
595+
| Qwen3 8B | 74.0 | 68.3 | 59.3 | 55.7 |
596+
| CePO (using Qwen3 8B) | 86.7 | 80.0 | 62.5 | 60.5 |
597+
| Qwen3 32B | 81.4 | 72.9 | 66.8 | 65.7 |
598+
| CePO (using Qwen3 32B) | **90.7** | **83.3** | 70.0 | **71.9** |
599+
| Qwen3 235B | 85.7 | 81.5 | 71.1 | 70.7 |
600+
| DeepSeek R1 | 79.8 | 70.0 | 71.5 | 64.3 |
601+
| OpenAI o3-mini | 79.6 | 74.8 | 76.8 | 66.3 |
602+
| Grok3 Think | 83.9 | 77.3 |**80.2**| 70.6 |
603+
587604
### CePO on math and code benchmarks (Mar 2025)
588605

589606
| Method | Math-L5 | MMLU-Pro (Math) | CRUX | LiveCodeBench (pass@1) | Simple QA |

optillm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Version information
2-
__version__ = "0.2.8"
2+
__version__ = "0.2.9"
33

44
# Import from server module
55
from .server import (

optillm/cepo/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@ The model reviews all generated solution proposals and their associated plans, i
2323
**Step 4**: Final Solution
2424
The model uses the refined plan from Step 3 to produce the final answer.
2525

26+
## Example Usage
27+
28+
Here’s an example of running Optillm using the CePO method for Qwen3 deployed with VLLM on port 8001:
29+
30+
```bash
31+
OPENAI_API_KEY=serving-on-vllm \
32+
python optillm.py \
33+
--base-url http://localhost:8001/v1 \
34+
--approach cepo \
35+
--port 8000 \
36+
--cepo_config_file ./optillm/cepo/cepo_configs/cepo_qwen3.yaml
37+
```
38+
2639
## CePO Current Status
2740

2841
This project is a work in progress, and the provided code is in an early experimental stage. While the proposed approach works well across the benchmarks we tested, further improvements can be achieved by task-specific customizations to prompts.

optillm/cepo/cepo.py

Lines changed: 591 additions & 155 deletions
Large diffs are not rendered by default.
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
11
bestofn_n: 3
22
bestofn_temperature: 0.1
33
bestofn_max_tokens: 4096
4-
bestofn_rating_type: "absolute" # or "pairwise"
4+
bestofn_rating_type: "absolute" # or "pairwise", "majority"
55
planning_n: 3
66
planning_m: 6
77
planning_temperature_step1: 0.55
88
planning_temperature_step2: 0.25
9+
planning_temperature_direct_resp: 0.1
910
planning_temperature_step3: 0.1
1011
planning_temperature_step4: 0
1112
planning_max_tokens_step1: 4096
1213
planning_max_tokens_step2: 4096
14+
planning_max_tokens_direct_resp: 4096
1315
planning_max_tokens_step3: 4096
1416
planning_max_tokens_step4: 4096
1517
use_plan_diversity: False
1618
rating_model: null
19+
use_reasoning_fallback: False
20+
num_of_retries: 0
1721
print_output: False
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
bestofn_n: 3
2+
bestofn_temperature: 0.6
3+
bestofn_max_tokens: 40960
4+
bestofn_rating_type: "absolute"
5+
planning_n: 2
6+
planning_m: 4
7+
planning_temperature_step1: 1.0
8+
planning_temperature_step2: 1.0
9+
planning_temperature_direct_resp: 0.6
10+
planning_temperature_step3: 1.0
11+
planning_temperature_step4: 0.5
12+
planning_max_tokens_step1: 40960
13+
planning_max_tokens_step2: 40960
14+
planning_max_tokens_direct_resp: 32768
15+
planning_max_tokens_step3: 40960
16+
planning_max_tokens_step4: 40960
17+
use_plan_diversity: False
18+
rating_model: null
19+
use_reasoning_fallback: True
20+
num_of_retries: 2
21+
print_output: true
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
bestofn_n: 3
2+
bestofn_temperature: 0.6
3+
bestofn_max_tokens: 20480
4+
bestofn_rating_type: "majority"
5+
planning_n: 2
6+
planning_m: 4
7+
planning_temperature_step1: 0.8
8+
planning_temperature_step2: 0.8
9+
planning_temperature_direct_resp: 0.6
10+
planning_temperature_step3: 0.8
11+
planning_temperature_step4: 0.8
12+
planning_max_tokens_step1: 28672
13+
planning_max_tokens_step2: 24576
14+
planning_max_tokens_direct_resp: 32768
15+
planning_max_tokens_step3: 20481
16+
planning_max_tokens_step4: 20482
17+
use_plan_diversity: False
18+
rating_model: null
19+
use_reasoning_fallback: False
20+
num_of_retries: 0
21+
print_output: False

optillm/inference.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,7 +1029,7 @@ def _load_model():
10291029
logger.info(f"Using device: {device}")
10301030

10311031
# Load tokenizer
1032-
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
1032+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=os.getenv("HF_TOKEN"))
10331033

10341034
# Base kwargs for model loading
10351035
model_kwargs = {
@@ -1076,6 +1076,7 @@ def _load_model():
10761076
try:
10771077
model = AutoModelForCausalLM.from_pretrained(
10781078
model_id,
1079+
token=os.getenv("HF_TOKEN"),
10791080
**model_kwargs
10801081
)
10811082
except Exception as e:
@@ -1085,6 +1086,7 @@ def _load_model():
10851086
model_kwargs.pop("attn_implementation")
10861087
model = AutoModelForCausalLM.from_pretrained(
10871088
model_id,
1089+
token=os.getenv("HF_TOKEN"),
10881090
**model_kwargs
10891091
)
10901092
elif model_kwargs["torch_dtype"] == torch.float16:
@@ -1094,6 +1096,7 @@ def _load_model():
10941096
model_kwargs["torch_dtype"] = torch.float32
10951097
model = AutoModelForCausalLM.from_pretrained(
10961098
model_id,
1099+
token=os.getenv("HF_TOKEN"),
10971100
**model_kwargs
10981101
)
10991102

@@ -1134,7 +1137,7 @@ def validate_adapter(self, adapter_id: str) -> bool:
11341137
config = PeftConfig.from_pretrained(
11351138
adapter_id,
11361139
trust_remote_code=True,
1137-
use_auth_token=os.getenv("HF_TOKEN")
1140+
token=os.getenv("HF_TOKEN")
11381141
)
11391142
return True
11401143
except Exception as e:
@@ -1159,7 +1162,7 @@ def _load_adapter():
11591162
config = PeftConfig.from_pretrained(
11601163
adapter_id,
11611164
trust_remote_code=True,
1162-
use_auth_token=os.getenv("HF_TOKEN")
1165+
token=os.getenv("HF_TOKEN")
11631166
)
11641167

11651168
model = base_model

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "optillm"
7-
version = "0.2.8"
7+
version = "0.2.9"
88
description = "An optimizing inference proxy for LLMs."
99
readme = "README.md"
1010
license = "Apache-2.0"

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,5 @@ adaptive-classifier
3333
datasets
3434
mcp
3535
# MLX support for Apple Silicon optimization
36-
mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"
36+
mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"
37+
math_verify

0 commit comments

Comments
 (0)