Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4aa6ff4
Add revived e2e tests and updates
yossiovadia Sep 17, 2025
e5d3220
fix: increase e2e test timeouts and update config health check
yossiovadia Sep 18, 2025
c9d714d
fix: update remaining e2e test timeouts from 10s to 30s
yossiovadia Sep 18, 2025
83cffca
feat: harden jailbreak tests - remove 503 acceptance and add actual j…
yossiovadia Sep 18, 2025
fefd96b
feat: add auto routing intelligence test to validate actual model sel…
yossiovadia Sep 18, 2025
304a84b
feat: harden all remaining e2e tests - remove 503 acceptance and add …
yossiovadia Sep 18, 2025
ab2017d
cleanup: simplify status report and remove redundant 503 comments
yossiovadia Sep 18, 2025
ae18b6a
fix: remove DAN jargon from status report
yossiovadia Sep 18, 2025
f406747
Delete CLAUDE.md
yossiovadia Sep 18, 2025
d36d323
feat: add DCO signoffs to all test files and status report
yossiovadia Sep 19, 2025
08a8041
feat: add DCO signoffs to remaining test files
yossiovadia Sep 19, 2025
e3ac473
metrics: Add request-level token histograms (#157)
tao12345666333 Sep 18, 2025
ea14306
docs: add repo URL in docker/README.md (#163)
cryo-zd Sep 18, 2025
9a43f53
remove discarded fields from documents (#165)
lengrongfu Sep 18, 2025
a329bb5
Correct tools directory copy command in Dockerfile (#171)
yuluo-yx Sep 18, 2025
fd7c267
feat: add basic cache eviction policy: LRU/LFU/FIFO (#166)
aeft Sep 18, 2025
5a0a957
chore: add just max token for different models in router bench
rootfs Sep 15, 2025
1d37497
docs: Model Performance Evaluation Guide (#136)
JaredforReal Sep 19, 2025
9815862
api: add semantic route support (#147)
Xunzhuo Sep 19, 2025
a6affe4
infra: update Dockerfile.extproc (#169)
yuluo-yx Sep 19, 2025
6569c5a
feat: add more content for contribution docs (#175)
yuluo-yx Sep 19, 2025
afd9cb8
fix: avoid double counting cache hits (#177)
cryo-zd Sep 19, 2025
2ab2523
refactor: remove failing tests for separate PRs
yossiovadia Sep 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
push: ${{ github.event_name != 'pull_request' }} # Only push on merge to main, not on PRs
tags: |
${{ inputs.is_nightly == true && format('ghcr.io/{0}/semantic-router/extproc:nightly-{1}', github.repository_owner, steps.date.outputs.date_tag) || format('ghcr.io/{0}/semantic-router/extproc:{1}', github.repository_owner, github.sha) }}
${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/extproc:latest', github.repository_owner) || '' }}
${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/extproc:latest', github.repository_owner) || '' }}
26 changes: 4 additions & 22 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
sudo apt-get install -y \
make \
build-essential \
pkg-config
pkg-config
npm install -g markdownlint-cli
pip install --user yamllint codespell

Expand Down Expand Up @@ -81,31 +81,13 @@ jobs:
key: ${{ runner.os }}-pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}

- name: Install pre-commit
run: pip install pre-commit
run: make precommit-install

- name: Run Code Spell Check
run: make codespell

- name: Run pre-commit on Go, Rust, JavaScript, Markdown, Yaml and Python files
run: |
# Find all Go, Rust, JavaScripts, Markdown and Python files (excluding vendored/generated code)
FILES=$(find . -type f \( -name "*.go" -o -name "*.rs" -o -name "*.py" -o -name "*.js" -o -name "*.md" -o -name "*.yaml" -o -name "*.yml" \) \
! -path "./target/*" \
! -path "./candle-binding/target/*" \
! -path "./.git/*" \
! -path "./node_modules/*" \
! -path "./vendor/*" \
! -path "./__pycache__/*" \
! -path "./site/*" \
! -name "*.pb.go" \
| tr '\n' ' ')

if [ -n "$FILES" ]; then
echo "Running pre-commit on files: $FILES"
pre-commit run --files $FILES
else
echo "No Go, Rust, JavaScript, Markdown, Yaml, or Python files found to check"
fi
- name: Run pre-commit check
run: make precommit-check

- name: Show pre-commit results
if: failure()
Expand Down
40 changes: 40 additions & 0 deletions .github/workflows/precommit-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Create and publish Precommit Image

on:
push:
branches: [ "main" ]
pull_request:
paths:
- 'Dockerfile.precommit'

jobs:
build_and_push:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Check out the repo
uses: actions/checkout@v4

- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Generate date tag for nightly builds
id: date
if: inputs.is_nightly == true
run: echo "date_tag=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile.precommit
push: ${{ github.event_name != 'pull_request' }} # Only push on merge to main, not on PRs
tags: |
${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/precommit:latest', github.repository_owner) || '' }}
1 change: 0 additions & 1 deletion Dockerfile.extproc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ COPY tools/make/ tools/make/
COPY Makefile ./
COPY candle-binding/Cargo.toml candle-binding/
COPY candle-binding/src/ candle-binding/src/
Copy tools ./tools

# Use Makefile to build the Rust library
RUN make rust
Expand Down
29 changes: 29 additions & 0 deletions Dockerfile.precommit
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
FROM golang:1.24

# Install Base env
RUN apt-get update && apt-get install -y \
make \
build-essential \
pkg-config \
python3 \
python3-pip

# Install Node.js and npm
RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - && \
apt-get install -y nodejs

# Install Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
. $HOME/.cargo/env

# Markdown
RUN npm install -g markdownlint-cli

# Install pre-commit and tools
RUN pip install --break-system-packages pre-commit

# Yamllint
RUN pip install --break-system-packages yamllint

# CodeSpell
RUN pip install --break-system-packages codespell
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ _run:
-f tools/make/linter.mk \
-f tools/make/milvus.mk \
-f tools/make/models.mk \
-f tools/make/pre-commit.mk \
$(MAKECMDGOALS)

.PHONY: _run
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,35 +178,62 @@ def parse_args():
return parser.parse_args()


def get_dataset_optimal_tokens(dataset_info):
def get_dataset_optimal_tokens(dataset_info, model_name=None):
"""
Determine optimal token limit based on dataset complexity and reasoning requirements.
Determine optimal token limit based on dataset complexity, reasoning requirements, and model capabilities.

Token limits are optimized for structured response generation while maintaining
efficiency across different reasoning complexity levels.
efficiency across different reasoning complexity levels and model architectures.

Args:
dataset_info: Dataset information object
model_name: Model identifier (e.g., "openai/gpt-oss-20b", "Qwen/Qwen3-30B-A3B")
"""
dataset_name = dataset_info.name.lower()
difficulty = dataset_info.difficulty_level.lower()

# Optimized token limits per dataset (increased for reasoning mode support)
dataset_tokens = {
"gpqa": 1500, # Graduate-level scientific reasoning
# Determine model type and capabilities
model_multiplier = 1.0
if model_name:
model_lower = model_name.lower()
if "qwen" in model_lower:
# Qwen models are more efficient and can handle longer contexts
model_multiplier = 1.5
elif "deepseek" in model_lower:
# DeepSeek models (e.g., V3.1) are capable and can handle longer contexts
model_multiplier = 1.5
elif "gpt-oss" in model_lower:
# GPT-OSS models use baseline token limits
model_multiplier = 1.0
# Default to baseline for unknown models

# Base token limits per dataset (optimized for gpt-oss20b baseline)
base_dataset_tokens = {
"gpqa": 3000, # Graduate-level scientific reasoning (increased for complex multi-step reasoning)
"truthfulqa": 800, # Misconception analysis
"hellaswag": 800, # Natural continuation reasoning
"arc": 800, # Elementary/middle school science
"commonsenseqa": 1000, # Common sense reasoning
"mmlu": 600 if difficulty == "undergraduate" else 800, # Academic knowledge
"mmlu": 3000, # Academic knowledge (increased for complex technical domains like engineering/chemistry)
}

# Find matching dataset
for dataset_key, tokens in dataset_tokens.items():
# Find matching dataset and apply model multiplier
base_tokens = None
for dataset_key, tokens in base_dataset_tokens.items():
if dataset_key in dataset_name:
return tokens
base_tokens = tokens
break

# Fallback to difficulty-based tokens if dataset not found
if base_tokens is None:
difficulty_tokens = {"graduate": 300, "hard": 300, "moderate": 200, "easy": 150}
base_tokens = difficulty_tokens.get(difficulty, 200)

# Default based on difficulty level
difficulty_tokens = {"graduate": 300, "hard": 300, "moderate": 200, "easy": 150}
# Apply model-specific multiplier and round to nearest 50
final_tokens = int(base_tokens * model_multiplier)
final_tokens = ((final_tokens + 25) // 50) * 50 # Round to nearest 50

return difficulty_tokens.get(difficulty, 200)
return final_tokens


def get_available_models(endpoint: str, api_key: str = "") -> List[str]:
Expand Down Expand Up @@ -507,6 +534,20 @@ def evaluate_model_vllm_multimode(
q.cot_content is not None and q.cot_content.strip() for q in questions[:10]
)

# Debug: Show CoT content status for first few questions
print(f" CoT Debug - Checking first 10 questions:")
for i, q in enumerate(questions[:10]):
cot_status = (
"None"
if q.cot_content is None
else (
f"'{q.cot_content[:50]}...'"
if len(q.cot_content) > 50
else f"'{q.cot_content}'"
)
)
print(f" Q{i+1}: CoT = {cot_status}")

if has_cot_content:
print(f" Dataset has CoT content - using 3 modes: NR, XC, NR_REASONING")
else:
Expand Down Expand Up @@ -827,28 +868,31 @@ def main():
print(f"Router models: {router_models}")
print(f"vLLM models: {vllm_models}")

# Determine optimal token limit for this dataset
if args.max_tokens:
optimal_tokens = args.max_tokens
print(f"Using user-specified max_tokens: {optimal_tokens}")
else:
optimal_tokens = get_dataset_optimal_tokens(dataset_info)
print(
f"Using dataset-optimal max_tokens: {optimal_tokens} (for {dataset_info.name})"
)
# Function to get optimal tokens for a specific model
# For fair comparison, use consistent token limits regardless of model name
def get_model_optimal_tokens(model_name):
if args.max_tokens:
return args.max_tokens
else:
# Use base dataset tokens without model-specific multipliers for fair comparison
return get_dataset_optimal_tokens(dataset_info, model_name=None)

# Router evaluation (NR-only)
if args.run_router and router_endpoint and router_models:
for model in router_models:
model_tokens = get_model_optimal_tokens(model)
print(f"\nEvaluating router model: {model}")
print(
f"Using max_tokens: {model_tokens} (dataset-optimized for fair comparison)"
)
rt_df = evaluate_model_router_transparent(
questions=questions,
dataset=dataset,
model=model,
endpoint=router_endpoint,
api_key=router_api_key,
concurrent_requests=args.concurrent_requests,
max_tokens=optimal_tokens,
max_tokens=model_tokens,
temperature=args.temperature,
)
analysis = analyze_results(rt_df)
Expand All @@ -863,15 +907,19 @@ def main():
# Direct vLLM evaluation (NR/XC with reasoning ON/OFF)
if args.run_vllm and vllm_endpoint and vllm_models:
for model in vllm_models:
model_tokens = get_model_optimal_tokens(model)
print(f"\nEvaluating vLLM model: {model}")
print(
f"Using max_tokens: {model_tokens} (dataset-optimized for fair comparison)"
)
vdf = evaluate_model_vllm_multimode(
questions=questions,
dataset=dataset,
model=model,
endpoint=vllm_endpoint,
api_key=vllm_api_key,
concurrent_requests=args.concurrent_requests,
max_tokens=optimal_tokens,
max_tokens=model_tokens,
temperature=args.temperature,
exec_modes=args.vllm_exec_modes,
)
Expand Down
5 changes: 3 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ semantic_cache:
similarity_threshold: 0.8
max_entries: 1000 # Only applies to memory backend
ttl_seconds: 3600
eviction_policy: "fifo" # "fifo", "lru", "lfu", currently only supports memory backend

# For production environments, use Milvus for scalable caching:
# backend_type: "milvus"
Expand Down Expand Up @@ -46,14 +47,14 @@ vllm_endpoints:
- "phi4"
- "gemma3:27b"
weight: 1 # Load balancing weight
health_check_path: "/health" # Optional health check endpoint
health_check_path: "/api/version" # Optional health check endpoint
- name: "endpoint2"
address: "127.0.0.1"
port: 11434
models:
- "mistral-small3.1"
weight: 1
health_check_path: "/health"
health_check_path: "/api/version"
- name: "endpoint3"
address: "127.0.0.1"
port: 11434
Expand Down
Loading
Loading