vllm-project · yossiovadia · Sep 17, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
@@ -48,4 +48,4 @@ jobs:
         push: ${{ github.event_name != 'pull_request' }} # Only push on merge to main, not on PRs
         tags: |
           ${{ inputs.is_nightly == true && format('ghcr.io/{0}/semantic-router/extproc:nightly-{1}', github.repository_owner, steps.date.outputs.date_tag) || format('ghcr.io/{0}/semantic-router/extproc:{1}', github.repository_owner, github.sha) }}
-          ${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/extproc:latest', github.repository_owner) || '' }} 
+          ${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/extproc:latest', github.repository_owner) || '' }}
@@ -45,7 +45,7 @@ jobs:
         sudo apt-get install -y \
           make \
           build-essential \
-          pkg-config 
+          pkg-config
         npm install -g markdownlint-cli
         pip install --user yamllint codespell
 
@@ -81,31 +81,13 @@ jobs:
         key: ${{ runner.os }}-pre-commit-${{ hashFiles('.pre-commit-config.yaml') }}
 
     - name: Install pre-commit
-      run: pip install pre-commit
+      run: make precommit-install
 
     - name: Run Code Spell Check
       run: make codespell
 
-    - name: Run pre-commit on Go, Rust, JavaScript, Markdown, Yaml and Python files
-      run: |
-        # Find all Go, Rust, JavaScripts, Markdown and Python files (excluding vendored/generated code)
-        FILES=$(find . -type f \( -name "*.go" -o -name "*.rs" -o -name "*.py" -o -name "*.js" -o -name "*.md" -o -name "*.yaml" -o -name "*.yml" \) \
-          ! -path "./target/*" \
-          ! -path "./candle-binding/target/*" \
-          ! -path "./.git/*" \
-          ! -path "./node_modules/*" \
-          ! -path "./vendor/*" \
-          ! -path "./__pycache__/*" \
-          ! -path "./site/*" \
-          ! -name "*.pb.go" \
-          | tr '\n' ' ')
-
-        if [ -n "$FILES" ]; then
-          echo "Running pre-commit on files: $FILES"
-          pre-commit run --files $FILES
-        else
-          echo "No Go, Rust, JavaScript, Markdown, Yaml, or Python files found to check"
-        fi
+    - name: Run pre-commit check
+      run: make precommit-check
 
     - name: Show pre-commit results
       if: failure()

@@ -0,0 +1,40 @@
+name: Create and publish Precommit Image
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    paths:
+      - 'Dockerfile.precommit'
+
+jobs:
+  build_and_push:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Generate date tag for nightly builds
+        id: date
+        if: inputs.is_nightly == true
+        run: echo "date_tag=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./Dockerfile.precommit
+          push: ${{ github.event_name != 'pull_request' }} # Only push on merge to main, not on PRs
+          tags: |
+            ${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/precommit:latest', github.repository_owner) || '' }}
@@ -15,7 +15,6 @@ COPY tools/make/ tools/make/
 COPY Makefile ./
 COPY candle-binding/Cargo.toml candle-binding/
 COPY candle-binding/src/ candle-binding/src/
-Copy tools ./tools
 
 # Use Makefile to build the Rust library
 RUN make rust

@@ -0,0 +1,29 @@
+FROM golang:1.24
+
+# Install Base env
+RUN apt-get update && apt-get install -y \
+    make \
+    build-essential \
+    pkg-config \
+    python3 \
+    python3-pip
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - && \
+    apt-get install -y nodejs
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
+    . $HOME/.cargo/env
+
+# Markdown
+RUN npm install -g markdownlint-cli
+
+# Install pre-commit and tools
+RUN pip install --break-system-packages pre-commit
+
+# Yamllint
+RUN pip install --break-system-packages yamllint
+
+# CodeSpell
+RUN pip install --break-system-packages codespell
@@ -13,6 +13,7 @@ _run:
 		-f tools/make/linter.mk \
 		-f tools/make/milvus.mk \
 		-f tools/make/models.mk \
+		-f tools/make/pre-commit.mk \
 		$(MAKECMDGOALS)
 
 .PHONY: _run

@@ -178,35 +178,62 @@ def parse_args():
     return parser.parse_args()
 
 
-def get_dataset_optimal_tokens(dataset_info):
+def get_dataset_optimal_tokens(dataset_info, model_name=None):
     """
-    Determine optimal token limit based on dataset complexity and reasoning requirements.
+    Determine optimal token limit based on dataset complexity, reasoning requirements, and model capabilities.
 
     Token limits are optimized for structured response generation while maintaining
-    efficiency across different reasoning complexity levels.
+    efficiency across different reasoning complexity levels and model architectures.
+
+    Args:
+        dataset_info: Dataset information object
+        model_name: Model identifier (e.g., "openai/gpt-oss-20b", "Qwen/Qwen3-30B-A3B")
     """
     dataset_name = dataset_info.name.lower()
     difficulty = dataset_info.difficulty_level.lower()
 
-    # Optimized token limits per dataset (increased for reasoning mode support)
-    dataset_tokens = {
-        "gpqa": 1500,  # Graduate-level scientific reasoning
+    # Determine model type and capabilities
+    model_multiplier = 1.0
+    if model_name:
+        model_lower = model_name.lower()
+        if "qwen" in model_lower:
+            # Qwen models are more efficient and can handle longer contexts
+            model_multiplier = 1.5
+        elif "deepseek" in model_lower:
+            # DeepSeek models (e.g., V3.1) are capable and can handle longer contexts
+            model_multiplier = 1.5
+        elif "gpt-oss" in model_lower:
+            # GPT-OSS models use baseline token limits
+            model_multiplier = 1.0
+        # Default to baseline for unknown models
+
+    # Base token limits per dataset (optimized for gpt-oss20b baseline)
+    base_dataset_tokens = {
+        "gpqa": 3000,  # Graduate-level scientific reasoning (increased for complex multi-step reasoning)
         "truthfulqa": 800,  # Misconception analysis
         "hellaswag": 800,  # Natural continuation reasoning
         "arc": 800,  # Elementary/middle school science
         "commonsenseqa": 1000,  # Common sense reasoning
-        "mmlu": 600 if difficulty == "undergraduate" else 800,  # Academic knowledge
+        "mmlu": 3000,  # Academic knowledge (increased for complex technical domains like engineering/chemistry)
     }
 
-    # Find matching dataset
-    for dataset_key, tokens in dataset_tokens.items():
+    # Find matching dataset and apply model multiplier
+    base_tokens = None
+    for dataset_key, tokens in base_dataset_tokens.items():
         if dataset_key in dataset_name:
-            return tokens
+            base_tokens = tokens
+            break
+
+    # Fallback to difficulty-based tokens if dataset not found
+    if base_tokens is None:
+        difficulty_tokens = {"graduate": 300, "hard": 300, "moderate": 200, "easy": 150}
+        base_tokens = difficulty_tokens.get(difficulty, 200)
 
-    # Default based on difficulty level
-    difficulty_tokens = {"graduate": 300, "hard": 300, "moderate": 200, "easy": 150}
+    # Apply model-specific multiplier and round to nearest 50
+    final_tokens = int(base_tokens * model_multiplier)
+    final_tokens = ((final_tokens + 25) // 50) * 50  # Round to nearest 50
 
-    return difficulty_tokens.get(difficulty, 200)
+    return final_tokens
 
 
 def get_available_models(endpoint: str, api_key: str = "") -> List[str]:
@@ -507,6 +534,20 @@ def evaluate_model_vllm_multimode(
         q.cot_content is not None and q.cot_content.strip() for q in questions[:10]
     )
 
+    # Debug: Show CoT content status for first few questions
+    print(f"  CoT Debug - Checking first 10 questions:")
+    for i, q in enumerate(questions[:10]):
+        cot_status = (
+            "None"
+            if q.cot_content is None
+            else (
+                f"'{q.cot_content[:50]}...'"
+                if len(q.cot_content) > 50
+                else f"'{q.cot_content}'"
+            )
+        )
+        print(f"    Q{i+1}: CoT = {cot_status}")
+
     if has_cot_content:
         print(f"  Dataset has CoT content - using 3 modes: NR, XC, NR_REASONING")
     else:
@@ -827,28 +868,31 @@ def main():
     print(f"Router models: {router_models}")
     print(f"vLLM models: {vllm_models}")
 
-    # Determine optimal token limit for this dataset
-    if args.max_tokens:
-        optimal_tokens = args.max_tokens
-        print(f"Using user-specified max_tokens: {optimal_tokens}")
-    else:
-        optimal_tokens = get_dataset_optimal_tokens(dataset_info)
-        print(
-            f"Using dataset-optimal max_tokens: {optimal_tokens} (for {dataset_info.name})"
-        )
+    # Function to get optimal tokens for a specific model
+    # For fair comparison, use consistent token limits regardless of model name
+    def get_model_optimal_tokens(model_name):
+        if args.max_tokens:
+            return args.max_tokens
+        else:
+            # Use base dataset tokens without model-specific multipliers for fair comparison
+            return get_dataset_optimal_tokens(dataset_info, model_name=None)
 
     # Router evaluation (NR-only)
     if args.run_router and router_endpoint and router_models:
         for model in router_models:
+            model_tokens = get_model_optimal_tokens(model)
             print(f"\nEvaluating router model: {model}")
+            print(
+                f"Using max_tokens: {model_tokens} (dataset-optimized for fair comparison)"
+            )
             rt_df = evaluate_model_router_transparent(
                 questions=questions,
                 dataset=dataset,
                 model=model,
                 endpoint=router_endpoint,
                 api_key=router_api_key,
                 concurrent_requests=args.concurrent_requests,
-                max_tokens=optimal_tokens,
+                max_tokens=model_tokens,
                 temperature=args.temperature,
             )
             analysis = analyze_results(rt_df)
@@ -863,15 +907,19 @@ def main():
     # Direct vLLM evaluation (NR/XC with reasoning ON/OFF)
     if args.run_vllm and vllm_endpoint and vllm_models:
         for model in vllm_models:
+            model_tokens = get_model_optimal_tokens(model)
             print(f"\nEvaluating vLLM model: {model}")
+            print(
+                f"Using max_tokens: {model_tokens} (dataset-optimized for fair comparison)"
+            )
             vdf = evaluate_model_vllm_multimode(
                 questions=questions,
                 dataset=dataset,
                 model=model,
                 endpoint=vllm_endpoint,
                 api_key=vllm_api_key,
                 concurrent_requests=args.concurrent_requests,
-                max_tokens=optimal_tokens,
+                max_tokens=model_tokens,
                 temperature=args.temperature,
                 exec_modes=args.vllm_exec_modes,
             )

@@ -8,6 +8,7 @@ semantic_cache:
   similarity_threshold: 0.8
   max_entries: 1000  # Only applies to memory backend
   ttl_seconds: 3600
+  eviction_policy: "fifo"  # "fifo", "lru", "lfu", currently only supports memory backend
 
   # For production environments, use Milvus for scalable caching:
   # backend_type: "milvus"
@@ -46,14 +47,14 @@ vllm_endpoints:
       - "phi4"
       - "gemma3:27b"
     weight: 1  # Load balancing weight
-    health_check_path: "/health"  # Optional health check endpoint
+    health_check_path: "/api/version"  # Optional health check endpoint
   - name: "endpoint2"
     address: "127.0.0.1"
     port: 11434
     models:
       - "mistral-small3.1"
     weight: 1
-    health_check_path: "/health"
+    health_check_path: "/api/version"
   - name: "endpoint3"
     address: "127.0.0.1"
     port: 11434