vllm-project · rootfs · Sep 23, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
@@ -55,4 +55,8 @@ ENV LD_LIBRARY_PATH=/app/lib
 
 EXPOSE 50051
 
-CMD ["/app/extproc-server", "--config", "/app/config/config.yaml"]
+# Copy entrypoint to allow switching config via env var CONFIG_FILE
+COPY scripts/entrypoint.sh /app/entrypoint.sh
+RUN chmod +x /app/entrypoint.sh
+
+ENTRYPOINT ["/app/entrypoint.sh"]
@@ -0,0 +1,84 @@
+bert_model:
+  model_id: sentence-transformers/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 1000
+  ttl_seconds: 3600
+  eviction_policy: "fifo"
+
+tools:
+  enabled: true
+  top_k: 3
+  similarity_threshold: 0.2
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+prompt_guard:
+  enabled: true
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+vllm_endpoints:
+  - name: "mock"
+    address: "mock-vllm"
+    port: 8000
+    models:
+      - "openai/gpt-oss-20b"
+    weight: 1
+    health_check_path: "/health"
+
+model_config:
+  "openai/gpt-oss-20b":
+    reasoning_family: "gpt-oss"
+    preferred_endpoints: ["mock"]
+    pii_policy:
+      allow_by_default: true
+
+categories:
+  - name: other
+    model_scores:
+      - model: openai/gpt-oss-20b
+        score: 0.7
+        use_reasoning: false
+
+default_model: openai/gpt-oss-20b
+
+reasoning_families:
+  deepseek:
+    type: "chat_template_kwargs"
+    parameter: "thinking"
+
+  qwen3:
+    type: "chat_template_kwargs"
+    parameter: "enable_thinking"
+
+  gpt-oss:
+    type: "reasoning_effort"
+    parameter: "reasoning_effort"
+  gpt:
+    type: "reasoning_effort"
+    parameter: "reasoning_effort"
+
+default_reasoning_effort: high
+
+api:
+  batch_classification:
+    max_batch_size: 100
+    concurrency_threshold: 5
+    max_concurrency: 8
+    metrics:
+      enabled: true
+      detailed_goroutine_tracking: true
+      high_resolution_timing: false
+      sample_rate: 1.0
+      duration_buckets:
+        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
@@ -13,6 +13,7 @@ services:
       - ./models:/app/models:ro
     environment:
       - LD_LIBRARY_PATH=/app/lib
+      - CONFIG_FILE=${CONFIG_FILE:-/app/config/config.yaml}
     networks:
       - semantic-network
     healthcheck:
@@ -44,6 +45,24 @@ services:
       retries: 5
       start_period: 10s
 
+  # Mock vLLM service for testing profile
+  mock-vllm:
+    build:
+      context: ./tools/mock-vllm
+      dockerfile: Dockerfile
+    container_name: mock-vllm
+    profiles: ["testing"]
+    ports:
+      - "8000:8000"
+    networks:
+      - semantic-network
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://localhost:8000/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 5s
+
 networks:
   semantic-network:
     driver: bridge

@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+CONFIG_FILE_PATH=${CONFIG_FILE:-/app/config/config.yaml}
+
+if [[ ! -f "$CONFIG_FILE_PATH" ]]; then
+  echo "[entrypoint] Config file not found at $CONFIG_FILE_PATH" >&2
+  exit 1
+fi
+
+echo "[entrypoint] Starting semantic-router with config: $CONFIG_FILE_PATH"
+exec /app/extproc-server --config "$CONFIG_FILE_PATH"
@@ -0,0 +1,16 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt 
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app.py
+
+EXPOSE 8000
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,9 @@
+# Mock vLLM (OpenAI-compatible) service
+
+A tiny FastAPI server that emulates minimal endpoints used by the router:
+
+- GET /health
+- GET /v1/models
+- POST /v1/chat/completions
+
+Intended for local testing with Docker Compose profile `testing`.
@@ -0,0 +1,81 @@
+import math
+import time
+from typing import List, Optional
+
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+app = FastAPI()
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 0.2
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+@app.get("/v1/models")
+async def models():
+    return {"data": [{"id": "openai/gpt-oss-20b", "object": "model"}]}
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(req: ChatRequest):
+    # Very simple echo-like behavior
+    last_user = next(
+        (m.content for m in reversed(req.messages) if m.role == "user"), ""
+    )
+    content = f"[mock-{req.model}] You said: {last_user}"
+
+    # Rough token estimation: ~1 token per 4 characters (ceil)
+    def estimate_tokens(text: str) -> int:
+        if not text:
+            return 0
+        return max(1, math.ceil(len(text) / 4))
+
+    prompt_text = "\n".join(
+        m.content for m in req.messages if isinstance(m.content, str)
+    )
+    prompt_tokens = estimate_tokens(prompt_text)
+    completion_tokens = estimate_tokens(content)
+    total_tokens = prompt_tokens + completion_tokens
+
+    created_ts = int(time.time())
+
+    usage = {
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "total_tokens": total_tokens,
+        # Optional details fields some clients read when using caching/reasoning
+        "prompt_tokens_details": {"cached_tokens": 0},
+        "completion_tokens_details": {"reasoning_tokens": 0},
+    }
+
+    return {
+        "id": "cmpl-mock-123",
+        "object": "chat.completion",
+        "created": created_ts,
+        "model": req.model,
+        "system_fingerprint": "mock-vllm",
+        "choices": [
+            {
+                "index": 0,
+                "message": {"role": "assistant", "content": content},
+                "finish_reason": "stop",
+                "logprobs": None,
+            }
+        ],
+        "usage": usage,
+        # Some SDKs look for token_usage; keep it as an alias for convenience.
+        "token_usage": usage,
+    }
@@ -0,0 +1,3 @@
+fastapi==0.115.0
+uvicorn==0.30.6
+pydantic==2.9.2