diff --git a/Dockerfile.extproc b/Dockerfile.extproc index 1ba8b45e..72ead6e4 100644 --- a/Dockerfile.extproc +++ b/Dockerfile.extproc @@ -55,4 +55,8 @@ ENV LD_LIBRARY_PATH=/app/lib EXPOSE 50051 -CMD ["/app/extproc-server", "--config", "/app/config/config.yaml"] +# Copy entrypoint to allow switching config via env var CONFIG_FILE +COPY scripts/entrypoint.sh /app/entrypoint.sh +RUN chmod +x /app/entrypoint.sh + +ENTRYPOINT ["/app/entrypoint.sh"] diff --git a/config/config.testing.yaml b/config/config.testing.yaml new file mode 100644 index 00000000..0b84e0ff --- /dev/null +++ b/config/config.testing.yaml @@ -0,0 +1,84 @@ +bert_model: + model_id: sentence-transformers/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.8 + max_entries: 1000 + ttl_seconds: 3600 + eviction_policy: "fifo" + +tools: + enabled: true + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + +prompt_guard: + enabled: true + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +vllm_endpoints: + - name: "mock" + address: "mock-vllm" + port: 8000 + models: + - "openai/gpt-oss-20b" + weight: 1 + health_check_path: "/health" + +model_config: + "openai/gpt-oss-20b": + reasoning_family: "gpt-oss" + preferred_endpoints: ["mock"] + pii_policy: + allow_by_default: true + +categories: + - name: other + model_scores: + - model: openai/gpt-oss-20b + score: 0.7 + use_reasoning: false + +default_model: openai/gpt-oss-20b + +reasoning_families: + deepseek: + type: "chat_template_kwargs" + parameter: "thinking" + + qwen3: + type: "chat_template_kwargs" + parameter: "enable_thinking" + + gpt-oss: + type: "reasoning_effort" + parameter: "reasoning_effort" + gpt: + type: "reasoning_effort" + parameter: "reasoning_effort" + +default_reasoning_effort: high + +api: + batch_classification: + max_batch_size: 100 + concurrency_threshold: 5 + max_concurrency: 8 + metrics: + enabled: true + detailed_goroutine_tracking: true + high_resolution_timing: false + sample_rate: 1.0 + duration_buckets: + [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] + size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] diff --git a/docker-compose.yml b/docker-compose.yml index 09f7b9ad..afc7e7e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,6 +13,7 @@ services: - ./models:/app/models:ro environment: - LD_LIBRARY_PATH=/app/lib + - CONFIG_FILE=${CONFIG_FILE:-/app/config/config.yaml} networks: - semantic-network healthcheck: @@ -44,6 +45,24 @@ services: retries: 5 start_period: 10s + # Mock vLLM service for testing profile + mock-vllm: + build: + context: ./tools/mock-vllm + dockerfile: Dockerfile + container_name: mock-vllm + profiles: ["testing"] + ports: + - "8000:8000" + networks: + - semantic-network + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:8000/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 5s + networks: semantic-network: driver: bridge diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh new file mode 100644 index 00000000..c0b4093a --- /dev/null +++ b/scripts/entrypoint.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +CONFIG_FILE_PATH=${CONFIG_FILE:-/app/config/config.yaml} + +if [[ ! -f "$CONFIG_FILE_PATH" ]]; then + echo "[entrypoint] Config file not found at $CONFIG_FILE_PATH" >&2 + exit 1 +fi + +echo "[entrypoint] Starting semantic-router with config: $CONFIG_FILE_PATH" +exec /app/extproc-server --config "$CONFIG_FILE_PATH" diff --git a/tools/mock-vllm/Dockerfile b/tools/mock-vllm/Dockerfile new file mode 100644 index 00000000..ea955b2b --- /dev/null +++ b/tools/mock-vllm/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +COPY app.py + +EXPOSE 8000 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/tools/mock-vllm/README.md b/tools/mock-vllm/README.md new file mode 100644 index 00000000..1ac7a9b8 --- /dev/null +++ b/tools/mock-vllm/README.md @@ -0,0 +1,9 @@ +# Mock vLLM (OpenAI-compatible) service + +A tiny FastAPI server that emulates minimal endpoints used by the router: + +- GET /health +- GET /v1/models +- POST /v1/chat/completions + +Intended for local testing with Docker Compose profile `testing`. diff --git a/tools/mock-vllm/app.py b/tools/mock-vllm/app.py new file mode 100644 index 00000000..e4d02d15 --- /dev/null +++ b/tools/mock-vllm/app.py @@ -0,0 +1,81 @@ +import math +import time +from typing import List, Optional + +from fastapi import FastAPI +from pydantic import BaseModel + +app = FastAPI() + + +class ChatMessage(BaseModel): + role: str + content: str + + +class ChatRequest(BaseModel): + model: str + messages: List[ChatMessage] + temperature: Optional[float] = 0.2 + + +@app.get("/health") +async def health(): + return {"status": "ok"} + + +@app.get("/v1/models") +async def models(): + return {"data": [{"id": "openai/gpt-oss-20b", "object": "model"}]} + + +@app.post("/v1/chat/completions") +async def chat_completions(req: ChatRequest): + # Very simple echo-like behavior + last_user = next( + (m.content for m in reversed(req.messages) if m.role == "user"), "" + ) + content = f"[mock-{req.model}] You said: {last_user}" + + # Rough token estimation: ~1 token per 4 characters (ceil) + def estimate_tokens(text: str) -> int: + if not text: + return 0 + return max(1, math.ceil(len(text) / 4)) + + prompt_text = "\n".join( + m.content for m in req.messages if isinstance(m.content, str) + ) + prompt_tokens = estimate_tokens(prompt_text) + completion_tokens = estimate_tokens(content) + total_tokens = prompt_tokens + completion_tokens + + created_ts = int(time.time()) + + usage = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + # Optional details fields some clients read when using caching/reasoning + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + } + + return { + "id": "cmpl-mock-123", + "object": "chat.completion", + "created": created_ts, + "model": req.model, + "system_fingerprint": "mock-vllm", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": content}, + "finish_reason": "stop", + "logprobs": None, + } + ], + "usage": usage, + # Some SDKs look for token_usage; keep it as an alias for convenience. + "token_usage": usage, + } diff --git a/tools/mock-vllm/requirements.txt b/tools/mock-vllm/requirements.txt new file mode 100644 index 00000000..3971515d --- /dev/null +++ b/tools/mock-vllm/requirements.txt @@ -0,0 +1,3 @@ +fastapi==0.115.0 +uvicorn==0.30.6 +pydantic==2.9.2 diff --git a/website/docs/getting-started/docker-quickstart.md b/website/docs/getting-started/docker-quickstart.md index e06bed44..6a517ff2 100644 --- a/website/docs/getting-started/docker-quickstart.md +++ b/website/docs/getting-started/docker-quickstart.md @@ -4,42 +4,36 @@ Run Semantic Router + Envoy locally using Docker Compose v2. ## Prerequisites -- Docker Engine and Docker Compose v2 (use the `docker compose` command, not the legacy `docker-compose`) +- Docker Engine, see more in [Docker Engine Installation](https://docs.docker.com/engine/install/) +- Docker Compose v2 (use the `docker compose` command, not the legacy `docker-compose`) - ```bash - # Verify - docker compose version - ``` + Docker Compose Plugin Installation(if missing), see more in [Docker Compose Plugin Installation](https://docs.docker.com/compose/install/linux/#install-using-the-repository) - Install Docker Compose v2 for Ubuntu(if missing), see more in [Docker Compose Plugin Installation](https://docs.docker.com/compose/install/linux/#install-using-the-repository) + ```bash + # For Ubuntu and Debian, run: + sudo apt-get update + sudo apt-get install -y docker-compose-plugin - ```bash - # Remove legacy v1 if present (optional) - sudo apt-get remove -y docker-compose || true + # For RPM-based distributions, run: + sudo yum update + sudo yum install docker-compose-plugin - sudo apt-get update - sudo apt-get install -y ca-certificates curl gnupg - sudo install -m 0755 -d /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --yes --dearmor -o /etc/apt/keyrings/docker.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo $VERSION_CODENAME) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - - docker compose version - ``` + # Verify + docker compose version + ``` - Ensure ports 8801, 50051, 19000 are free ## Install and Run with Docker Compose v2 -1) Clone the repo and move into it (from your workspace root): +**1. Clone the repo and move into it (from your workspace root)** ```bash git clone https://github.com/vllm-project/semantic-router.git cd semantic-router ``` -2) Download required models (classification models): +**2. Download required models (classification models)** ```bash make download-models @@ -53,7 +47,7 @@ This downloads the classification models used by the router: Note: The BERT similarity model defaults to a remote Hugging Face model. See Troubleshooting for offline/local usage. -3) Start the services with Docker Compose v2: +**3. Start the services with Docker Compose v2** ```bash # Start core services (semantic-router + envoy) @@ -62,11 +56,12 @@ docker compose up --build # Or run in background (recommended) docker compose up --build -d -# With testing profile (includes mock vLLM) -docker compose --profile testing up --build +# With testing profile (includes mock vLLM). Use testing config to point router at the mock endpoint: +# (CONFIG_FILE is read by the router entrypoint; the file is mounted from ./config) +CONFIG_FILE=/app/config/config.testing.yaml docker compose --profile testing up --build ``` -4) Verify +**4. Verify** - Semantic Router (gRPC): localhost:50051 - Envoy Proxy: http://localhost:8801 @@ -90,7 +85,7 @@ docker compose down ## Troubleshooting -### 1) Router exits immediately with a Hugging Face DNS/download error +**1. Router exits immediately with a Hugging Face DNS/download error** Symptoms (from `docker compose logs -f semantic-router`): @@ -103,32 +98,62 @@ Why: `bert_model.model_id` in `config/config.yaml` points to a remote model (`se Fix options: - Allow network access in the container (online): + - Ensure your host can resolve DNS, or add DNS servers to the `semantic-router` service in `docker-compose.yml`: - ```yaml - services: - semantic-router: - # ... - dns: - - 1.1.1.1 - - 8.8.8.8 - ``` - + ```yaml + services: + semantic-router: + # ... + dns: + - 1.1.1.1 + - 8.8.8.8 + ``` + - If behind a proxy, set `http_proxy/https_proxy/no_proxy` env vars for the service. - Use a local copy of the model (offline): - 1. Download `sentence-transformers/all-MiniLM-L12-v2` to `./models/sentence-transformers/all-MiniLM-L12-v2/` on the host. - 2. Update `config/config.yaml` to use the local path (mounted into the container at `/app/models`): - ```yaml - bert_model: - model_id: "models/sentence-transformers/all-MiniLM-L12-v2" - threshold: 0.6 - use_cpu: true - ``` + 1. Download `sentence-transformers/all-MiniLM-L12-v2` to `./models/sentence-transformers/all-MiniLM-L12-v2/` on the host. + 2. Update `config/config.yaml` to use the local path (mounted into the container at `/app/models`): + + ```yaml + bert_model: + model_id: "models/sentence-transformers/all-MiniLM-L12-v2" + threshold: 0.6 + use_cpu: true + ``` + + 3. Recreate services: `docker compose up -d --build` - 3. Recreate services: `docker compose up -d --build` +Extra tip: If you use the testing profile, also pass the testing config so the router targets the mock service: + +```bash +CONFIG_FILE=/app/config/config.testing.yaml docker compose --profile testing up --build +``` + +**2. Envoy/Router up but requests fail** + +- Ensure `mock-vllm` is healthy (testing profile only): + - `docker compose ps` should show mock-vllm healthy; logs show 200 on `/health`. +- Verify the router config in use: + - Router logs print `Starting vLLM Semantic Router ExtProc with config: ...`. If it shows `/app/config/config.yaml` while testing, you forgot `CONFIG_FILE`. +- Basic smoke test via Envoy (OpenAI-compatible): + - Send a POST to `http://localhost:8801/v1/chat/completions` with `{"model":"auto", "messages":[{"role":"user","content":"hi"}]}` and check that the mock responds with `[mock-openai/gpt-oss-20b]` content when testing profile is active. + +**3. DNS problems inside containers** + +If DNS is flaky in your Docker environment, add DNS servers to the `semantic-router` service in `docker-compose.yml`: + +```yaml +services: + semantic-router: + # ... + dns: + - 1.1.1.1 + - 8.8.8.8 +``` -### 2) Port already in use +For corporate proxies, set `http_proxy`, `https_proxy`, and `no_proxy` in the service `environment`. Make sure 8801, 50051, 19000 are not bound by other processes. Adjust ports in `docker-compose.yml` if needed.