Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Dockerfile.extproc
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,8 @@ ENV LD_LIBRARY_PATH=/app/lib

EXPOSE 50051

CMD ["/app/extproc-server", "--config", "/app/config/config.yaml"]
# Copy entrypoint to allow switching config via env var CONFIG_FILE
COPY scripts/entrypoint.sh /app/entrypoint.sh
RUN chmod +x /app/entrypoint.sh

ENTRYPOINT ["/app/entrypoint.sh"]
84 changes: 84 additions & 0 deletions config/config.testing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
bert_model:
model_id: sentence-transformers/all-MiniLM-L12-v2
threshold: 0.6
use_cpu: true

semantic_cache:
enabled: true
backend_type: "memory"
similarity_threshold: 0.8
max_entries: 1000
ttl_seconds: 3600
eviction_policy: "fifo"

tools:
enabled: true
top_k: 3
similarity_threshold: 0.2
tools_db_path: "config/tools_db.json"
fallback_to_empty: true

prompt_guard:
enabled: true
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"

vllm_endpoints:
- name: "mock"
address: "mock-vllm"
port: 8000
models:
- "openai/gpt-oss-20b"
weight: 1
health_check_path: "/health"

model_config:
"openai/gpt-oss-20b":
reasoning_family: "gpt-oss"
preferred_endpoints: ["mock"]
pii_policy:
allow_by_default: true

categories:
- name: other
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
use_reasoning: false

default_model: openai/gpt-oss-20b

reasoning_families:
deepseek:
type: "chat_template_kwargs"
parameter: "thinking"

qwen3:
type: "chat_template_kwargs"
parameter: "enable_thinking"

gpt-oss:
type: "reasoning_effort"
parameter: "reasoning_effort"
gpt:
type: "reasoning_effort"
parameter: "reasoning_effort"

default_reasoning_effort: high

api:
batch_classification:
max_batch_size: 100
concurrency_threshold: 5
max_concurrency: 8
metrics:
enabled: true
detailed_goroutine_tracking: true
high_resolution_timing: false
sample_rate: 1.0
duration_buckets:
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
19 changes: 19 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ services:
- ./models:/app/models:ro
environment:
- LD_LIBRARY_PATH=/app/lib
- CONFIG_FILE=${CONFIG_FILE:-/app/config/config.yaml}
networks:
- semantic-network
healthcheck:
Expand Down Expand Up @@ -44,6 +45,24 @@ services:
retries: 5
start_period: 10s

# Mock vLLM service for testing profile
mock-vllm:
build:
context: ./tools/mock-vllm
dockerfile: Dockerfile
container_name: mock-vllm
profiles: ["testing"]
ports:
- "8000:8000"
networks:
- semantic-network
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost:8000/health"]
interval: 10s
timeout: 5s
retries: 5
start_period: 5s

networks:
semantic-network:
driver: bridge
Expand Down
12 changes: 12 additions & 0 deletions scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -euo pipefail

CONFIG_FILE_PATH=${CONFIG_FILE:-/app/config/config.yaml}

if [[ ! -f "$CONFIG_FILE_PATH" ]]; then
echo "[entrypoint] Config file not found at $CONFIG_FILE_PATH" >&2
exit 1
fi

echo "[entrypoint] Starting semantic-router with config: $CONFIG_FILE_PATH"
exec /app/extproc-server --config "$CONFIG_FILE_PATH"
16 changes: 16 additions & 0 deletions tools/mock-vllm/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM python:3.11-slim

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*

COPY requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

COPY app.py

EXPOSE 8000

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
9 changes: 9 additions & 0 deletions tools/mock-vllm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Mock vLLM (OpenAI-compatible) service

A tiny FastAPI server that emulates minimal endpoints used by the router:

- GET /health
- GET /v1/models
- POST /v1/chat/completions

Intended for local testing with Docker Compose profile `testing`.
81 changes: 81 additions & 0 deletions tools/mock-vllm/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import math
import time
from typing import List, Optional

from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()


class ChatMessage(BaseModel):
role: str
content: str


class ChatRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = 0.2


@app.get("/health")
async def health():
return {"status": "ok"}


@app.get("/v1/models")
async def models():
return {"data": [{"id": "openai/gpt-oss-20b", "object": "model"}]}


@app.post("/v1/chat/completions")
async def chat_completions(req: ChatRequest):
# Very simple echo-like behavior
last_user = next(
(m.content for m in reversed(req.messages) if m.role == "user"), ""
)
content = f"[mock-{req.model}] You said: {last_user}"

# Rough token estimation: ~1 token per 4 characters (ceil)
def estimate_tokens(text: str) -> int:
if not text:
return 0
return max(1, math.ceil(len(text) / 4))

prompt_text = "\n".join(
m.content for m in req.messages if isinstance(m.content, str)
)
prompt_tokens = estimate_tokens(prompt_text)
completion_tokens = estimate_tokens(content)
total_tokens = prompt_tokens + completion_tokens

created_ts = int(time.time())

usage = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens,
# Optional details fields some clients read when using caching/reasoning
"prompt_tokens_details": {"cached_tokens": 0},
"completion_tokens_details": {"reasoning_tokens": 0},
}

return {
"id": "cmpl-mock-123",
"object": "chat.completion",
"created": created_ts,
"model": req.model,
"system_fingerprint": "mock-vllm",
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": content},
"finish_reason": "stop",
"logprobs": None,
}
],
"usage": usage,
# Some SDKs look for token_usage; keep it as an alias for convenience.
"token_usage": usage,
}
3 changes: 3 additions & 0 deletions tools/mock-vllm/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fastapi==0.115.0
uvicorn==0.30.6
pydantic==2.9.2
Loading
Loading