Skip to content

Commit 34bb8fa

Browse files
JaredforRealrootfs
andauthored
feat: Implement testing profile with mock vllm in docker compose (#190)
* Fix healthcheck curl missing & Implement testing profile Signed-off-by: JaredforReal <w13431838023@gmail.com> * fix pre-commit error Signed-off-by: JaredforReal <w13431838023@gmail.com> * Added usage fields and metadata to chat_completions Signed-off-by: JaredforReal <w13431838023@gmail.com> * remove curl install & add mirrors for CN users Signed-off-by: JaredforReal <w13431838023@gmail.com> * Update docker quick start doc & comment config for CN user Signed-off-by: JaredforReal <w13431838023@gmail.com> * clean docker-compose.yml Signed-off-by: JaredforReal <w13431838023@gmail.com> * modify docker-quickstart Signed-off-by: JaredforReal <w13431838023@gmail.com> * installation for more distribution Signed-off-by: JaredforReal <w13431838023@gmail.com> * get rid of optimization for CN network Signed-off-by: JaredforReal <w13431838023@gmail.com> --------- Signed-off-by: JaredforReal <w13431838023@gmail.com> Co-authored-by: Huamin Chen <rootfs@users.noreply.github.com>
1 parent d0404fd commit 34bb8fa

File tree

9 files changed

+299
-46
lines changed

9 files changed

+299
-46
lines changed

Dockerfile.extproc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,8 @@ ENV LD_LIBRARY_PATH=/app/lib
5555

5656
EXPOSE 50051
5757

58-
CMD ["/app/extproc-server", "--config", "/app/config/config.yaml"]
58+
# Copy entrypoint to allow switching config via env var CONFIG_FILE
59+
COPY scripts/entrypoint.sh /app/entrypoint.sh
60+
RUN chmod +x /app/entrypoint.sh
61+
62+
ENTRYPOINT ["/app/entrypoint.sh"]

config/config.testing.yaml

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
bert_model:
2+
model_id: sentence-transformers/all-MiniLM-L12-v2
3+
threshold: 0.6
4+
use_cpu: true
5+
6+
semantic_cache:
7+
enabled: true
8+
backend_type: "memory"
9+
similarity_threshold: 0.8
10+
max_entries: 1000
11+
ttl_seconds: 3600
12+
eviction_policy: "fifo"
13+
14+
tools:
15+
enabled: true
16+
top_k: 3
17+
similarity_threshold: 0.2
18+
tools_db_path: "config/tools_db.json"
19+
fallback_to_empty: true
20+
21+
prompt_guard:
22+
enabled: true
23+
use_modernbert: true
24+
model_id: "models/jailbreak_classifier_modernbert-base_model"
25+
threshold: 0.7
26+
use_cpu: true
27+
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
28+
29+
vllm_endpoints:
30+
- name: "mock"
31+
address: "mock-vllm"
32+
port: 8000
33+
models:
34+
- "openai/gpt-oss-20b"
35+
weight: 1
36+
health_check_path: "/health"
37+
38+
model_config:
39+
"openai/gpt-oss-20b":
40+
reasoning_family: "gpt-oss"
41+
preferred_endpoints: ["mock"]
42+
pii_policy:
43+
allow_by_default: true
44+
45+
categories:
46+
- name: other
47+
model_scores:
48+
- model: openai/gpt-oss-20b
49+
score: 0.7
50+
use_reasoning: false
51+
52+
default_model: openai/gpt-oss-20b
53+
54+
reasoning_families:
55+
deepseek:
56+
type: "chat_template_kwargs"
57+
parameter: "thinking"
58+
59+
qwen3:
60+
type: "chat_template_kwargs"
61+
parameter: "enable_thinking"
62+
63+
gpt-oss:
64+
type: "reasoning_effort"
65+
parameter: "reasoning_effort"
66+
gpt:
67+
type: "reasoning_effort"
68+
parameter: "reasoning_effort"
69+
70+
default_reasoning_effort: high
71+
72+
api:
73+
batch_classification:
74+
max_batch_size: 100
75+
concurrency_threshold: 5
76+
max_concurrency: 8
77+
metrics:
78+
enabled: true
79+
detailed_goroutine_tracking: true
80+
high_resolution_timing: false
81+
sample_rate: 1.0
82+
duration_buckets:
83+
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
84+
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]

docker-compose.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ services:
1313
- ./models:/app/models:ro
1414
environment:
1515
- LD_LIBRARY_PATH=/app/lib
16+
- CONFIG_FILE=${CONFIG_FILE:-/app/config/config.yaml}
1617
networks:
1718
- semantic-network
1819
healthcheck:
@@ -44,6 +45,24 @@ services:
4445
retries: 5
4546
start_period: 10s
4647

48+
# Mock vLLM service for testing profile
49+
mock-vllm:
50+
build:
51+
context: ./tools/mock-vllm
52+
dockerfile: Dockerfile
53+
container_name: mock-vllm
54+
profiles: ["testing"]
55+
ports:
56+
- "8000:8000"
57+
networks:
58+
- semantic-network
59+
healthcheck:
60+
test: ["CMD", "curl", "-fsS", "http://localhost:8000/health"]
61+
interval: 10s
62+
timeout: 5s
63+
retries: 5
64+
start_period: 5s
65+
4766
networks:
4867
semantic-network:
4968
driver: bridge

scripts/entrypoint.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
CONFIG_FILE_PATH=${CONFIG_FILE:-/app/config/config.yaml}
5+
6+
if [[ ! -f "$CONFIG_FILE_PATH" ]]; then
7+
echo "[entrypoint] Config file not found at $CONFIG_FILE_PATH" >&2
8+
exit 1
9+
fi
10+
11+
echo "[entrypoint] Starting semantic-router with config: $CONFIG_FILE_PATH"
12+
exec /app/extproc-server --config "$CONFIG_FILE_PATH"

tools/mock-vllm/Dockerfile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
FROM python:3.11-slim
2+
3+
WORKDIR /app
4+
5+
RUN apt-get update && apt-get install -y --no-install-recommends \
6+
curl \
7+
&& rm -rf /var/lib/apt/lists/*
8+
9+
COPY requirements.txt
10+
RUN pip install --no-cache-dir -r requirements.txt
11+
12+
COPY app.py
13+
14+
EXPOSE 8000
15+
16+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

tools/mock-vllm/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Mock vLLM (OpenAI-compatible) service
2+
3+
A tiny FastAPI server that emulates minimal endpoints used by the router:
4+
5+
- GET /health
6+
- GET /v1/models
7+
- POST /v1/chat/completions
8+
9+
Intended for local testing with Docker Compose profile `testing`.

tools/mock-vllm/app.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import math
2+
import time
3+
from typing import List, Optional
4+
5+
from fastapi import FastAPI
6+
from pydantic import BaseModel
7+
8+
app = FastAPI()
9+
10+
11+
class ChatMessage(BaseModel):
12+
role: str
13+
content: str
14+
15+
16+
class ChatRequest(BaseModel):
17+
model: str
18+
messages: List[ChatMessage]
19+
temperature: Optional[float] = 0.2
20+
21+
22+
@app.get("/health")
23+
async def health():
24+
return {"status": "ok"}
25+
26+
27+
@app.get("/v1/models")
28+
async def models():
29+
return {"data": [{"id": "openai/gpt-oss-20b", "object": "model"}]}
30+
31+
32+
@app.post("/v1/chat/completions")
33+
async def chat_completions(req: ChatRequest):
34+
# Very simple echo-like behavior
35+
last_user = next(
36+
(m.content for m in reversed(req.messages) if m.role == "user"), ""
37+
)
38+
content = f"[mock-{req.model}] You said: {last_user}"
39+
40+
# Rough token estimation: ~1 token per 4 characters (ceil)
41+
def estimate_tokens(text: str) -> int:
42+
if not text:
43+
return 0
44+
return max(1, math.ceil(len(text) / 4))
45+
46+
prompt_text = "\n".join(
47+
m.content for m in req.messages if isinstance(m.content, str)
48+
)
49+
prompt_tokens = estimate_tokens(prompt_text)
50+
completion_tokens = estimate_tokens(content)
51+
total_tokens = prompt_tokens + completion_tokens
52+
53+
created_ts = int(time.time())
54+
55+
usage = {
56+
"prompt_tokens": prompt_tokens,
57+
"completion_tokens": completion_tokens,
58+
"total_tokens": total_tokens,
59+
# Optional details fields some clients read when using caching/reasoning
60+
"prompt_tokens_details": {"cached_tokens": 0},
61+
"completion_tokens_details": {"reasoning_tokens": 0},
62+
}
63+
64+
return {
65+
"id": "cmpl-mock-123",
66+
"object": "chat.completion",
67+
"created": created_ts,
68+
"model": req.model,
69+
"system_fingerprint": "mock-vllm",
70+
"choices": [
71+
{
72+
"index": 0,
73+
"message": {"role": "assistant", "content": content},
74+
"finish_reason": "stop",
75+
"logprobs": None,
76+
}
77+
],
78+
"usage": usage,
79+
# Some SDKs look for token_usage; keep it as an alias for convenience.
80+
"token_usage": usage,
81+
}

tools/mock-vllm/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
fastapi==0.115.0
2+
uvicorn==0.30.6
3+
pydantic==2.9.2

0 commit comments

Comments
 (0)