Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,51 @@ COLLECTION_NAME=codebase

# Embeddings
EMBEDDING_MODEL=BAAI/bge-base-en-v1.5
EMBEDDING_PROVIDER=fastembed
# Optional repo tag attached to each payload
REPO_NAME=workspace

# ---------------------------------------------------------------------------
# Embedding Service Configuration (Shared ONNX for scale)
# ---------------------------------------------------------------------------
# EMBEDDING_PROVIDER: local | remote
# local = Use in-process ONNX (default, high memory per worker)
# remote = Use shared embedding service (recommended for scale)
EMBEDDING_PROVIDER=local

# When EMBEDDING_PROVIDER=remote, calls this service
EMBEDDING_SERVICE_URL=http://embedding:8100
EMBEDDING_SERVICE_TIMEOUT=60

# Max concurrent ONNX inferences (local mode or in embedding service)
# Prevents memory explosion with parallel workers
EMBED_MAX_CONCURRENT=2

# Max batch size per embed request
EMBED_MAX_BATCH=256

# ---------------------------------------------------------------------------
# ONNX CPU Optimizations (for embedding service)
# ---------------------------------------------------------------------------
# ONNX_THREADS: Number of threads for intra-op parallelism
# 0 = auto (1 per physical core), or set explicit count (e.g., 4-6)
ONNX_THREADS=0

# ONNX_DISABLE_SPINNING: Disable thread spin-wait (saves CPU cycles)
# 0 = spinning enabled (faster, burns CPU), 1 = disabled (power efficient)
ONNX_DISABLE_SPINNING=0

# EMBED_OPTIMAL_BATCH: Internal batch size for chunking large requests
# Sweet spot for CPU is 32-64. Too small = overhead, too large = memory pressure
EMBED_OPTIMAL_BATCH=32

# ---------------------------------------------------------------------------
# Embedding Model Options
# ---------------------------------------------------------------------------
# Model options (changing model requires re-indexing!):
# BAAI/bge-base-en-v1.5 - Default, solid quality (768 dim, 0.21 GB)
# nomic-ai/nomic-embed-text-v1.5 - Faster, outperforms BGE on MTEB (768 dim, 0.13 GB)
# BAAI/bge-large-en-v1.5 - Higher quality, slower (1024 dim, 0.67 GB)
#
# Qwen3-Embedding Feature Flag (optional, experimental)
# Enable to use Qwen3-Embedding-0.6B instead of BGE-base (requires reindex)
# QWEN3_EMBEDDING_ENABLED=0
Expand Down
2 changes: 2 additions & 0 deletions deploy/helm/context-engine/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ data:
QDRANT_URL: {{ include "context-engine.qdrantUrl" . | quote }}
EMBEDDING_MODEL: {{ .Values.config.embeddingModel | quote }}
EMBEDDING_PROVIDER: {{ .Values.config.embeddingProvider | quote }}
EMBEDDING_SERVICE_URL: {{ .Values.config.embeddingServiceUrl | quote }}
EMBEDDING_WARMUP: "0"
INDEX_WORKERS: {{ .Values.config.indexWorkers | default "4" | quote }}

FASTMCP_HOST: {{ .Values.config.fastmcp.host | quote }}
FASTMCP_PORT: {{ .Values.config.fastmcp.port | quote }}
Expand Down
91 changes: 91 additions & 0 deletions deploy/helm/context-engine/templates/embedding-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{{- if eq .Values.config.embeddingProvider "remote" }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "context-engine.fullname" . }}-embedding
labels:
{{- include "context-engine.labels" . | nindent 4 }}
app.kubernetes.io/component: embedding
spec:
replicas: {{ .Values.embedding.replicas | default 2 }}
selector:
matchLabels:
{{- include "context-engine.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: embedding
template:
metadata:
labels:
{{- include "context-engine.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: embedding
spec:
containers:
- name: embedding
image: "{{ .Values.embedding.image.repository | default "context-engine-embedding" }}:{{ .Values.embedding.image.tag | default "latest" }}"
ports:
- containerPort: 8100
env:
- name: EMBEDDING_MODEL
value: {{ .Values.config.embeddingModel | quote }}
- name: EMBED_MAX_CONCURRENT
value: {{ .Values.embedding.maxConcurrent | default "2" | quote }}
- name: EMBED_OPTIMAL_BATCH
value: {{ .Values.embedding.optimalBatch | default "32" | quote }}
- name: ONNX_THREADS
value: {{ .Values.embedding.onnxThreads | default "4" | quote }}
- name: ONNX_DISABLE_SPINNING
value: "1"
- name: OMP_NUM_THREADS
value: {{ .Values.embedding.onnxThreads | default "4" | quote }}
- name: MKL_NUM_THREADS
value: {{ .Values.embedding.onnxThreads | default "4" | quote }}
resources:
{{- toYaml .Values.embedding.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /health
port: 8100
initialDelaySeconds: 30
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: 8100
initialDelaySeconds: 60
periodSeconds: 30
---
apiVersion: v1
kind: Service
metadata:
name: embedding
labels:
{{- include "context-engine.labels" . | nindent 4 }}
spec:
selector:
{{- include "context-engine.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: embedding
ports:
- port: 8100
targetPort: 8100
---
{{- if .Values.embedding.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "context-engine.fullname" . }}-embedding
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "context-engine.fullname" . }}-embedding
minReplicas: {{ .Values.embedding.autoscaling.minReplicas | default 2 }}
maxReplicas: {{ .Values.embedding.autoscaling.maxReplicas | default 10 }}
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.embedding.autoscaling.targetCPU | default 70 }}
{{- end }}
{{- end }}

43 changes: 39 additions & 4 deletions deploy/helm/context-engine/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,37 @@ qdrant:
initialDelaySeconds: 5
periodSeconds: 5

# -----------------------------------------------------------------------------
# Embedding Service Configuration (shared ONNX model)
# -----------------------------------------------------------------------------
embedding:
# -- Number of replicas
replicas: 2
# -- Image configuration
image:
repository: context-engine-embedding
tag: latest
# -- Max concurrent embeddings per replica
maxConcurrent: 2
# -- Optimal batch size for CPU cache
optimalBatch: 32
# -- ONNX threads per replica
onnxThreads: 4
# -- Resource requests and limits
resources:
requests:
cpu: "2"
memory: 4Gi
limits:
cpu: "4"
memory: 6Gi
# -- Autoscaling configuration
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPU: 70

# -----------------------------------------------------------------------------
# MCP Indexer HTTP Configuration
# -----------------------------------------------------------------------------
Expand Down Expand Up @@ -463,10 +494,14 @@ config:
# -- Qdrant URL (auto-generated if not set)
qdrantUrl: ""
# -- Embedding model
embeddingModel: BAAI/bge-base-en-v1.5
# -- Embedding provider
embeddingProvider: fastembed

embeddingModel: nomic-ai/nomic-embed-text-v1.5
# -- Embedding provider (remote = shared service, fastembed = local)
embeddingProvider: remote
# -- Embedding service URL (when provider=remote)
embeddingServiceUrl: http://embedding:8100
# -- Index workers (parallel file processing)
indexWorkers: 4

# -- FastMCP settings
fastmcp:
host: "0.0.0.0"
Expand Down
6 changes: 4 additions & 2 deletions deploy/kubernetes/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ data:
CTX_SUMMARY_CHARS: '0'
CURRENT_REPO: ''
DECODER_MAX_TOKENS: '4000'
EMBEDDING_MODEL: BAAI/bge-base-en-v1.5
EMBEDDING_PROVIDER: fastembed
EMBEDDING_MODEL: nomic-ai/nomic-embed-text-v1.5
EMBEDDING_PROVIDER: remote
EMBEDDING_SERVICE_URL: http://embedding:8100
INDEX_WORKERS: "4"
EMBEDDING_WARMUP: '0'
FASTMCP_HOST: 0.0.0.0
FASTMCP_HTTP_HEALTH_PORT: '18002'
Expand Down
106 changes: 106 additions & 0 deletions deploy/kubernetes/embedding-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: embedding-service
namespace: context-engine
labels:
app: embedding-service
spec:
replicas: 2
selector:
matchLabels:
app: embedding-service
template:
metadata:
labels:
app: embedding-service
spec:
containers:
- name: embedding
image: context-engine-embedding:latest
ports:
- containerPort: 8100
env:
- name: EMBEDDING_MODEL
valueFrom:
configMapKeyRef:
name: context-engine-config
key: EMBEDDING_MODEL
- name: EMBED_MAX_CONCURRENT
value: "2"
- name: EMBED_MAX_BATCH
value: "256"
- name: EMBED_OPTIMAL_BATCH
value: "32"
- name: ONNX_THREADS
value: "4"
- name: ONNX_DISABLE_SPINNING
value: "1"
- name: OMP_NUM_THREADS
value: "4"
- name: MKL_NUM_THREADS
value: "4"
- name: HF_HOME
value: /tmp/huggingface
- name: FASTEMBED_CACHE_PATH
value: /tmp/fastembed
resources:
requests:
memory: "4Gi"
cpu: "2"
limits:
memory: "6Gi"
cpu: "4"
readinessProbe:
httpGet:
path: /health
port: 8100
initialDelaySeconds: 30
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: 8100
initialDelaySeconds: 60
periodSeconds: 30
volumeMounts:
- name: embedding-cache
mountPath: /tmp/huggingface
volumes:
- name: embedding-cache
emptyDir:
sizeLimit: 2Gi
---
apiVersion: v1
kind: Service
metadata:
name: embedding
namespace: context-engine
spec:
selector:
app: embedding-service
ports:
- port: 8100
targetPort: 8100
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: embedding-service-hpa
namespace: context-engine
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: embedding-service
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70

30 changes: 30 additions & 0 deletions deploy/kubernetes/indexer-services.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,21 @@ spec:
configMapKeyRef:
name: context-engine-config
key: EMBEDDING_MODEL
- name: EMBEDDING_PROVIDER
valueFrom:
configMapKeyRef:
name: context-engine-config
key: EMBEDDING_PROVIDER
- name: EMBEDDING_SERVICE_URL
valueFrom:
configMapKeyRef:
name: context-engine-config
key: EMBEDDING_SERVICE_URL
- name: INDEX_WORKERS
valueFrom:
configMapKeyRef:
name: context-engine-config
key: INDEX_WORKERS
- name: HF_HOME
value: /work/models/hf-cache
- name: XDG_CACHE_HOME
Expand Down Expand Up @@ -209,6 +224,21 @@ spec:
configMapKeyRef:
name: context-engine-config
key: EMBEDDING_MODEL
- name: EMBEDDING_PROVIDER
valueFrom:
configMapKeyRef:
name: context-engine-config
key: EMBEDDING_PROVIDER
- name: EMBEDDING_SERVICE_URL
valueFrom:
configMapKeyRef:
name: context-engine-config
key: EMBEDDING_SERVICE_URL
- name: INDEX_WORKERS
valueFrom:
configMapKeyRef:
name: context-engine-config
key: INDEX_WORKERS
- name: HF_HOME
value: /work/models/hf-cache
- name: XDG_CACHE_HOME
Expand Down
Loading