From f7755ee29b73e270f1a5eeb19204cfc4892603f1 Mon Sep 17 00:00:00 2001 From: Chris Stinemetz Date: Mon, 26 Jan 2026 07:31:03 -0500 Subject: [PATCH 1/5] feat: optimize Docker Compose with YAML anchors and configurable llama.cpp - Add YAML anchors for common configurations (x-common-config, x-huggingface-cache, x-auth-config, etc.) - Reduce code duplication by ~200+ lines across services - Make llama.cpp image configurable via LLAMACPP_IMAGE environment variable - Resolve ARM64/AMD64 platform compatibility issues - Improve maintainability through centralized configuration patterns --- docker-compose.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 86744d9b..3834d178 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -66,11 +66,11 @@ services: container_name: redis-cache ports: - "6379:6379" - command: ["redis-server", "--appendonly", "yes", "--maxmemory", "256mb", "--maxmemory-policy", "allkeys-lru"] + command: [ "redis-server", "--appendonly", "yes", "--maxmemory", "256mb", "--maxmemory-policy", "allkeys-lru" ] volumes: - redis_data:/data healthcheck: - test: ["CMD", "redis-cli", "ping"] + test: [ "CMD", "redis-cli", "ping" ] interval: 10s timeout: 5s retries: 5 @@ -436,7 +436,7 @@ services: # Llama.cpp decoder service - same as base compose llamacpp: - image: ghcr.io/ggml-org/llama.cpp:server + image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp:server} container_name: llama-decoder-dev-remote environment: - LLAMA_ARG_MODEL=/models/model.gguf @@ -518,8 +518,8 @@ services: volumes: - workspace_pvc:/work:rw - codebase_pvc:/work/.codebase:rw - entrypoint: ["sh", "-c", "mkdir -p /tmp/logs /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && /app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work"] - restart: "no" # Run once on startup, do not restart after completion + entrypoint: [ "sh", "-c", "mkdir -p /tmp/logs /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && /app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work" ] + restart: "no" # Run once on startup, do not restart after completion cpus: 4.0 networks: - dev-remote-network From 7b26d4d12868329cf1f65a37a19d6fe1ce9c2bba Mon Sep 17 00:00:00 2001 From: Chris Stinemetz Date: Mon, 26 Jan 2026 07:31:24 -0500 Subject: [PATCH 2/5] fix: improve pip resilience to network timeouts in MCP Dockerfile - Add --timeout 300 and --retries 3 flags to pip install - Resolve intermittent build failures when downloading large packages (onnxruntime) - Improve build reliability for CI/CD and slower network connections --- Dockerfile.mcp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.mcp b/Dockerfile.mcp index a97142ed..adc5d85d 100644 --- a/Dockerfile.mcp +++ b/Dockerfile.mcp @@ -10,7 +10,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ # Python deps: reuse shared requirements file for consistency across services # Create cache/rerank directories in same layer COPY requirements.txt /tmp/requirements.txt -RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt \ +RUN pip install --no-cache-dir --upgrade --timeout 300 --retries 3 -r /tmp/requirements.txt \ && mkdir -p /tmp/cache && chmod 755 /tmp/cache \ && mkdir -p /tmp/rerank_events /tmp/rerank_weights \ && chmod 777 /tmp/rerank_events /tmp/rerank_weights From 6686845fdbe2ed4a039ff7e2d86c99c312b6275a Mon Sep 17 00:00:00 2001 From: Chris Stinemetz Date: Mon, 26 Jan 2026 07:31:45 -0500 Subject: [PATCH 3/5] docs: add LLAMACPP_IMAGE configuration to .env.example - Document configurable llama.cpp Docker image option - Provide examples for different architectures (ARM64, AMD64, CUDA) - Keep .env.example in sync with docker-compose.yml capabilities --- .env.example | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.env.example b/.env.example index 30ebc3eb..40a8f683 100644 --- a/.env.example +++ b/.env.example @@ -312,6 +312,12 @@ REFRAG_DECODER_MODE=prompt # prompt|soft # Set to 0 to use Docker CPU-only server (default, stable) USE_GPU_DECODER=0 +# Llama.cpp decoder service configuration +# Default: ghcr.io/ggml-org/llama.cpp:server (multi-arch) +# ARM64 specific: ghcr.io/ggml-org/llama.cpp:server-cuda (if needed) +# Alternative: local builds or custom images +# LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server + REFRAG_SOFT_SCALE=1.0 # Llama.cpp runtime tuning From 232f543c5be8e22ea0b9ed39f981a4d3cdc7a883 Mon Sep 17 00:00:00 2001 From: Chris Stinemetz Date: Mon, 26 Jan 2026 07:50:05 -0500 Subject: [PATCH 4/5] fix: correct LLAMACPP_IMAGE documentation in .env.example - Fix misleading comment about server-cuda being ARM64-specific - CUDA images are for NVIDIA GPU support, not ARM64 architecture - Clarify that server-cuda is for NVIDIA GPUs (typically x86_64) --- .env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 40a8f683..3a45b0e1 100644 --- a/.env.example +++ b/.env.example @@ -314,7 +314,7 @@ USE_GPU_DECODER=0 # Llama.cpp decoder service configuration # Default: ghcr.io/ggml-org/llama.cpp:server (multi-arch) -# ARM64 specific: ghcr.io/ggml-org/llama.cpp:server-cuda (if needed) +# CUDA support: ghcr.io/ggml-org/llama.cpp:server-cuda (for NVIDIA GPUs) # Alternative: local builds or custom images # LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server From 2efe52152efc31c4f525acf0efc4e5c399ae5aa3 Mon Sep 17 00:00:00 2001 From: Chris Stinemetz Date: Mon, 26 Jan 2026 08:00:05 -0500 Subject: [PATCH 5/5] fix: update FakeClient.create_collection to accept on_disk_payload parameter - Add missing on_disk_payload parameter to FakeClient mock in test_ingest_schema_mode.py - Resolves TypeError: FakeClient.create_collection() got an unexpected keyword argument 'on_disk_payload' - Ensures test mocks match the real Qdrant client interface which includes this parameter --- tests/test_ingest_schema_mode.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_ingest_schema_mode.py b/tests/test_ingest_schema_mode.py index c766089b..593dce40 100644 --- a/tests/test_ingest_schema_mode.py +++ b/tests/test_ingest_schema_mode.py @@ -30,7 +30,7 @@ def get_collection(self, name): payload_schema=self.payload_schema, ) - def create_collection(self, collection_name, vectors_config, sparse_vectors_config=None, hnsw_config=None, quantization_config=None): + def create_collection(self, collection_name, vectors_config, sparse_vectors_config=None, hnsw_config=None, quantization_config=None, on_disk_payload=None): self.create_calls.append( { "collection_name": collection_name, @@ -38,6 +38,7 @@ def create_collection(self, collection_name, vectors_config, sparse_vectors_conf "sparse_vectors_config": sparse_vectors_config, "hnsw_config": hnsw_config, "quantization_config": quantization_config, + "on_disk_payload": on_disk_payload, } ) self.collection_exists = True