diff --git a/.env.example b/.env.example index 30ebc3eb..3a45b0e1 100644 --- a/.env.example +++ b/.env.example @@ -312,6 +312,12 @@ REFRAG_DECODER_MODE=prompt # prompt|soft # Set to 0 to use Docker CPU-only server (default, stable) USE_GPU_DECODER=0 +# Llama.cpp decoder service configuration +# Default: ghcr.io/ggml-org/llama.cpp:server (multi-arch) +# CUDA support: ghcr.io/ggml-org/llama.cpp:server-cuda (for NVIDIA GPUs) +# Alternative: local builds or custom images +# LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server + REFRAG_SOFT_SCALE=1.0 # Llama.cpp runtime tuning diff --git a/Dockerfile.mcp b/Dockerfile.mcp index a97142ed..adc5d85d 100644 --- a/Dockerfile.mcp +++ b/Dockerfile.mcp @@ -10,7 +10,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ # Python deps: reuse shared requirements file for consistency across services # Create cache/rerank directories in same layer COPY requirements.txt /tmp/requirements.txt -RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt \ +RUN pip install --no-cache-dir --upgrade --timeout 300 --retries 3 -r /tmp/requirements.txt \ && mkdir -p /tmp/cache && chmod 755 /tmp/cache \ && mkdir -p /tmp/rerank_events /tmp/rerank_weights \ && chmod 777 /tmp/rerank_events /tmp/rerank_weights diff --git a/docker-compose.yml b/docker-compose.yml index 86744d9b..3834d178 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -66,11 +66,11 @@ services: container_name: redis-cache ports: - "6379:6379" - command: ["redis-server", "--appendonly", "yes", "--maxmemory", "256mb", "--maxmemory-policy", "allkeys-lru"] + command: [ "redis-server", "--appendonly", "yes", "--maxmemory", "256mb", "--maxmemory-policy", "allkeys-lru" ] volumes: - redis_data:/data healthcheck: - test: ["CMD", "redis-cli", "ping"] + test: [ "CMD", "redis-cli", "ping" ] interval: 10s timeout: 5s retries: 5 @@ -436,7 +436,7 @@ services: # Llama.cpp decoder service - same as base compose llamacpp: - image: ghcr.io/ggml-org/llama.cpp:server + image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp:server} container_name: llama-decoder-dev-remote environment: - LLAMA_ARG_MODEL=/models/model.gguf @@ -518,8 +518,8 @@ services: volumes: - workspace_pvc:/work:rw - codebase_pvc:/work/.codebase:rw - entrypoint: ["sh", "-c", "mkdir -p /tmp/logs /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && /app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work"] - restart: "no" # Run once on startup, do not restart after completion + entrypoint: [ "sh", "-c", "mkdir -p /tmp/logs /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && /app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work" ] + restart: "no" # Run once on startup, do not restart after completion cpus: 4.0 networks: - dev-remote-network diff --git a/tests/test_ingest_schema_mode.py b/tests/test_ingest_schema_mode.py index c766089b..593dce40 100644 --- a/tests/test_ingest_schema_mode.py +++ b/tests/test_ingest_schema_mode.py @@ -30,7 +30,7 @@ def get_collection(self, name): payload_schema=self.payload_schema, ) - def create_collection(self, collection_name, vectors_config, sparse_vectors_config=None, hnsw_config=None, quantization_config=None): + def create_collection(self, collection_name, vectors_config, sparse_vectors_config=None, hnsw_config=None, quantization_config=None, on_disk_payload=None): self.create_calls.append( { "collection_name": collection_name, @@ -38,6 +38,7 @@ def create_collection(self, collection_name, vectors_config, sparse_vectors_conf "sparse_vectors_config": sparse_vectors_config, "hnsw_config": hnsw_config, "quantization_config": quantization_config, + "on_disk_payload": on_disk_payload, } ) self.collection_exists = True