Context-Engine-AI · m1rl0k · Nov 23, 2025 · Nov 23, 2025 · Nov 23, 2025 · Nov 23, 2025
diff --git a/docs/vscode-extension.md b/docs/vscode-extension.md
@@ -46,6 +46,9 @@ All settings live under `Context Engine Uploader` in the VS Code settings UI or
 | `contextEngineUploader.runOnStartup` | Runs the force sync automatically after VS Code starts, then starts watch mode. Leave enabled to mirror the old manual workflow. |
 | `contextEngineUploader.pythonPath` | Python executable to use (`python3` by default). |
 | `contextEngineUploader.scriptWorkingDirectory` | Optional override for the folder that contains `standalone_upload_client.py`. Leave blank to use the extension’s own copy. |
+| `contextEngineUploader.decoderUrl` | Override `DECODER_URL` passed into `scripts/ctx.py` when running Prompt+. Defaults to local llama.cpp (`http://localhost:8081`, auto-appends `/completion`). Use `http://localhost:11434/api/chat` for Ollama. |
+| `contextEngineUploader.useGlmDecoder` | Set `REFRAG_RUNTIME=glm` for Prompt+ to hit GLM instead of Ollama/llama.cpp. |
+| `contextEngineUploader.useGpuDecoder` | Set `USE_GPU_DECODER=1` so ctx.py prefers the GPU llama.cpp sidecar. |
 | `contextEngineUploader.targetPath` | Absolute path that should be passed to `--path` (for example `/Users/mikah/Nadi/dumon/dumon-ai-engine-revised`). |
 | `contextEngineUploader.endpoint` | Remote endpoint passed to `--endpoint`, defaulting to `http://mcp.speramus.id:8004`. |
 | `contextEngineUploader.intervalSeconds` | Poll interval for watch mode. Set to `5` to match the previous command file. |
@@ -57,6 +60,7 @@ All settings live under `Context Engine Uploader` in the VS Code settings UI or
 - `Context Engine Uploader: Start` — executes the initial `--force` followed by `--watch` using the configured settings.
 - `Context Engine Uploader: Stop` — terminates any running upload client processes.
 - `Context Engine Uploader: Restart` — stops current processes and re-runs the startup sequence.
+- `Context Engine Uploader: Show Upload Service Logs` — opens a terminal and tails `docker compose logs -f upload_service`.
+- `Context Engine Uploader: Prompt+ (Unicorn Mode)` — runs `scripts/ctx.py --unicorn` on your current selection and replaces it with the enhanced prompt (status bar button).
 
 The extension logs all subprocess output to the **Context Engine Upload** output channel so you can confirm uploads without leaving VS Code. The watch process shuts down automatically when VS Code exits or when you run the Stop command.
-
diff --git a/scripts/ctx.py b/scripts/ctx.py
@@ -53,6 +53,7 @@
 from urllib import request
 from urllib.parse import urlparse
 from urllib.error import HTTPError, URLError
+import socket
 from typing import Dict, Any, List, Optional, Tuple
 from pathlib import Path
 
@@ -97,18 +98,48 @@ def _load_env_file():
 
 # Local decoder configuration (llama.cpp server)
 def resolve_decoder_url() -> str:
-    """Resolve decoder endpoint, honoring USE_GPU_DECODER + overrides."""
+    """Resolve decoder endpoint, honoring overrides and Ollama/GLM options.
+
+    Rules:
+    - DECODER_URL wins
+    - Otherwise, if OLLAMA_HOST is set, default to its /api/chat endpoint
+    - Otherwise, fall back to llama.cpp URL (GPU override if requested)
+    - Only append /completion for llama.cpp-style endpoints; leave Ollama/OpenAI paths untouched
+    """
     override = os.environ.get("DECODER_URL", "").strip()
     if override:
         base = override
     else:
-        use_gpu = str(os.environ.get("USE_GPU_DECODER", "0")).strip().lower()
-        if use_gpu in {"1", "true", "yes", "on"}:
-            host = "host.docker.internal" if os.path.exists("/.dockerenv") else "localhost"
-            base = f"http://{host}:8081"
+        ollama_host = os.environ.get("OLLAMA_HOST", "").strip()
+        if ollama_host:
+            base = ollama_host.rstrip("/")
+            if "/api/" not in base:
+                base = base + "/api/chat"
         else:
-            base = os.environ.get("LLAMACPP_URL", "http://localhost:8080").strip()
-    base = base or "http://localhost:8080"
+            use_gpu = str(os.environ.get("USE_GPU_DECODER", "0")).strip().lower()
+            if use_gpu in {"1", "true", "yes", "on"}:
+                host = "host.docker.internal" if os.path.exists("/.dockerenv") else "localhost"
+                base = f"http://{host}:8081"
+            else:
+                base = os.environ.get("LLAMACPP_URL", "http://localhost:8080").strip()
+
+    base = base or "http://localhost:11434/api/chat"
+    parsed_base = urlparse(base)
+    if parsed_base.hostname == "host.docker.internal" and not os.path.exists("/.dockerenv"):
+        try:
+            socket.gethostbyname(parsed_base.hostname)
+        except socket.gaierror:
+            base = base.replace("host.docker.internal", "localhost")
+            sys.stderr.write("[DEBUG] decoder host.docker.internal not reachable; falling back to localhost\n")
+            sys.stderr.flush()
+    lowered = base.lower()
+    if (
+        "ollama" in lowered
+        or "/api/chat" in lowered
+        or "/api/generate" in lowered
+        or "/v1/chat/completions" in lowered
+    ):
+        return base
     if base.endswith("/completion"):
         return base
     return base.rstrip("/") + "/completion"
@@ -1107,7 +1138,7 @@ def rewrite_prompt(original_prompt: str, context: str, note: str, max_tokens: Op
             enhanced = response.choices[0].message.content
 
     else:
-        # Use llama.cpp decoder (original logic)
+        # Use local decoder (llama.cpp by default; Ollama supported when DECODER_URL points to /api/chat)
         meta_prompt = (
             "<|start_of_role|>system<|end_of_role|>" + system_msg + "<|end_of_text|>\n"
             "<|start_of_role|>user<|end_of_role|>" + user_msg + "<|end_of_text|>\n"
@@ -1119,62 +1150,146 @@ def rewrite_prompt(original_prompt: str, context: str, note: str, max_tokens: Op
         parsed = urlparse(decoder_url)
         if parsed.hostname not in {"localhost", "127.0.0.1", "host.docker.internal"}:
             raise ValueError(f"Unsafe decoder host: {parsed.hostname}")
-        payload = {
-            "prompt": meta_prompt,
-            "n_predict": int(max_tokens or DEFAULT_REWRITE_TOKENS),
-            "temperature": 0.45,
-            "stream": stream,
-        }
 
-        req = request.Request(
-            decoder_url,
-            data=json.dumps(payload).encode("utf-8"),
-            headers={"Content-Type": "application/json"},
+        lowered_url = decoder_url.lower()
+        is_ollama = (
+            "ollama" in lowered_url
+            or "/api/chat" in lowered_url
+            or "/api/generate" in lowered_url
+            or "/v1/chat/completions" in lowered_url
         )
 
         enhanced = ""
         try:
-            if stream:
-                # Streaming mode: print tokens as they arrive for instant feedback
-                with request.urlopen(req, timeout=DECODER_TIMEOUT) as resp:
-                    for line in resp:
-                        line_str = line.decode("utf-8", errors="ignore").strip()
-                        if not line_str or line_str.startswith(":"):
-                            continue
-                        if line_str.startswith("data: "):
-                            line_str = line_str[6:]
-                        try:
-                            chunk = json.loads(line_str)
-                            token = chunk.get("content", "")
+            if is_ollama:
+                model = (
+                    os.environ.get("DECODER_MODEL", "").strip()
+                    or os.environ.get("OLLAMA_MODEL", "").strip()
+                    or "llama3"
+                )
+                payload = {
+                    "model": model,
+                    "stream": stream,
+                    "options": {"temperature": 0.45},
+                }
+                if max_tokens:
+                    payload["options"]["num_predict"] = int(max_tokens)
+                if "/api/chat" in lowered_url or "/v1/chat/completions" in lowered_url:
+                    payload["messages"] = [
+                        {"role": "system", "content": system_msg},
+                        {"role": "user", "content": user_msg},
+                    ]
+                else:
+                    payload["prompt"] = f"{system_msg}\n\n{user_msg}"
+
+                req = request.Request(
+                    decoder_url,
+                    data=json.dumps(payload).encode("utf-8"),
+                    headers={"Content-Type": "application/json"},
+                )
+
+                if stream:
+                    with request.urlopen(req, timeout=DECODER_TIMEOUT) as resp:
+                        for line in resp:
+                            line_str = line.decode("utf-8", errors="ignore").strip()
+                            if not line_str or line_str.startswith(":"):
+                                continue
+                            if line_str.startswith("data: "):
+                                line_str = line_str[6:]
+                            try:
+                                chunk = json.loads(line_str)
+                            except json.JSONDecodeError:
+                                continue
+                            token = ""
+                            if isinstance(chunk, dict):
+                                token = (
+                                    (chunk.get("message") or {}).get("content", "")
+                                    or chunk.get("response", "")
+                                )
                             if token:
                                 sys.stdout.write(token)
                                 sys.stdout.flush()
                                 enhanced += token
-                            if chunk.get("stop", False):
+                            if chunk.get("done") or chunk.get("stop"):
                                 break
-                        except json.JSONDecodeError as e:
-                            # Warn once per malformed line but keep streaming the final output only
-                            sys.stderr.write(f"[WARN] decoder stream JSON decode failed: {str(e)}\n")
-                            sys.stderr.flush()
-                            continue
-                sys.stdout.write("\n")
-                sys.stdout.flush()
+                    sys.stdout.write("\n")
+                    sys.stdout.flush()
+                else:
+                    with request.urlopen(req, timeout=DECODER_TIMEOUT) as resp:
+                        raw = resp.read().decode("utf-8", errors="ignore")
+                        data = json.loads(raw or "{}")
+                        if isinstance(data, dict):
+                            enhanced = (
+                                (data.get("message") or {}).get("content")
+                                or data.get("response")
+                                or ((data.get("choices") or [{}])[0].get("message") or {}).get("content")
+                            )
+                        else:
+                            enhanced = None
             else:
-                # Non-streaming mode: wait for full response
-                with request.urlopen(req, timeout=DECODER_TIMEOUT) as resp:
-                    raw = resp.read().decode("utf-8", errors="ignore")
-                    data = json.loads(raw)
-
-                    # Extract content from llama.cpp response
-                    enhanced = (
-                        (data.get("content") if isinstance(data, dict) else None)
-                        or ((data.get("choices") or [{}])[0].get("content") if isinstance(data, dict) else None)
-                        or ((data.get("choices") or [{}])[0].get("text") if isinstance(data, dict) else None)
-                        or (data.get("generated_text") if isinstance(data, dict) else None)
-                        or (data.get("text") if isinstance(data, dict) else None)
-                    )
+                payload = {
+                    "prompt": meta_prompt,
+                    "n_predict": int(max_tokens or DEFAULT_REWRITE_TOKENS),
+                    "temperature": 0.45,
+                    "stream": stream,
+                }
+
+                req = request.Request(
+                    decoder_url,
+                    data=json.dumps(payload).encode("utf-8"),
+                    headers={"Content-Type": "application/json"},
+                )
+
+                if stream:
+                    # Streaming mode: print tokens as they arrive for instant feedback
+                    with request.urlopen(req, timeout=DECODER_TIMEOUT) as resp:
+                        for line in resp:
+                            line_str = line.decode("utf-8", errors="ignore").strip()
+                            if not line_str or line_str.startswith(":"):
+                                continue
+                            if line_str.startswith("data: "):
+                                line_str = line_str[6:]
+                            try:
+                                chunk = json.loads(line_str)
+                                token = chunk.get("content", "")
+                                if token:
+                                    sys.stdout.write(token)
+                                    sys.stdout.flush()
+                                    enhanced += token
+                                if chunk.get("stop", False):
+                                    break
+                            except json.JSONDecodeError as e:
+                                # Warn once per malformed line but keep streaming the final output only
+                                sys.stderr.write(f"[WARN] decoder stream JSON decode failed: {str(e)}\n")
+                                sys.stderr.flush()
+                                continue
+                    sys.stdout.write("\n")
+                    sys.stdout.flush()
+                else:
+                    # Non-streaming mode: wait for full response
+                    with request.urlopen(req, timeout=DECODER_TIMEOUT) as resp:
+                        raw = resp.read().decode("utf-8", errors="ignore")
+                        data = json.loads(raw)
+
+                        # Extract content from llama.cpp response
+                        enhanced = (
+                            (data.get("content") if isinstance(data, dict) else None)
+                            or ((data.get("choices") or [{}])[0].get("content") if isinstance(data, dict) else None)
+                            or ((data.get("choices") or [{}])[0].get("text") if isinstance(data, dict) else None)
+                            or (data.get("generated_text") if isinstance(data, dict) else None)
+                            or (data.get("text") if isinstance(data, dict) else None)
+                        )
         except Exception as e:
-            sys.stderr.write(f"[ERROR] Decoder call to {decoder_url} failed: {type(e).__name__}: {e}\n")
+            body_detail = ""
+            if isinstance(e, HTTPError):
+                try:
+                    body_detail = e.read().decode("utf-8", errors="ignore").strip()
+                except Exception:
+                    body_detail = ""
+            msg = f"[ERROR] Decoder call to {decoder_url} failed: {type(e).__name__}: {e}"
+            if body_detail:
+                msg += f" | body: {body_detail}"
+            sys.stderr.write(msg + "\n")
             sys.stderr.flush()
             raise
 

diff --git a/scripts/remote_upload_client.py b/scripts/remote_upload_client.py
@@ -582,11 +582,13 @@ def get_server_status(self) -> Dict[str, Any]:
         """Get server status with simplified error handling."""
         try:
             container_workspace_path = self._translate_to_container_path(self.workspace_path)
-
+            connect_timeout = min(self.timeout, 10)
+            # Allow slower responses (e.g., cold starts/large collections) before bailing
+            read_timeout = max(self.timeout, 30)
             response = self.session.get(
                 f"{self.upload_endpoint}/api/v1/delta/status",
                 params={'workspace_path': container_workspace_path},
-                timeout=min(self.timeout, 10)
+                timeout=(connect_timeout, read_timeout)
             )
 
             if response.status_code == 200:

diff --git a/scripts/standalone_upload_client.py b/scripts/standalone_upload_client.py
@@ -739,11 +739,13 @@ def get_server_status(self) -> Dict[str, Any]:
         """Get server status with simplified error handling."""
         try:
             container_workspace_path = self._translate_to_container_path(self.workspace_path)
-
+            connect_timeout = min(self.timeout, 10)
+            # Allow slower responses (e.g., cold starts/large collections) before bailing
+            read_timeout = max(self.timeout, 30)
             response = self.session.get(
                 f"{self.upload_endpoint}/api/v1/delta/status",
                 params={'workspace_path': container_workspace_path},
-                timeout=min(self.timeout, 10)
+                timeout=(connect_timeout, read_timeout)
             )
 
             if response.status_code == 200:

diff --git a/vscode-extension/context-engine-uploader/README.md b/vscode-extension/context-engine-uploader/README.md
@@ -18,11 +18,13 @@ Configuration
 - `Target Path` is auto-filled from the workspace but can be overridden if you need to upload a different folder.
 - **Python dependencies:** the extension runs the standalone upload client via your configured `pythonPath`. Ensure the interpreter has `requests`, `urllib3`, and `charset_normalizer` installed. Run `python3 -m pip install requests urllib3 charset_normalizer` (or replace `python3` with your configured path) before starting the uploader.
 - **Path mapping:** `Host Root` + `Container Root` control how local paths are rewritten before reaching the remote service. By default the host root mirrors your `Target Path` and the container root is `/work`, which keeps Windows paths working without extra config.
+- **Prompt+ decoder:** set `Context Engine Uploader: Decoder Url` (default `http://localhost:8081`, auto-appends `/completion`) to point at your local llama.cpp decoder. For Ollama, set it to `http://localhost:11434/api/chat`. Enable `Context Engine Uploader: Use Glm Decoder` to set `REFRAG_RUNTIME=glm` for GLM backends. Turn on `Use Gpu Decoder` to set `USE_GPU_DECODER=1` so ctx.py prefers the GPU llama.cpp sidecar.
 
 Commands
 --------
 - Command Palette → “Context Engine Uploader” to access Start/Stop/Restart/Index Codebase.
 - Status-bar button (`Index Codebase`) mirrors the same behavior and displays progress.
+- Status-bar button (`Prompt+`) runs the bundled `scripts/ctx.py --unicorn` on your current selection and replaces it with the enhanced prompt.
 
 Logs
 ----