From d653390a9fc996355e603038d3da725565b6829e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?= <12643176+chindris-mihai-alexandru@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:36:31 +0200 Subject: [PATCH 1/5] feat: add llama.cpp local inference support Add support for running DeepResearch locally using llama.cpp with Metal (Apple Silicon) or CUDA acceleration. Zero API costs, full privacy. New files: - inference/interactive_llamacpp.py: ReAct agent CLI for llama.cpp - scripts/start_llama_server.sh: Server startup script with optimized settings - requirements-local.txt: Minimal dependencies for local inference Features: - Free web search via DuckDuckGo (no API key required) - Optional Jina Reader for better page content extraction - Loop detection to prevent infinite tool call cycles - 32K context window for long research sessions - Exponential backoff retry for rate limits - URL validation before visiting pages Works with bartowski's GGUF quantizations (~18GB for Q4_K_M). --- .env.example | 15 +- README.md | 49 +++ inference/interactive_llamacpp.py | 566 ++++++++++++++++++++++++++++++ requirements-local.txt | 12 + scripts/start_llama_server.sh | 219 ++++++++++++ 5 files changed, 860 insertions(+), 1 deletion(-) create mode 100644 inference/interactive_llamacpp.py create mode 100644 requirements-local.txt create mode 100755 scripts/start_llama_server.sh diff --git a/.env.example b/.env.example index 8558e9c4..8154c65c 100644 --- a/.env.example +++ b/.env.example @@ -95,4 +95,17 @@ IDP_KEY_SECRET=your_idp_key_secret # These are typically set by distributed training frameworks # WORLD_SIZE=1 -# RANK=0 \ No newline at end of file +# RANK=0 + +# ============================================================================= +# llama.cpp Local Inference (Alternative for Mac/Local Users) +# ============================================================================= +# If using the llama.cpp local inference option instead of vLLM: + +# The llama.cpp server URL (default works if using start_llama_server.sh) +LLAMA_SERVER_URL=http://127.0.0.1:8080 + +# For llama.cpp mode: +# - Web search uses DuckDuckGo by default (FREE, no API key needed) +# - JINA_API_KEYS is optional but recommended for better page reading +# - See: python inference/interactive_llamacpp.py --help \ No newline at end of file diff --git a/README.md b/README.md index 6a147f47..554bc0a8 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,55 @@ You need to modify the following in the file [inference/react_agent.py](https:// - Change the model name to alibaba/tongyi-deepresearch-30b-a3b. - Adjust the content concatenation way as described in the comments on lines **88–90.** + +--- + +### 7. Local Inference with llama.cpp (Optional) + +> **For Mac users or anyone who wants 100% local inference without vLLM/CUDA dependencies.** + +This repo includes support for running DeepResearch locally using [llama.cpp](https://github.com/ggerganov/llama.cpp) with Metal (Apple Silicon) or CUDA acceleration. Zero API costs, full privacy. + +#### Requirements + +- llama.cpp built with Metal or CUDA support +- GGUF model: [bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF](https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF) +- 32GB+ RAM (for Q4_K_M quantization) + +#### Quick Start + +```bash +# Install minimal dependencies +pip install -r requirements-local.txt + +# Build llama.cpp (Mac with Metal) +cd llama.cpp +cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release +cmake --build build --config Release +cd .. + +# Download model (~18GB) +mkdir -p models/gguf +curl -L -o models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \ + 'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf' + +# Terminal 1: Start the server +./scripts/start_llama_server.sh + +# Terminal 2: Run research queries +python inference/interactive_llamacpp.py +``` + +The llama.cpp server provides both an API and a web UI at http://localhost:8080. + +#### Features + +- **Free web search**: Uses DuckDuckGo (no API key required) +- **Page visiting**: Uses Jina Reader (optional API key for better results) +- **Loop detection**: Prevents infinite tool call cycles +- **32K context**: Long research sessions supported + +--- ## Benchmark Evaluation We provide benchmark evaluation scripts for various datasets. Please refer to the [evaluation scripts](./evaluation/) directory for more details. diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py new file mode 100644 index 00000000..1cf6dffc --- /dev/null +++ b/inference/interactive_llamacpp.py @@ -0,0 +1,566 @@ +#!/usr/bin/env python3 +""" +DeepResearch Interactive CLI - llama.cpp Server Edition +========================================================= + +A powerful local research assistant that runs on YOUR machine. +Zero API costs. Full privacy. Complete control. + +This script connects to a local llama.cpp server running the +Tongyi-DeepResearch model and provides a full ReAct agent loop +with web search and page visiting capabilities. + +Architecture: + ┌─────────────────┐ HTTP ┌──────────────────┐ + │ This Script │ ────────────> │ llama.cpp │ + │ (Agent Logic) │ │ Server │ + │ - Tool calls │ <──────────── │ (Model loaded) │ + │ - Web search │ JSON │ - Metal GPU │ + │ - Page visits │ │ - 32K context │ + └─────────────────┘ └──────────────────┘ + +Usage: + # Terminal 1: Start the server (one-time, stays running) + ./scripts/start_llama_server.sh + + # Terminal 2: Run research queries + python inference/interactive_llamacpp.py + +Requirements: + pip install requests duckduckgo-search python-dotenv + +The server must be running before starting this script. +""" + +import argparse +import json +import os +import re +import sys +import time +from datetime import datetime +from typing import Dict, List, Optional, Any + +import requests + +from urllib.parse import urlparse + +# Load environment variables +try: + from dotenv import load_dotenv + load_dotenv(os.path.join(os.path.dirname(__file__), '..', '.env')) +except ImportError: + pass + +# ============================================================================= +# Configuration +# ============================================================================= + +LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://127.0.0.1:8080") +JINA_API_KEY = os.environ.get("JINA_API_KEYS", "") or os.environ.get("JINA_API_KEY", "") + +MAX_ROUNDS = 30 +MAX_TOKENS = 4096 +TEMPERATURE = 0.7 +TOP_P = 0.95 +REQUEST_TIMEOUT = 300 # 5 minutes for long generations + +# Stop sequences for the ReAct loop +STOP_SEQUENCES = [ + "", + "\n", +] + +# ============================================================================= +# System Prompt - Optimized for DeepResearch ReAct Agent +# ============================================================================= + +def get_system_prompt() -> str: + return f"""You are a deep research assistant. Your task is to answer questions by searching the web and synthesizing information from credible sources. + +# CRITICAL RULES + +1. **Think deeply**: Use tags to reason about what you know and what you need to find +2. **Search strategically**: Use multiple targeted searches to gather comprehensive information +3. **Verify information**: Cross-reference facts across multiple sources +4. **Synthesize thoroughly**: Combine information from multiple sources into a coherent answer +5. **NEVER visit the same URL twice**: Each URL can only be visited once +6. **Always conclude**: After gathering sufficient info (typically 5-15 sources), provide your answer in tags +7. **Be efficient**: Aim to answer in 10-20 rounds + +# Response Format + +When you need to search, respond with: +What I need to find and why + +{{"name": "search", "arguments": {{"query": ["your search query"]}}}} + + +When you need to visit a page for details: +Why I need to visit this specific page + +{{"name": "visit", "arguments": {{"url": "https://example.com", "goal": "what specific info you need"}}}} + + +When you have enough information, respond with: +Summary of what I found and my analysis +Your comprehensive, well-researched answer with citations where appropriate + +# Tools + + +{{"type": "function", "function": {{"name": "search", "description": "Web search. Returns titles, URLs, and snippets.", "parameters": {{"type": "object", "properties": {{"query": {{"type": "array", "items": {{"type": "string"}}, "description": "1-3 search queries"}}}}, "required": ["query"]}}}}}} +{{"type": "function", "function": {{"name": "visit", "description": "Visit a URL to get full page content. Each URL can only be visited ONCE.", "parameters": {{"type": "object", "properties": {{"url": {{"type": "string", "description": "URL to visit"}}, "goal": {{"type": "string", "description": "What info you need"}}}}, "required": ["url", "goal"]}}}}}} + + +# Important Notes +- The visit tool returns the COMPLETE page content in one response +- After 8-10 successful source visits, you likely have enough information to answer +- Prefer quality over quantity - don't just collect sources, synthesize them + +Current date: {datetime.now().strftime("%Y-%m-%d")}""" + + +# ============================================================================= +# Tools - DuckDuckGo Search (FREE, no API key needed!) +# ============================================================================= + +def duckduckgo_search(queries: list, num_results: int = 10) -> str: + """Search using DuckDuckGo - completely free, no API key needed.""" + try: + from duckduckgo_search import DDGS + from duckduckgo_search.exceptions import RatelimitException + except ImportError: + return "[Search Error] duckduckgo-search not installed. Run: pip install duckduckgo-search" + + results = [] + for query in queries[:3]: + retries = 3 + for attempt in range(retries): + try: + with DDGS() as ddgs: + search_results = list(ddgs.text(query, max_results=num_results)) + + output = [f"\n## Search: '{query}'\n"] + for idx, r in enumerate(search_results, 1): + title = r.get("title", "No title") + url = r.get("href", r.get("link", "")) + snippet = r.get("body", "")[:300] + output.append(f"{idx}. [{title}]({url})") + if snippet: + output.append(f" {snippet}...") + + results.append("\n".join(output)) + break # Success, exit retry loop + except RatelimitException: + if attempt < retries - 1: + time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s + continue + results.append(f"[Search Error for '{query}']: Rate limited. Try again in a few seconds.") + except Exception as e: + results.append(f"[Search Error for '{query}']: {e}") + break + + return "\n".join(results) if results else "No results found" + + +def is_valid_url(url: str) -> bool: + """Check if URL is valid and uses http/https scheme.""" + try: + result = urlparse(url) + return all([result.scheme in ('http', 'https'), result.netloc]) + except Exception: + return False + + +def visit_page(url: str, goal: str) -> str: + """Fetch webpage content using Jina Reader (free tier) or direct fetch.""" + if isinstance(url, list): + url = url[0] if url else "" + + if not url: + return "[Visit Error] No URL provided" + + if not is_valid_url(url): + return f"[Visit Error] Invalid URL: {url}. Must be a valid http/https URL." + + # Try Jina Reader first (free tier available) + try: + headers = {"Accept": "text/plain"} + if JINA_API_KEY: + headers["Authorization"] = f"Bearer {JINA_API_KEY}" + + jina_url = f"https://r.jina.ai/{url}" + response = requests.get(jina_url, headers=headers, timeout=30, allow_redirects=True) + + if response.status_code == 200 and len(response.text) > 100: + content = response.text[:12000] # Increased limit for more context + return f"**Content from {url}** (goal: {goal}):\n\n{content}" + except Exception: + pass + + # Fallback to direct fetch + try: + headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} + response = requests.get(url, headers=headers, timeout=15) + response.raise_for_status() + + text = response.text + text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r'<[^>]+>', ' ', text) + text = re.sub(r'\s+', ' ', text).strip() + + if len(text) > 100: + return f"**Content from {url}** (goal: {goal}):\n\n{text[:12000]}" + return f"[Visit Error] Page content too short or blocked: {url}" + except requests.Timeout: + return f"[Visit Error] Timeout fetching {url}" + except Exception as e: + return f"[Visit Error] Could not fetch {url}: {type(e).__name__}: {e}" + + +# ============================================================================= +# llama.cpp Server Client +# ============================================================================= + +class LlamaCppClient: + """Client for the llama.cpp OpenAI-compatible API.""" + + def __init__(self, base_url: str = LLAMA_SERVER_URL): + self.base_url = base_url.rstrip('/') + self.api_url = f"{self.base_url}/v1/chat/completions" + self.session = requests.Session() + + def check_server(self) -> bool: + """Check if the server is running and responsive.""" + try: + response = self.session.get(f"{self.base_url}/health", timeout=5) + return response.status_code == 200 + except Exception: + return False + + def generate(self, messages: List[Dict[str, str]], + max_tokens: int = MAX_TOKENS, + temperature: float = TEMPERATURE, + top_p: float = TOP_P, + stop: Optional[List[str]] = None) -> str: + """Generate a response from the llama.cpp server.""" + + payload = { + "messages": messages, + "max_tokens": max_tokens, + "temperature": temperature, + "top_p": top_p, + "stop": stop or STOP_SEQUENCES, + "stream": False, + } + + try: + response = self.session.post( + self.api_url, + json=payload, + headers={"Content-Type": "application/json"}, + timeout=300, # 5 minute timeout for long generations + ) + + if response.status_code != 200: + error_text = response.text[:500] + return f"[Server Error] Status {response.status_code}: {error_text}" + + data = response.json() + content = data["choices"][0]["message"]["content"] + return content.strip() + + except requests.Timeout: + return "[Error] Request timed out. The model may be processing a complex query." + except requests.ConnectionError: + return "[Error] Cannot connect to llama.cpp server. Is it running?" + except Exception as e: + return f"[Error] API call failed: {e}" + + +# ============================================================================= +# Research Agent +# ============================================================================= + +def research(client: LlamaCppClient, question: str, verbose: bool = True, + max_rounds: int = MAX_ROUNDS, temperature: float = TEMPERATURE) -> dict: + """Run the research agent loop.""" + if verbose: + print(f"\n🔍 Researching: {question}\n") + print("-" * 60) + + system_prompt = get_system_prompt() + messages: List[Dict[str, str]] = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": question} + ] + + sources: List[Dict[str, Any]] = [] + thinking: List[str] = [] + visited_urls: set = set() + consecutive_errors = 0 + max_consecutive_errors = 3 + start_time = time.time() + + for round_num in range(max_rounds): + if verbose: + print(f"\n📝 Round {round_num + 1}/{max_rounds}") + + gen_start = time.time() + content = client.generate(messages, temperature=temperature) + gen_time = time.time() - gen_start + + if content.startswith("[Error]") or content.startswith("[Server Error]"): + if verbose: + print(f" ❌ {content}") + break + + if verbose: + print(f" ⏱️ Generated in {gen_time:.1f}s") + + messages.append({"role": "assistant", "content": content}) + + # Extract and display thinking + if "" in content and "" in content: + think_content = content.split("")[1].split("")[0].strip() + thinking.append(think_content) + if verbose: + preview = think_content[:200] + "..." if len(think_content) > 200 else think_content + print(f" 💭 {preview}") + + # Check for final answer + if "" in content: + if "" in content: + answer = content.split("")[1].split("")[0] + else: + answer = content.split("")[1] + + elapsed = time.time() - start_time + + if verbose: + print("\n" + "=" * 60) + print("✅ ANSWER:") + print("=" * 60) + print(answer.strip()) + print("=" * 60) + print(f"\n📊 Stats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s") + + return { + "answer": answer.strip(), + "sources": sources, + "rounds": round_num + 1, + "thinking": thinking, + "elapsed_seconds": elapsed, + } + + # Handle tool calls + if "" in content and "" in content: + try: + tool_json = content.split("")[1].split("")[0] + tool = json.loads(tool_json.strip()) + name = tool.get("name", "") + args = tool.get("arguments", {}) + + if verbose: + print(f" 🔧 Tool: {name}") + + tool_error = False + + if name == "search": + queries = args.get("query", [question]) + if isinstance(queries, str): + queries = [queries] + if verbose: + print(f" Searching: {queries}") + tool_result = duckduckgo_search(queries) + if "[Search Error" in tool_result: + tool_error = True + else: + sources.append({"type": "search", "queries": queries}) + + elif name == "visit": + url = args.get("url", "") + if isinstance(url, list): + url = url[0] if url else "" + goal = args.get("goal", "extract information") + + if url in visited_urls: + if verbose: + print(f" ⚠️ Already visited: {url}") + tool_result = f"[Already Visited] You already visited {url}. Use the information from your previous visit or try a different source." + tool_error = True + else: + visited_urls.add(url) + if verbose: + print(f" Visiting: {url[:60]}...") + tool_result = visit_page(url, goal) + if "[Visit Error]" in tool_result: + tool_error = True + else: + sources.append({"type": "visit", "url": url}) + + else: + tool_result = f"Unknown tool: {name}. Available tools: search, visit" + tool_error = True + + # Track consecutive errors for loop detection + if tool_error: + consecutive_errors += 1 + if consecutive_errors >= max_consecutive_errors: + if verbose: + print(f"\n⚠️ {max_consecutive_errors} consecutive tool errors detected.") + messages.append({ + "role": "user", + "content": f"\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n" + }) + continue + else: + consecutive_errors = 0 # Reset on success + + # Inject tool response + messages.append({ + "role": "user", + "content": f"\n{tool_result}\n" + }) + + except json.JSONDecodeError as e: + consecutive_errors += 1 + messages.append({ + "role": "user", + "content": f"\nError: Invalid JSON in tool call: {e}\n" + }) + except Exception as e: + consecutive_errors += 1 + messages.append({ + "role": "user", + "content": f"\nTool error: {e}\n" + }) + + # Force final answer after max rounds + if verbose: + print("\n⚠️ Max rounds reached, requesting final answer...") + + messages.append({ + "role": "user", + "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use tags." + }) + + content = client.generate(messages, max_tokens=2048, temperature=temperature) + + if "" in content: + if "" in content: + answer = content.split("")[1].split("")[0] + else: + answer = content.split("")[1] + else: + answer = content + + elapsed = time.time() - start_time + + if verbose: + print("\n" + "=" * 60) + print("✅ ANSWER:") + print("=" * 60) + print(answer.strip()) + print("=" * 60) + print(f"\n📊 Stats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s") + + return { + "answer": answer.strip(), + "sources": sources, + "rounds": max_rounds, + "thinking": thinking, + "elapsed_seconds": elapsed, + } + + +# ============================================================================= +# Main +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser( + description="DeepResearch Interactive CLI - llama.cpp Server Edition", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Start the server first (in another terminal): + ./scripts/start_llama_server.sh + + # Run interactive mode: + python inference/interactive_llamacpp.py + + # Single query mode: + python inference/interactive_llamacpp.py --query "What is quantum entanglement?" + + # Connect to a different server: + python inference/interactive_llamacpp.py --server http://192.168.1.100:8080 +""" + ) + parser.add_argument("--server", type=str, default=LLAMA_SERVER_URL, + help="llama.cpp server URL (default: http://127.0.0.1:8080)") + parser.add_argument("--query", "-q", type=str, default=None, + help="Single query mode - run one research query and exit") + parser.add_argument("--max-rounds", type=int, default=MAX_ROUNDS, + help=f"Maximum research rounds (default: {MAX_ROUNDS})") + parser.add_argument("--temperature", type=float, default=TEMPERATURE, + help=f"Sampling temperature (default: {TEMPERATURE})") + args = parser.parse_args() + + print("\n" + "=" * 60) + print("🔬 DeepResearch - Interactive CLI") + print(" llama.cpp Server Edition (100% Local)") + print("=" * 60) + print(f"Server: {args.server}") + print(f"Search: DuckDuckGo (free, no API key)") + print(f"Reader: Jina.ai {'✓' if JINA_API_KEY else '(free tier)'}") + print("=" * 60) + + # Initialize client + client = LlamaCppClient(base_url=args.server) + + # Check server connection + print("\nConnecting to llama.cpp server...", end=" ") + if not client.check_server(): + print("❌ FAILED") + print(f"\nError: Cannot connect to llama.cpp server at {args.server}") + print("\nPlease start the server first:") + print(" ./scripts/start_llama_server.sh") + print("\nOr specify a different server URL:") + print(" python inference/interactive_llamacpp.py --server http://your-server:8080") + sys.exit(1) + print("✅ Connected!") + + # Single query mode + if args.query: + research(client, args.query, max_rounds=args.max_rounds, temperature=args.temperature) + return + + # Interactive mode + print("\nType your research question (or 'quit' to exit):\n") + + while True: + try: + question = input("❓ Question: ").strip() + + if not question: + continue + + if question.lower() in ('quit', 'exit', 'q'): + print("\n👋 Goodbye!") + break + + research(client, question, max_rounds=args.max_rounds, temperature=args.temperature) + print("\n" + "-" * 60 + "\n") + + except KeyboardInterrupt: + print("\n\n👋 Goodbye!") + break + except Exception as e: + print(f"\n❌ Error: {e}\n") + + +if __name__ == "__main__": + main() diff --git a/requirements-local.txt b/requirements-local.txt new file mode 100644 index 00000000..7800ee0e --- /dev/null +++ b/requirements-local.txt @@ -0,0 +1,12 @@ +# DeepResearch - Minimal requirements for llama.cpp local inference +# Install with: pip install -r requirements-local.txt + +# Core dependencies +requests>=2.31.0 +python-dotenv>=1.0.0 + +# Web search (FREE, no API key needed) +duckduckgo-search>=6.0.0 + +# Optional but recommended +tqdm>=4.66.0 diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh new file mode 100755 index 00000000..1ec44511 --- /dev/null +++ b/scripts/start_llama_server.sh @@ -0,0 +1,219 @@ +#!/bin/bash +# ============================================================================= +# DeepResearch Local Server - llama.cpp with Metal Acceleration +# ============================================================================= +# +# This script starts the llama.cpp server optimized for the DeepResearch +# ReAct agent workflow on Apple Silicon. +# +# The server provides: +# - OpenAI-compatible API at http://localhost:8080/v1/chat/completions +# - Built-in Web UI at http://localhost:8080 (chat interface!) +# - Metal (GPU) acceleration for fast inference +# - Model loaded once and kept resident in memory +# +# Usage: +# ./scripts/start_llama_server.sh # Start with defaults +# ./scripts/start_llama_server.sh --ctx 16384 # Custom context size +# ./scripts/start_llama_server.sh --no-webui # API only, no web UI +# +# Access: +# - Web UI: http://localhost:8080 +# - API: http://localhost:8080/v1/chat/completions +# - CLI: python inference/interactive_llamacpp.py +# +# ============================================================================= + +set -e + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +LLAMA_SERVER="$PROJECT_DIR/llama.cpp/build/bin/llama-server" +MODEL_PATH="$PROJECT_DIR/models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf" + +# Default settings (optimized for Apple Silicon with 32GB+ RAM) +PORT=${PORT:-8080} +HOST=${HOST:-127.0.0.1} +CTX_SIZE=${CTX_SIZE:-32768} # 32K context for long research sessions +GPU_LAYERS=${GPU_LAYERS:-99} # Offload all layers to Metal +THREADS=${THREADS:-8} # CPU threads for non-GPU ops +PARALLEL=${PARALLEL:-1} # Parallel request slots +BATCH_SIZE=${BATCH_SIZE:-512} # Batch size for prompt processing +WEBUI=${WEBUI:-true} # Enable web UI by default + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}" +echo "============================================================" +echo " DeepResearch Local Server (llama.cpp + Metal)" +echo "============================================================" +echo -e "${NC}" + +# Check if llama-server exists +if [ ! -f "$LLAMA_SERVER" ]; then + echo -e "${RED}Error: llama-server not found at $LLAMA_SERVER${NC}" + echo "" + echo "Please build llama.cpp first:" + echo " cd $PROJECT_DIR/llama.cpp" + echo " cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release" + echo " cmake --build build --config Release" + exit 1 +fi + +# Check if model exists +if [ ! -f "$MODEL_PATH" ]; then + echo -e "${RED}Error: Model not found at $MODEL_PATH${NC}" + echo "" + echo "Please download the model first:" + echo " cd $PROJECT_DIR/models/gguf" + echo " curl -L -C - -o Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \\" + echo " 'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf'" + exit 1 +fi + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --port) + PORT="$2" + shift 2 + ;; + --ctx) + CTX_SIZE="$2" + shift 2 + ;; + --threads) + THREADS="$2" + shift 2 + ;; + --parallel) + PARALLEL="$2" + shift 2 + ;; + --no-webui) + WEBUI=false + shift + ;; + --webui) + WEBUI=true + shift + ;; + -h|--help) + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --port N Port number (default: 8080)" + echo " --ctx N Context size (default: 32768)" + echo " --threads N CPU threads (default: 8)" + echo " --parallel N Parallel requests (default: 1)" + echo " --webui Enable web UI (default)" + echo " --no-webui Disable web UI, API only" + echo " -h, --help Show this help" + echo "" + echo "Access points:" + echo " Web UI: http://127.0.0.1:PORT" + echo " API: http://127.0.0.1:PORT/v1/chat/completions" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Display configuration +echo -e "${GREEN}Configuration:${NC}" +echo " Model: $(basename "$MODEL_PATH")" +echo " Size: $(du -h "$MODEL_PATH" | cut -f1)" +echo " Context: $CTX_SIZE tokens" +echo " GPU: Metal (all $GPU_LAYERS layers)" +echo " Threads: $THREADS" +echo " Parallel: $PARALLEL slots" +echo " Web UI: $WEBUI" +echo " Endpoint: http://$HOST:$PORT" +echo "" + +# Check for existing server on port +if lsof -i :$PORT > /dev/null 2>&1; then + echo -e "${YELLOW}Warning: Port $PORT is already in use.${NC}" + echo "Existing process:" + lsof -i :$PORT | head -2 + echo "" + read -p "Kill existing process and continue? (y/N) " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + lsof -t -i :$PORT | xargs kill -9 2>/dev/null || true + sleep 1 + else + echo "Aborting." + exit 1 + fi +fi + +echo -e "${YELLOW}Starting server...${NC}" +echo "(Model loading takes ~30-60 seconds)" +echo "" + +# Build command arguments +SERVER_ARGS=( + --model "$MODEL_PATH" + --host "$HOST" + --port "$PORT" + --ctx-size "$CTX_SIZE" + --n-gpu-layers "$GPU_LAYERS" + --threads "$THREADS" + --parallel "$PARALLEL" + --batch-size "$BATCH_SIZE" + --flash-attn auto + --mlock + --metrics + --log-disable +) +# Note: --jinja is enabled by default in recent llama.cpp versions + +# Add no-webui flag if requested +if [ "$WEBUI" = "false" ]; then + SERVER_ARGS+=(--no-webui) +fi + +# Start the server with optimized settings for DeepResearch +exec "$LLAMA_SERVER" "${SERVER_ARGS[@]}" 2>&1 | while read -r line; do + # Colorize output + if [[ $line == *"error"* ]] || [[ $line == *"Error"* ]]; then + echo -e "${RED}$line${NC}" + elif [[ $line == *"listening"* ]] || [[ $line == *"ready"* ]]; then + echo -e "${GREEN}$line${NC}" + echo "" + echo -e "${GREEN}============================================================${NC}" + echo -e "${GREEN} Server ready!${NC}" + echo -e "${GREEN}============================================================${NC}" + if [ "$WEBUI" = "true" ]; then + echo "" + echo -e "${GREEN} Web UI: http://$HOST:$PORT${NC}" + echo " Open in your browser for a chat interface!" + fi + echo "" + echo -e "${GREEN} API: http://$HOST:$PORT/v1/chat/completions${NC}" + echo "" + echo "Run DeepResearch CLI:" + echo " python inference/interactive_llamacpp.py" + echo "" + echo "Test API:" + echo " curl http://$HOST:$PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{\"model\": \"deepresearch\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'" + echo "" + echo "Press Ctrl+C to stop the server." + elif [[ $line == *"warning"* ]] || [[ $line == *"Warning"* ]]; then + echo -e "${YELLOW}$line${NC}" + else + echo "$line" + fi +done From 448000f681d2e4995816edc29d407021259cb2f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?= <12643176+chindris-mihai-alexandru@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:52:44 +0200 Subject: [PATCH 2/5] Add multi-provider search support to llama.cpp CLI - Add Exa, Tavily, Serper, DuckDuckGo providers with automatic fallback - Remove emojis for cleaner professional output - Update documentation with search provider information - Show available search providers on startup --- .env.example | 16 +- inference/interactive_llamacpp.py | 432 +++++++++++++++++++++-------- inference/tool_search.py | 434 ++++++++++++++++++++++++------ 3 files changed, 687 insertions(+), 195 deletions(-) diff --git a/.env.example b/.env.example index 8154c65c..c53f27cb 100644 --- a/.env.example +++ b/.env.example @@ -46,10 +46,24 @@ MAX_WORKERS=30 # API Keys and External Services # ============================================================================= -# Serper API for web search and Google Scholar +# Web Search Providers (in order of quality/preference) +# The system will try each provider in order until one succeeds. +# You only need ONE provider configured, but having multiple provides fallback. + +# Exa.ai - Best semantic/neural search ($10 free credits) +# Get your key from: https://exa.ai/ +EXA_API_KEY=your_key + +# Tavily - Purpose-built for RAG/LLMs (1,000 free requests/month) +# Get your key from: https://tavily.com/ +TAVILY_API_KEY=your_key + +# Serper API for Google search results (2,500 free queries) # Get your key from: https://serper.dev/ SERPER_KEY_ID=your_key +# DuckDuckGo is always available as final fallback (FREE, no API key needed) + # Jina API for web page reading # Get your key from: https://jina.ai/ JINA_API_KEYS=your_key diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py index 1cf6dffc..34471aba 100644 --- a/inference/interactive_llamacpp.py +++ b/inference/interactive_llamacpp.py @@ -11,13 +11,19 @@ with web search and page visiting capabilities. Architecture: - ┌─────────────────┐ HTTP ┌──────────────────┐ - │ This Script │ ────────────> │ llama.cpp │ - │ (Agent Logic) │ │ Server │ - │ - Tool calls │ <──────────── │ (Model loaded) │ - │ - Web search │ JSON │ - Metal GPU │ - │ - Page visits │ │ - 32K context │ - └─────────────────┘ └──────────────────┘ + +------------------+ HTTP +-------------------+ + | This Script | ------------> | llama.cpp | + | (Agent Logic) | | Server | + | - Tool calls | <------------ | (Model loaded) | + | - Web search | JSON | - Metal GPU | + | - Page visits | | - 32K context | + +------------------+ +-------------------+ + +Search Providers (in order of quality): + 1. Exa.ai - Best semantic/neural search + 2. Tavily - Purpose-built for RAG/LLMs + 3. Serper - Google SERP results + 4. DuckDuckGo - Free fallback (no API key needed) Usage: # Terminal 1: Start the server (one-time, stays running) @@ -28,27 +34,25 @@ Requirements: pip install requests duckduckgo-search python-dotenv - -The server must be running before starting this script. """ import argparse +import http.client import json import os import re import sys import time from datetime import datetime -from typing import Dict, List, Optional, Any +from typing import Any, Dict, List, Optional +from urllib.parse import urlparse import requests -from urllib.parse import urlparse - # Load environment variables try: from dotenv import load_dotenv - load_dotenv(os.path.join(os.path.dirname(__file__), '..', '.env')) + load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env")) except ImportError: pass @@ -59,6 +63,11 @@ LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://127.0.0.1:8080") JINA_API_KEY = os.environ.get("JINA_API_KEYS", "") or os.environ.get("JINA_API_KEY", "") +# Search API keys +EXA_API_KEY = os.environ.get("EXA_API_KEY", "") +TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "") +SERPER_KEY = os.environ.get("SERPER_KEY_ID", "") + MAX_ROUNDS = 30 MAX_TOKENS = 4096 TEMPERATURE = 0.7 @@ -71,6 +80,7 @@ "\n", ] + # ============================================================================= # System Prompt - Optimized for DeepResearch ReAct Agent # ============================================================================= @@ -122,59 +132,227 @@ def get_system_prompt() -> str: # ============================================================================= -# Tools - DuckDuckGo Search (FREE, no API key needed!) +# Search Providers # ============================================================================= -def duckduckgo_search(queries: list, num_results: int = 10) -> str: - """Search using DuckDuckGo - completely free, no API key needed.""" +def contains_chinese(text: str) -> bool: + """Check if text contains Chinese characters.""" + return any("\u4E00" <= char <= "\u9FFF" for char in text) + + +def search_exa(query: str, num_results: int = 10) -> Optional[str]: + """Exa.ai - Neural/semantic search engine.""" + if not EXA_API_KEY: + return None + + try: + response = requests.post( + "https://api.exa.ai/search", + headers={ + "x-api-key": EXA_API_KEY, + "Content-Type": "application/json", + }, + json={ + "query": query, + "numResults": num_results, + "useAutoprompt": True, + "type": "neural", + }, + timeout=30, + ) + + if response.status_code in (401, 429) or response.status_code != 200: + return None + + data = response.json() + results = data.get("results", []) + if not results: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("url", "") + text = r.get("text", "")[:300] if r.get("text") else "" + output.append(f"{idx}. [{title}]({url})") + if text: + output.append(f" {text}...") + + return "\n".join(output) + except Exception: + return None + + +def search_tavily(query: str, num_results: int = 10) -> Optional[str]: + """Tavily - Search API for RAG/LLM applications.""" + if not TAVILY_API_KEY: + return None + + try: + response = requests.post( + "https://api.tavily.com/search", + headers={"Content-Type": "application/json"}, + json={ + "api_key": TAVILY_API_KEY, + "query": query, + "max_results": num_results, + "search_depth": "advanced", + }, + timeout=30, + ) + + if response.status_code in (401, 429) or response.status_code != 200: + return None + + data = response.json() + results = data.get("results", []) + if not results: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("url", "") + content = r.get("content", "")[:300] + output.append(f"{idx}. [{title}]({url})") + if content: + output.append(f" {content}...") + + return "\n".join(output) + except Exception: + return None + + +def search_serper(query: str, num_results: int = 10) -> Optional[str]: + """Serper - Google Search API.""" + if not SERPER_KEY: + return None + + try: + conn = http.client.HTTPSConnection("google.serper.dev") + + if contains_chinese(query): + payload = json.dumps({ + "q": query, "location": "China", "gl": "cn", "hl": "zh-cn", "num": num_results + }) + else: + payload = json.dumps({ + "q": query, "location": "United States", "gl": "us", "hl": "en", "num": num_results + }) + + headers = {"X-API-KEY": SERPER_KEY, "Content-Type": "application/json"} + + res = None + for attempt in range(3): + try: + conn.request("POST", "/search", payload, headers) + res = conn.getresponse() + break + except Exception: + if attempt == 2: + return None + time.sleep(1) + + if res is None: + return None + + data = json.loads(res.read().decode("utf-8")) + if "organic" not in data: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, page in enumerate(data["organic"], 1): + title = page.get("title", "No title") + url = page.get("link", "") + snippet = page.get("snippet", "")[:300] + output.append(f"{idx}. [{title}]({url})") + if snippet: + output.append(f" {snippet}...") + + return "\n".join(output) + except Exception: + return None + + +def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]: + """DuckDuckGo - Free search, no API key required.""" try: from duckduckgo_search import DDGS from duckduckgo_search.exceptions import RatelimitException except ImportError: - return "[Search Error] duckduckgo-search not installed. Run: pip install duckduckgo-search" + return None + + retries = 3 + for attempt in range(retries): + try: + with DDGS() as ddgs: + results = list(ddgs.text(query, max_results=num_results)) + + if not results: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("href", r.get("link", "")) + body = r.get("body", "")[:300] + output.append(f"{idx}. [{title}]({url})") + if body: + output.append(f" {body}...") + + return "\n".join(output) + except RatelimitException: + if attempt < retries - 1: + time.sleep(2 ** attempt) + continue + return None + except Exception: + return None + + return None + + +def multi_provider_search(queries: list, num_results: int = 10) -> str: + """Search using multiple providers with automatic fallback.""" + providers = [ + ("Exa", search_exa), + ("Tavily", search_tavily), + ("Serper", search_serper), + ("DuckDuckGo", search_duckduckgo), + ] + + all_results = [] - results = [] for query in queries[:3]: - retries = 3 - for attempt in range(retries): - try: - with DDGS() as ddgs: - search_results = list(ddgs.text(query, max_results=num_results)) - - output = [f"\n## Search: '{query}'\n"] - for idx, r in enumerate(search_results, 1): - title = r.get("title", "No title") - url = r.get("href", r.get("link", "")) - snippet = r.get("body", "")[:300] - output.append(f"{idx}. [{title}]({url})") - if snippet: - output.append(f" {snippet}...") - - results.append("\n".join(output)) - break # Success, exit retry loop - except RatelimitException: - if attempt < retries - 1: - time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s - continue - results.append(f"[Search Error for '{query}']: Rate limited. Try again in a few seconds.") - except Exception as e: - results.append(f"[Search Error for '{query}']: {e}") + result = None + for name, search_fn in providers: + result = search_fn(query, num_results) + if result: break + + if result: + all_results.append(result) + else: + all_results.append(f"\n## Search: '{query}'\n[No results found]") - return "\n".join(results) if results else "No results found" + return "\n".join(all_results) if all_results else "No results found" +# ============================================================================= +# Page Visitor +# ============================================================================= + def is_valid_url(url: str) -> bool: """Check if URL is valid and uses http/https scheme.""" try: result = urlparse(url) - return all([result.scheme in ('http', 'https'), result.netloc]) + return all([result.scheme in ("http", "https"), result.netloc]) except Exception: return False def visit_page(url: str, goal: str) -> str: - """Fetch webpage content using Jina Reader (free tier) or direct fetch.""" + """Fetch webpage content using Jina Reader or direct fetch.""" if isinstance(url, list): url = url[0] if url else "" @@ -184,7 +362,7 @@ def visit_page(url: str, goal: str) -> str: if not is_valid_url(url): return f"[Visit Error] Invalid URL: {url}. Must be a valid http/https URL." - # Try Jina Reader first (free tier available) + # Try Jina Reader first try: headers = {"Accept": "text/plain"} if JINA_API_KEY: @@ -194,7 +372,7 @@ def visit_page(url: str, goal: str) -> str: response = requests.get(jina_url, headers=headers, timeout=30, allow_redirects=True) if response.status_code == 200 and len(response.text) > 100: - content = response.text[:12000] # Increased limit for more context + content = response.text[:12000] return f"**Content from {url}** (goal: {goal}):\n\n{content}" except Exception: pass @@ -206,10 +384,10 @@ def visit_page(url: str, goal: str) -> str: response.raise_for_status() text = response.text - text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) - text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) - text = re.sub(r'<[^>]+>', ' ', text) - text = re.sub(r'\s+', ' ', text).strip() + text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() if len(text) > 100: return f"**Content from {url}** (goal: {goal}):\n\n{text[:12000]}" @@ -228,7 +406,7 @@ class LlamaCppClient: """Client for the llama.cpp OpenAI-compatible API.""" def __init__(self, base_url: str = LLAMA_SERVER_URL): - self.base_url = base_url.rstrip('/') + self.base_url = base_url.rstrip("/") self.api_url = f"{self.base_url}/v1/chat/completions" self.session = requests.Session() @@ -240,13 +418,15 @@ def check_server(self) -> bool: except Exception: return False - def generate(self, messages: List[Dict[str, str]], - max_tokens: int = MAX_TOKENS, - temperature: float = TEMPERATURE, - top_p: float = TOP_P, - stop: Optional[List[str]] = None) -> str: + def generate( + self, + messages: List[Dict[str, str]], + max_tokens: int = MAX_TOKENS, + temperature: float = TEMPERATURE, + top_p: float = TOP_P, + stop: Optional[List[str]] = None, + ) -> str: """Generate a response from the llama.cpp server.""" - payload = { "messages": messages, "max_tokens": max_tokens, @@ -261,7 +441,7 @@ def generate(self, messages: List[Dict[str, str]], self.api_url, json=payload, headers={"Content-Type": "application/json"}, - timeout=300, # 5 minute timeout for long generations + timeout=300, ) if response.status_code != 200: @@ -284,17 +464,22 @@ def generate(self, messages: List[Dict[str, str]], # Research Agent # ============================================================================= -def research(client: LlamaCppClient, question: str, verbose: bool = True, - max_rounds: int = MAX_ROUNDS, temperature: float = TEMPERATURE) -> dict: +def research( + client: LlamaCppClient, + question: str, + verbose: bool = True, + max_rounds: int = MAX_ROUNDS, + temperature: float = TEMPERATURE, +) -> dict: """Run the research agent loop.""" if verbose: - print(f"\n🔍 Researching: {question}\n") + print(f"\n[*] Researching: {question}\n") print("-" * 60) system_prompt = get_system_prompt() messages: List[Dict[str, str]] = [ {"role": "system", "content": system_prompt}, - {"role": "user", "content": question} + {"role": "user", "content": question}, ] sources: List[Dict[str, Any]] = [] @@ -306,7 +491,7 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, for round_num in range(max_rounds): if verbose: - print(f"\n📝 Round {round_num + 1}/{max_rounds}") + print(f"\n[Round {round_num + 1}/{max_rounds}]") gen_start = time.time() content = client.generate(messages, temperature=temperature) @@ -314,11 +499,11 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, if content.startswith("[Error]") or content.startswith("[Server Error]"): if verbose: - print(f" ❌ {content}") + print(f" Error: {content}") break if verbose: - print(f" ⏱️ Generated in {gen_time:.1f}s") + print(f" Generated in {gen_time:.1f}s") messages.append({"role": "assistant", "content": content}) @@ -328,7 +513,7 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, thinking.append(think_content) if verbose: preview = think_content[:200] + "..." if len(think_content) > 200 else think_content - print(f" 💭 {preview}") + print(f" Thinking: {preview}") # Check for final answer if "" in content: @@ -341,11 +526,11 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, if verbose: print("\n" + "=" * 60) - print("✅ ANSWER:") + print("ANSWER:") print("=" * 60) print(answer.strip()) print("=" * 60) - print(f"\n📊 Stats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s") + print(f"\nStats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s") return { "answer": answer.strip(), @@ -364,7 +549,7 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, args = tool.get("arguments", {}) if verbose: - print(f" 🔧 Tool: {name}") + print(f" Tool: {name}") tool_error = False @@ -373,9 +558,9 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, if isinstance(queries, str): queries = [queries] if verbose: - print(f" Searching: {queries}") - tool_result = duckduckgo_search(queries) - if "[Search Error" in tool_result: + print(f" Queries: {queries}") + tool_result = multi_provider_search(queries) + if "[No results" in tool_result or "error" in tool_result.lower(): tool_error = True else: sources.append({"type": "search", "queries": queries}) @@ -388,13 +573,13 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, if url in visited_urls: if verbose: - print(f" ⚠️ Already visited: {url}") + print(f" [!] Already visited: {url}") tool_result = f"[Already Visited] You already visited {url}. Use the information from your previous visit or try a different source." tool_error = True else: visited_urls.add(url) if verbose: - print(f" Visiting: {url[:60]}...") + print(f" Visiting: {url[:60]}...") tool_result = visit_page(url, goal) if "[Visit Error]" in tool_result: tool_error = True @@ -410,41 +595,41 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, consecutive_errors += 1 if consecutive_errors >= max_consecutive_errors: if verbose: - print(f"\n⚠️ {max_consecutive_errors} consecutive tool errors detected.") + print(f"\n[!] {max_consecutive_errors} consecutive tool errors detected.") messages.append({ "role": "user", - "content": f"\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n" + "content": f"\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n", }) continue else: - consecutive_errors = 0 # Reset on success + consecutive_errors = 0 # Inject tool response messages.append({ - "role": "user", - "content": f"\n{tool_result}\n" + "role": "user", + "content": f"\n{tool_result}\n", }) except json.JSONDecodeError as e: consecutive_errors += 1 messages.append({ - "role": "user", - "content": f"\nError: Invalid JSON in tool call: {e}\n" + "role": "user", + "content": f"\nError: Invalid JSON in tool call: {e}\n", }) except Exception as e: consecutive_errors += 1 messages.append({ - "role": "user", - "content": f"\nTool error: {e}\n" + "role": "user", + "content": f"\nTool error: {e}\n", }) # Force final answer after max rounds if verbose: - print("\n⚠️ Max rounds reached, requesting final answer...") + print("\n[!] Max rounds reached, requesting final answer...") messages.append({ "role": "user", - "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use tags." + "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use tags.", }) content = client.generate(messages, max_tokens=2048, temperature=temperature) @@ -461,11 +646,11 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, if verbose: print("\n" + "=" * 60) - print("✅ ANSWER:") + print("ANSWER:") print("=" * 60) print(answer.strip()) print("=" * 60) - print(f"\n📊 Stats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s") + print(f"\nStats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s") return { "answer": answer.strip(), @@ -480,6 +665,19 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True, # Main # ============================================================================= +def get_available_providers() -> List[str]: + """Return list of available search providers.""" + providers = [] + if EXA_API_KEY: + providers.append("Exa") + if TAVILY_API_KEY: + providers.append("Tavily") + if SERPER_KEY: + providers.append("Serper") + providers.append("DuckDuckGo") + return providers + + def main(): parser = argparse.ArgumentParser( description="DeepResearch Interactive CLI - llama.cpp Server Edition", @@ -497,25 +695,43 @@ def main(): # Connect to a different server: python inference/interactive_llamacpp.py --server http://192.168.1.100:8080 -""" +""", + ) + parser.add_argument( + "--server", + type=str, + default=LLAMA_SERVER_URL, + help="llama.cpp server URL (default: http://127.0.0.1:8080)", + ) + parser.add_argument( + "--query", "-q", + type=str, + default=None, + help="Single query mode - run one research query and exit", + ) + parser.add_argument( + "--max-rounds", + type=int, + default=MAX_ROUNDS, + help=f"Maximum research rounds (default: {MAX_ROUNDS})", + ) + parser.add_argument( + "--temperature", + type=float, + default=TEMPERATURE, + help=f"Sampling temperature (default: {TEMPERATURE})", ) - parser.add_argument("--server", type=str, default=LLAMA_SERVER_URL, - help="llama.cpp server URL (default: http://127.0.0.1:8080)") - parser.add_argument("--query", "-q", type=str, default=None, - help="Single query mode - run one research query and exit") - parser.add_argument("--max-rounds", type=int, default=MAX_ROUNDS, - help=f"Maximum research rounds (default: {MAX_ROUNDS})") - parser.add_argument("--temperature", type=float, default=TEMPERATURE, - help=f"Sampling temperature (default: {TEMPERATURE})") args = parser.parse_args() + providers = get_available_providers() + print("\n" + "=" * 60) - print("🔬 DeepResearch - Interactive CLI") - print(" llama.cpp Server Edition (100% Local)") + print("DeepResearch - Interactive CLI") + print("llama.cpp Server Edition (100% Local)") print("=" * 60) print(f"Server: {args.server}") - print(f"Search: DuckDuckGo (free, no API key)") - print(f"Reader: Jina.ai {'✓' if JINA_API_KEY else '(free tier)'}") + print(f"Search: {', '.join(providers)}") + print(f"Reader: Jina.ai {'[configured]' if JINA_API_KEY else '[free tier]'}") print("=" * 60) # Initialize client @@ -524,14 +740,14 @@ def main(): # Check server connection print("\nConnecting to llama.cpp server...", end=" ") if not client.check_server(): - print("❌ FAILED") + print("FAILED") print(f"\nError: Cannot connect to llama.cpp server at {args.server}") print("\nPlease start the server first:") print(" ./scripts/start_llama_server.sh") print("\nOr specify a different server URL:") print(" python inference/interactive_llamacpp.py --server http://your-server:8080") sys.exit(1) - print("✅ Connected!") + print("OK") # Single query mode if args.query: @@ -543,23 +759,23 @@ def main(): while True: try: - question = input("❓ Question: ").strip() + question = input("Question: ").strip() if not question: continue - if question.lower() in ('quit', 'exit', 'q'): - print("\n👋 Goodbye!") + if question.lower() in ("quit", "exit", "q"): + print("\nGoodbye!") break research(client, question, max_rounds=args.max_rounds, temperature=args.temperature) print("\n" + "-" * 60 + "\n") except KeyboardInterrupt: - print("\n\n👋 Goodbye!") + print("\n\nGoodbye!") break except Exception as e: - print(f"\n❌ Error: {e}\n") + print(f"\nError: {e}\n") if __name__ == "__main__": diff --git a/inference/tool_search.py b/inference/tool_search.py index 1a3f7b53..03229595 100644 --- a/inference/tool_search.py +++ b/inference/tool_search.py @@ -1,131 +1,393 @@ +""" +Multi-Provider Web Search Tool +============================== + +Implements a robust search fallback chain optimized for AI research: + 1. Exa.ai - Best semantic/neural search, $10 free credits + 2. Tavily - Purpose-built for RAG/LLMs, 1,000 free requests/month + 3. Serper - Google SERP results, 2,500 free queries + 4. DuckDuckGo - Free forever, final fallback (no API key needed) + +Each provider is tried in order. If one fails (rate limit, error, no key), +the next provider is attempted automatically. + +Environment Variables: + EXA_API_KEY - Exa.ai API key (https://exa.ai/) + TAVILY_API_KEY - Tavily API key (https://tavily.com/) + SERPER_KEY_ID - Serper API key (https://serper.dev/) + +If no API keys are set, DuckDuckGo is used as the default (free, no key needed). +""" + +import http.client import json -from concurrent.futures import ThreadPoolExecutor -from typing import List, Union +import os +import time +from typing import Dict, List, Optional, Union + import requests from qwen_agent.tools.base import BaseTool, register_tool -import asyncio -from typing import Dict, List, Optional, Union -import uuid -import http.client -import json -import os +# API Keys from environment +EXA_API_KEY = os.environ.get("EXA_API_KEY", "") +TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "") +SERPER_KEY = os.environ.get("SERPER_KEY_ID", "") -SERPER_KEY=os.environ.get('SERPER_KEY_ID') +def contains_chinese(text: str) -> bool: + """Check if text contains Chinese characters.""" + return any("\u4E00" <= char <= "\u9FFF" for char in text) -@register_tool("search", allow_overwrite=True) -class Search(BaseTool): - name = "search" - description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call." - parameters = { - "type": "object", - "properties": { - "query": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Array of query strings. Include multiple complementary search queries in a single call." + +# ============================================================================= +# Search Providers +# ============================================================================= + +def search_exa(query: str, num_results: int = 10) -> Optional[str]: + """ + Exa.ai - Neural/semantic search engine. + Best for finding conceptually relevant results, not just keyword matches. + """ + if not EXA_API_KEY: + return None + + try: + response = requests.post( + "https://api.exa.ai/search", + headers={ + "x-api-key": EXA_API_KEY, + "Content-Type": "application/json", }, - }, - "required": ["query"], - } + json={ + "query": query, + "numResults": num_results, + "useAutoprompt": True, + "type": "neural", + }, + timeout=30, + ) + + if response.status_code == 401: + print("[Exa] Invalid API key") + return None + if response.status_code == 429: + print("[Exa] Rate limited") + return None + if response.status_code != 200: + print(f"[Exa] Error {response.status_code}: {response.text[:200]}") + return None + + data = response.json() + results = data.get("results", []) + + if not results: + return None + + snippets = [] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("url", "") + text = r.get("text", "")[:300] if r.get("text") else "" + published = r.get("publishedDate", "") + + snippet = f"{idx}. [{title}]({url})" + if published: + snippet += f"\nDate published: {published[:10]}" + if text: + snippet += f"\n{text}" + snippets.append(snippet) + + return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) + + except requests.Timeout: + print("[Exa] Request timeout") + return None + except Exception as e: + print(f"[Exa] Error: {e}") + return None - def __init__(self, cfg: Optional[dict] = None): - super().__init__(cfg) - def google_search_with_serp(self, query: str): - def contains_chinese_basic(text: str) -> bool: - return any('\u4E00' <= char <= '\u9FFF' for char in text) + +def search_tavily(query: str, num_results: int = 10) -> Optional[str]: + """ + Tavily - Search API designed specifically for RAG and LLM applications. + Returns AI-optimized snippets and supports advanced filtering. + """ + if not TAVILY_API_KEY: + return None + + try: + response = requests.post( + "https://api.tavily.com/search", + headers={"Content-Type": "application/json"}, + json={ + "api_key": TAVILY_API_KEY, + "query": query, + "max_results": num_results, + "search_depth": "advanced", + "include_answer": False, + "include_raw_content": False, + }, + timeout=30, + ) + + if response.status_code == 401: + print("[Tavily] Invalid API key") + return None + if response.status_code == 429: + print("[Tavily] Rate limited") + return None + if response.status_code != 200: + print(f"[Tavily] Error {response.status_code}: {response.text[:200]}") + return None + + data = response.json() + results = data.get("results", []) + + if not results: + return None + + snippets = [] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("url", "") + content = r.get("content", "")[:300] + score = r.get("score", 0) + + snippet = f"{idx}. [{title}]({url})" + if content: + snippet += f"\n{content}" + snippets.append(snippet) + + return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) + + except requests.Timeout: + print("[Tavily] Request timeout") + return None + except Exception as e: + print(f"[Tavily] Error: {e}") + return None + + +def search_serper(query: str, num_results: int = 10) -> Optional[str]: + """ + Serper - Google Search API (SERP results). + Fast and reliable Google search results. + """ + if not SERPER_KEY: + return None + + try: conn = http.client.HTTPSConnection("google.serper.dev") - if contains_chinese_basic(query): + + if contains_chinese(query): payload = json.dumps({ "q": query, "location": "China", "gl": "cn", - "hl": "zh-cn" + "hl": "zh-cn", + "num": num_results, }) - else: payload = json.dumps({ "q": query, "location": "United States", "gl": "us", - "hl": "en" + "hl": "en", + "num": num_results, }) - headers = { - 'X-API-KEY': SERPER_KEY, - 'Content-Type': 'application/json' - } + headers = { + "X-API-KEY": SERPER_KEY, + "Content-Type": "application/json", + } - for i in range(5): + res = None + for attempt in range(3): try: conn.request("POST", "/search", payload, headers) res = conn.getresponse() break except Exception as e: - print(e) - if i == 4: - return f"Google search Timeout, return None, Please try again later." + if attempt == 2: + print(f"[Serper] Connection error: {e}") + return None + time.sleep(1) continue + + if res is None: + return None + + data = json.loads(res.read().decode("utf-8")) + + if "error" in data: + print(f"[Serper] API error: {data['error']}") + return None + + if "organic" not in data: + return None + + snippets = [] + for idx, page in enumerate(data["organic"], 1): + title = page.get("title", "No title") + url = page.get("link", "") + snippet_text = page.get("snippet", "") + date = page.get("date", "") + source = page.get("source", "") + + result = f"{idx}. [{title}]({url})" + if date: + result += f"\nDate published: {date}" + if source: + result += f"\nSource: {source}" + if snippet_text: + result += f"\n{snippet_text}" + + result = result.replace("Your browser can't play this video.", "") + snippets.append(result) + + return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) - data = res.read() - results = json.loads(data.decode("utf-8")) + except Exception as e: + print(f"[Serper] Error: {e}") + return None - try: - if "organic" not in results: - raise Exception(f"No results found for query: '{query}'. Use a less specific query.") - web_snippets = list() - idx = 0 - if "organic" in results: - for page in results["organic"]: - idx += 1 - date_published = "" - if "date" in page: - date_published = "\nDate published: " + page["date"] +def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]: + """ + DuckDuckGo - Free search with no API key required. + Rate limited but reliable as a final fallback. + """ + try: + from duckduckgo_search import DDGS + from duckduckgo_search.exceptions import RatelimitException + except ImportError: + print("[DuckDuckGo] duckduckgo-search package not installed") + return None + + retries = 3 + for attempt in range(retries): + try: + with DDGS() as ddgs: + results = list(ddgs.text(query, max_results=num_results)) + + if not results: + return None + + snippets = [] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("href", r.get("link", "")) + body = r.get("body", "")[:300] + + snippet = f"{idx}. [{title}]({url})" + if body: + snippet += f"\n{body}" + snippets.append(snippet) + + return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) + + except RatelimitException: + if attempt < retries - 1: + time.sleep(2 ** attempt) + continue + print("[DuckDuckGo] Rate limited after retries") + return None + except Exception as e: + print(f"[DuckDuckGo] Error: {e}") + return None + + return None - source = "" - if "source" in page: - source = "\nSource: " + page["source"] - snippet = "" - if "snippet" in page: - snippet = "\n" + page["snippet"] +# ============================================================================= +# Multi-Provider Search with Fallback +# ============================================================================= - redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{snippet}" - redacted_version = redacted_version.replace("Your browser can't play this video.", "") - web_snippets.append(redacted_version) +def multi_provider_search(query: str, num_results: int = 10) -> str: + """ + Search using multiple providers with automatic fallback. + + Provider priority (by quality): + 1. Exa.ai - Best semantic search + 2. Tavily - Purpose-built for LLMs + 3. Serper - Google SERP results + 4. DuckDuckGo - Free fallback + + Returns the first successful result or an error message. + """ + providers = [ + ("Exa", search_exa), + ("Tavily", search_tavily), + ("Serper", search_serper), + ("DuckDuckGo", search_duckduckgo), + ] + + errors = [] + + for name, search_fn in providers: + result = search_fn(query, num_results) + if result: + return result + errors.append(name) + + return f"No results found for '{query}'. All providers failed: {', '.join(errors)}. Try a different query." - content = f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n" + "\n\n".join(web_snippets) - return content - except: - return f"No results found for '{query}'. Try with a more general query." +# ============================================================================= +# Qwen Agent Tool Registration +# ============================================================================= +@register_tool("search", allow_overwrite=True) +class Search(BaseTool): + """Web search tool with multi-provider fallback.""" - def search_with_serp(self, query: str): - result = self.google_search_with_serp(query) - return result + name = "search" + description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call." + parameters = { + "type": "object", + "properties": { + "query": { + "type": "array", + "items": {"type": "string"}, + "description": "Array of query strings. Include multiple complementary search queries in a single call.", + }, + }, + "required": ["query"], + } + + def __init__(self, cfg: Optional[dict] = None): + super().__init__(cfg) + + # Log which providers are available + available = [] + if EXA_API_KEY: + available.append("Exa") + if TAVILY_API_KEY: + available.append("Tavily") + if SERPER_KEY: + available.append("Serper") + available.append("DuckDuckGo") + + print(f"[Search] Available providers: {', '.join(available)}") def call(self, params: Union[str, dict], **kwargs) -> str: - try: - query = params["query"] - except: + if isinstance(params, str): + return "[Search] Invalid request format: Input must be a JSON object containing 'query' field" + + params_dict: dict = params + query = params_dict.get("query") + if query is None: return "[Search] Invalid request format: Input must be a JSON object containing 'query' field" if isinstance(query, str): - # 单个查询 - response = self.search_with_serp(query) - else: - # 多个查询 - assert isinstance(query, List) - responses = [] - for q in query: - responses.append(self.search_with_serp(q)) - response = "\n=======\n".join(responses) - - return response - + return multi_provider_search(query) + + if not isinstance(query, list): + return "[Search] Invalid query format: 'query' must be a string or array of strings" + + responses = [] + for q in query: + responses.append(multi_provider_search(q)) + + return "\n=======\n".join(responses) From ca5e84af4bfe44f29ca2aaab63c711f4ed41816d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?= <12643176+chindris-mihai-alexandru@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:59:55 +0200 Subject: [PATCH 3/5] Improve search providers with Tavily support and robust error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Tavily as 2nd provider (Exa → Tavily → Serper → DuckDuckGo) - Use Bearer token auth for Tavily (per API docs) - Add quota error handling (Exa 402, Tavily 432/433) - Add sanitize_query() for input validation - Switch Serper from http.client to requests - Handle ConnectionError, JSONDecodeError, DuckDuckGoSearchException - Use Exa type: auto for better search quality --- inference/interactive_llamacpp.py | 187 ++++++++++---- inference/tool_search.py | 417 ++++++++++++++++++++---------- 2 files changed, 419 insertions(+), 185 deletions(-) diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py index 34471aba..0bae1aed 100644 --- a/inference/interactive_llamacpp.py +++ b/inference/interactive_llamacpp.py @@ -37,7 +37,6 @@ """ import argparse -import http.client import json import os import re @@ -137,14 +136,30 @@ def get_system_prompt() -> str: def contains_chinese(text: str) -> bool: """Check if text contains Chinese characters.""" + if not text: + return False return any("\u4E00" <= char <= "\u9FFF" for char in text) +def sanitize_query(query: str) -> str: + """Sanitize and validate a search query.""" + if not query: + return "" + return query.strip()[:500] + + def search_exa(query: str, num_results: int = 10) -> Optional[str]: - """Exa.ai - Neural/semantic search engine.""" + """ + Exa.ai - Neural/semantic search engine. + API Docs: https://docs.exa.ai/reference/search + """ if not EXA_API_KEY: return None + query = sanitize_query(query) + if not query: + return None + try: response = requests.post( "https://api.exa.ai/search", @@ -154,14 +169,22 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]: }, json={ "query": query, - "numResults": num_results, - "useAutoprompt": True, - "type": "neural", + "numResults": min(num_results, 100), + "type": "auto", # Let Exa choose best search type }, timeout=30, ) - if response.status_code in (401, 429) or response.status_code != 200: + if response.status_code == 401: + print("[Exa] Invalid or expired API key") + return None + if response.status_code == 429: + print("[Exa] Rate limited") + return None + if response.status_code == 402: + print("[Exa] Payment required - credits exhausted") + return None + if response.status_code != 200: return None data = response.json() @@ -171,7 +194,7 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]: output = [f"\n## Search: '{query}'\n"] for idx, r in enumerate(results, 1): - title = r.get("title", "No title") + title = r.get("title") or "No title" url = r.get("url", "") text = r.get("text", "")[:300] if r.get("text") else "" output.append(f"{idx}. [{title}]({url})") @@ -179,29 +202,60 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]: output.append(f" {text}...") return "\n".join(output) - except Exception: + except requests.Timeout: + print("[Exa] Request timeout") + return None + except requests.ConnectionError: + print("[Exa] Connection error") + return None + except Exception as e: + print(f"[Exa] Error: {e}") return None def search_tavily(query: str, num_results: int = 10) -> Optional[str]: - """Tavily - Search API for RAG/LLM applications.""" + """ + Tavily - Search API for RAG/LLM applications. + API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search + """ if not TAVILY_API_KEY: return None + query = sanitize_query(query) + if not query: + return None + try: + # Use Bearer token auth (preferred over api_key in body) response = requests.post( "https://api.tavily.com/search", - headers={"Content-Type": "application/json"}, + headers={ + "Authorization": f"Bearer {TAVILY_API_KEY}", + "Content-Type": "application/json", + }, json={ - "api_key": TAVILY_API_KEY, "query": query, - "max_results": num_results, - "search_depth": "advanced", + "max_results": min(num_results, 20), + "search_depth": "basic", # Use basic (1 credit) vs advanced (2 credits) + "include_answer": False, + "include_raw_content": False, }, timeout=30, ) - if response.status_code in (401, 429) or response.status_code != 200: + if response.status_code == 401: + print("[Tavily] Invalid or expired API key") + return None + if response.status_code == 429: + print("[Tavily] Rate limited") + return None + if response.status_code == 432: + print("[Tavily] Plan limit exceeded") + return None + if response.status_code == 433: + print("[Tavily] Pay-as-you-go limit exceeded") + return None + if response.status_code != 200: return None data = response.json() @@ -211,7 +265,7 @@ def search_tavily(query: str, num_results: int = 10) -> Optional[str]: output = [f"\n## Search: '{query}'\n"] for idx, r in enumerate(results, 1): - title = r.get("title", "No title") + title = r.get("title") or "No title" url = r.get("url", "") content = r.get("content", "")[:300] output.append(f"{idx}. [{title}]({url})") @@ -219,58 +273,81 @@ def search_tavily(query: str, num_results: int = 10) -> Optional[str]: output.append(f" {content}...") return "\n".join(output) - except Exception: + except requests.Timeout: + print("[Tavily] Request timeout") + return None + except requests.ConnectionError: + print("[Tavily] Connection error") + return None + except Exception as e: + print(f"[Tavily] Error: {e}") return None def search_serper(query: str, num_results: int = 10) -> Optional[str]: - """Serper - Google Search API.""" + """ + Serper - Google Search API. + API Docs: https://serper.dev/ + """ if not SERPER_KEY: return None + query = sanitize_query(query) + if not query: + return None + try: - conn = http.client.HTTPSConnection("google.serper.dev") - if contains_chinese(query): - payload = json.dumps({ - "q": query, "location": "China", "gl": "cn", "hl": "zh-cn", "num": num_results - }) + payload = {"q": query, "gl": "cn", "hl": "zh-cn", "num": min(num_results, 100)} else: - payload = json.dumps({ - "q": query, "location": "United States", "gl": "us", "hl": "en", "num": num_results - }) + payload = {"q": query, "gl": "us", "hl": "en", "num": min(num_results, 100)} - headers = {"X-API-KEY": SERPER_KEY, "Content-Type": "application/json"} + response = requests.post( + "https://google.serper.dev/search", + headers={ + "X-API-KEY": SERPER_KEY, + "Content-Type": "application/json", + }, + json=payload, + timeout=30, + ) - res = None - for attempt in range(3): - try: - conn.request("POST", "/search", payload, headers) - res = conn.getresponse() - break - except Exception: - if attempt == 2: - return None - time.sleep(1) + if response.status_code == 401: + print("[Serper] Invalid API key") + return None + if response.status_code == 429: + print("[Serper] Rate limited") + return None + if response.status_code != 200: + return None - if res is None: + data = response.json() + if "error" in data: + print(f"[Serper] API error: {data['error']}") return None - data = json.loads(res.read().decode("utf-8")) - if "organic" not in data: + organic = data.get("organic", []) + if not organic: return None output = [f"\n## Search: '{query}'\n"] - for idx, page in enumerate(data["organic"], 1): - title = page.get("title", "No title") + for idx, page in enumerate(organic, 1): + title = page.get("title") or "No title" url = page.get("link", "") - snippet = page.get("snippet", "")[:300] + snippet = page.get("snippet", "")[:300].replace("Your browser can't play this video.", "").strip() output.append(f"{idx}. [{title}]({url})") if snippet: output.append(f" {snippet}...") return "\n".join(output) - except Exception: + except requests.Timeout: + print("[Serper] Request timeout") + return None + except requests.ConnectionError: + print("[Serper] Connection error") + return None + except Exception as e: + print(f"[Serper] Error: {e}") return None @@ -278,23 +355,28 @@ def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]: """DuckDuckGo - Free search, no API key required.""" try: from duckduckgo_search import DDGS - from duckduckgo_search.exceptions import RatelimitException + from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException except ImportError: + print("[DuckDuckGo] duckduckgo-search package not installed") + return None + + query = sanitize_query(query) + if not query: return None retries = 3 for attempt in range(retries): try: with DDGS() as ddgs: - results = list(ddgs.text(query, max_results=num_results)) + results = list(ddgs.text(query, max_results=min(num_results, 25))) if not results: return None output = [f"\n## Search: '{query}'\n"] for idx, r in enumerate(results, 1): - title = r.get("title", "No title") - url = r.get("href", r.get("link", "")) + title = r.get("title") or "No title" + url = r.get("href") or r.get("link", "") body = r.get("body", "")[:300] output.append(f"{idx}. [{title}]({url})") if body: @@ -303,10 +385,17 @@ def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]: return "\n".join(output) except RatelimitException: if attempt < retries - 1: - time.sleep(2 ** attempt) + wait = 2 ** attempt + print(f"[DuckDuckGo] Rate limited, waiting {wait}s...") + time.sleep(wait) continue + print("[DuckDuckGo] Rate limited after all retries") return None - except Exception: + except DuckDuckGoSearchException as e: + print(f"[DuckDuckGo] Search error: {e}") + return None + except Exception as e: + print(f"[DuckDuckGo] Error: {e}") return None return None diff --git a/inference/tool_search.py b/inference/tool_search.py index 03229595..15f21063 100644 --- a/inference/tool_search.py +++ b/inference/tool_search.py @@ -3,10 +3,10 @@ ============================== Implements a robust search fallback chain optimized for AI research: - 1. Exa.ai - Best semantic/neural search, $10 free credits - 2. Tavily - Purpose-built for RAG/LLMs, 1,000 free requests/month - 3. Serper - Google SERP results, 2,500 free queries - 4. DuckDuckGo - Free forever, final fallback (no API key needed) + 1. Exa.ai - Best semantic/neural search ($10 free credits) + 2. Tavily - Purpose-built for RAG/LLMs (1,000 free requests/month) + 3. Serper - Google SERP results (2,500 free queries) + 4. DuckDuckGo - Free forever, final fallback (no API key needed) Each provider is tried in order. If one fails (rate limit, error, no key), the next provider is attempted automatically. @@ -23,23 +23,61 @@ import json import os import time -from typing import Dict, List, Optional, Union +from typing import List, Optional, Union import requests from qwen_agent.tools.base import BaseTool, register_tool # API Keys from environment -EXA_API_KEY = os.environ.get("EXA_API_KEY", "") -TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "") -SERPER_KEY = os.environ.get("SERPER_KEY_ID", "") +EXA_API_KEY = os.environ.get("EXA_API_KEY", "").strip() +TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "").strip() +SERPER_KEY = os.environ.get("SERPER_KEY_ID", "").strip() + +# Request timeouts (seconds) +REQUEST_TIMEOUT = 30 def contains_chinese(text: str) -> bool: """Check if text contains Chinese characters.""" + if not text: + return False return any("\u4E00" <= char <= "\u9FFF" for char in text) +def sanitize_query(query: str) -> str: + """Sanitize and validate a search query.""" + if not query: + return "" + # Strip whitespace and limit length + query = query.strip()[:500] + return query + + +def format_results(query: str, results: List[dict], provider: str) -> str: + """Format search results into a consistent markdown format.""" + if not results: + return "" + + snippets = [] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("url", "") + snippet = r.get("snippet", "") + date = r.get("date", "") + + # Build result entry + entry = f"{idx}. [{title}]({url})" + if date: + entry += f"\nDate: {date}" + if snippet: + entry += f"\n{snippet}" + snippets.append(entry) + + header = f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + return header + "\n\n".join(snippets) + + # ============================================================================= # Search Providers # ============================================================================= @@ -48,10 +86,16 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]: """ Exa.ai - Neural/semantic search engine. Best for finding conceptually relevant results, not just keyword matches. + + API Docs: https://docs.exa.ai/reference/search """ if not EXA_API_KEY: return None + query = sanitize_query(query) + if not query: + return None + try: response = requests.post( "https://api.exa.ai/search", @@ -61,50 +105,65 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]: }, json={ "query": query, - "numResults": num_results, - "useAutoprompt": True, - "type": "neural", + "numResults": min(num_results, 100), # API max is 100 + "type": "auto", # Let Exa choose best search type }, - timeout=30, + timeout=REQUEST_TIMEOUT, ) + # Handle error responses if response.status_code == 401: - print("[Exa] Invalid API key") + print("[Exa] Invalid or expired API key") return None if response.status_code == 429: - print("[Exa] Rate limited") + print("[Exa] Rate limited - too many requests") + return None + if response.status_code == 402: + print("[Exa] Payment required - credits exhausted") return None if response.status_code != 200: - print(f"[Exa] Error {response.status_code}: {response.text[:200]}") + error_msg = response.text[:200] if response.text else "Unknown error" + print(f"[Exa] Error {response.status_code}: {error_msg}") return None data = response.json() - results = data.get("results", []) + api_results = data.get("results", []) - if not results: + if not api_results: return None - snippets = [] - for idx, r in enumerate(results, 1): - title = r.get("title", "No title") + # Normalize results + results = [] + for r in api_results: + title = r.get("title") or "No title" url = r.get("url", "") - text = r.get("text", "")[:300] if r.get("text") else "" + text = r.get("text", "") published = r.get("publishedDate", "") - snippet = f"{idx}. [{title}]({url})" - if published: - snippet += f"\nDate published: {published[:10]}" - if text: - snippet += f"\n{text}" - snippets.append(snippet) + # Truncate text for snippet + snippet = text[:300] + "..." if len(text) > 300 else text + date = published[:10] if published else "" + + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": date, + }) - return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) + return format_results(query, results, "Exa") except requests.Timeout: print("[Exa] Request timeout") return None + except requests.ConnectionError: + print("[Exa] Connection error - network issue") + return None + except json.JSONDecodeError: + print("[Exa] Invalid JSON response") + return None except Exception as e: - print(f"[Exa] Error: {e}") + print(f"[Exa] Unexpected error: {type(e).__name__}: {e}") return None @@ -112,60 +171,89 @@ def search_tavily(query: str, num_results: int = 10) -> Optional[str]: """ Tavily - Search API designed specifically for RAG and LLM applications. Returns AI-optimized snippets and supports advanced filtering. + + API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search """ if not TAVILY_API_KEY: return None + query = sanitize_query(query) + if not query: + return None + try: + # Tavily supports both Bearer token and api_key in body + # Using Bearer token as it's more standard response = requests.post( "https://api.tavily.com/search", - headers={"Content-Type": "application/json"}, + headers={ + "Authorization": f"Bearer {TAVILY_API_KEY}", + "Content-Type": "application/json", + }, json={ - "api_key": TAVILY_API_KEY, "query": query, - "max_results": num_results, - "search_depth": "advanced", + "max_results": min(num_results, 20), # API max is 20 + "search_depth": "basic", # Use basic (1 credit) vs advanced (2 credits) "include_answer": False, "include_raw_content": False, }, - timeout=30, + timeout=REQUEST_TIMEOUT, ) + # Handle error responses if response.status_code == 401: - print("[Tavily] Invalid API key") + print("[Tavily] Invalid or expired API key") return None if response.status_code == 429: - print("[Tavily] Rate limited") + print("[Tavily] Rate limited - too many requests") + return None + if response.status_code == 432: + print("[Tavily] Plan limit exceeded - upgrade required") + return None + if response.status_code == 433: + print("[Tavily] Pay-as-you-go limit exceeded") return None if response.status_code != 200: - print(f"[Tavily] Error {response.status_code}: {response.text[:200]}") + error_msg = response.text[:200] if response.text else "Unknown error" + print(f"[Tavily] Error {response.status_code}: {error_msg}") return None data = response.json() - results = data.get("results", []) + api_results = data.get("results", []) - if not results: + if not api_results: return None - snippets = [] - for idx, r in enumerate(results, 1): - title = r.get("title", "No title") + # Normalize results + results = [] + for r in api_results: + title = r.get("title") or "No title" url = r.get("url", "") - content = r.get("content", "")[:300] - score = r.get("score", 0) + content = r.get("content", "") - snippet = f"{idx}. [{title}]({url})" - if content: - snippet += f"\n{content}" - snippets.append(snippet) + # Truncate content for snippet + snippet = content[:300] + "..." if len(content) > 300 else content + + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": "", + }) - return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) + return format_results(query, results, "Tavily") except requests.Timeout: print("[Tavily] Request timeout") return None + except requests.ConnectionError: + print("[Tavily] Connection error - network issue") + return None + except json.JSONDecodeError: + print("[Tavily] Invalid JSON response") + return None except Exception as e: - print(f"[Tavily] Error: {e}") + print(f"[Tavily] Unexpected error: {type(e).__name__}: {e}") return None @@ -173,83 +261,98 @@ def search_serper(query: str, num_results: int = 10) -> Optional[str]: """ Serper - Google Search API (SERP results). Fast and reliable Google search results. + + API Docs: https://serper.dev/ """ if not SERPER_KEY: return None + query = sanitize_query(query) + if not query: + return None + try: - conn = http.client.HTTPSConnection("google.serper.dev") - + # Determine locale based on query content if contains_chinese(query): - payload = json.dumps({ + payload = { "q": query, - "location": "China", "gl": "cn", "hl": "zh-cn", - "num": num_results, - }) + "num": min(num_results, 100), + } else: - payload = json.dumps({ + payload = { "q": query, - "location": "United States", "gl": "us", "hl": "en", - "num": num_results, - }) + "num": min(num_results, 100), + } - headers = { - "X-API-KEY": SERPER_KEY, - "Content-Type": "application/json", - } - - res = None - for attempt in range(3): - try: - conn.request("POST", "/search", payload, headers) - res = conn.getresponse() - break - except Exception as e: - if attempt == 2: - print(f"[Serper] Connection error: {e}") - return None - time.sleep(1) - continue + # Use requests instead of http.client for consistency and better error handling + response = requests.post( + "https://google.serper.dev/search", + headers={ + "X-API-KEY": SERPER_KEY, + "Content-Type": "application/json", + }, + json=payload, + timeout=REQUEST_TIMEOUT, + ) - if res is None: + # Handle error responses + if response.status_code == 401: + print("[Serper] Invalid API key") + return None + if response.status_code == 429: + print("[Serper] Rate limited") + return None + if response.status_code != 200: + error_msg = response.text[:200] if response.text else "Unknown error" + print(f"[Serper] Error {response.status_code}: {error_msg}") return None - data = json.loads(res.read().decode("utf-8")) + data = response.json() + # Check for API-level errors if "error" in data: print(f"[Serper] API error: {data['error']}") return None - if "organic" not in data: + organic = data.get("organic", []) + if not organic: return None - snippets = [] - for idx, page in enumerate(data["organic"], 1): - title = page.get("title", "No title") + # Normalize results + results = [] + for page in organic: + title = page.get("title") or "No title" url = page.get("link", "") snippet_text = page.get("snippet", "") date = page.get("date", "") - source = page.get("source", "") - result = f"{idx}. [{title}]({url})" - if date: - result += f"\nDate published: {date}" - if source: - result += f"\nSource: {source}" - if snippet_text: - result += f"\n{snippet_text}" + # Clean up snippet + snippet = snippet_text.replace("Your browser can't play this video.", "").strip() - result = result.replace("Your browser can't play this video.", "") - snippets.append(result) + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": date, + }) - return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) + return format_results(query, results, "Serper") + except requests.Timeout: + print("[Serper] Request timeout") + return None + except requests.ConnectionError: + print("[Serper] Connection error - network issue") + return None + except json.JSONDecodeError: + print("[Serper] Invalid JSON response") + return None except Exception as e: - print(f"[Serper] Error: {e}") + print(f"[Serper] Unexpected error: {type(e).__name__}: {e}") return None @@ -260,41 +363,56 @@ def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]: """ try: from duckduckgo_search import DDGS - from duckduckgo_search.exceptions import RatelimitException + from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException except ImportError: - print("[DuckDuckGo] duckduckgo-search package not installed") + print("[DuckDuckGo] duckduckgo-search package not installed. Run: pip install duckduckgo-search") return None - retries = 3 - for attempt in range(retries): + query = sanitize_query(query) + if not query: + return None + + max_retries = 3 + for attempt in range(max_retries): try: with DDGS() as ddgs: - results = list(ddgs.text(query, max_results=num_results)) + api_results = list(ddgs.text(query, max_results=min(num_results, 25))) - if not results: + if not api_results: return None - snippets = [] - for idx, r in enumerate(results, 1): - title = r.get("title", "No title") - url = r.get("href", r.get("link", "")) - body = r.get("body", "")[:300] + # Normalize results + results = [] + for r in api_results: + title = r.get("title") or "No title" + url = r.get("href") or r.get("link", "") + body = r.get("body", "") - snippet = f"{idx}. [{title}]({url})" - if body: - snippet += f"\n{body}" - snippets.append(snippet) + # Truncate body for snippet + snippet = body[:300] + "..." if len(body) > 300 else body + + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": "", + }) - return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets) + return format_results(query, results, "DuckDuckGo") except RatelimitException: - if attempt < retries - 1: - time.sleep(2 ** attempt) + if attempt < max_retries - 1: + wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s + print(f"[DuckDuckGo] Rate limited, waiting {wait_time}s...") + time.sleep(wait_time) continue - print("[DuckDuckGo] Rate limited after retries") + print("[DuckDuckGo] Rate limited after all retries") + return None + except DuckDuckGoSearchException as e: + print(f"[DuckDuckGo] Search error: {e}") return None except Exception as e: - print(f"[DuckDuckGo] Error: {e}") + print(f"[DuckDuckGo] Unexpected error: {type(e).__name__}: {e}") return None return None @@ -309,13 +427,18 @@ def multi_provider_search(query: str, num_results: int = 10) -> str: Search using multiple providers with automatic fallback. Provider priority (by quality): - 1. Exa.ai - Best semantic search - 2. Tavily - Purpose-built for LLMs - 3. Serper - Google SERP results - 4. DuckDuckGo - Free fallback + 1. Exa.ai - Best semantic search + 2. Tavily - Purpose-built for LLMs + 3. Serper - Google SERP results + 4. DuckDuckGo - Free fallback Returns the first successful result or an error message. """ + # Validate query + query = sanitize_query(query) + if not query: + return "[Search] Empty query provided. Please provide a search term." + providers = [ ("Exa", search_exa), ("Tavily", search_tavily), @@ -323,15 +446,16 @@ def multi_provider_search(query: str, num_results: int = 10) -> str: ("DuckDuckGo", search_duckduckgo), ] - errors = [] + failed_providers = [] for name, search_fn in providers: result = search_fn(query, num_results) if result: return result - errors.append(name) + failed_providers.append(name) - return f"No results found for '{query}'. All providers failed: {', '.join(errors)}. Try a different query." + # All providers failed + return f"No results found for '{query}'. Providers attempted: {', '.join(failed_providers)}. Try a different or simpler query." # ============================================================================= @@ -359,7 +483,7 @@ class Search(BaseTool): def __init__(self, cfg: Optional[dict] = None): super().__init__(cfg) - # Log which providers are available + # Log which providers are available at initialization available = [] if EXA_API_KEY: available.append("Exa") @@ -367,27 +491,48 @@ def __init__(self, cfg: Optional[dict] = None): available.append("Tavily") if SERPER_KEY: available.append("Serper") - available.append("DuckDuckGo") + available.append("DuckDuckGo") # Always available - print(f"[Search] Available providers: {', '.join(available)}") + print(f"[Search] Initialized with providers: {', '.join(available)}") def call(self, params: Union[str, dict], **kwargs) -> str: + # Handle string input (invalid) if isinstance(params, str): - return "[Search] Invalid request format: Input must be a JSON object containing 'query' field" + return "[Search] Invalid request: Input must be a JSON object with 'query' field, not a string." + + # Handle None or non-dict + if not isinstance(params, dict): + return "[Search] Invalid request: Input must be a JSON object with 'query' field." - params_dict: dict = params - query = params_dict.get("query") + query = params.get("query") + + # Handle missing query if query is None: - return "[Search] Invalid request format: Input must be a JSON object containing 'query' field" + return "[Search] Missing 'query' field in request." + # Handle single string query if isinstance(query, str): + query = query.strip() + if not query: + return "[Search] Empty query string provided." return multi_provider_search(query) - if not isinstance(query, list): - return "[Search] Invalid query format: 'query' must be a string or array of strings" - - responses = [] - for q in query: - responses.append(multi_provider_search(q)) + # Handle list of queries + if isinstance(query, list): + if not query: + return "[Search] Empty query list provided." + + # Filter out empty strings + valid_queries = [q.strip() for q in query if isinstance(q, str) and q.strip()] + + if not valid_queries: + return "[Search] No valid queries in list (all empty or non-string)." + + responses = [] + for q in valid_queries: + responses.append(multi_provider_search(q)) + + return "\n=======\n".join(responses) - return "\n=======\n".join(responses) + # Invalid query type + return f"[Search] Invalid 'query' type: expected string or array, got {type(query).__name__}." From 722acdb6a8d644f164215d49d39b878d9f6dab6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?= <12643176+chindris-mihai-alexandru@users.noreply.github.com> Date: Fri, 28 Nov 2025 17:22:51 +0200 Subject: [PATCH 4/5] Fix memory issues: disable mlock by default, reduce context to 16K - Change default context size from 32K to 16K (saves ~8GB RAM) - Disable --mlock by default (prevents locking 18GB model in wired memory) - Add --mlock flag to opt-in if needed for performance - Add --low-memory flag for constrained systems (8K context) - Display mlock status in configuration output This prevents the system from running out of memory when running llama-server alongside other apps like Firefox. --- scripts/start_llama_server.sh | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh index 1ec44511..ae7fed68 100755 --- a/scripts/start_llama_server.sh +++ b/scripts/start_llama_server.sh @@ -32,15 +32,16 @@ PROJECT_DIR="$(dirname "$SCRIPT_DIR")" LLAMA_SERVER="$PROJECT_DIR/llama.cpp/build/bin/llama-server" MODEL_PATH="$PROJECT_DIR/models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf" -# Default settings (optimized for Apple Silicon with 32GB+ RAM) +# Default settings (optimized for Apple Silicon with 32GB RAM) PORT=${PORT:-8080} HOST=${HOST:-127.0.0.1} -CTX_SIZE=${CTX_SIZE:-32768} # 32K context for long research sessions +CTX_SIZE=${CTX_SIZE:-16384} # 16K context (use --ctx 32768 for longer sessions) GPU_LAYERS=${GPU_LAYERS:-99} # Offload all layers to Metal THREADS=${THREADS:-8} # CPU threads for non-GPU ops PARALLEL=${PARALLEL:-1} # Parallel request slots BATCH_SIZE=${BATCH_SIZE:-512} # Batch size for prompt processing WEBUI=${WEBUI:-true} # Enable web UI by default +MLOCK=${MLOCK:-false} # Don't lock model in RAM (saves memory for other apps) # Colors RED='\033[0;31m' @@ -104,16 +105,28 @@ while [[ $# -gt 0 ]]; do WEBUI=true shift ;; + --mlock) + MLOCK=true + shift + ;; + --low-memory) + # Low memory mode: smaller context, no mlock + CTX_SIZE=8192 + MLOCK=false + shift + ;; -h|--help) echo "Usage: $0 [options]" echo "" echo "Options:" echo " --port N Port number (default: 8080)" - echo " --ctx N Context size (default: 32768)" + echo " --ctx N Context size (default: 16384)" echo " --threads N CPU threads (default: 8)" echo " --parallel N Parallel requests (default: 1)" echo " --webui Enable web UI (default)" echo " --no-webui Disable web UI, API only" + echo " --mlock Lock model in RAM (uses more memory but faster)" + echo " --low-memory Low memory mode: 8K context, no mlock" echo " -h, --help Show this help" echo "" echo "Access points:" @@ -128,7 +141,6 @@ while [[ $# -gt 0 ]]; do esac done -# Display configuration echo -e "${GREEN}Configuration:${NC}" echo " Model: $(basename "$MODEL_PATH")" echo " Size: $(du -h "$MODEL_PATH" | cut -f1)" @@ -136,6 +148,7 @@ echo " Context: $CTX_SIZE tokens" echo " GPU: Metal (all $GPU_LAYERS layers)" echo " Threads: $THREADS" echo " Parallel: $PARALLEL slots" +echo " Mlock: $MLOCK" echo " Web UI: $WEBUI" echo " Endpoint: http://$HOST:$PORT" echo "" @@ -172,12 +185,16 @@ SERVER_ARGS=( --parallel "$PARALLEL" --batch-size "$BATCH_SIZE" --flash-attn auto - --mlock --metrics --log-disable ) # Note: --jinja is enabled by default in recent llama.cpp versions +# Add mlock if requested (uses more memory but may be faster) +if [ "$MLOCK" = "true" ]; then + SERVER_ARGS+=(--mlock) +fi + # Add no-webui flag if requested if [ "$WEBUI" = "false" ]; then SERVER_ARGS+=(--no-webui) From b8422300608dc80f034f12a8ce4d85fccec260ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?= <12643176+chindris-mihai-alexandru@users.noreply.github.com> Date: Fri, 28 Nov 2025 21:14:41 +0200 Subject: [PATCH 5/5] Improve llama.cpp usability: reduce default rounds and add model alias - Reduce MAX_ROUNDS from 30 to 10 for faster, more practical research queries - Add --alias flag to llama-server for cleaner model naming (fixes Open WebUI compatibility) - Better UX when using with web UIs like Open WebUI that expect short model names --- inference/interactive_llamacpp.py | 2 +- scripts/start_llama_server.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py index 0bae1aed..b12fcb53 100644 --- a/inference/interactive_llamacpp.py +++ b/inference/interactive_llamacpp.py @@ -67,7 +67,7 @@ TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "") SERPER_KEY = os.environ.get("SERPER_KEY_ID", "") -MAX_ROUNDS = 30 +MAX_ROUNDS = 10 MAX_TOKENS = 4096 TEMPERATURE = 0.7 TOP_P = 0.95 diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh index ae7fed68..4b42ce9b 100755 --- a/scripts/start_llama_server.sh +++ b/scripts/start_llama_server.sh @@ -187,6 +187,7 @@ SERVER_ARGS=( --flash-attn auto --metrics --log-disable + --alias deepresearch ) # Note: --jinja is enabled by default in recent llama.cpp versions