From d653390a9fc996355e603038d3da725565b6829e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?=
 <12643176+chindris-mihai-alexandru@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:36:31 +0200
Subject: [PATCH 1/5] feat: add llama.cpp local inference support

Add support for running DeepResearch locally using llama.cpp with
Metal (Apple Silicon) or CUDA acceleration. Zero API costs, full privacy.

New files:
- inference/interactive_llamacpp.py: ReAct agent CLI for llama.cpp
- scripts/start_llama_server.sh: Server startup script with optimized settings
- requirements-local.txt: Minimal dependencies for local inference

Features:
- Free web search via DuckDuckGo (no API key required)
- Optional Jina Reader for better page content extraction
- Loop detection to prevent infinite tool call cycles
- 32K context window for long research sessions
- Exponential backoff retry for rate limits
- URL validation before visiting pages

Works with bartowski's GGUF quantizations (~18GB for Q4_K_M).
---
 .env.example                      |  15 +-
 README.md                         |  49 +++
 inference/interactive_llamacpp.py | 566 ++++++++++++++++++++++++++++++
 requirements-local.txt            |  12 +
 scripts/start_llama_server.sh     | 219 ++++++++++++
 5 files changed, 860 insertions(+), 1 deletion(-)
 create mode 100644 inference/interactive_llamacpp.py
 create mode 100644 requirements-local.txt
 create mode 100755 scripts/start_llama_server.sh

diff --git a/.env.example b/.env.example
index 8558e9c4..8154c65c 100644
--- a/.env.example
+++ b/.env.example
@@ -95,4 +95,17 @@ IDP_KEY_SECRET=your_idp_key_secret
 
 # These are typically set by distributed training frameworks
 # WORLD_SIZE=1
-# RANK=0
\ No newline at end of file
+# RANK=0
+
+# =============================================================================
+# llama.cpp Local Inference (Alternative for Mac/Local Users)
+# =============================================================================
+# If using the llama.cpp local inference option instead of vLLM:
+
+# The llama.cpp server URL (default works if using start_llama_server.sh)
+LLAMA_SERVER_URL=http://127.0.0.1:8080
+
+# For llama.cpp mode:
+# - Web search uses DuckDuckGo by default (FREE, no API key needed)
+# - JINA_API_KEYS is optional but recommended for better page reading
+# - See: python inference/interactive_llamacpp.py --help
\ No newline at end of file
diff --git a/README.md b/README.md
index 6a147f47..554bc0a8 100644
--- a/README.md
+++ b/README.md
@@ -179,6 +179,55 @@ You need to modify the following in the file [inference/react_agent.py](https://
 - Change the model name to alibaba/tongyi-deepresearch-30b-a3b.
 - Adjust the content concatenation way as described in the comments on lines **88–90.**
 
+
+---
+
+### 7. Local Inference with llama.cpp (Optional)
+
+> **For Mac users or anyone who wants 100% local inference without vLLM/CUDA dependencies.**
+
+This repo includes support for running DeepResearch locally using [llama.cpp](https://github.com/ggerganov/llama.cpp) with Metal (Apple Silicon) or CUDA acceleration. Zero API costs, full privacy.
+
+#### Requirements
+
+- llama.cpp built with Metal or CUDA support
+- GGUF model: [bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF](https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF)
+- 32GB+ RAM (for Q4_K_M quantization)
+
+#### Quick Start
+
+```bash
+# Install minimal dependencies
+pip install -r requirements-local.txt
+
+# Build llama.cpp (Mac with Metal)
+cd llama.cpp
+cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build build --config Release
+cd ..
+
+# Download model (~18GB)
+mkdir -p models/gguf
+curl -L -o models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \
+  'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf'
+
+# Terminal 1: Start the server
+./scripts/start_llama_server.sh
+
+# Terminal 2: Run research queries
+python inference/interactive_llamacpp.py
+```
+
+The llama.cpp server provides both an API and a web UI at http://localhost:8080.
+
+#### Features
+
+- **Free web search**: Uses DuckDuckGo (no API key required)
+- **Page visiting**: Uses Jina Reader (optional API key for better results)
+- **Loop detection**: Prevents infinite tool call cycles
+- **32K context**: Long research sessions supported
+
+---
 ## Benchmark Evaluation
 
 We provide benchmark evaluation scripts for various datasets. Please refer to the [evaluation scripts](./evaluation/) directory for more details.
diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py
new file mode 100644
index 00000000..1cf6dffc
--- /dev/null
+++ b/inference/interactive_llamacpp.py
@@ -0,0 +1,566 @@
+#!/usr/bin/env python3
+"""
+DeepResearch Interactive CLI - llama.cpp Server Edition
+=========================================================
+
+A powerful local research assistant that runs on YOUR machine.
+Zero API costs. Full privacy. Complete control.
+
+This script connects to a local llama.cpp server running the 
+Tongyi-DeepResearch model and provides a full ReAct agent loop
+with web search and page visiting capabilities.
+
+Architecture:
+    ┌─────────────────┐     HTTP      ┌──────────────────┐
+    │  This Script    │ ────────────> │  llama.cpp       │
+    │  (Agent Logic)  │               │  Server          │
+    │  - Tool calls   │ <──────────── │  (Model loaded)  │
+    │  - Web search   │     JSON      │  - Metal GPU     │
+    │  - Page visits  │               │  - 32K context   │
+    └─────────────────┘               └──────────────────┘
+
+Usage:
+    # Terminal 1: Start the server (one-time, stays running)
+    ./scripts/start_llama_server.sh
+    
+    # Terminal 2: Run research queries
+    python inference/interactive_llamacpp.py
+
+Requirements:
+    pip install requests duckduckgo-search python-dotenv
+
+The server must be running before starting this script.
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+import time
+from datetime import datetime
+from typing import Dict, List, Optional, Any
+
+import requests
+
+from urllib.parse import urlparse
+
+# Load environment variables
+try:
+    from dotenv import load_dotenv
+    load_dotenv(os.path.join(os.path.dirname(__file__), '..', '.env'))
+except ImportError:
+    pass
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://127.0.0.1:8080")
+JINA_API_KEY = os.environ.get("JINA_API_KEYS", "") or os.environ.get("JINA_API_KEY", "")
+
+MAX_ROUNDS = 30
+MAX_TOKENS = 4096
+TEMPERATURE = 0.7
+TOP_P = 0.95
+REQUEST_TIMEOUT = 300  # 5 minutes for long generations
+
+# Stop sequences for the ReAct loop
+STOP_SEQUENCES = [
+    "<tool_response>",
+    "\n<tool_response>",
+]
+
+# =============================================================================
+# System Prompt - Optimized for DeepResearch ReAct Agent
+# =============================================================================
+
+def get_system_prompt() -> str:
+    return f"""You are a deep research assistant. Your task is to answer questions by searching the web and synthesizing information from credible sources.
+
+# CRITICAL RULES
+
+1. **Think deeply**: Use <think></think> tags to reason about what you know and what you need to find
+2. **Search strategically**: Use multiple targeted searches to gather comprehensive information
+3. **Verify information**: Cross-reference facts across multiple sources
+4. **Synthesize thoroughly**: Combine information from multiple sources into a coherent answer
+5. **NEVER visit the same URL twice**: Each URL can only be visited once
+6. **Always conclude**: After gathering sufficient info (typically 5-15 sources), provide your answer in <answer></answer> tags
+7. **Be efficient**: Aim to answer in 10-20 rounds
+
+# Response Format
+
+When you need to search, respond with:
+<think>What I need to find and why</think>
+<tool_call>
+{{"name": "search", "arguments": {{"query": ["your search query"]}}}}
+</tool_call>
+
+When you need to visit a page for details:
+<think>Why I need to visit this specific page</think>
+<tool_call>
+{{"name": "visit", "arguments": {{"url": "https://example.com", "goal": "what specific info you need"}}}}
+</tool_call>
+
+When you have enough information, respond with:
+<think>Summary of what I found and my analysis</think>
+<answer>Your comprehensive, well-researched answer with citations where appropriate</answer>
+
+# Tools
+
+<tools>
+{{"type": "function", "function": {{"name": "search", "description": "Web search. Returns titles, URLs, and snippets.", "parameters": {{"type": "object", "properties": {{"query": {{"type": "array", "items": {{"type": "string"}}, "description": "1-3 search queries"}}}}, "required": ["query"]}}}}}}
+{{"type": "function", "function": {{"name": "visit", "description": "Visit a URL to get full page content. Each URL can only be visited ONCE.", "parameters": {{"type": "object", "properties": {{"url": {{"type": "string", "description": "URL to visit"}}, "goal": {{"type": "string", "description": "What info you need"}}}}, "required": ["url", "goal"]}}}}}}
+</tools>
+
+# Important Notes
+- The visit tool returns the COMPLETE page content in one response
+- After 8-10 successful source visits, you likely have enough information to answer
+- Prefer quality over quantity - don't just collect sources, synthesize them
+
+Current date: {datetime.now().strftime("%Y-%m-%d")}"""
+
+
+# =============================================================================
+# Tools - DuckDuckGo Search (FREE, no API key needed!)
+# =============================================================================
+
+def duckduckgo_search(queries: list, num_results: int = 10) -> str:
+    """Search using DuckDuckGo - completely free, no API key needed."""
+    try:
+        from duckduckgo_search import DDGS
+        from duckduckgo_search.exceptions import RatelimitException
+    except ImportError:
+        return "[Search Error] duckduckgo-search not installed. Run: pip install duckduckgo-search"
+    
+    results = []
+    for query in queries[:3]:
+        retries = 3
+        for attempt in range(retries):
+            try:
+                with DDGS() as ddgs:
+                    search_results = list(ddgs.text(query, max_results=num_results))
+                
+                output = [f"\n## Search: '{query}'\n"]
+                for idx, r in enumerate(search_results, 1):
+                    title = r.get("title", "No title")
+                    url = r.get("href", r.get("link", ""))
+                    snippet = r.get("body", "")[:300]
+                    output.append(f"{idx}. [{title}]({url})")
+                    if snippet:
+                        output.append(f"   {snippet}...")
+                
+                results.append("\n".join(output))
+                break  # Success, exit retry loop
+            except RatelimitException:
+                if attempt < retries - 1:
+                    time.sleep(2 ** attempt)  # Exponential backoff: 1s, 2s, 4s
+                    continue
+                results.append(f"[Search Error for '{query}']: Rate limited. Try again in a few seconds.")
+            except Exception as e:
+                results.append(f"[Search Error for '{query}']: {e}")
+                break
+    
+    return "\n".join(results) if results else "No results found"
+
+
+def is_valid_url(url: str) -> bool:
+    """Check if URL is valid and uses http/https scheme."""
+    try:
+        result = urlparse(url)
+        return all([result.scheme in ('http', 'https'), result.netloc])
+    except Exception:
+        return False
+
+
+def visit_page(url: str, goal: str) -> str:
+    """Fetch webpage content using Jina Reader (free tier) or direct fetch."""
+    if isinstance(url, list):
+        url = url[0] if url else ""
+    
+    if not url:
+        return "[Visit Error] No URL provided"
+    
+    if not is_valid_url(url):
+        return f"[Visit Error] Invalid URL: {url}. Must be a valid http/https URL."
+    
+    # Try Jina Reader first (free tier available)
+    try:
+        headers = {"Accept": "text/plain"}
+        if JINA_API_KEY:
+            headers["Authorization"] = f"Bearer {JINA_API_KEY}"
+        
+        jina_url = f"https://r.jina.ai/{url}"
+        response = requests.get(jina_url, headers=headers, timeout=30, allow_redirects=True)
+        
+        if response.status_code == 200 and len(response.text) > 100:
+            content = response.text[:12000]  # Increased limit for more context
+            return f"**Content from {url}** (goal: {goal}):\n\n{content}"
+    except Exception:
+        pass
+    
+    # Fallback to direct fetch
+    try:
+        headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()
+        
+        text = response.text
+        text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r'<[^>]+>', ' ', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        
+        if len(text) > 100:
+            return f"**Content from {url}** (goal: {goal}):\n\n{text[:12000]}"
+        return f"[Visit Error] Page content too short or blocked: {url}"
+    except requests.Timeout:
+        return f"[Visit Error] Timeout fetching {url}"
+    except Exception as e:
+        return f"[Visit Error] Could not fetch {url}: {type(e).__name__}: {e}"
+
+
+# =============================================================================
+# llama.cpp Server Client
+# =============================================================================
+
+class LlamaCppClient:
+    """Client for the llama.cpp OpenAI-compatible API."""
+    
+    def __init__(self, base_url: str = LLAMA_SERVER_URL):
+        self.base_url = base_url.rstrip('/')
+        self.api_url = f"{self.base_url}/v1/chat/completions"
+        self.session = requests.Session()
+    
+    def check_server(self) -> bool:
+        """Check if the server is running and responsive."""
+        try:
+            response = self.session.get(f"{self.base_url}/health", timeout=5)
+            return response.status_code == 200
+        except Exception:
+            return False
+    
+    def generate(self, messages: List[Dict[str, str]], 
+                 max_tokens: int = MAX_TOKENS,
+                 temperature: float = TEMPERATURE,
+                 top_p: float = TOP_P,
+                 stop: Optional[List[str]] = None) -> str:
+        """Generate a response from the llama.cpp server."""
+        
+        payload = {
+            "messages": messages,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "stop": stop or STOP_SEQUENCES,
+            "stream": False,
+        }
+        
+        try:
+            response = self.session.post(
+                self.api_url,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=300,  # 5 minute timeout for long generations
+            )
+            
+            if response.status_code != 200:
+                error_text = response.text[:500]
+                return f"[Server Error] Status {response.status_code}: {error_text}"
+            
+            data = response.json()
+            content = data["choices"][0]["message"]["content"]
+            return content.strip()
+            
+        except requests.Timeout:
+            return "[Error] Request timed out. The model may be processing a complex query."
+        except requests.ConnectionError:
+            return "[Error] Cannot connect to llama.cpp server. Is it running?"
+        except Exception as e:
+            return f"[Error] API call failed: {e}"
+
+
+# =============================================================================
+# Research Agent
+# =============================================================================
+
+def research(client: LlamaCppClient, question: str, verbose: bool = True, 
+             max_rounds: int = MAX_ROUNDS, temperature: float = TEMPERATURE) -> dict:
+    """Run the research agent loop."""
+    if verbose:
+        print(f"\n🔍 Researching: {question}\n")
+        print("-" * 60)
+    
+    system_prompt = get_system_prompt()
+    messages: List[Dict[str, str]] = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": question}
+    ]
+    
+    sources: List[Dict[str, Any]] = []
+    thinking: List[str] = []
+    visited_urls: set = set()
+    consecutive_errors = 0
+    max_consecutive_errors = 3
+    start_time = time.time()
+    
+    for round_num in range(max_rounds):
+        if verbose:
+            print(f"\n📝 Round {round_num + 1}/{max_rounds}")
+        
+        gen_start = time.time()
+        content = client.generate(messages, temperature=temperature)
+        gen_time = time.time() - gen_start
+        
+        if content.startswith("[Error]") or content.startswith("[Server Error]"):
+            if verbose:
+                print(f"   ❌ {content}")
+            break
+        
+        if verbose:
+            print(f"   ⏱️  Generated in {gen_time:.1f}s")
+        
+        messages.append({"role": "assistant", "content": content})
+        
+        # Extract and display thinking
+        if "<think>" in content and "</think>" in content:
+            think_content = content.split("<think>")[1].split("</think>")[0].strip()
+            thinking.append(think_content)
+            if verbose:
+                preview = think_content[:200] + "..." if len(think_content) > 200 else think_content
+                print(f"   💭 {preview}")
+        
+        # Check for final answer
+        if "<answer>" in content:
+            if "</answer>" in content:
+                answer = content.split("<answer>")[1].split("</answer>")[0]
+            else:
+                answer = content.split("<answer>")[1]
+            
+            elapsed = time.time() - start_time
+            
+            if verbose:
+                print("\n" + "=" * 60)
+                print("✅ ANSWER:")
+                print("=" * 60)
+                print(answer.strip())
+                print("=" * 60)
+                print(f"\n📊 Stats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s")
+            
+            return {
+                "answer": answer.strip(),
+                "sources": sources,
+                "rounds": round_num + 1,
+                "thinking": thinking,
+                "elapsed_seconds": elapsed,
+            }
+        
+        # Handle tool calls
+        if "<tool_call>" in content and "</tool_call>" in content:
+            try:
+                tool_json = content.split("<tool_call>")[1].split("</tool_call>")[0]
+                tool = json.loads(tool_json.strip())
+                name = tool.get("name", "")
+                args = tool.get("arguments", {})
+                
+                if verbose:
+                    print(f"   🔧 Tool: {name}")
+                
+                tool_error = False
+                
+                if name == "search":
+                    queries = args.get("query", [question])
+                    if isinstance(queries, str):
+                        queries = [queries]
+                    if verbose:
+                        print(f"      Searching: {queries}")
+                    tool_result = duckduckgo_search(queries)
+                    if "[Search Error" in tool_result:
+                        tool_error = True
+                    else:
+                        sources.append({"type": "search", "queries": queries})
+                
+                elif name == "visit":
+                    url = args.get("url", "")
+                    if isinstance(url, list):
+                        url = url[0] if url else ""
+                    goal = args.get("goal", "extract information")
+                    
+                    if url in visited_urls:
+                        if verbose:
+                            print(f"      ⚠️ Already visited: {url}")
+                        tool_result = f"[Already Visited] You already visited {url}. Use the information from your previous visit or try a different source."
+                        tool_error = True
+                    else:
+                        visited_urls.add(url)
+                        if verbose:
+                            print(f"      Visiting: {url[:60]}...")
+                        tool_result = visit_page(url, goal)
+                        if "[Visit Error]" in tool_result:
+                            tool_error = True
+                        else:
+                            sources.append({"type": "visit", "url": url})
+                
+                else:
+                    tool_result = f"Unknown tool: {name}. Available tools: search, visit"
+                    tool_error = True
+                
+                # Track consecutive errors for loop detection
+                if tool_error:
+                    consecutive_errors += 1
+                    if consecutive_errors >= max_consecutive_errors:
+                        if verbose:
+                            print(f"\n⚠️  {max_consecutive_errors} consecutive tool errors detected.")
+                        messages.append({
+                            "role": "user",
+                            "content": f"<tool_response>\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n</tool_response>"
+                        })
+                        continue
+                else:
+                    consecutive_errors = 0  # Reset on success
+                
+                # Inject tool response
+                messages.append({
+                    "role": "user", 
+                    "content": f"<tool_response>\n{tool_result}\n</tool_response>"
+                })
+            
+            except json.JSONDecodeError as e:
+                consecutive_errors += 1
+                messages.append({
+                    "role": "user", 
+                    "content": f"<tool_response>\nError: Invalid JSON in tool call: {e}\n</tool_response>"
+                })
+            except Exception as e:
+                consecutive_errors += 1
+                messages.append({
+                    "role": "user", 
+                    "content": f"<tool_response>\nTool error: {e}\n</tool_response>"
+                })
+    
+    # Force final answer after max rounds
+    if verbose:
+        print("\n⚠️  Max rounds reached, requesting final answer...")
+    
+    messages.append({
+        "role": "user",
+        "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use <answer></answer> tags."
+    })
+    
+    content = client.generate(messages, max_tokens=2048, temperature=temperature)
+    
+    if "<answer>" in content:
+        if "</answer>" in content:
+            answer = content.split("<answer>")[1].split("</answer>")[0]
+        else:
+            answer = content.split("<answer>")[1]
+    else:
+        answer = content
+    
+    elapsed = time.time() - start_time
+    
+    if verbose:
+        print("\n" + "=" * 60)
+        print("✅ ANSWER:")
+        print("=" * 60)
+        print(answer.strip())
+        print("=" * 60)
+        print(f"\n📊 Stats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s")
+    
+    return {
+        "answer": answer.strip(),
+        "sources": sources,
+        "rounds": max_rounds,
+        "thinking": thinking,
+        "elapsed_seconds": elapsed,
+    }
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="DeepResearch Interactive CLI - llama.cpp Server Edition",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Start the server first (in another terminal):
+    ./scripts/start_llama_server.sh
+    
+    # Run interactive mode:
+    python inference/interactive_llamacpp.py
+    
+    # Single query mode:
+    python inference/interactive_llamacpp.py --query "What is quantum entanglement?"
+    
+    # Connect to a different server:
+    python inference/interactive_llamacpp.py --server http://192.168.1.100:8080
+"""
+    )
+    parser.add_argument("--server", type=str, default=LLAMA_SERVER_URL,
+                        help="llama.cpp server URL (default: http://127.0.0.1:8080)")
+    parser.add_argument("--query", "-q", type=str, default=None,
+                        help="Single query mode - run one research query and exit")
+    parser.add_argument("--max-rounds", type=int, default=MAX_ROUNDS,
+                        help=f"Maximum research rounds (default: {MAX_ROUNDS})")
+    parser.add_argument("--temperature", type=float, default=TEMPERATURE,
+                        help=f"Sampling temperature (default: {TEMPERATURE})")
+    args = parser.parse_args()
+    
+    print("\n" + "=" * 60)
+    print("🔬 DeepResearch - Interactive CLI")
+    print("   llama.cpp Server Edition (100% Local)")
+    print("=" * 60)
+    print(f"Server:  {args.server}")
+    print(f"Search:  DuckDuckGo (free, no API key)")
+    print(f"Reader:  Jina.ai {'✓' if JINA_API_KEY else '(free tier)'}")
+    print("=" * 60)
+    
+    # Initialize client
+    client = LlamaCppClient(base_url=args.server)
+    
+    # Check server connection
+    print("\nConnecting to llama.cpp server...", end=" ")
+    if not client.check_server():
+        print("❌ FAILED")
+        print(f"\nError: Cannot connect to llama.cpp server at {args.server}")
+        print("\nPlease start the server first:")
+        print("    ./scripts/start_llama_server.sh")
+        print("\nOr specify a different server URL:")
+        print("    python inference/interactive_llamacpp.py --server http://your-server:8080")
+        sys.exit(1)
+    print("✅ Connected!")
+    
+    # Single query mode
+    if args.query:
+        research(client, args.query, max_rounds=args.max_rounds, temperature=args.temperature)
+        return
+    
+    # Interactive mode
+    print("\nType your research question (or 'quit' to exit):\n")
+    
+    while True:
+        try:
+            question = input("❓ Question: ").strip()
+            
+            if not question:
+                continue
+            
+            if question.lower() in ('quit', 'exit', 'q'):
+                print("\n👋 Goodbye!")
+                break
+            
+            research(client, question, max_rounds=args.max_rounds, temperature=args.temperature)
+            print("\n" + "-" * 60 + "\n")
+        
+        except KeyboardInterrupt:
+            print("\n\n👋 Goodbye!")
+            break
+        except Exception as e:
+            print(f"\n❌ Error: {e}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements-local.txt b/requirements-local.txt
new file mode 100644
index 00000000..7800ee0e
--- /dev/null
+++ b/requirements-local.txt
@@ -0,0 +1,12 @@
+# DeepResearch - Minimal requirements for llama.cpp local inference
+# Install with: pip install -r requirements-local.txt
+
+# Core dependencies
+requests>=2.31.0
+python-dotenv>=1.0.0
+
+# Web search (FREE, no API key needed)
+duckduckgo-search>=6.0.0
+
+# Optional but recommended
+tqdm>=4.66.0
diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh
new file mode 100755
index 00000000..1ec44511
--- /dev/null
+++ b/scripts/start_llama_server.sh
@@ -0,0 +1,219 @@
+#!/bin/bash
+# =============================================================================
+# DeepResearch Local Server - llama.cpp with Metal Acceleration
+# =============================================================================
+#
+# This script starts the llama.cpp server optimized for the DeepResearch
+# ReAct agent workflow on Apple Silicon.
+#
+# The server provides:
+#   - OpenAI-compatible API at http://localhost:8080/v1/chat/completions
+#   - Built-in Web UI at http://localhost:8080 (chat interface!)
+#   - Metal (GPU) acceleration for fast inference
+#   - Model loaded once and kept resident in memory
+#
+# Usage:
+#   ./scripts/start_llama_server.sh              # Start with defaults
+#   ./scripts/start_llama_server.sh --ctx 16384  # Custom context size
+#   ./scripts/start_llama_server.sh --no-webui   # API only, no web UI
+#
+# Access:
+#   - Web UI:  http://localhost:8080
+#   - API:     http://localhost:8080/v1/chat/completions
+#   - CLI:     python inference/interactive_llamacpp.py
+#
+# =============================================================================
+
+set -e
+
+# Configuration
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+LLAMA_SERVER="$PROJECT_DIR/llama.cpp/build/bin/llama-server"
+MODEL_PATH="$PROJECT_DIR/models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf"
+
+# Default settings (optimized for Apple Silicon with 32GB+ RAM)
+PORT=${PORT:-8080}
+HOST=${HOST:-127.0.0.1}
+CTX_SIZE=${CTX_SIZE:-32768}       # 32K context for long research sessions
+GPU_LAYERS=${GPU_LAYERS:-99}      # Offload all layers to Metal
+THREADS=${THREADS:-8}             # CPU threads for non-GPU ops
+PARALLEL=${PARALLEL:-1}           # Parallel request slots
+BATCH_SIZE=${BATCH_SIZE:-512}     # Batch size for prompt processing
+WEBUI=${WEBUI:-true}              # Enable web UI by default
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}"
+echo "============================================================"
+echo "  DeepResearch Local Server (llama.cpp + Metal)"
+echo "============================================================"
+echo -e "${NC}"
+
+# Check if llama-server exists
+if [ ! -f "$LLAMA_SERVER" ]; then
+    echo -e "${RED}Error: llama-server not found at $LLAMA_SERVER${NC}"
+    echo ""
+    echo "Please build llama.cpp first:"
+    echo "  cd $PROJECT_DIR/llama.cpp"
+    echo "  cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release"
+    echo "  cmake --build build --config Release"
+    exit 1
+fi
+
+# Check if model exists
+if [ ! -f "$MODEL_PATH" ]; then
+    echo -e "${RED}Error: Model not found at $MODEL_PATH${NC}"
+    echo ""
+    echo "Please download the model first:"
+    echo "  cd $PROJECT_DIR/models/gguf"
+    echo "  curl -L -C - -o Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \\"
+    echo "    'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf'"
+    exit 1
+fi
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --ctx)
+            CTX_SIZE="$2"
+            shift 2
+            ;;
+        --threads)
+            THREADS="$2"
+            shift 2
+            ;;
+        --parallel)
+            PARALLEL="$2"
+            shift 2
+            ;;
+        --no-webui)
+            WEBUI=false
+            shift
+            ;;
+        --webui)
+            WEBUI=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [options]"
+            echo ""
+            echo "Options:"
+            echo "  --port N      Port number (default: 8080)"
+            echo "  --ctx N       Context size (default: 32768)"
+            echo "  --threads N   CPU threads (default: 8)"
+            echo "  --parallel N  Parallel requests (default: 1)"
+            echo "  --webui       Enable web UI (default)"
+            echo "  --no-webui    Disable web UI, API only"
+            echo "  -h, --help    Show this help"
+            echo ""
+            echo "Access points:"
+            echo "  Web UI:  http://127.0.0.1:PORT"
+            echo "  API:     http://127.0.0.1:PORT/v1/chat/completions"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Display configuration
+echo -e "${GREEN}Configuration:${NC}"
+echo "  Model:     $(basename "$MODEL_PATH")"
+echo "  Size:      $(du -h "$MODEL_PATH" | cut -f1)"
+echo "  Context:   $CTX_SIZE tokens"
+echo "  GPU:       Metal (all $GPU_LAYERS layers)"
+echo "  Threads:   $THREADS"
+echo "  Parallel:  $PARALLEL slots"
+echo "  Web UI:    $WEBUI"
+echo "  Endpoint:  http://$HOST:$PORT"
+echo ""
+
+# Check for existing server on port
+if lsof -i :$PORT > /dev/null 2>&1; then
+    echo -e "${YELLOW}Warning: Port $PORT is already in use.${NC}"
+    echo "Existing process:"
+    lsof -i :$PORT | head -2
+    echo ""
+    read -p "Kill existing process and continue? (y/N) " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        lsof -t -i :$PORT | xargs kill -9 2>/dev/null || true
+        sleep 1
+    else
+        echo "Aborting."
+        exit 1
+    fi
+fi
+
+echo -e "${YELLOW}Starting server...${NC}"
+echo "(Model loading takes ~30-60 seconds)"
+echo ""
+
+# Build command arguments
+SERVER_ARGS=(
+    --model "$MODEL_PATH"
+    --host "$HOST"
+    --port "$PORT"
+    --ctx-size "$CTX_SIZE"
+    --n-gpu-layers "$GPU_LAYERS"
+    --threads "$THREADS"
+    --parallel "$PARALLEL"
+    --batch-size "$BATCH_SIZE"
+    --flash-attn auto
+    --mlock
+    --metrics
+    --log-disable
+)
+# Note: --jinja is enabled by default in recent llama.cpp versions
+
+# Add no-webui flag if requested
+if [ "$WEBUI" = "false" ]; then
+    SERVER_ARGS+=(--no-webui)
+fi
+
+# Start the server with optimized settings for DeepResearch
+exec "$LLAMA_SERVER" "${SERVER_ARGS[@]}" 2>&1 | while read -r line; do
+    # Colorize output
+    if [[ $line == *"error"* ]] || [[ $line == *"Error"* ]]; then
+        echo -e "${RED}$line${NC}"
+    elif [[ $line == *"listening"* ]] || [[ $line == *"ready"* ]]; then
+        echo -e "${GREEN}$line${NC}"
+        echo ""
+        echo -e "${GREEN}============================================================${NC}"
+        echo -e "${GREEN}  Server ready!${NC}"
+        echo -e "${GREEN}============================================================${NC}"
+        if [ "$WEBUI" = "true" ]; then
+            echo ""
+            echo -e "${GREEN}  Web UI:  http://$HOST:$PORT${NC}"
+            echo "           Open in your browser for a chat interface!"
+        fi
+        echo ""
+        echo -e "${GREEN}  API:     http://$HOST:$PORT/v1/chat/completions${NC}"
+        echo ""
+        echo "Run DeepResearch CLI:"
+        echo "  python inference/interactive_llamacpp.py"
+        echo ""
+        echo "Test API:"
+        echo "  curl http://$HOST:$PORT/v1/chat/completions \\"
+        echo "    -H 'Content-Type: application/json' \\"
+        echo "    -d '{\"model\": \"deepresearch\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'"
+        echo ""
+        echo "Press Ctrl+C to stop the server."
+    elif [[ $line == *"warning"* ]] || [[ $line == *"Warning"* ]]; then
+        echo -e "${YELLOW}$line${NC}"
+    else
+        echo "$line"
+    fi
+done

From 448000f681d2e4995816edc29d407021259cb2f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?=
 <12643176+chindris-mihai-alexandru@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:52:44 +0200
Subject: [PATCH 2/5] Add multi-provider search support to llama.cpp CLI

- Add Exa, Tavily, Serper, DuckDuckGo providers with automatic fallback
- Remove emojis for cleaner professional output
- Update documentation with search provider information
- Show available search providers on startup
---
 .env.example                      |  16 +-
 inference/interactive_llamacpp.py | 432 +++++++++++++++++++++--------
 inference/tool_search.py          | 434 ++++++++++++++++++++++++------
 3 files changed, 687 insertions(+), 195 deletions(-)

diff --git a/.env.example b/.env.example
index 8154c65c..c53f27cb 100644
--- a/.env.example
+++ b/.env.example
@@ -46,10 +46,24 @@ MAX_WORKERS=30
 # API Keys and External Services
 # =============================================================================
 
-# Serper API for web search and Google Scholar
+# Web Search Providers (in order of quality/preference)
+# The system will try each provider in order until one succeeds.
+# You only need ONE provider configured, but having multiple provides fallback.
+
+# Exa.ai - Best semantic/neural search ($10 free credits)
+# Get your key from: https://exa.ai/
+EXA_API_KEY=your_key
+
+# Tavily - Purpose-built for RAG/LLMs (1,000 free requests/month)
+# Get your key from: https://tavily.com/
+TAVILY_API_KEY=your_key
+
+# Serper API for Google search results (2,500 free queries)
 # Get your key from: https://serper.dev/
 SERPER_KEY_ID=your_key
 
+# DuckDuckGo is always available as final fallback (FREE, no API key needed)
+
 # Jina API for web page reading
 # Get your key from: https://jina.ai/
 JINA_API_KEYS=your_key
diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py
index 1cf6dffc..34471aba 100644
--- a/inference/interactive_llamacpp.py
+++ b/inference/interactive_llamacpp.py
@@ -11,13 +11,19 @@
 with web search and page visiting capabilities.
 
 Architecture:
-    ┌─────────────────┐     HTTP      ┌──────────────────┐
-    │  This Script    │ ────────────> │  llama.cpp       │
-    │  (Agent Logic)  │               │  Server          │
-    │  - Tool calls   │ <──────────── │  (Model loaded)  │
-    │  - Web search   │     JSON      │  - Metal GPU     │
-    │  - Page visits  │               │  - 32K context   │
-    └─────────────────┘               └──────────────────┘
+    +------------------+     HTTP      +-------------------+
+    |  This Script     | ------------> |  llama.cpp        |
+    |  (Agent Logic)   |               |  Server           |
+    |  - Tool calls    | <------------ |  (Model loaded)   |
+    |  - Web search    |     JSON      |  - Metal GPU      |
+    |  - Page visits   |               |  - 32K context    |
+    +------------------+               +-------------------+
+
+Search Providers (in order of quality):
+    1. Exa.ai      - Best semantic/neural search
+    2. Tavily      - Purpose-built for RAG/LLMs
+    3. Serper      - Google SERP results
+    4. DuckDuckGo  - Free fallback (no API key needed)
 
 Usage:
     # Terminal 1: Start the server (one-time, stays running)
@@ -28,27 +34,25 @@
 
 Requirements:
     pip install requests duckduckgo-search python-dotenv
-
-The server must be running before starting this script.
 """
 
 import argparse
+import http.client
 import json
 import os
 import re
 import sys
 import time
 from datetime import datetime
-from typing import Dict, List, Optional, Any
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
 
 import requests
 
-from urllib.parse import urlparse
-
 # Load environment variables
 try:
     from dotenv import load_dotenv
-    load_dotenv(os.path.join(os.path.dirname(__file__), '..', '.env'))
+    load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env"))
 except ImportError:
     pass
 
@@ -59,6 +63,11 @@
 LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://127.0.0.1:8080")
 JINA_API_KEY = os.environ.get("JINA_API_KEYS", "") or os.environ.get("JINA_API_KEY", "")
 
+# Search API keys
+EXA_API_KEY = os.environ.get("EXA_API_KEY", "")
+TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "")
+SERPER_KEY = os.environ.get("SERPER_KEY_ID", "")
+
 MAX_ROUNDS = 30
 MAX_TOKENS = 4096
 TEMPERATURE = 0.7
@@ -71,6 +80,7 @@
     "\n<tool_response>",
 ]
 
+
 # =============================================================================
 # System Prompt - Optimized for DeepResearch ReAct Agent
 # =============================================================================
@@ -122,59 +132,227 @@ def get_system_prompt() -> str:
 
 
 # =============================================================================
-# Tools - DuckDuckGo Search (FREE, no API key needed!)
+# Search Providers
 # =============================================================================
 
-def duckduckgo_search(queries: list, num_results: int = 10) -> str:
-    """Search using DuckDuckGo - completely free, no API key needed."""
+def contains_chinese(text: str) -> bool:
+    """Check if text contains Chinese characters."""
+    return any("\u4E00" <= char <= "\u9FFF" for char in text)
+
+
+def search_exa(query: str, num_results: int = 10) -> Optional[str]:
+    """Exa.ai - Neural/semantic search engine."""
+    if not EXA_API_KEY:
+        return None
+    
+    try:
+        response = requests.post(
+            "https://api.exa.ai/search",
+            headers={
+                "x-api-key": EXA_API_KEY,
+                "Content-Type": "application/json",
+            },
+            json={
+                "query": query,
+                "numResults": num_results,
+                "useAutoprompt": True,
+                "type": "neural",
+            },
+            timeout=30,
+        )
+        
+        if response.status_code in (401, 429) or response.status_code != 200:
+            return None
+        
+        data = response.json()
+        results = data.get("results", [])
+        if not results:
+            return None
+        
+        output = [f"\n## Search: '{query}'\n"]
+        for idx, r in enumerate(results, 1):
+            title = r.get("title", "No title")
+            url = r.get("url", "")
+            text = r.get("text", "")[:300] if r.get("text") else ""
+            output.append(f"{idx}. [{title}]({url})")
+            if text:
+                output.append(f"   {text}...")
+        
+        return "\n".join(output)
+    except Exception:
+        return None
+
+
+def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
+    """Tavily - Search API for RAG/LLM applications."""
+    if not TAVILY_API_KEY:
+        return None
+    
+    try:
+        response = requests.post(
+            "https://api.tavily.com/search",
+            headers={"Content-Type": "application/json"},
+            json={
+                "api_key": TAVILY_API_KEY,
+                "query": query,
+                "max_results": num_results,
+                "search_depth": "advanced",
+            },
+            timeout=30,
+        )
+        
+        if response.status_code in (401, 429) or response.status_code != 200:
+            return None
+        
+        data = response.json()
+        results = data.get("results", [])
+        if not results:
+            return None
+        
+        output = [f"\n## Search: '{query}'\n"]
+        for idx, r in enumerate(results, 1):
+            title = r.get("title", "No title")
+            url = r.get("url", "")
+            content = r.get("content", "")[:300]
+            output.append(f"{idx}. [{title}]({url})")
+            if content:
+                output.append(f"   {content}...")
+        
+        return "\n".join(output)
+    except Exception:
+        return None
+
+
+def search_serper(query: str, num_results: int = 10) -> Optional[str]:
+    """Serper - Google Search API."""
+    if not SERPER_KEY:
+        return None
+    
+    try:
+        conn = http.client.HTTPSConnection("google.serper.dev")
+        
+        if contains_chinese(query):
+            payload = json.dumps({
+                "q": query, "location": "China", "gl": "cn", "hl": "zh-cn", "num": num_results
+            })
+        else:
+            payload = json.dumps({
+                "q": query, "location": "United States", "gl": "us", "hl": "en", "num": num_results
+            })
+        
+        headers = {"X-API-KEY": SERPER_KEY, "Content-Type": "application/json"}
+        
+        res = None
+        for attempt in range(3):
+            try:
+                conn.request("POST", "/search", payload, headers)
+                res = conn.getresponse()
+                break
+            except Exception:
+                if attempt == 2:
+                    return None
+                time.sleep(1)
+        
+        if res is None:
+            return None
+        
+        data = json.loads(res.read().decode("utf-8"))
+        if "organic" not in data:
+            return None
+        
+        output = [f"\n## Search: '{query}'\n"]
+        for idx, page in enumerate(data["organic"], 1):
+            title = page.get("title", "No title")
+            url = page.get("link", "")
+            snippet = page.get("snippet", "")[:300]
+            output.append(f"{idx}. [{title}]({url})")
+            if snippet:
+                output.append(f"   {snippet}...")
+        
+        return "\n".join(output)
+    except Exception:
+        return None
+
+
+def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]:
+    """DuckDuckGo - Free search, no API key required."""
     try:
         from duckduckgo_search import DDGS
         from duckduckgo_search.exceptions import RatelimitException
     except ImportError:
-        return "[Search Error] duckduckgo-search not installed. Run: pip install duckduckgo-search"
+        return None
+    
+    retries = 3
+    for attempt in range(retries):
+        try:
+            with DDGS() as ddgs:
+                results = list(ddgs.text(query, max_results=num_results))
+            
+            if not results:
+                return None
+            
+            output = [f"\n## Search: '{query}'\n"]
+            for idx, r in enumerate(results, 1):
+                title = r.get("title", "No title")
+                url = r.get("href", r.get("link", ""))
+                body = r.get("body", "")[:300]
+                output.append(f"{idx}. [{title}]({url})")
+                if body:
+                    output.append(f"   {body}...")
+            
+            return "\n".join(output)
+        except RatelimitException:
+            if attempt < retries - 1:
+                time.sleep(2 ** attempt)
+                continue
+            return None
+        except Exception:
+            return None
+    
+    return None
+
+
+def multi_provider_search(queries: list, num_results: int = 10) -> str:
+    """Search using multiple providers with automatic fallback."""
+    providers = [
+        ("Exa", search_exa),
+        ("Tavily", search_tavily),
+        ("Serper", search_serper),
+        ("DuckDuckGo", search_duckduckgo),
+    ]
+    
+    all_results = []
     
-    results = []
     for query in queries[:3]:
-        retries = 3
-        for attempt in range(retries):
-            try:
-                with DDGS() as ddgs:
-                    search_results = list(ddgs.text(query, max_results=num_results))
-                
-                output = [f"\n## Search: '{query}'\n"]
-                for idx, r in enumerate(search_results, 1):
-                    title = r.get("title", "No title")
-                    url = r.get("href", r.get("link", ""))
-                    snippet = r.get("body", "")[:300]
-                    output.append(f"{idx}. [{title}]({url})")
-                    if snippet:
-                        output.append(f"   {snippet}...")
-                
-                results.append("\n".join(output))
-                break  # Success, exit retry loop
-            except RatelimitException:
-                if attempt < retries - 1:
-                    time.sleep(2 ** attempt)  # Exponential backoff: 1s, 2s, 4s
-                    continue
-                results.append(f"[Search Error for '{query}']: Rate limited. Try again in a few seconds.")
-            except Exception as e:
-                results.append(f"[Search Error for '{query}']: {e}")
+        result = None
+        for name, search_fn in providers:
+            result = search_fn(query, num_results)
+            if result:
                 break
+        
+        if result:
+            all_results.append(result)
+        else:
+            all_results.append(f"\n## Search: '{query}'\n[No results found]")
     
-    return "\n".join(results) if results else "No results found"
+    return "\n".join(all_results) if all_results else "No results found"
 
 
+# =============================================================================
+# Page Visitor
+# =============================================================================
+
 def is_valid_url(url: str) -> bool:
     """Check if URL is valid and uses http/https scheme."""
     try:
         result = urlparse(url)
-        return all([result.scheme in ('http', 'https'), result.netloc])
+        return all([result.scheme in ("http", "https"), result.netloc])
     except Exception:
         return False
 
 
 def visit_page(url: str, goal: str) -> str:
-    """Fetch webpage content using Jina Reader (free tier) or direct fetch."""
+    """Fetch webpage content using Jina Reader or direct fetch."""
     if isinstance(url, list):
         url = url[0] if url else ""
     
@@ -184,7 +362,7 @@ def visit_page(url: str, goal: str) -> str:
     if not is_valid_url(url):
         return f"[Visit Error] Invalid URL: {url}. Must be a valid http/https URL."
     
-    # Try Jina Reader first (free tier available)
+    # Try Jina Reader first
     try:
         headers = {"Accept": "text/plain"}
         if JINA_API_KEY:
@@ -194,7 +372,7 @@ def visit_page(url: str, goal: str) -> str:
         response = requests.get(jina_url, headers=headers, timeout=30, allow_redirects=True)
         
         if response.status_code == 200 and len(response.text) > 100:
-            content = response.text[:12000]  # Increased limit for more context
+            content = response.text[:12000]
             return f"**Content from {url}** (goal: {goal}):\n\n{content}"
     except Exception:
         pass
@@ -206,10 +384,10 @@ def visit_page(url: str, goal: str) -> str:
         response.raise_for_status()
         
         text = response.text
-        text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
-        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
-        text = re.sub(r'<[^>]+>', ' ', text)
-        text = re.sub(r'\s+', ' ', text).strip()
+        text = re.sub(r"<script[^>]*>.*?</script>", "", text, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r"<[^>]+>", " ", text)
+        text = re.sub(r"\s+", " ", text).strip()
         
         if len(text) > 100:
             return f"**Content from {url}** (goal: {goal}):\n\n{text[:12000]}"
@@ -228,7 +406,7 @@ class LlamaCppClient:
     """Client for the llama.cpp OpenAI-compatible API."""
     
     def __init__(self, base_url: str = LLAMA_SERVER_URL):
-        self.base_url = base_url.rstrip('/')
+        self.base_url = base_url.rstrip("/")
         self.api_url = f"{self.base_url}/v1/chat/completions"
         self.session = requests.Session()
     
@@ -240,13 +418,15 @@ def check_server(self) -> bool:
         except Exception:
             return False
     
-    def generate(self, messages: List[Dict[str, str]], 
-                 max_tokens: int = MAX_TOKENS,
-                 temperature: float = TEMPERATURE,
-                 top_p: float = TOP_P,
-                 stop: Optional[List[str]] = None) -> str:
+    def generate(
+        self,
+        messages: List[Dict[str, str]],
+        max_tokens: int = MAX_TOKENS,
+        temperature: float = TEMPERATURE,
+        top_p: float = TOP_P,
+        stop: Optional[List[str]] = None,
+    ) -> str:
         """Generate a response from the llama.cpp server."""
-        
         payload = {
             "messages": messages,
             "max_tokens": max_tokens,
@@ -261,7 +441,7 @@ def generate(self, messages: List[Dict[str, str]],
                 self.api_url,
                 json=payload,
                 headers={"Content-Type": "application/json"},
-                timeout=300,  # 5 minute timeout for long generations
+                timeout=300,
             )
             
             if response.status_code != 200:
@@ -284,17 +464,22 @@ def generate(self, messages: List[Dict[str, str]],
 # Research Agent
 # =============================================================================
 
-def research(client: LlamaCppClient, question: str, verbose: bool = True, 
-             max_rounds: int = MAX_ROUNDS, temperature: float = TEMPERATURE) -> dict:
+def research(
+    client: LlamaCppClient,
+    question: str,
+    verbose: bool = True,
+    max_rounds: int = MAX_ROUNDS,
+    temperature: float = TEMPERATURE,
+) -> dict:
     """Run the research agent loop."""
     if verbose:
-        print(f"\n🔍 Researching: {question}\n")
+        print(f"\n[*] Researching: {question}\n")
         print("-" * 60)
     
     system_prompt = get_system_prompt()
     messages: List[Dict[str, str]] = [
         {"role": "system", "content": system_prompt},
-        {"role": "user", "content": question}
+        {"role": "user", "content": question},
     ]
     
     sources: List[Dict[str, Any]] = []
@@ -306,7 +491,7 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
     
     for round_num in range(max_rounds):
         if verbose:
-            print(f"\n📝 Round {round_num + 1}/{max_rounds}")
+            print(f"\n[Round {round_num + 1}/{max_rounds}]")
         
         gen_start = time.time()
         content = client.generate(messages, temperature=temperature)
@@ -314,11 +499,11 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
         
         if content.startswith("[Error]") or content.startswith("[Server Error]"):
             if verbose:
-                print(f"   ❌ {content}")
+                print(f"   Error: {content}")
             break
         
         if verbose:
-            print(f"   ⏱️  Generated in {gen_time:.1f}s")
+            print(f"   Generated in {gen_time:.1f}s")
         
         messages.append({"role": "assistant", "content": content})
         
@@ -328,7 +513,7 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
             thinking.append(think_content)
             if verbose:
                 preview = think_content[:200] + "..." if len(think_content) > 200 else think_content
-                print(f"   💭 {preview}")
+                print(f"   Thinking: {preview}")
         
         # Check for final answer
         if "<answer>" in content:
@@ -341,11 +526,11 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
             
             if verbose:
                 print("\n" + "=" * 60)
-                print("✅ ANSWER:")
+                print("ANSWER:")
                 print("=" * 60)
                 print(answer.strip())
                 print("=" * 60)
-                print(f"\n📊 Stats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s")
+                print(f"\nStats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s")
             
             return {
                 "answer": answer.strip(),
@@ -364,7 +549,7 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
                 args = tool.get("arguments", {})
                 
                 if verbose:
-                    print(f"   🔧 Tool: {name}")
+                    print(f"   Tool: {name}")
                 
                 tool_error = False
                 
@@ -373,9 +558,9 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
                     if isinstance(queries, str):
                         queries = [queries]
                     if verbose:
-                        print(f"      Searching: {queries}")
-                    tool_result = duckduckgo_search(queries)
-                    if "[Search Error" in tool_result:
+                        print(f"   Queries: {queries}")
+                    tool_result = multi_provider_search(queries)
+                    if "[No results" in tool_result or "error" in tool_result.lower():
                         tool_error = True
                     else:
                         sources.append({"type": "search", "queries": queries})
@@ -388,13 +573,13 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
                     
                     if url in visited_urls:
                         if verbose:
-                            print(f"      ⚠️ Already visited: {url}")
+                            print(f"   [!] Already visited: {url}")
                         tool_result = f"[Already Visited] You already visited {url}. Use the information from your previous visit or try a different source."
                         tool_error = True
                     else:
                         visited_urls.add(url)
                         if verbose:
-                            print(f"      Visiting: {url[:60]}...")
+                            print(f"   Visiting: {url[:60]}...")
                         tool_result = visit_page(url, goal)
                         if "[Visit Error]" in tool_result:
                             tool_error = True
@@ -410,41 +595,41 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
                     consecutive_errors += 1
                     if consecutive_errors >= max_consecutive_errors:
                         if verbose:
-                            print(f"\n⚠️  {max_consecutive_errors} consecutive tool errors detected.")
+                            print(f"\n[!] {max_consecutive_errors} consecutive tool errors detected.")
                         messages.append({
                             "role": "user",
-                            "content": f"<tool_response>\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n</tool_response>"
+                            "content": f"<tool_response>\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n</tool_response>",
                         })
                         continue
                 else:
-                    consecutive_errors = 0  # Reset on success
+                    consecutive_errors = 0
                 
                 # Inject tool response
                 messages.append({
-                    "role": "user", 
-                    "content": f"<tool_response>\n{tool_result}\n</tool_response>"
+                    "role": "user",
+                    "content": f"<tool_response>\n{tool_result}\n</tool_response>",
                 })
             
             except json.JSONDecodeError as e:
                 consecutive_errors += 1
                 messages.append({
-                    "role": "user", 
-                    "content": f"<tool_response>\nError: Invalid JSON in tool call: {e}\n</tool_response>"
+                    "role": "user",
+                    "content": f"<tool_response>\nError: Invalid JSON in tool call: {e}\n</tool_response>",
                 })
             except Exception as e:
                 consecutive_errors += 1
                 messages.append({
-                    "role": "user", 
-                    "content": f"<tool_response>\nTool error: {e}\n</tool_response>"
+                    "role": "user",
+                    "content": f"<tool_response>\nTool error: {e}\n</tool_response>",
                 })
     
     # Force final answer after max rounds
     if verbose:
-        print("\n⚠️  Max rounds reached, requesting final answer...")
+        print("\n[!] Max rounds reached, requesting final answer...")
     
     messages.append({
         "role": "user",
-        "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use <answer></answer> tags."
+        "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use <answer></answer> tags.",
     })
     
     content = client.generate(messages, max_tokens=2048, temperature=temperature)
@@ -461,11 +646,11 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
     
     if verbose:
         print("\n" + "=" * 60)
-        print("✅ ANSWER:")
+        print("ANSWER:")
         print("=" * 60)
         print(answer.strip())
         print("=" * 60)
-        print(f"\n📊 Stats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s")
+        print(f"\nStats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s")
     
     return {
         "answer": answer.strip(),
@@ -480,6 +665,19 @@ def research(client: LlamaCppClient, question: str, verbose: bool = True,
 # Main
 # =============================================================================
 
+def get_available_providers() -> List[str]:
+    """Return list of available search providers."""
+    providers = []
+    if EXA_API_KEY:
+        providers.append("Exa")
+    if TAVILY_API_KEY:
+        providers.append("Tavily")
+    if SERPER_KEY:
+        providers.append("Serper")
+    providers.append("DuckDuckGo")
+    return providers
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="DeepResearch Interactive CLI - llama.cpp Server Edition",
@@ -497,25 +695,43 @@ def main():
     
     # Connect to a different server:
     python inference/interactive_llamacpp.py --server http://192.168.1.100:8080
-"""
+""",
+    )
+    parser.add_argument(
+        "--server",
+        type=str,
+        default=LLAMA_SERVER_URL,
+        help="llama.cpp server URL (default: http://127.0.0.1:8080)",
+    )
+    parser.add_argument(
+        "--query", "-q",
+        type=str,
+        default=None,
+        help="Single query mode - run one research query and exit",
+    )
+    parser.add_argument(
+        "--max-rounds",
+        type=int,
+        default=MAX_ROUNDS,
+        help=f"Maximum research rounds (default: {MAX_ROUNDS})",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=TEMPERATURE,
+        help=f"Sampling temperature (default: {TEMPERATURE})",
     )
-    parser.add_argument("--server", type=str, default=LLAMA_SERVER_URL,
-                        help="llama.cpp server URL (default: http://127.0.0.1:8080)")
-    parser.add_argument("--query", "-q", type=str, default=None,
-                        help="Single query mode - run one research query and exit")
-    parser.add_argument("--max-rounds", type=int, default=MAX_ROUNDS,
-                        help=f"Maximum research rounds (default: {MAX_ROUNDS})")
-    parser.add_argument("--temperature", type=float, default=TEMPERATURE,
-                        help=f"Sampling temperature (default: {TEMPERATURE})")
     args = parser.parse_args()
     
+    providers = get_available_providers()
+    
     print("\n" + "=" * 60)
-    print("🔬 DeepResearch - Interactive CLI")
-    print("   llama.cpp Server Edition (100% Local)")
+    print("DeepResearch - Interactive CLI")
+    print("llama.cpp Server Edition (100% Local)")
     print("=" * 60)
     print(f"Server:  {args.server}")
-    print(f"Search:  DuckDuckGo (free, no API key)")
-    print(f"Reader:  Jina.ai {'✓' if JINA_API_KEY else '(free tier)'}")
+    print(f"Search:  {', '.join(providers)}")
+    print(f"Reader:  Jina.ai {'[configured]' if JINA_API_KEY else '[free tier]'}")
     print("=" * 60)
     
     # Initialize client
@@ -524,14 +740,14 @@ def main():
     # Check server connection
     print("\nConnecting to llama.cpp server...", end=" ")
     if not client.check_server():
-        print("❌ FAILED")
+        print("FAILED")
         print(f"\nError: Cannot connect to llama.cpp server at {args.server}")
         print("\nPlease start the server first:")
         print("    ./scripts/start_llama_server.sh")
         print("\nOr specify a different server URL:")
         print("    python inference/interactive_llamacpp.py --server http://your-server:8080")
         sys.exit(1)
-    print("✅ Connected!")
+    print("OK")
     
     # Single query mode
     if args.query:
@@ -543,23 +759,23 @@ def main():
     
     while True:
         try:
-            question = input("❓ Question: ").strip()
+            question = input("Question: ").strip()
             
             if not question:
                 continue
             
-            if question.lower() in ('quit', 'exit', 'q'):
-                print("\n👋 Goodbye!")
+            if question.lower() in ("quit", "exit", "q"):
+                print("\nGoodbye!")
                 break
             
             research(client, question, max_rounds=args.max_rounds, temperature=args.temperature)
             print("\n" + "-" * 60 + "\n")
         
         except KeyboardInterrupt:
-            print("\n\n👋 Goodbye!")
+            print("\n\nGoodbye!")
             break
         except Exception as e:
-            print(f"\n❌ Error: {e}\n")
+            print(f"\nError: {e}\n")
 
 
 if __name__ == "__main__":
diff --git a/inference/tool_search.py b/inference/tool_search.py
index 1a3f7b53..03229595 100644
--- a/inference/tool_search.py
+++ b/inference/tool_search.py
@@ -1,131 +1,393 @@
+"""
+Multi-Provider Web Search Tool
+==============================
+
+Implements a robust search fallback chain optimized for AI research:
+  1. Exa.ai     - Best semantic/neural search, $10 free credits
+  2. Tavily    - Purpose-built for RAG/LLMs, 1,000 free requests/month
+  3. Serper    - Google SERP results, 2,500 free queries
+  4. DuckDuckGo - Free forever, final fallback (no API key needed)
+
+Each provider is tried in order. If one fails (rate limit, error, no key),
+the next provider is attempted automatically.
+
+Environment Variables:
+  EXA_API_KEY      - Exa.ai API key (https://exa.ai/)
+  TAVILY_API_KEY   - Tavily API key (https://tavily.com/)
+  SERPER_KEY_ID    - Serper API key (https://serper.dev/)
+
+If no API keys are set, DuckDuckGo is used as the default (free, no key needed).
+"""
+
+import http.client
 import json
-from concurrent.futures import ThreadPoolExecutor
-from typing import List, Union
+import os
+import time
+from typing import Dict, List, Optional, Union
+
 import requests
 from qwen_agent.tools.base import BaseTool, register_tool
-import asyncio
-from typing import Dict, List, Optional, Union
-import uuid
-import http.client
-import json
 
-import os
 
+# API Keys from environment
+EXA_API_KEY = os.environ.get("EXA_API_KEY", "")
+TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "")
+SERPER_KEY = os.environ.get("SERPER_KEY_ID", "")
 
-SERPER_KEY=os.environ.get('SERPER_KEY_ID')
 
+def contains_chinese(text: str) -> bool:
+    """Check if text contains Chinese characters."""
+    return any("\u4E00" <= char <= "\u9FFF" for char in text)
 
-@register_tool("search", allow_overwrite=True)
-class Search(BaseTool):
-    name = "search"
-    description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call."
-    parameters = {
-        "type": "object",
-        "properties": {
-            "query": {
-                "type": "array",
-                "items": {
-                    "type": "string"
-                },
-                "description": "Array of query strings. Include multiple complementary search queries in a single call."
+
+# =============================================================================
+# Search Providers
+# =============================================================================
+
+def search_exa(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Exa.ai - Neural/semantic search engine.
+    Best for finding conceptually relevant results, not just keyword matches.
+    """
+    if not EXA_API_KEY:
+        return None
+    
+    try:
+        response = requests.post(
+            "https://api.exa.ai/search",
+            headers={
+                "x-api-key": EXA_API_KEY,
+                "Content-Type": "application/json",
             },
-        },
-        "required": ["query"],
-    }
+            json={
+                "query": query,
+                "numResults": num_results,
+                "useAutoprompt": True,
+                "type": "neural",
+            },
+            timeout=30,
+        )
+        
+        if response.status_code == 401:
+            print("[Exa] Invalid API key")
+            return None
+        if response.status_code == 429:
+            print("[Exa] Rate limited")
+            return None
+        if response.status_code != 200:
+            print(f"[Exa] Error {response.status_code}: {response.text[:200]}")
+            return None
+        
+        data = response.json()
+        results = data.get("results", [])
+        
+        if not results:
+            return None
+        
+        snippets = []
+        for idx, r in enumerate(results, 1):
+            title = r.get("title", "No title")
+            url = r.get("url", "")
+            text = r.get("text", "")[:300] if r.get("text") else ""
+            published = r.get("publishedDate", "")
+            
+            snippet = f"{idx}. [{title}]({url})"
+            if published:
+                snippet += f"\nDate published: {published[:10]}"
+            if text:
+                snippet += f"\n{text}"
+            snippets.append(snippet)
+        
+        return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
+    
+    except requests.Timeout:
+        print("[Exa] Request timeout")
+        return None
+    except Exception as e:
+        print(f"[Exa] Error: {e}")
+        return None
 
-    def __init__(self, cfg: Optional[dict] = None):
-        super().__init__(cfg)
-    def google_search_with_serp(self, query: str):
-        def contains_chinese_basic(text: str) -> bool:
-            return any('\u4E00' <= char <= '\u9FFF' for char in text)
+
+def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Tavily - Search API designed specifically for RAG and LLM applications.
+    Returns AI-optimized snippets and supports advanced filtering.
+    """
+    if not TAVILY_API_KEY:
+        return None
+    
+    try:
+        response = requests.post(
+            "https://api.tavily.com/search",
+            headers={"Content-Type": "application/json"},
+            json={
+                "api_key": TAVILY_API_KEY,
+                "query": query,
+                "max_results": num_results,
+                "search_depth": "advanced",
+                "include_answer": False,
+                "include_raw_content": False,
+            },
+            timeout=30,
+        )
+        
+        if response.status_code == 401:
+            print("[Tavily] Invalid API key")
+            return None
+        if response.status_code == 429:
+            print("[Tavily] Rate limited")
+            return None
+        if response.status_code != 200:
+            print(f"[Tavily] Error {response.status_code}: {response.text[:200]}")
+            return None
+        
+        data = response.json()
+        results = data.get("results", [])
+        
+        if not results:
+            return None
+        
+        snippets = []
+        for idx, r in enumerate(results, 1):
+            title = r.get("title", "No title")
+            url = r.get("url", "")
+            content = r.get("content", "")[:300]
+            score = r.get("score", 0)
+            
+            snippet = f"{idx}. [{title}]({url})"
+            if content:
+                snippet += f"\n{content}"
+            snippets.append(snippet)
+        
+        return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
+    
+    except requests.Timeout:
+        print("[Tavily] Request timeout")
+        return None
+    except Exception as e:
+        print(f"[Tavily] Error: {e}")
+        return None
+
+
+def search_serper(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Serper - Google Search API (SERP results).
+    Fast and reliable Google search results.
+    """
+    if not SERPER_KEY:
+        return None
+    
+    try:
         conn = http.client.HTTPSConnection("google.serper.dev")
-        if contains_chinese_basic(query):
+        
+        if contains_chinese(query):
             payload = json.dumps({
                 "q": query,
                 "location": "China",
                 "gl": "cn",
-                "hl": "zh-cn"
+                "hl": "zh-cn",
+                "num": num_results,
             })
-            
         else:
             payload = json.dumps({
                 "q": query,
                 "location": "United States",
                 "gl": "us",
-                "hl": "en"
+                "hl": "en",
+                "num": num_results,
             })
-        headers = {
-                'X-API-KEY': SERPER_KEY,
-                'Content-Type': 'application/json'
-            }
         
+        headers = {
+            "X-API-KEY": SERPER_KEY,
+            "Content-Type": "application/json",
+        }
         
-        for i in range(5):
+        res = None
+        for attempt in range(3):
             try:
                 conn.request("POST", "/search", payload, headers)
                 res = conn.getresponse()
                 break
             except Exception as e:
-                print(e)
-                if i == 4:
-                    return f"Google search Timeout, return None, Please try again later."
+                if attempt == 2:
+                    print(f"[Serper] Connection error: {e}")
+                    return None
+                time.sleep(1)
                 continue
+        
+        if res is None:
+            return None
+        
+        data = json.loads(res.read().decode("utf-8"))
+        
+        if "error" in data:
+            print(f"[Serper] API error: {data['error']}")
+            return None
+        
+        if "organic" not in data:
+            return None
+        
+        snippets = []
+        for idx, page in enumerate(data["organic"], 1):
+            title = page.get("title", "No title")
+            url = page.get("link", "")
+            snippet_text = page.get("snippet", "")
+            date = page.get("date", "")
+            source = page.get("source", "")
+            
+            result = f"{idx}. [{title}]({url})"
+            if date:
+                result += f"\nDate published: {date}"
+            if source:
+                result += f"\nSource: {source}"
+            if snippet_text:
+                result += f"\n{snippet_text}"
+            
+            result = result.replace("Your browser can't play this video.", "")
+            snippets.append(result)
+        
+        return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
     
-        data = res.read()
-        results = json.loads(data.decode("utf-8"))
+    except Exception as e:
+        print(f"[Serper] Error: {e}")
+        return None
 
-        try:
-            if "organic" not in results:
-                raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
 
-            web_snippets = list()
-            idx = 0
-            if "organic" in results:
-                for page in results["organic"]:
-                    idx += 1
-                    date_published = ""
-                    if "date" in page:
-                        date_published = "\nDate published: " + page["date"]
+def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    DuckDuckGo - Free search with no API key required.
+    Rate limited but reliable as a final fallback.
+    """
+    try:
+        from duckduckgo_search import DDGS
+        from duckduckgo_search.exceptions import RatelimitException
+    except ImportError:
+        print("[DuckDuckGo] duckduckgo-search package not installed")
+        return None
+    
+    retries = 3
+    for attempt in range(retries):
+        try:
+            with DDGS() as ddgs:
+                results = list(ddgs.text(query, max_results=num_results))
+            
+            if not results:
+                return None
+            
+            snippets = []
+            for idx, r in enumerate(results, 1):
+                title = r.get("title", "No title")
+                url = r.get("href", r.get("link", ""))
+                body = r.get("body", "")[:300]
+                
+                snippet = f"{idx}. [{title}]({url})"
+                if body:
+                    snippet += f"\n{body}"
+                snippets.append(snippet)
+            
+            return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
+        
+        except RatelimitException:
+            if attempt < retries - 1:
+                time.sleep(2 ** attempt)
+                continue
+            print("[DuckDuckGo] Rate limited after retries")
+            return None
+        except Exception as e:
+            print(f"[DuckDuckGo] Error: {e}")
+            return None
+    
+    return None
 
-                    source = ""
-                    if "source" in page:
-                        source = "\nSource: " + page["source"]
 
-                    snippet = ""
-                    if "snippet" in page:
-                        snippet = "\n" + page["snippet"]
+# =============================================================================
+# Multi-Provider Search with Fallback
+# =============================================================================
 
-                    redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{snippet}"
-                    redacted_version = redacted_version.replace("Your browser can't play this video.", "")
-                    web_snippets.append(redacted_version)
+def multi_provider_search(query: str, num_results: int = 10) -> str:
+    """
+    Search using multiple providers with automatic fallback.
+    
+    Provider priority (by quality):
+      1. Exa.ai     - Best semantic search
+      2. Tavily     - Purpose-built for LLMs
+      3. Serper     - Google SERP results
+      4. DuckDuckGo - Free fallback
+    
+    Returns the first successful result or an error message.
+    """
+    providers = [
+        ("Exa", search_exa),
+        ("Tavily", search_tavily),
+        ("Serper", search_serper),
+        ("DuckDuckGo", search_duckduckgo),
+    ]
+    
+    errors = []
+    
+    for name, search_fn in providers:
+        result = search_fn(query, num_results)
+        if result:
+            return result
+        errors.append(name)
+    
+    return f"No results found for '{query}'. All providers failed: {', '.join(errors)}. Try a different query."
 
-            content = f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n" + "\n\n".join(web_snippets)
-            return content
-        except:
-            return f"No results found for '{query}'. Try with a more general query."
 
+# =============================================================================
+# Qwen Agent Tool Registration
+# =============================================================================
 
+@register_tool("search", allow_overwrite=True)
+class Search(BaseTool):
+    """Web search tool with multi-provider fallback."""
     
-    def search_with_serp(self, query: str):
-        result = self.google_search_with_serp(query)
-        return result
+    name = "search"
+    description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call."
+    parameters = {
+        "type": "object",
+        "properties": {
+            "query": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Array of query strings. Include multiple complementary search queries in a single call.",
+            },
+        },
+        "required": ["query"],
+    }
+
+    def __init__(self, cfg: Optional[dict] = None):
+        super().__init__(cfg)
+        
+        # Log which providers are available
+        available = []
+        if EXA_API_KEY:
+            available.append("Exa")
+        if TAVILY_API_KEY:
+            available.append("Tavily")
+        if SERPER_KEY:
+            available.append("Serper")
+        available.append("DuckDuckGo")
+        
+        print(f"[Search] Available providers: {', '.join(available)}")
 
     def call(self, params: Union[str, dict], **kwargs) -> str:
-        try:
-            query = params["query"]
-        except:
+        if isinstance(params, str):
+            return "[Search] Invalid request format: Input must be a JSON object containing 'query' field"
+        
+        params_dict: dict = params
+        query = params_dict.get("query")
+        if query is None:
             return "[Search] Invalid request format: Input must be a JSON object containing 'query' field"
         
         if isinstance(query, str):
-            # 单个查询
-            response = self.search_with_serp(query)
-        else:
-            # 多个查询
-            assert isinstance(query, List)
-            responses = []
-            for q in query:
-                responses.append(self.search_with_serp(q))
-            response = "\n=======\n".join(responses)
-            
-        return response
-
+            return multi_provider_search(query)
+        
+        if not isinstance(query, list):
+            return "[Search] Invalid query format: 'query' must be a string or array of strings"
+        
+        responses = []
+        for q in query:
+            responses.append(multi_provider_search(q))
+        
+        return "\n=======\n".join(responses)

From ca5e84af4bfe44f29ca2aaab63c711f4ed41816d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?=
 <12643176+chindris-mihai-alexandru@users.noreply.github.com>
Date: Fri, 28 Nov 2025 16:59:55 +0200
Subject: [PATCH 3/5] Improve search providers with Tavily support and robust
 error handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Tavily as 2nd provider (Exa → Tavily → Serper → DuckDuckGo)
- Use Bearer token auth for Tavily (per API docs)
- Add quota error handling (Exa 402, Tavily 432/433)
- Add sanitize_query() for input validation
- Switch Serper from http.client to requests
- Handle ConnectionError, JSONDecodeError, DuckDuckGoSearchException
- Use Exa type: auto for better search quality
---
 inference/interactive_llamacpp.py | 187 ++++++++++----
 inference/tool_search.py          | 417 ++++++++++++++++++++----------
 2 files changed, 419 insertions(+), 185 deletions(-)

diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py
index 34471aba..0bae1aed 100644
--- a/inference/interactive_llamacpp.py
+++ b/inference/interactive_llamacpp.py
@@ -37,7 +37,6 @@
 """
 
 import argparse
-import http.client
 import json
 import os
 import re
@@ -137,14 +136,30 @@ def get_system_prompt() -> str:
 
 def contains_chinese(text: str) -> bool:
     """Check if text contains Chinese characters."""
+    if not text:
+        return False
     return any("\u4E00" <= char <= "\u9FFF" for char in text)
 
 
+def sanitize_query(query: str) -> str:
+    """Sanitize and validate a search query."""
+    if not query:
+        return ""
+    return query.strip()[:500]
+
+
 def search_exa(query: str, num_results: int = 10) -> Optional[str]:
-    """Exa.ai - Neural/semantic search engine."""
+    """
+    Exa.ai - Neural/semantic search engine.
+    API Docs: https://docs.exa.ai/reference/search
+    """
     if not EXA_API_KEY:
         return None
     
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
     try:
         response = requests.post(
             "https://api.exa.ai/search",
@@ -154,14 +169,22 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]:
             },
             json={
                 "query": query,
-                "numResults": num_results,
-                "useAutoprompt": True,
-                "type": "neural",
+                "numResults": min(num_results, 100),
+                "type": "auto",  # Let Exa choose best search type
             },
             timeout=30,
         )
         
-        if response.status_code in (401, 429) or response.status_code != 200:
+        if response.status_code == 401:
+            print("[Exa] Invalid or expired API key")
+            return None
+        if response.status_code == 429:
+            print("[Exa] Rate limited")
+            return None
+        if response.status_code == 402:
+            print("[Exa] Payment required - credits exhausted")
+            return None
+        if response.status_code != 200:
             return None
         
         data = response.json()
@@ -171,7 +194,7 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]:
         
         output = [f"\n## Search: '{query}'\n"]
         for idx, r in enumerate(results, 1):
-            title = r.get("title", "No title")
+            title = r.get("title") or "No title"
             url = r.get("url", "")
             text = r.get("text", "")[:300] if r.get("text") else ""
             output.append(f"{idx}. [{title}]({url})")
@@ -179,29 +202,60 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]:
                 output.append(f"   {text}...")
         
         return "\n".join(output)
-    except Exception:
+    except requests.Timeout:
+        print("[Exa] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Exa] Connection error")
+        return None
+    except Exception as e:
+        print(f"[Exa] Error: {e}")
         return None
 
 
 def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
-    """Tavily - Search API for RAG/LLM applications."""
+    """
+    Tavily - Search API for RAG/LLM applications.
+    API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search
+    """
     if not TAVILY_API_KEY:
         return None
     
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
     try:
+        # Use Bearer token auth (preferred over api_key in body)
         response = requests.post(
             "https://api.tavily.com/search",
-            headers={"Content-Type": "application/json"},
+            headers={
+                "Authorization": f"Bearer {TAVILY_API_KEY}",
+                "Content-Type": "application/json",
+            },
             json={
-                "api_key": TAVILY_API_KEY,
                 "query": query,
-                "max_results": num_results,
-                "search_depth": "advanced",
+                "max_results": min(num_results, 20),
+                "search_depth": "basic",  # Use basic (1 credit) vs advanced (2 credits)
+                "include_answer": False,
+                "include_raw_content": False,
             },
             timeout=30,
         )
         
-        if response.status_code in (401, 429) or response.status_code != 200:
+        if response.status_code == 401:
+            print("[Tavily] Invalid or expired API key")
+            return None
+        if response.status_code == 429:
+            print("[Tavily] Rate limited")
+            return None
+        if response.status_code == 432:
+            print("[Tavily] Plan limit exceeded")
+            return None
+        if response.status_code == 433:
+            print("[Tavily] Pay-as-you-go limit exceeded")
+            return None
+        if response.status_code != 200:
             return None
         
         data = response.json()
@@ -211,7 +265,7 @@ def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
         
         output = [f"\n## Search: '{query}'\n"]
         for idx, r in enumerate(results, 1):
-            title = r.get("title", "No title")
+            title = r.get("title") or "No title"
             url = r.get("url", "")
             content = r.get("content", "")[:300]
             output.append(f"{idx}. [{title}]({url})")
@@ -219,58 +273,81 @@ def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
                 output.append(f"   {content}...")
         
         return "\n".join(output)
-    except Exception:
+    except requests.Timeout:
+        print("[Tavily] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Tavily] Connection error")
+        return None
+    except Exception as e:
+        print(f"[Tavily] Error: {e}")
         return None
 
 
 def search_serper(query: str, num_results: int = 10) -> Optional[str]:
-    """Serper - Google Search API."""
+    """
+    Serper - Google Search API.
+    API Docs: https://serper.dev/
+    """
     if not SERPER_KEY:
         return None
     
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
     try:
-        conn = http.client.HTTPSConnection("google.serper.dev")
-        
         if contains_chinese(query):
-            payload = json.dumps({
-                "q": query, "location": "China", "gl": "cn", "hl": "zh-cn", "num": num_results
-            })
+            payload = {"q": query, "gl": "cn", "hl": "zh-cn", "num": min(num_results, 100)}
         else:
-            payload = json.dumps({
-                "q": query, "location": "United States", "gl": "us", "hl": "en", "num": num_results
-            })
+            payload = {"q": query, "gl": "us", "hl": "en", "num": min(num_results, 100)}
         
-        headers = {"X-API-KEY": SERPER_KEY, "Content-Type": "application/json"}
+        response = requests.post(
+            "https://google.serper.dev/search",
+            headers={
+                "X-API-KEY": SERPER_KEY,
+                "Content-Type": "application/json",
+            },
+            json=payload,
+            timeout=30,
+        )
         
-        res = None
-        for attempt in range(3):
-            try:
-                conn.request("POST", "/search", payload, headers)
-                res = conn.getresponse()
-                break
-            except Exception:
-                if attempt == 2:
-                    return None
-                time.sleep(1)
+        if response.status_code == 401:
+            print("[Serper] Invalid API key")
+            return None
+        if response.status_code == 429:
+            print("[Serper] Rate limited")
+            return None
+        if response.status_code != 200:
+            return None
         
-        if res is None:
+        data = response.json()
+        if "error" in data:
+            print(f"[Serper] API error: {data['error']}")
             return None
         
-        data = json.loads(res.read().decode("utf-8"))
-        if "organic" not in data:
+        organic = data.get("organic", [])
+        if not organic:
             return None
         
         output = [f"\n## Search: '{query}'\n"]
-        for idx, page in enumerate(data["organic"], 1):
-            title = page.get("title", "No title")
+        for idx, page in enumerate(organic, 1):
+            title = page.get("title") or "No title"
             url = page.get("link", "")
-            snippet = page.get("snippet", "")[:300]
+            snippet = page.get("snippet", "")[:300].replace("Your browser can't play this video.", "").strip()
             output.append(f"{idx}. [{title}]({url})")
             if snippet:
                 output.append(f"   {snippet}...")
         
         return "\n".join(output)
-    except Exception:
+    except requests.Timeout:
+        print("[Serper] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Serper] Connection error")
+        return None
+    except Exception as e:
+        print(f"[Serper] Error: {e}")
         return None
 
 
@@ -278,23 +355,28 @@ def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]:
     """DuckDuckGo - Free search, no API key required."""
     try:
         from duckduckgo_search import DDGS
-        from duckduckgo_search.exceptions import RatelimitException
+        from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException
     except ImportError:
+        print("[DuckDuckGo] duckduckgo-search package not installed")
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
         return None
     
     retries = 3
     for attempt in range(retries):
         try:
             with DDGS() as ddgs:
-                results = list(ddgs.text(query, max_results=num_results))
+                results = list(ddgs.text(query, max_results=min(num_results, 25)))
             
             if not results:
                 return None
             
             output = [f"\n## Search: '{query}'\n"]
             for idx, r in enumerate(results, 1):
-                title = r.get("title", "No title")
-                url = r.get("href", r.get("link", ""))
+                title = r.get("title") or "No title"
+                url = r.get("href") or r.get("link", "")
                 body = r.get("body", "")[:300]
                 output.append(f"{idx}. [{title}]({url})")
                 if body:
@@ -303,10 +385,17 @@ def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]:
             return "\n".join(output)
         except RatelimitException:
             if attempt < retries - 1:
-                time.sleep(2 ** attempt)
+                wait = 2 ** attempt
+                print(f"[DuckDuckGo] Rate limited, waiting {wait}s...")
+                time.sleep(wait)
                 continue
+            print("[DuckDuckGo] Rate limited after all retries")
             return None
-        except Exception:
+        except DuckDuckGoSearchException as e:
+            print(f"[DuckDuckGo] Search error: {e}")
+            return None
+        except Exception as e:
+            print(f"[DuckDuckGo] Error: {e}")
             return None
     
     return None
diff --git a/inference/tool_search.py b/inference/tool_search.py
index 03229595..15f21063 100644
--- a/inference/tool_search.py
+++ b/inference/tool_search.py
@@ -3,10 +3,10 @@
 ==============================
 
 Implements a robust search fallback chain optimized for AI research:
-  1. Exa.ai     - Best semantic/neural search, $10 free credits
-  2. Tavily    - Purpose-built for RAG/LLMs, 1,000 free requests/month
-  3. Serper    - Google SERP results, 2,500 free queries
-  4. DuckDuckGo - Free forever, final fallback (no API key needed)
+  1. Exa.ai      - Best semantic/neural search ($10 free credits)
+  2. Tavily      - Purpose-built for RAG/LLMs (1,000 free requests/month)
+  3. Serper      - Google SERP results (2,500 free queries)
+  4. DuckDuckGo  - Free forever, final fallback (no API key needed)
 
 Each provider is tried in order. If one fails (rate limit, error, no key),
 the next provider is attempted automatically.
@@ -23,23 +23,61 @@
 import json
 import os
 import time
-from typing import Dict, List, Optional, Union
+from typing import List, Optional, Union
 
 import requests
 from qwen_agent.tools.base import BaseTool, register_tool
 
 
 # API Keys from environment
-EXA_API_KEY = os.environ.get("EXA_API_KEY", "")
-TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "")
-SERPER_KEY = os.environ.get("SERPER_KEY_ID", "")
+EXA_API_KEY = os.environ.get("EXA_API_KEY", "").strip()
+TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "").strip()
+SERPER_KEY = os.environ.get("SERPER_KEY_ID", "").strip()
+
+# Request timeouts (seconds)
+REQUEST_TIMEOUT = 30
 
 
 def contains_chinese(text: str) -> bool:
     """Check if text contains Chinese characters."""
+    if not text:
+        return False
     return any("\u4E00" <= char <= "\u9FFF" for char in text)
 
 
+def sanitize_query(query: str) -> str:
+    """Sanitize and validate a search query."""
+    if not query:
+        return ""
+    # Strip whitespace and limit length
+    query = query.strip()[:500]
+    return query
+
+
+def format_results(query: str, results: List[dict], provider: str) -> str:
+    """Format search results into a consistent markdown format."""
+    if not results:
+        return ""
+    
+    snippets = []
+    for idx, r in enumerate(results, 1):
+        title = r.get("title", "No title")
+        url = r.get("url", "")
+        snippet = r.get("snippet", "")
+        date = r.get("date", "")
+        
+        # Build result entry
+        entry = f"{idx}. [{title}]({url})"
+        if date:
+            entry += f"\nDate: {date}"
+        if snippet:
+            entry += f"\n{snippet}"
+        snippets.append(entry)
+    
+    header = f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n"
+    return header + "\n\n".join(snippets)
+
+
 # =============================================================================
 # Search Providers
 # =============================================================================
@@ -48,10 +86,16 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]:
     """
     Exa.ai - Neural/semantic search engine.
     Best for finding conceptually relevant results, not just keyword matches.
+    
+    API Docs: https://docs.exa.ai/reference/search
     """
     if not EXA_API_KEY:
         return None
     
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
     try:
         response = requests.post(
             "https://api.exa.ai/search",
@@ -61,50 +105,65 @@ def search_exa(query: str, num_results: int = 10) -> Optional[str]:
             },
             json={
                 "query": query,
-                "numResults": num_results,
-                "useAutoprompt": True,
-                "type": "neural",
+                "numResults": min(num_results, 100),  # API max is 100
+                "type": "auto",  # Let Exa choose best search type
             },
-            timeout=30,
+            timeout=REQUEST_TIMEOUT,
         )
         
+        # Handle error responses
         if response.status_code == 401:
-            print("[Exa] Invalid API key")
+            print("[Exa] Invalid or expired API key")
             return None
         if response.status_code == 429:
-            print("[Exa] Rate limited")
+            print("[Exa] Rate limited - too many requests")
+            return None
+        if response.status_code == 402:
+            print("[Exa] Payment required - credits exhausted")
             return None
         if response.status_code != 200:
-            print(f"[Exa] Error {response.status_code}: {response.text[:200]}")
+            error_msg = response.text[:200] if response.text else "Unknown error"
+            print(f"[Exa] Error {response.status_code}: {error_msg}")
             return None
         
         data = response.json()
-        results = data.get("results", [])
+        api_results = data.get("results", [])
         
-        if not results:
+        if not api_results:
             return None
         
-        snippets = []
-        for idx, r in enumerate(results, 1):
-            title = r.get("title", "No title")
+        # Normalize results
+        results = []
+        for r in api_results:
+            title = r.get("title") or "No title"
             url = r.get("url", "")
-            text = r.get("text", "")[:300] if r.get("text") else ""
+            text = r.get("text", "")
             published = r.get("publishedDate", "")
             
-            snippet = f"{idx}. [{title}]({url})"
-            if published:
-                snippet += f"\nDate published: {published[:10]}"
-            if text:
-                snippet += f"\n{text}"
-            snippets.append(snippet)
+            # Truncate text for snippet
+            snippet = text[:300] + "..." if len(text) > 300 else text
+            date = published[:10] if published else ""
+            
+            results.append({
+                "title": title,
+                "url": url,
+                "snippet": snippet,
+                "date": date,
+            })
         
-        return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
+        return format_results(query, results, "Exa")
     
     except requests.Timeout:
         print("[Exa] Request timeout")
         return None
+    except requests.ConnectionError:
+        print("[Exa] Connection error - network issue")
+        return None
+    except json.JSONDecodeError:
+        print("[Exa] Invalid JSON response")
+        return None
     except Exception as e:
-        print(f"[Exa] Error: {e}")
+        print(f"[Exa] Unexpected error: {type(e).__name__}: {e}")
         return None
 
 
@@ -112,60 +171,89 @@ def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
     """
     Tavily - Search API designed specifically for RAG and LLM applications.
     Returns AI-optimized snippets and supports advanced filtering.
+    
+    API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search
     """
     if not TAVILY_API_KEY:
         return None
     
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
     try:
+        # Tavily supports both Bearer token and api_key in body
+        # Using Bearer token as it's more standard
         response = requests.post(
             "https://api.tavily.com/search",
-            headers={"Content-Type": "application/json"},
+            headers={
+                "Authorization": f"Bearer {TAVILY_API_KEY}",
+                "Content-Type": "application/json",
+            },
             json={
-                "api_key": TAVILY_API_KEY,
                 "query": query,
-                "max_results": num_results,
-                "search_depth": "advanced",
+                "max_results": min(num_results, 20),  # API max is 20
+                "search_depth": "basic",  # Use basic (1 credit) vs advanced (2 credits)
                 "include_answer": False,
                 "include_raw_content": False,
             },
-            timeout=30,
+            timeout=REQUEST_TIMEOUT,
         )
         
+        # Handle error responses
         if response.status_code == 401:
-            print("[Tavily] Invalid API key")
+            print("[Tavily] Invalid or expired API key")
             return None
         if response.status_code == 429:
-            print("[Tavily] Rate limited")
+            print("[Tavily] Rate limited - too many requests")
+            return None
+        if response.status_code == 432:
+            print("[Tavily] Plan limit exceeded - upgrade required")
+            return None
+        if response.status_code == 433:
+            print("[Tavily] Pay-as-you-go limit exceeded")
             return None
         if response.status_code != 200:
-            print(f"[Tavily] Error {response.status_code}: {response.text[:200]}")
+            error_msg = response.text[:200] if response.text else "Unknown error"
+            print(f"[Tavily] Error {response.status_code}: {error_msg}")
             return None
         
         data = response.json()
-        results = data.get("results", [])
+        api_results = data.get("results", [])
         
-        if not results:
+        if not api_results:
             return None
         
-        snippets = []
-        for idx, r in enumerate(results, 1):
-            title = r.get("title", "No title")
+        # Normalize results
+        results = []
+        for r in api_results:
+            title = r.get("title") or "No title"
             url = r.get("url", "")
-            content = r.get("content", "")[:300]
-            score = r.get("score", 0)
+            content = r.get("content", "")
             
-            snippet = f"{idx}. [{title}]({url})"
-            if content:
-                snippet += f"\n{content}"
-            snippets.append(snippet)
+            # Truncate content for snippet
+            snippet = content[:300] + "..." if len(content) > 300 else content
+            
+            results.append({
+                "title": title,
+                "url": url,
+                "snippet": snippet,
+                "date": "",
+            })
         
-        return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
+        return format_results(query, results, "Tavily")
     
     except requests.Timeout:
         print("[Tavily] Request timeout")
         return None
+    except requests.ConnectionError:
+        print("[Tavily] Connection error - network issue")
+        return None
+    except json.JSONDecodeError:
+        print("[Tavily] Invalid JSON response")
+        return None
     except Exception as e:
-        print(f"[Tavily] Error: {e}")
+        print(f"[Tavily] Unexpected error: {type(e).__name__}: {e}")
         return None
 
 
@@ -173,83 +261,98 @@ def search_serper(query: str, num_results: int = 10) -> Optional[str]:
     """
     Serper - Google Search API (SERP results).
     Fast and reliable Google search results.
+    
+    API Docs: https://serper.dev/
     """
     if not SERPER_KEY:
         return None
     
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
     try:
-        conn = http.client.HTTPSConnection("google.serper.dev")
-        
+        # Determine locale based on query content
         if contains_chinese(query):
-            payload = json.dumps({
+            payload = {
                 "q": query,
-                "location": "China",
                 "gl": "cn",
                 "hl": "zh-cn",
-                "num": num_results,
-            })
+                "num": min(num_results, 100),
+            }
         else:
-            payload = json.dumps({
+            payload = {
                 "q": query,
-                "location": "United States",
                 "gl": "us",
                 "hl": "en",
-                "num": num_results,
-            })
+                "num": min(num_results, 100),
+            }
         
-        headers = {
-            "X-API-KEY": SERPER_KEY,
-            "Content-Type": "application/json",
-        }
-        
-        res = None
-        for attempt in range(3):
-            try:
-                conn.request("POST", "/search", payload, headers)
-                res = conn.getresponse()
-                break
-            except Exception as e:
-                if attempt == 2:
-                    print(f"[Serper] Connection error: {e}")
-                    return None
-                time.sleep(1)
-                continue
+        # Use requests instead of http.client for consistency and better error handling
+        response = requests.post(
+            "https://google.serper.dev/search",
+            headers={
+                "X-API-KEY": SERPER_KEY,
+                "Content-Type": "application/json",
+            },
+            json=payload,
+            timeout=REQUEST_TIMEOUT,
+        )
         
-        if res is None:
+        # Handle error responses
+        if response.status_code == 401:
+            print("[Serper] Invalid API key")
+            return None
+        if response.status_code == 429:
+            print("[Serper] Rate limited")
+            return None
+        if response.status_code != 200:
+            error_msg = response.text[:200] if response.text else "Unknown error"
+            print(f"[Serper] Error {response.status_code}: {error_msg}")
             return None
         
-        data = json.loads(res.read().decode("utf-8"))
+        data = response.json()
         
+        # Check for API-level errors
         if "error" in data:
             print(f"[Serper] API error: {data['error']}")
             return None
         
-        if "organic" not in data:
+        organic = data.get("organic", [])
+        if not organic:
             return None
         
-        snippets = []
-        for idx, page in enumerate(data["organic"], 1):
-            title = page.get("title", "No title")
+        # Normalize results
+        results = []
+        for page in organic:
+            title = page.get("title") or "No title"
             url = page.get("link", "")
             snippet_text = page.get("snippet", "")
             date = page.get("date", "")
-            source = page.get("source", "")
             
-            result = f"{idx}. [{title}]({url})"
-            if date:
-                result += f"\nDate published: {date}"
-            if source:
-                result += f"\nSource: {source}"
-            if snippet_text:
-                result += f"\n{snippet_text}"
+            # Clean up snippet
+            snippet = snippet_text.replace("Your browser can't play this video.", "").strip()
             
-            result = result.replace("Your browser can't play this video.", "")
-            snippets.append(result)
+            results.append({
+                "title": title,
+                "url": url,
+                "snippet": snippet,
+                "date": date,
+            })
         
-        return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
+        return format_results(query, results, "Serper")
     
+    except requests.Timeout:
+        print("[Serper] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Serper] Connection error - network issue")
+        return None
+    except json.JSONDecodeError:
+        print("[Serper] Invalid JSON response")
+        return None
     except Exception as e:
-        print(f"[Serper] Error: {e}")
+        print(f"[Serper] Unexpected error: {type(e).__name__}: {e}")
         return None
 
 
@@ -260,41 +363,56 @@ def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]:
     """
     try:
         from duckduckgo_search import DDGS
-        from duckduckgo_search.exceptions import RatelimitException
+        from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException
     except ImportError:
-        print("[DuckDuckGo] duckduckgo-search package not installed")
+        print("[DuckDuckGo] duckduckgo-search package not installed. Run: pip install duckduckgo-search")
         return None
     
-    retries = 3
-    for attempt in range(retries):
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    max_retries = 3
+    for attempt in range(max_retries):
         try:
             with DDGS() as ddgs:
-                results = list(ddgs.text(query, max_results=num_results))
+                api_results = list(ddgs.text(query, max_results=min(num_results, 25)))
             
-            if not results:
+            if not api_results:
                 return None
             
-            snippets = []
-            for idx, r in enumerate(results, 1):
-                title = r.get("title", "No title")
-                url = r.get("href", r.get("link", ""))
-                body = r.get("body", "")[:300]
+            # Normalize results
+            results = []
+            for r in api_results:
+                title = r.get("title") or "No title"
+                url = r.get("href") or r.get("link", "")
+                body = r.get("body", "")
                 
-                snippet = f"{idx}. [{title}]({url})"
-                if body:
-                    snippet += f"\n{body}"
-                snippets.append(snippet)
+                # Truncate body for snippet
+                snippet = body[:300] + "..." if len(body) > 300 else body
+                
+                results.append({
+                    "title": title,
+                    "url": url,
+                    "snippet": snippet,
+                    "date": "",
+                })
             
-            return f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + "\n\n".join(snippets)
+            return format_results(query, results, "DuckDuckGo")
         
         except RatelimitException:
-            if attempt < retries - 1:
-                time.sleep(2 ** attempt)
+            if attempt < max_retries - 1:
+                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
+                print(f"[DuckDuckGo] Rate limited, waiting {wait_time}s...")
+                time.sleep(wait_time)
                 continue
-            print("[DuckDuckGo] Rate limited after retries")
+            print("[DuckDuckGo] Rate limited after all retries")
+            return None
+        except DuckDuckGoSearchException as e:
+            print(f"[DuckDuckGo] Search error: {e}")
             return None
         except Exception as e:
-            print(f"[DuckDuckGo] Error: {e}")
+            print(f"[DuckDuckGo] Unexpected error: {type(e).__name__}: {e}")
             return None
     
     return None
@@ -309,13 +427,18 @@ def multi_provider_search(query: str, num_results: int = 10) -> str:
     Search using multiple providers with automatic fallback.
     
     Provider priority (by quality):
-      1. Exa.ai     - Best semantic search
-      2. Tavily     - Purpose-built for LLMs
-      3. Serper     - Google SERP results
-      4. DuckDuckGo - Free fallback
+      1. Exa.ai      - Best semantic search
+      2. Tavily      - Purpose-built for LLMs
+      3. Serper      - Google SERP results
+      4. DuckDuckGo  - Free fallback
     
     Returns the first successful result or an error message.
     """
+    # Validate query
+    query = sanitize_query(query)
+    if not query:
+        return "[Search] Empty query provided. Please provide a search term."
+    
     providers = [
         ("Exa", search_exa),
         ("Tavily", search_tavily),
@@ -323,15 +446,16 @@ def multi_provider_search(query: str, num_results: int = 10) -> str:
         ("DuckDuckGo", search_duckduckgo),
     ]
     
-    errors = []
+    failed_providers = []
     
     for name, search_fn in providers:
         result = search_fn(query, num_results)
         if result:
             return result
-        errors.append(name)
+        failed_providers.append(name)
     
-    return f"No results found for '{query}'. All providers failed: {', '.join(errors)}. Try a different query."
+    # All providers failed
+    return f"No results found for '{query}'. Providers attempted: {', '.join(failed_providers)}. Try a different or simpler query."
 
 
 # =============================================================================
@@ -359,7 +483,7 @@ class Search(BaseTool):
     def __init__(self, cfg: Optional[dict] = None):
         super().__init__(cfg)
         
-        # Log which providers are available
+        # Log which providers are available at initialization
         available = []
         if EXA_API_KEY:
             available.append("Exa")
@@ -367,27 +491,48 @@ def __init__(self, cfg: Optional[dict] = None):
             available.append("Tavily")
         if SERPER_KEY:
             available.append("Serper")
-        available.append("DuckDuckGo")
+        available.append("DuckDuckGo")  # Always available
         
-        print(f"[Search] Available providers: {', '.join(available)}")
+        print(f"[Search] Initialized with providers: {', '.join(available)}")
 
     def call(self, params: Union[str, dict], **kwargs) -> str:
+        # Handle string input (invalid)
         if isinstance(params, str):
-            return "[Search] Invalid request format: Input must be a JSON object containing 'query' field"
+            return "[Search] Invalid request: Input must be a JSON object with 'query' field, not a string."
+        
+        # Handle None or non-dict
+        if not isinstance(params, dict):
+            return "[Search] Invalid request: Input must be a JSON object with 'query' field."
         
-        params_dict: dict = params
-        query = params_dict.get("query")
+        query = params.get("query")
+        
+        # Handle missing query
         if query is None:
-            return "[Search] Invalid request format: Input must be a JSON object containing 'query' field"
+            return "[Search] Missing 'query' field in request."
         
+        # Handle single string query
         if isinstance(query, str):
+            query = query.strip()
+            if not query:
+                return "[Search] Empty query string provided."
             return multi_provider_search(query)
         
-        if not isinstance(query, list):
-            return "[Search] Invalid query format: 'query' must be a string or array of strings"
-        
-        responses = []
-        for q in query:
-            responses.append(multi_provider_search(q))
+        # Handle list of queries
+        if isinstance(query, list):
+            if not query:
+                return "[Search] Empty query list provided."
+            
+            # Filter out empty strings
+            valid_queries = [q.strip() for q in query if isinstance(q, str) and q.strip()]
+            
+            if not valid_queries:
+                return "[Search] No valid queries in list (all empty or non-string)."
+            
+            responses = []
+            for q in valid_queries:
+                responses.append(multi_provider_search(q))
+            
+            return "\n=======\n".join(responses)
         
-        return "\n=======\n".join(responses)
+        # Invalid query type
+        return f"[Search] Invalid 'query' type: expected string or array, got {type(query).__name__}."

From 722acdb6a8d644f164215d49d39b878d9f6dab6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?=
 <12643176+chindris-mihai-alexandru@users.noreply.github.com>
Date: Fri, 28 Nov 2025 17:22:51 +0200
Subject: [PATCH 4/5] Fix memory issues: disable mlock by default, reduce
 context to 16K

- Change default context size from 32K to 16K (saves ~8GB RAM)
- Disable --mlock by default (prevents locking 18GB model in wired memory)
- Add --mlock flag to opt-in if needed for performance
- Add --low-memory flag for constrained systems (8K context)
- Display mlock status in configuration output

This prevents the system from running out of memory when running
llama-server alongside other apps like Firefox.
---
 scripts/start_llama_server.sh | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh
index 1ec44511..ae7fed68 100755
--- a/scripts/start_llama_server.sh
+++ b/scripts/start_llama_server.sh
@@ -32,15 +32,16 @@ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
 LLAMA_SERVER="$PROJECT_DIR/llama.cpp/build/bin/llama-server"
 MODEL_PATH="$PROJECT_DIR/models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf"
 
-# Default settings (optimized for Apple Silicon with 32GB+ RAM)
+# Default settings (optimized for Apple Silicon with 32GB RAM)
 PORT=${PORT:-8080}
 HOST=${HOST:-127.0.0.1}
-CTX_SIZE=${CTX_SIZE:-32768}       # 32K context for long research sessions
+CTX_SIZE=${CTX_SIZE:-16384}       # 16K context (use --ctx 32768 for longer sessions)
 GPU_LAYERS=${GPU_LAYERS:-99}      # Offload all layers to Metal
 THREADS=${THREADS:-8}             # CPU threads for non-GPU ops
 PARALLEL=${PARALLEL:-1}           # Parallel request slots
 BATCH_SIZE=${BATCH_SIZE:-512}     # Batch size for prompt processing
 WEBUI=${WEBUI:-true}              # Enable web UI by default
+MLOCK=${MLOCK:-false}             # Don't lock model in RAM (saves memory for other apps)
 
 # Colors
 RED='\033[0;31m'
@@ -104,16 +105,28 @@ while [[ $# -gt 0 ]]; do
             WEBUI=true
             shift
             ;;
+        --mlock)
+            MLOCK=true
+            shift
+            ;;
+        --low-memory)
+            # Low memory mode: smaller context, no mlock
+            CTX_SIZE=8192
+            MLOCK=false
+            shift
+            ;;
         -h|--help)
             echo "Usage: $0 [options]"
             echo ""
             echo "Options:"
             echo "  --port N      Port number (default: 8080)"
-            echo "  --ctx N       Context size (default: 32768)"
+            echo "  --ctx N       Context size (default: 16384)"
             echo "  --threads N   CPU threads (default: 8)"
             echo "  --parallel N  Parallel requests (default: 1)"
             echo "  --webui       Enable web UI (default)"
             echo "  --no-webui    Disable web UI, API only"
+            echo "  --mlock       Lock model in RAM (uses more memory but faster)"
+            echo "  --low-memory  Low memory mode: 8K context, no mlock"
             echo "  -h, --help    Show this help"
             echo ""
             echo "Access points:"
@@ -128,7 +141,6 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-# Display configuration
 echo -e "${GREEN}Configuration:${NC}"
 echo "  Model:     $(basename "$MODEL_PATH")"
 echo "  Size:      $(du -h "$MODEL_PATH" | cut -f1)"
@@ -136,6 +148,7 @@ echo "  Context:   $CTX_SIZE tokens"
 echo "  GPU:       Metal (all $GPU_LAYERS layers)"
 echo "  Threads:   $THREADS"
 echo "  Parallel:  $PARALLEL slots"
+echo "  Mlock:     $MLOCK"
 echo "  Web UI:    $WEBUI"
 echo "  Endpoint:  http://$HOST:$PORT"
 echo ""
@@ -172,12 +185,16 @@ SERVER_ARGS=(
     --parallel "$PARALLEL"
     --batch-size "$BATCH_SIZE"
     --flash-attn auto
-    --mlock
     --metrics
     --log-disable
 )
 # Note: --jinja is enabled by default in recent llama.cpp versions
 
+# Add mlock if requested (uses more memory but may be faster)
+if [ "$MLOCK" = "true" ]; then
+    SERVER_ARGS+=(--mlock)
+fi
+
 # Add no-webui flag if requested
 if [ "$WEBUI" = "false" ]; then
     SERVER_ARGS+=(--no-webui)

From b8422300608dc80f034f12a8ce4d85fccec260ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chindri=C8=99=20Mihai=20Alexandru?=
 <12643176+chindris-mihai-alexandru@users.noreply.github.com>
Date: Fri, 28 Nov 2025 21:14:41 +0200
Subject: [PATCH 5/5] Improve llama.cpp usability: reduce default rounds and
 add model alias

- Reduce MAX_ROUNDS from 30 to 10 for faster, more practical research queries
- Add --alias flag to llama-server for cleaner model naming (fixes Open WebUI compatibility)
- Better UX when using with web UIs like Open WebUI that expect short model names
---
 inference/interactive_llamacpp.py | 2 +-
 scripts/start_llama_server.sh     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py
index 0bae1aed..b12fcb53 100644
--- a/inference/interactive_llamacpp.py
+++ b/inference/interactive_llamacpp.py
@@ -67,7 +67,7 @@
 TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "")
 SERPER_KEY = os.environ.get("SERPER_KEY_ID", "")
 
-MAX_ROUNDS = 30
+MAX_ROUNDS = 10
 MAX_TOKENS = 4096
 TEMPERATURE = 0.7
 TOP_P = 0.95
diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh
index ae7fed68..4b42ce9b 100755
--- a/scripts/start_llama_server.sh
+++ b/scripts/start_llama_server.sh
@@ -187,6 +187,7 @@ SERVER_ARGS=(
     --flash-attn auto
     --metrics
     --log-disable
+    --alias deepresearch
 )
 # Note: --jinja is enabled by default in recent llama.cpp versions