diff --git a/.env.example b/.env.example
index 8558e9c4..c53f27cb 100644
--- a/.env.example
+++ b/.env.example
@@ -46,10 +46,24 @@ MAX_WORKERS=30
 # API Keys and External Services
 # =============================================================================
 
-# Serper API for web search and Google Scholar
+# Web Search Providers (in order of quality/preference)
+# The system will try each provider in order until one succeeds.
+# You only need ONE provider configured, but having multiple provides fallback.
+
+# Exa.ai - Best semantic/neural search ($10 free credits)
+# Get your key from: https://exa.ai/
+EXA_API_KEY=your_key
+
+# Tavily - Purpose-built for RAG/LLMs (1,000 free requests/month)
+# Get your key from: https://tavily.com/
+TAVILY_API_KEY=your_key
+
+# Serper API for Google search results (2,500 free queries)
 # Get your key from: https://serper.dev/
 SERPER_KEY_ID=your_key
 
+# DuckDuckGo is always available as final fallback (FREE, no API key needed)
+
 # Jina API for web page reading
 # Get your key from: https://jina.ai/
 JINA_API_KEYS=your_key
@@ -95,4 +109,17 @@ IDP_KEY_SECRET=your_idp_key_secret
 
 # These are typically set by distributed training frameworks
 # WORLD_SIZE=1
-# RANK=0
\ No newline at end of file
+# RANK=0
+
+# =============================================================================
+# llama.cpp Local Inference (Alternative for Mac/Local Users)
+# =============================================================================
+# If using the llama.cpp local inference option instead of vLLM:
+
+# The llama.cpp server URL (default works if using start_llama_server.sh)
+LLAMA_SERVER_URL=http://127.0.0.1:8080
+
+# For llama.cpp mode:
+# - Web search uses DuckDuckGo by default (FREE, no API key needed)
+# - JINA_API_KEYS is optional but recommended for better page reading
+# - See: python inference/interactive_llamacpp.py --help
\ No newline at end of file
diff --git a/README.md b/README.md
index 6a147f47..554bc0a8 100644
--- a/README.md
+++ b/README.md
@@ -179,6 +179,55 @@ You need to modify the following in the file [inference/react_agent.py](https://
 - Change the model name to alibaba/tongyi-deepresearch-30b-a3b.
 - Adjust the content concatenation way as described in the comments on lines **88–90.**
 
+
+---
+
+### 7. Local Inference with llama.cpp (Optional)
+
+> **For Mac users or anyone who wants 100% local inference without vLLM/CUDA dependencies.**
+
+This repo includes support for running DeepResearch locally using [llama.cpp](https://github.com/ggerganov/llama.cpp) with Metal (Apple Silicon) or CUDA acceleration. Zero API costs, full privacy.
+
+#### Requirements
+
+- llama.cpp built with Metal or CUDA support
+- GGUF model: [bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF](https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF)
+- 32GB+ RAM (for Q4_K_M quantization)
+
+#### Quick Start
+
+```bash
+# Install minimal dependencies
+pip install -r requirements-local.txt
+
+# Build llama.cpp (Mac with Metal)
+cd llama.cpp
+cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build build --config Release
+cd ..
+
+# Download model (~18GB)
+mkdir -p models/gguf
+curl -L -o models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \
+  'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf'
+
+# Terminal 1: Start the server
+./scripts/start_llama_server.sh
+
+# Terminal 2: Run research queries
+python inference/interactive_llamacpp.py
+```
+
+The llama.cpp server provides both an API and a web UI at http://localhost:8080.
+
+#### Features
+
+- **Free web search**: Uses DuckDuckGo (no API key required)
+- **Page visiting**: Uses Jina Reader (optional API key for better results)
+- **Loop detection**: Prevents infinite tool call cycles
+- **32K context**: Long research sessions supported
+
+---
 ## Benchmark Evaluation
 
 We provide benchmark evaluation scripts for various datasets. Please refer to the [evaluation scripts](./evaluation/) directory for more details.
diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py
new file mode 100644
index 00000000..b12fcb53
--- /dev/null
+++ b/inference/interactive_llamacpp.py
@@ -0,0 +1,871 @@
+#!/usr/bin/env python3
+"""
+DeepResearch Interactive CLI - llama.cpp Server Edition
+=========================================================
+
+A powerful local research assistant that runs on YOUR machine.
+Zero API costs. Full privacy. Complete control.
+
+This script connects to a local llama.cpp server running the 
+Tongyi-DeepResearch model and provides a full ReAct agent loop
+with web search and page visiting capabilities.
+
+Architecture:
+    +------------------+     HTTP      +-------------------+
+    |  This Script     | ------------> |  llama.cpp        |
+    |  (Agent Logic)   |               |  Server           |
+    |  - Tool calls    | <------------ |  (Model loaded)   |
+    |  - Web search    |     JSON      |  - Metal GPU      |
+    |  - Page visits   |               |  - 32K context    |
+    +------------------+               +-------------------+
+
+Search Providers (in order of quality):
+    1. Exa.ai      - Best semantic/neural search
+    2. Tavily      - Purpose-built for RAG/LLMs
+    3. Serper      - Google SERP results
+    4. DuckDuckGo  - Free fallback (no API key needed)
+
+Usage:
+    # Terminal 1: Start the server (one-time, stays running)
+    ./scripts/start_llama_server.sh
+    
+    # Terminal 2: Run research queries
+    python inference/interactive_llamacpp.py
+
+Requirements:
+    pip install requests duckduckgo-search python-dotenv
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+import time
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
+
+import requests
+
+# Load environment variables
+try:
+    from dotenv import load_dotenv
+    load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env"))
+except ImportError:
+    pass
+
+# =============================================================================
+# Configuration
+# =============================================================================
+
+LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://127.0.0.1:8080")
+JINA_API_KEY = os.environ.get("JINA_API_KEYS", "") or os.environ.get("JINA_API_KEY", "")
+
+# Search API keys
+EXA_API_KEY = os.environ.get("EXA_API_KEY", "")
+TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "")
+SERPER_KEY = os.environ.get("SERPER_KEY_ID", "")
+
+MAX_ROUNDS = 10
+MAX_TOKENS = 4096
+TEMPERATURE = 0.7
+TOP_P = 0.95
+REQUEST_TIMEOUT = 300  # 5 minutes for long generations
+
+# Stop sequences for the ReAct loop
+STOP_SEQUENCES = [
+    "<tool_response>",
+    "\n<tool_response>",
+]
+
+
+# =============================================================================
+# System Prompt - Optimized for DeepResearch ReAct Agent
+# =============================================================================
+
+def get_system_prompt() -> str:
+    return f"""You are a deep research assistant. Your task is to answer questions by searching the web and synthesizing information from credible sources.
+
+# CRITICAL RULES
+
+1. **Think deeply**: Use <think></think> tags to reason about what you know and what you need to find
+2. **Search strategically**: Use multiple targeted searches to gather comprehensive information
+3. **Verify information**: Cross-reference facts across multiple sources
+4. **Synthesize thoroughly**: Combine information from multiple sources into a coherent answer
+5. **NEVER visit the same URL twice**: Each URL can only be visited once
+6. **Always conclude**: After gathering sufficient info (typically 5-15 sources), provide your answer in <answer></answer> tags
+7. **Be efficient**: Aim to answer in 10-20 rounds
+
+# Response Format
+
+When you need to search, respond with:
+<think>What I need to find and why</think>
+<tool_call>
+{{"name": "search", "arguments": {{"query": ["your search query"]}}}}
+</tool_call>
+
+When you need to visit a page for details:
+<think>Why I need to visit this specific page</think>
+<tool_call>
+{{"name": "visit", "arguments": {{"url": "https://example.com", "goal": "what specific info you need"}}}}
+</tool_call>
+
+When you have enough information, respond with:
+<think>Summary of what I found and my analysis</think>
+<answer>Your comprehensive, well-researched answer with citations where appropriate</answer>
+
+# Tools
+
+<tools>
+{{"type": "function", "function": {{"name": "search", "description": "Web search. Returns titles, URLs, and snippets.", "parameters": {{"type": "object", "properties": {{"query": {{"type": "array", "items": {{"type": "string"}}, "description": "1-3 search queries"}}}}, "required": ["query"]}}}}}}
+{{"type": "function", "function": {{"name": "visit", "description": "Visit a URL to get full page content. Each URL can only be visited ONCE.", "parameters": {{"type": "object", "properties": {{"url": {{"type": "string", "description": "URL to visit"}}, "goal": {{"type": "string", "description": "What info you need"}}}}, "required": ["url", "goal"]}}}}}}
+</tools>
+
+# Important Notes
+- The visit tool returns the COMPLETE page content in one response
+- After 8-10 successful source visits, you likely have enough information to answer
+- Prefer quality over quantity - don't just collect sources, synthesize them
+
+Current date: {datetime.now().strftime("%Y-%m-%d")}"""
+
+
+# =============================================================================
+# Search Providers
+# =============================================================================
+
+def contains_chinese(text: str) -> bool:
+    """Check if text contains Chinese characters."""
+    if not text:
+        return False
+    return any("\u4E00" <= char <= "\u9FFF" for char in text)
+
+
+def sanitize_query(query: str) -> str:
+    """Sanitize and validate a search query."""
+    if not query:
+        return ""
+    return query.strip()[:500]
+
+
+def search_exa(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Exa.ai - Neural/semantic search engine.
+    API Docs: https://docs.exa.ai/reference/search
+    """
+    if not EXA_API_KEY:
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    try:
+        response = requests.post(
+            "https://api.exa.ai/search",
+            headers={
+                "x-api-key": EXA_API_KEY,
+                "Content-Type": "application/json",
+            },
+            json={
+                "query": query,
+                "numResults": min(num_results, 100),
+                "type": "auto",  # Let Exa choose best search type
+            },
+            timeout=30,
+        )
+        
+        if response.status_code == 401:
+            print("[Exa] Invalid or expired API key")
+            return None
+        if response.status_code == 429:
+            print("[Exa] Rate limited")
+            return None
+        if response.status_code == 402:
+            print("[Exa] Payment required - credits exhausted")
+            return None
+        if response.status_code != 200:
+            return None
+        
+        data = response.json()
+        results = data.get("results", [])
+        if not results:
+            return None
+        
+        output = [f"\n## Search: '{query}'\n"]
+        for idx, r in enumerate(results, 1):
+            title = r.get("title") or "No title"
+            url = r.get("url", "")
+            text = r.get("text", "")[:300] if r.get("text") else ""
+            output.append(f"{idx}. [{title}]({url})")
+            if text:
+                output.append(f"   {text}...")
+        
+        return "\n".join(output)
+    except requests.Timeout:
+        print("[Exa] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Exa] Connection error")
+        return None
+    except Exception as e:
+        print(f"[Exa] Error: {e}")
+        return None
+
+
+def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Tavily - Search API for RAG/LLM applications.
+    API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search
+    """
+    if not TAVILY_API_KEY:
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    try:
+        # Use Bearer token auth (preferred over api_key in body)
+        response = requests.post(
+            "https://api.tavily.com/search",
+            headers={
+                "Authorization": f"Bearer {TAVILY_API_KEY}",
+                "Content-Type": "application/json",
+            },
+            json={
+                "query": query,
+                "max_results": min(num_results, 20),
+                "search_depth": "basic",  # Use basic (1 credit) vs advanced (2 credits)
+                "include_answer": False,
+                "include_raw_content": False,
+            },
+            timeout=30,
+        )
+        
+        if response.status_code == 401:
+            print("[Tavily] Invalid or expired API key")
+            return None
+        if response.status_code == 429:
+            print("[Tavily] Rate limited")
+            return None
+        if response.status_code == 432:
+            print("[Tavily] Plan limit exceeded")
+            return None
+        if response.status_code == 433:
+            print("[Tavily] Pay-as-you-go limit exceeded")
+            return None
+        if response.status_code != 200:
+            return None
+        
+        data = response.json()
+        results = data.get("results", [])
+        if not results:
+            return None
+        
+        output = [f"\n## Search: '{query}'\n"]
+        for idx, r in enumerate(results, 1):
+            title = r.get("title") or "No title"
+            url = r.get("url", "")
+            content = r.get("content", "")[:300]
+            output.append(f"{idx}. [{title}]({url})")
+            if content:
+                output.append(f"   {content}...")
+        
+        return "\n".join(output)
+    except requests.Timeout:
+        print("[Tavily] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Tavily] Connection error")
+        return None
+    except Exception as e:
+        print(f"[Tavily] Error: {e}")
+        return None
+
+
+def search_serper(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Serper - Google Search API.
+    API Docs: https://serper.dev/
+    """
+    if not SERPER_KEY:
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    try:
+        if contains_chinese(query):
+            payload = {"q": query, "gl": "cn", "hl": "zh-cn", "num": min(num_results, 100)}
+        else:
+            payload = {"q": query, "gl": "us", "hl": "en", "num": min(num_results, 100)}
+        
+        response = requests.post(
+            "https://google.serper.dev/search",
+            headers={
+                "X-API-KEY": SERPER_KEY,
+                "Content-Type": "application/json",
+            },
+            json=payload,
+            timeout=30,
+        )
+        
+        if response.status_code == 401:
+            print("[Serper] Invalid API key")
+            return None
+        if response.status_code == 429:
+            print("[Serper] Rate limited")
+            return None
+        if response.status_code != 200:
+            return None
+        
+        data = response.json()
+        if "error" in data:
+            print(f"[Serper] API error: {data['error']}")
+            return None
+        
+        organic = data.get("organic", [])
+        if not organic:
+            return None
+        
+        output = [f"\n## Search: '{query}'\n"]
+        for idx, page in enumerate(organic, 1):
+            title = page.get("title") or "No title"
+            url = page.get("link", "")
+            snippet = page.get("snippet", "")[:300].replace("Your browser can't play this video.", "").strip()
+            output.append(f"{idx}. [{title}]({url})")
+            if snippet:
+                output.append(f"   {snippet}...")
+        
+        return "\n".join(output)
+    except requests.Timeout:
+        print("[Serper] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Serper] Connection error")
+        return None
+    except Exception as e:
+        print(f"[Serper] Error: {e}")
+        return None
+
+
+def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]:
+    """DuckDuckGo - Free search, no API key required."""
+    try:
+        from duckduckgo_search import DDGS
+        from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException
+    except ImportError:
+        print("[DuckDuckGo] duckduckgo-search package not installed")
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    retries = 3
+    for attempt in range(retries):
+        try:
+            with DDGS() as ddgs:
+                results = list(ddgs.text(query, max_results=min(num_results, 25)))
+            
+            if not results:
+                return None
+            
+            output = [f"\n## Search: '{query}'\n"]
+            for idx, r in enumerate(results, 1):
+                title = r.get("title") or "No title"
+                url = r.get("href") or r.get("link", "")
+                body = r.get("body", "")[:300]
+                output.append(f"{idx}. [{title}]({url})")
+                if body:
+                    output.append(f"   {body}...")
+            
+            return "\n".join(output)
+        except RatelimitException:
+            if attempt < retries - 1:
+                wait = 2 ** attempt
+                print(f"[DuckDuckGo] Rate limited, waiting {wait}s...")
+                time.sleep(wait)
+                continue
+            print("[DuckDuckGo] Rate limited after all retries")
+            return None
+        except DuckDuckGoSearchException as e:
+            print(f"[DuckDuckGo] Search error: {e}")
+            return None
+        except Exception as e:
+            print(f"[DuckDuckGo] Error: {e}")
+            return None
+    
+    return None
+
+
+def multi_provider_search(queries: list, num_results: int = 10) -> str:
+    """Search using multiple providers with automatic fallback."""
+    providers = [
+        ("Exa", search_exa),
+        ("Tavily", search_tavily),
+        ("Serper", search_serper),
+        ("DuckDuckGo", search_duckduckgo),
+    ]
+    
+    all_results = []
+    
+    for query in queries[:3]:
+        result = None
+        for name, search_fn in providers:
+            result = search_fn(query, num_results)
+            if result:
+                break
+        
+        if result:
+            all_results.append(result)
+        else:
+            all_results.append(f"\n## Search: '{query}'\n[No results found]")
+    
+    return "\n".join(all_results) if all_results else "No results found"
+
+
+# =============================================================================
+# Page Visitor
+# =============================================================================
+
+def is_valid_url(url: str) -> bool:
+    """Check if URL is valid and uses http/https scheme."""
+    try:
+        result = urlparse(url)
+        return all([result.scheme in ("http", "https"), result.netloc])
+    except Exception:
+        return False
+
+
+def visit_page(url: str, goal: str) -> str:
+    """Fetch webpage content using Jina Reader or direct fetch."""
+    if isinstance(url, list):
+        url = url[0] if url else ""
+    
+    if not url:
+        return "[Visit Error] No URL provided"
+    
+    if not is_valid_url(url):
+        return f"[Visit Error] Invalid URL: {url}. Must be a valid http/https URL."
+    
+    # Try Jina Reader first
+    try:
+        headers = {"Accept": "text/plain"}
+        if JINA_API_KEY:
+            headers["Authorization"] = f"Bearer {JINA_API_KEY}"
+        
+        jina_url = f"https://r.jina.ai/{url}"
+        response = requests.get(jina_url, headers=headers, timeout=30, allow_redirects=True)
+        
+        if response.status_code == 200 and len(response.text) > 100:
+            content = response.text[:12000]
+            return f"**Content from {url}** (goal: {goal}):\n\n{content}"
+    except Exception:
+        pass
+    
+    # Fallback to direct fetch
+    try:
+        headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"}
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()
+        
+        text = response.text
+        text = re.sub(r"<script[^>]*>.*?</script>", "", text, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r"<[^>]+>", " ", text)
+        text = re.sub(r"\s+", " ", text).strip()
+        
+        if len(text) > 100:
+            return f"**Content from {url}** (goal: {goal}):\n\n{text[:12000]}"
+        return f"[Visit Error] Page content too short or blocked: {url}"
+    except requests.Timeout:
+        return f"[Visit Error] Timeout fetching {url}"
+    except Exception as e:
+        return f"[Visit Error] Could not fetch {url}: {type(e).__name__}: {e}"
+
+
+# =============================================================================
+# llama.cpp Server Client
+# =============================================================================
+
+class LlamaCppClient:
+    """Client for the llama.cpp OpenAI-compatible API."""
+    
+    def __init__(self, base_url: str = LLAMA_SERVER_URL):
+        self.base_url = base_url.rstrip("/")
+        self.api_url = f"{self.base_url}/v1/chat/completions"
+        self.session = requests.Session()
+    
+    def check_server(self) -> bool:
+        """Check if the server is running and responsive."""
+        try:
+            response = self.session.get(f"{self.base_url}/health", timeout=5)
+            return response.status_code == 200
+        except Exception:
+            return False
+    
+    def generate(
+        self,
+        messages: List[Dict[str, str]],
+        max_tokens: int = MAX_TOKENS,
+        temperature: float = TEMPERATURE,
+        top_p: float = TOP_P,
+        stop: Optional[List[str]] = None,
+    ) -> str:
+        """Generate a response from the llama.cpp server."""
+        payload = {
+            "messages": messages,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "stop": stop or STOP_SEQUENCES,
+            "stream": False,
+        }
+        
+        try:
+            response = self.session.post(
+                self.api_url,
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=300,
+            )
+            
+            if response.status_code != 200:
+                error_text = response.text[:500]
+                return f"[Server Error] Status {response.status_code}: {error_text}"
+            
+            data = response.json()
+            content = data["choices"][0]["message"]["content"]
+            return content.strip()
+            
+        except requests.Timeout:
+            return "[Error] Request timed out. The model may be processing a complex query."
+        except requests.ConnectionError:
+            return "[Error] Cannot connect to llama.cpp server. Is it running?"
+        except Exception as e:
+            return f"[Error] API call failed: {e}"
+
+
+# =============================================================================
+# Research Agent
+# =============================================================================
+
+def research(
+    client: LlamaCppClient,
+    question: str,
+    verbose: bool = True,
+    max_rounds: int = MAX_ROUNDS,
+    temperature: float = TEMPERATURE,
+) -> dict:
+    """Run the research agent loop."""
+    if verbose:
+        print(f"\n[*] Researching: {question}\n")
+        print("-" * 60)
+    
+    system_prompt = get_system_prompt()
+    messages: List[Dict[str, str]] = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": question},
+    ]
+    
+    sources: List[Dict[str, Any]] = []
+    thinking: List[str] = []
+    visited_urls: set = set()
+    consecutive_errors = 0
+    max_consecutive_errors = 3
+    start_time = time.time()
+    
+    for round_num in range(max_rounds):
+        if verbose:
+            print(f"\n[Round {round_num + 1}/{max_rounds}]")
+        
+        gen_start = time.time()
+        content = client.generate(messages, temperature=temperature)
+        gen_time = time.time() - gen_start
+        
+        if content.startswith("[Error]") or content.startswith("[Server Error]"):
+            if verbose:
+                print(f"   Error: {content}")
+            break
+        
+        if verbose:
+            print(f"   Generated in {gen_time:.1f}s")
+        
+        messages.append({"role": "assistant", "content": content})
+        
+        # Extract and display thinking
+        if "<think>" in content and "</think>" in content:
+            think_content = content.split("<think>")[1].split("</think>")[0].strip()
+            thinking.append(think_content)
+            if verbose:
+                preview = think_content[:200] + "..." if len(think_content) > 200 else think_content
+                print(f"   Thinking: {preview}")
+        
+        # Check for final answer
+        if "<answer>" in content:
+            if "</answer>" in content:
+                answer = content.split("<answer>")[1].split("</answer>")[0]
+            else:
+                answer = content.split("<answer>")[1]
+            
+            elapsed = time.time() - start_time
+            
+            if verbose:
+                print("\n" + "=" * 60)
+                print("ANSWER:")
+                print("=" * 60)
+                print(answer.strip())
+                print("=" * 60)
+                print(f"\nStats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s")
+            
+            return {
+                "answer": answer.strip(),
+                "sources": sources,
+                "rounds": round_num + 1,
+                "thinking": thinking,
+                "elapsed_seconds": elapsed,
+            }
+        
+        # Handle tool calls
+        if "<tool_call>" in content and "</tool_call>" in content:
+            try:
+                tool_json = content.split("<tool_call>")[1].split("</tool_call>")[0]
+                tool = json.loads(tool_json.strip())
+                name = tool.get("name", "")
+                args = tool.get("arguments", {})
+                
+                if verbose:
+                    print(f"   Tool: {name}")
+                
+                tool_error = False
+                
+                if name == "search":
+                    queries = args.get("query", [question])
+                    if isinstance(queries, str):
+                        queries = [queries]
+                    if verbose:
+                        print(f"   Queries: {queries}")
+                    tool_result = multi_provider_search(queries)
+                    if "[No results" in tool_result or "error" in tool_result.lower():
+                        tool_error = True
+                    else:
+                        sources.append({"type": "search", "queries": queries})
+                
+                elif name == "visit":
+                    url = args.get("url", "")
+                    if isinstance(url, list):
+                        url = url[0] if url else ""
+                    goal = args.get("goal", "extract information")
+                    
+                    if url in visited_urls:
+                        if verbose:
+                            print(f"   [!] Already visited: {url}")
+                        tool_result = f"[Already Visited] You already visited {url}. Use the information from your previous visit or try a different source."
+                        tool_error = True
+                    else:
+                        visited_urls.add(url)
+                        if verbose:
+                            print(f"   Visiting: {url[:60]}...")
+                        tool_result = visit_page(url, goal)
+                        if "[Visit Error]" in tool_result:
+                            tool_error = True
+                        else:
+                            sources.append({"type": "visit", "url": url})
+                
+                else:
+                    tool_result = f"Unknown tool: {name}. Available tools: search, visit"
+                    tool_error = True
+                
+                # Track consecutive errors for loop detection
+                if tool_error:
+                    consecutive_errors += 1
+                    if consecutive_errors >= max_consecutive_errors:
+                        if verbose:
+                            print(f"\n[!] {max_consecutive_errors} consecutive tool errors detected.")
+                        messages.append({
+                            "role": "user",
+                            "content": f"<tool_response>\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n</tool_response>",
+                        })
+                        continue
+                else:
+                    consecutive_errors = 0
+                
+                # Inject tool response
+                messages.append({
+                    "role": "user",
+                    "content": f"<tool_response>\n{tool_result}\n</tool_response>",
+                })
+            
+            except json.JSONDecodeError as e:
+                consecutive_errors += 1
+                messages.append({
+                    "role": "user",
+                    "content": f"<tool_response>\nError: Invalid JSON in tool call: {e}\n</tool_response>",
+                })
+            except Exception as e:
+                consecutive_errors += 1
+                messages.append({
+                    "role": "user",
+                    "content": f"<tool_response>\nTool error: {e}\n</tool_response>",
+                })
+    
+    # Force final answer after max rounds
+    if verbose:
+        print("\n[!] Max rounds reached, requesting final answer...")
+    
+    messages.append({
+        "role": "user",
+        "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use <answer></answer> tags.",
+    })
+    
+    content = client.generate(messages, max_tokens=2048, temperature=temperature)
+    
+    if "<answer>" in content:
+        if "</answer>" in content:
+            answer = content.split("<answer>")[1].split("</answer>")[0]
+        else:
+            answer = content.split("<answer>")[1]
+    else:
+        answer = content
+    
+    elapsed = time.time() - start_time
+    
+    if verbose:
+        print("\n" + "=" * 60)
+        print("ANSWER:")
+        print("=" * 60)
+        print(answer.strip())
+        print("=" * 60)
+        print(f"\nStats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s")
+    
+    return {
+        "answer": answer.strip(),
+        "sources": sources,
+        "rounds": max_rounds,
+        "thinking": thinking,
+        "elapsed_seconds": elapsed,
+    }
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def get_available_providers() -> List[str]:
+    """Return list of available search providers."""
+    providers = []
+    if EXA_API_KEY:
+        providers.append("Exa")
+    if TAVILY_API_KEY:
+        providers.append("Tavily")
+    if SERPER_KEY:
+        providers.append("Serper")
+    providers.append("DuckDuckGo")
+    return providers
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="DeepResearch Interactive CLI - llama.cpp Server Edition",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Start the server first (in another terminal):
+    ./scripts/start_llama_server.sh
+    
+    # Run interactive mode:
+    python inference/interactive_llamacpp.py
+    
+    # Single query mode:
+    python inference/interactive_llamacpp.py --query "What is quantum entanglement?"
+    
+    # Connect to a different server:
+    python inference/interactive_llamacpp.py --server http://192.168.1.100:8080
+""",
+    )
+    parser.add_argument(
+        "--server",
+        type=str,
+        default=LLAMA_SERVER_URL,
+        help="llama.cpp server URL (default: http://127.0.0.1:8080)",
+    )
+    parser.add_argument(
+        "--query", "-q",
+        type=str,
+        default=None,
+        help="Single query mode - run one research query and exit",
+    )
+    parser.add_argument(
+        "--max-rounds",
+        type=int,
+        default=MAX_ROUNDS,
+        help=f"Maximum research rounds (default: {MAX_ROUNDS})",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=TEMPERATURE,
+        help=f"Sampling temperature (default: {TEMPERATURE})",
+    )
+    args = parser.parse_args()
+    
+    providers = get_available_providers()
+    
+    print("\n" + "=" * 60)
+    print("DeepResearch - Interactive CLI")
+    print("llama.cpp Server Edition (100% Local)")
+    print("=" * 60)
+    print(f"Server:  {args.server}")
+    print(f"Search:  {', '.join(providers)}")
+    print(f"Reader:  Jina.ai {'[configured]' if JINA_API_KEY else '[free tier]'}")
+    print("=" * 60)
+    
+    # Initialize client
+    client = LlamaCppClient(base_url=args.server)
+    
+    # Check server connection
+    print("\nConnecting to llama.cpp server...", end=" ")
+    if not client.check_server():
+        print("FAILED")
+        print(f"\nError: Cannot connect to llama.cpp server at {args.server}")
+        print("\nPlease start the server first:")
+        print("    ./scripts/start_llama_server.sh")
+        print("\nOr specify a different server URL:")
+        print("    python inference/interactive_llamacpp.py --server http://your-server:8080")
+        sys.exit(1)
+    print("OK")
+    
+    # Single query mode
+    if args.query:
+        research(client, args.query, max_rounds=args.max_rounds, temperature=args.temperature)
+        return
+    
+    # Interactive mode
+    print("\nType your research question (or 'quit' to exit):\n")
+    
+    while True:
+        try:
+            question = input("Question: ").strip()
+            
+            if not question:
+                continue
+            
+            if question.lower() in ("quit", "exit", "q"):
+                print("\nGoodbye!")
+                break
+            
+            research(client, question, max_rounds=args.max_rounds, temperature=args.temperature)
+            print("\n" + "-" * 60 + "\n")
+        
+        except KeyboardInterrupt:
+            print("\n\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"\nError: {e}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inference/tool_search.py b/inference/tool_search.py
index 1a3f7b53..15f21063 100644
--- a/inference/tool_search.py
+++ b/inference/tool_search.py
@@ -1,131 +1,538 @@
+"""
+Multi-Provider Web Search Tool
+==============================
+
+Implements a robust search fallback chain optimized for AI research:
+  1. Exa.ai      - Best semantic/neural search ($10 free credits)
+  2. Tavily      - Purpose-built for RAG/LLMs (1,000 free requests/month)
+  3. Serper      - Google SERP results (2,500 free queries)
+  4. DuckDuckGo  - Free forever, final fallback (no API key needed)
+
+Each provider is tried in order. If one fails (rate limit, error, no key),
+the next provider is attempted automatically.
+
+Environment Variables:
+  EXA_API_KEY      - Exa.ai API key (https://exa.ai/)
+  TAVILY_API_KEY   - Tavily API key (https://tavily.com/)
+  SERPER_KEY_ID    - Serper API key (https://serper.dev/)
+
+If no API keys are set, DuckDuckGo is used as the default (free, no key needed).
+"""
+
+import http.client
 import json
-from concurrent.futures import ThreadPoolExecutor
-from typing import List, Union
+import os
+import time
+from typing import List, Optional, Union
+
 import requests
 from qwen_agent.tools.base import BaseTool, register_tool
-import asyncio
-from typing import Dict, List, Optional, Union
-import uuid
-import http.client
-import json
 
-import os
 
+# API Keys from environment
+EXA_API_KEY = os.environ.get("EXA_API_KEY", "").strip()
+TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "").strip()
+SERPER_KEY = os.environ.get("SERPER_KEY_ID", "").strip()
 
-SERPER_KEY=os.environ.get('SERPER_KEY_ID')
+# Request timeouts (seconds)
+REQUEST_TIMEOUT = 30
 
 
-@register_tool("search", allow_overwrite=True)
-class Search(BaseTool):
-    name = "search"
-    description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call."
-    parameters = {
-        "type": "object",
-        "properties": {
-            "query": {
-                "type": "array",
-                "items": {
-                    "type": "string"
-                },
-                "description": "Array of query strings. Include multiple complementary search queries in a single call."
+def contains_chinese(text: str) -> bool:
+    """Check if text contains Chinese characters."""
+    if not text:
+        return False
+    return any("\u4E00" <= char <= "\u9FFF" for char in text)
+
+
+def sanitize_query(query: str) -> str:
+    """Sanitize and validate a search query."""
+    if not query:
+        return ""
+    # Strip whitespace and limit length
+    query = query.strip()[:500]
+    return query
+
+
+def format_results(query: str, results: List[dict], provider: str) -> str:
+    """Format search results into a consistent markdown format."""
+    if not results:
+        return ""
+    
+    snippets = []
+    for idx, r in enumerate(results, 1):
+        title = r.get("title", "No title")
+        url = r.get("url", "")
+        snippet = r.get("snippet", "")
+        date = r.get("date", "")
+        
+        # Build result entry
+        entry = f"{idx}. [{title}]({url})"
+        if date:
+            entry += f"\nDate: {date}"
+        if snippet:
+            entry += f"\n{snippet}"
+        snippets.append(entry)
+    
+    header = f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n"
+    return header + "\n\n".join(snippets)
+
+
+# =============================================================================
+# Search Providers
+# =============================================================================
+
+def search_exa(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Exa.ai - Neural/semantic search engine.
+    Best for finding conceptually relevant results, not just keyword matches.
+    
+    API Docs: https://docs.exa.ai/reference/search
+    """
+    if not EXA_API_KEY:
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    try:
+        response = requests.post(
+            "https://api.exa.ai/search",
+            headers={
+                "x-api-key": EXA_API_KEY,
+                "Content-Type": "application/json",
             },
-        },
-        "required": ["query"],
-    }
+            json={
+                "query": query,
+                "numResults": min(num_results, 100),  # API max is 100
+                "type": "auto",  # Let Exa choose best search type
+            },
+            timeout=REQUEST_TIMEOUT,
+        )
+        
+        # Handle error responses
+        if response.status_code == 401:
+            print("[Exa] Invalid or expired API key")
+            return None
+        if response.status_code == 429:
+            print("[Exa] Rate limited - too many requests")
+            return None
+        if response.status_code == 402:
+            print("[Exa] Payment required - credits exhausted")
+            return None
+        if response.status_code != 200:
+            error_msg = response.text[:200] if response.text else "Unknown error"
+            print(f"[Exa] Error {response.status_code}: {error_msg}")
+            return None
+        
+        data = response.json()
+        api_results = data.get("results", [])
+        
+        if not api_results:
+            return None
+        
+        # Normalize results
+        results = []
+        for r in api_results:
+            title = r.get("title") or "No title"
+            url = r.get("url", "")
+            text = r.get("text", "")
+            published = r.get("publishedDate", "")
+            
+            # Truncate text for snippet
+            snippet = text[:300] + "..." if len(text) > 300 else text
+            date = published[:10] if published else ""
+            
+            results.append({
+                "title": title,
+                "url": url,
+                "snippet": snippet,
+                "date": date,
+            })
+        
+        return format_results(query, results, "Exa")
+    
+    except requests.Timeout:
+        print("[Exa] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Exa] Connection error - network issue")
+        return None
+    except json.JSONDecodeError:
+        print("[Exa] Invalid JSON response")
+        return None
+    except Exception as e:
+        print(f"[Exa] Unexpected error: {type(e).__name__}: {e}")
+        return None
 
-    def __init__(self, cfg: Optional[dict] = None):
-        super().__init__(cfg)
-    def google_search_with_serp(self, query: str):
-        def contains_chinese_basic(text: str) -> bool:
-            return any('\u4E00' <= char <= '\u9FFF' for char in text)
-        conn = http.client.HTTPSConnection("google.serper.dev")
-        if contains_chinese_basic(query):
-            payload = json.dumps({
+
+def search_tavily(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Tavily - Search API designed specifically for RAG and LLM applications.
+    Returns AI-optimized snippets and supports advanced filtering.
+    
+    API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search
+    """
+    if not TAVILY_API_KEY:
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    try:
+        # Tavily supports both Bearer token and api_key in body
+        # Using Bearer token as it's more standard
+        response = requests.post(
+            "https://api.tavily.com/search",
+            headers={
+                "Authorization": f"Bearer {TAVILY_API_KEY}",
+                "Content-Type": "application/json",
+            },
+            json={
+                "query": query,
+                "max_results": min(num_results, 20),  # API max is 20
+                "search_depth": "basic",  # Use basic (1 credit) vs advanced (2 credits)
+                "include_answer": False,
+                "include_raw_content": False,
+            },
+            timeout=REQUEST_TIMEOUT,
+        )
+        
+        # Handle error responses
+        if response.status_code == 401:
+            print("[Tavily] Invalid or expired API key")
+            return None
+        if response.status_code == 429:
+            print("[Tavily] Rate limited - too many requests")
+            return None
+        if response.status_code == 432:
+            print("[Tavily] Plan limit exceeded - upgrade required")
+            return None
+        if response.status_code == 433:
+            print("[Tavily] Pay-as-you-go limit exceeded")
+            return None
+        if response.status_code != 200:
+            error_msg = response.text[:200] if response.text else "Unknown error"
+            print(f"[Tavily] Error {response.status_code}: {error_msg}")
+            return None
+        
+        data = response.json()
+        api_results = data.get("results", [])
+        
+        if not api_results:
+            return None
+        
+        # Normalize results
+        results = []
+        for r in api_results:
+            title = r.get("title") or "No title"
+            url = r.get("url", "")
+            content = r.get("content", "")
+            
+            # Truncate content for snippet
+            snippet = content[:300] + "..." if len(content) > 300 else content
+            
+            results.append({
+                "title": title,
+                "url": url,
+                "snippet": snippet,
+                "date": "",
+            })
+        
+        return format_results(query, results, "Tavily")
+    
+    except requests.Timeout:
+        print("[Tavily] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Tavily] Connection error - network issue")
+        return None
+    except json.JSONDecodeError:
+        print("[Tavily] Invalid JSON response")
+        return None
+    except Exception as e:
+        print(f"[Tavily] Unexpected error: {type(e).__name__}: {e}")
+        return None
+
+
+def search_serper(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    Serper - Google Search API (SERP results).
+    Fast and reliable Google search results.
+    
+    API Docs: https://serper.dev/
+    """
+    if not SERPER_KEY:
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    try:
+        # Determine locale based on query content
+        if contains_chinese(query):
+            payload = {
                 "q": query,
-                "location": "China",
                 "gl": "cn",
-                "hl": "zh-cn"
-            })
-            
+                "hl": "zh-cn",
+                "num": min(num_results, 100),
+            }
         else:
-            payload = json.dumps({
+            payload = {
                 "q": query,
-                "location": "United States",
                 "gl": "us",
-                "hl": "en"
-            })
-        headers = {
-                'X-API-KEY': SERPER_KEY,
-                'Content-Type': 'application/json'
+                "hl": "en",
+                "num": min(num_results, 100),
             }
         
+        # Use requests instead of http.client for consistency and better error handling
+        response = requests.post(
+            "https://google.serper.dev/search",
+            headers={
+                "X-API-KEY": SERPER_KEY,
+                "Content-Type": "application/json",
+            },
+            json=payload,
+            timeout=REQUEST_TIMEOUT,
+        )
         
-        for i in range(5):
-            try:
-                conn.request("POST", "/search", payload, headers)
-                res = conn.getresponse()
-                break
-            except Exception as e:
-                print(e)
-                if i == 4:
-                    return f"Google search Timeout, return None, Please try again later."
-                continue
+        # Handle error responses
+        if response.status_code == 401:
+            print("[Serper] Invalid API key")
+            return None
+        if response.status_code == 429:
+            print("[Serper] Rate limited")
+            return None
+        if response.status_code != 200:
+            error_msg = response.text[:200] if response.text else "Unknown error"
+            print(f"[Serper] Error {response.status_code}: {error_msg}")
+            return None
+        
+        data = response.json()
+        
+        # Check for API-level errors
+        if "error" in data:
+            print(f"[Serper] API error: {data['error']}")
+            return None
+        
+        organic = data.get("organic", [])
+        if not organic:
+            return None
+        
+        # Normalize results
+        results = []
+        for page in organic:
+            title = page.get("title") or "No title"
+            url = page.get("link", "")
+            snippet_text = page.get("snippet", "")
+            date = page.get("date", "")
+            
+            # Clean up snippet
+            snippet = snippet_text.replace("Your browser can't play this video.", "").strip()
+            
+            results.append({
+                "title": title,
+                "url": url,
+                "snippet": snippet,
+                "date": date,
+            })
+        
+        return format_results(query, results, "Serper")
     
-        data = res.read()
-        results = json.loads(data.decode("utf-8"))
+    except requests.Timeout:
+        print("[Serper] Request timeout")
+        return None
+    except requests.ConnectionError:
+        print("[Serper] Connection error - network issue")
+        return None
+    except json.JSONDecodeError:
+        print("[Serper] Invalid JSON response")
+        return None
+    except Exception as e:
+        print(f"[Serper] Unexpected error: {type(e).__name__}: {e}")
+        return None
 
-        try:
-            if "organic" not in results:
-                raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
 
-            web_snippets = list()
-            idx = 0
-            if "organic" in results:
-                for page in results["organic"]:
-                    idx += 1
-                    date_published = ""
-                    if "date" in page:
-                        date_published = "\nDate published: " + page["date"]
+def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]:
+    """
+    DuckDuckGo - Free search with no API key required.
+    Rate limited but reliable as a final fallback.
+    """
+    try:
+        from duckduckgo_search import DDGS
+        from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException
+    except ImportError:
+        print("[DuckDuckGo] duckduckgo-search package not installed. Run: pip install duckduckgo-search")
+        return None
+    
+    query = sanitize_query(query)
+    if not query:
+        return None
+    
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            with DDGS() as ddgs:
+                api_results = list(ddgs.text(query, max_results=min(num_results, 25)))
+            
+            if not api_results:
+                return None
+            
+            # Normalize results
+            results = []
+            for r in api_results:
+                title = r.get("title") or "No title"
+                url = r.get("href") or r.get("link", "")
+                body = r.get("body", "")
+                
+                # Truncate body for snippet
+                snippet = body[:300] + "..." if len(body) > 300 else body
+                
+                results.append({
+                    "title": title,
+                    "url": url,
+                    "snippet": snippet,
+                    "date": "",
+                })
+            
+            return format_results(query, results, "DuckDuckGo")
+        
+        except RatelimitException:
+            if attempt < max_retries - 1:
+                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
+                print(f"[DuckDuckGo] Rate limited, waiting {wait_time}s...")
+                time.sleep(wait_time)
+                continue
+            print("[DuckDuckGo] Rate limited after all retries")
+            return None
+        except DuckDuckGoSearchException as e:
+            print(f"[DuckDuckGo] Search error: {e}")
+            return None
+        except Exception as e:
+            print(f"[DuckDuckGo] Unexpected error: {type(e).__name__}: {e}")
+            return None
+    
+    return None
 
-                    source = ""
-                    if "source" in page:
-                        source = "\nSource: " + page["source"]
 
-                    snippet = ""
-                    if "snippet" in page:
-                        snippet = "\n" + page["snippet"]
+# =============================================================================
+# Multi-Provider Search with Fallback
+# =============================================================================
 
-                    redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{snippet}"
-                    redacted_version = redacted_version.replace("Your browser can't play this video.", "")
-                    web_snippets.append(redacted_version)
+def multi_provider_search(query: str, num_results: int = 10) -> str:
+    """
+    Search using multiple providers with automatic fallback.
+    
+    Provider priority (by quality):
+      1. Exa.ai      - Best semantic search
+      2. Tavily      - Purpose-built for LLMs
+      3. Serper      - Google SERP results
+      4. DuckDuckGo  - Free fallback
+    
+    Returns the first successful result or an error message.
+    """
+    # Validate query
+    query = sanitize_query(query)
+    if not query:
+        return "[Search] Empty query provided. Please provide a search term."
+    
+    providers = [
+        ("Exa", search_exa),
+        ("Tavily", search_tavily),
+        ("Serper", search_serper),
+        ("DuckDuckGo", search_duckduckgo),
+    ]
+    
+    failed_providers = []
+    
+    for name, search_fn in providers:
+        result = search_fn(query, num_results)
+        if result:
+            return result
+        failed_providers.append(name)
+    
+    # All providers failed
+    return f"No results found for '{query}'. Providers attempted: {', '.join(failed_providers)}. Try a different or simpler query."
 
-            content = f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n" + "\n\n".join(web_snippets)
-            return content
-        except:
-            return f"No results found for '{query}'. Try with a more general query."
 
+# =============================================================================
+# Qwen Agent Tool Registration
+# =============================================================================
 
+@register_tool("search", allow_overwrite=True)
+class Search(BaseTool):
+    """Web search tool with multi-provider fallback."""
     
-    def search_with_serp(self, query: str):
-        result = self.google_search_with_serp(query)
-        return result
+    name = "search"
+    description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call."
+    parameters = {
+        "type": "object",
+        "properties": {
+            "query": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Array of query strings. Include multiple complementary search queries in a single call.",
+            },
+        },
+        "required": ["query"],
+    }
+
+    def __init__(self, cfg: Optional[dict] = None):
+        super().__init__(cfg)
+        
+        # Log which providers are available at initialization
+        available = []
+        if EXA_API_KEY:
+            available.append("Exa")
+        if TAVILY_API_KEY:
+            available.append("Tavily")
+        if SERPER_KEY:
+            available.append("Serper")
+        available.append("DuckDuckGo")  # Always available
+        
+        print(f"[Search] Initialized with providers: {', '.join(available)}")
 
     def call(self, params: Union[str, dict], **kwargs) -> str:
-        try:
-            query = params["query"]
-        except:
-            return "[Search] Invalid request format: Input must be a JSON object containing 'query' field"
+        # Handle string input (invalid)
+        if isinstance(params, str):
+            return "[Search] Invalid request: Input must be a JSON object with 'query' field, not a string."
+        
+        # Handle None or non-dict
+        if not isinstance(params, dict):
+            return "[Search] Invalid request: Input must be a JSON object with 'query' field."
+        
+        query = params.get("query")
+        
+        # Handle missing query
+        if query is None:
+            return "[Search] Missing 'query' field in request."
         
+        # Handle single string query
         if isinstance(query, str):
-            # 单个查询
-            response = self.search_with_serp(query)
-        else:
-            # 多个查询
-            assert isinstance(query, List)
+            query = query.strip()
+            if not query:
+                return "[Search] Empty query string provided."
+            return multi_provider_search(query)
+        
+        # Handle list of queries
+        if isinstance(query, list):
+            if not query:
+                return "[Search] Empty query list provided."
+            
+            # Filter out empty strings
+            valid_queries = [q.strip() for q in query if isinstance(q, str) and q.strip()]
+            
+            if not valid_queries:
+                return "[Search] No valid queries in list (all empty or non-string)."
+            
             responses = []
-            for q in query:
-                responses.append(self.search_with_serp(q))
-            response = "\n=======\n".join(responses)
+            for q in valid_queries:
+                responses.append(multi_provider_search(q))
             
-        return response
-
+            return "\n=======\n".join(responses)
+        
+        # Invalid query type
+        return f"[Search] Invalid 'query' type: expected string or array, got {type(query).__name__}."
diff --git a/requirements-local.txt b/requirements-local.txt
new file mode 100644
index 00000000..7800ee0e
--- /dev/null
+++ b/requirements-local.txt
@@ -0,0 +1,12 @@
+# DeepResearch - Minimal requirements for llama.cpp local inference
+# Install with: pip install -r requirements-local.txt
+
+# Core dependencies
+requests>=2.31.0
+python-dotenv>=1.0.0
+
+# Web search (FREE, no API key needed)
+duckduckgo-search>=6.0.0
+
+# Optional but recommended
+tqdm>=4.66.0
diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh
new file mode 100755
index 00000000..4b42ce9b
--- /dev/null
+++ b/scripts/start_llama_server.sh
@@ -0,0 +1,237 @@
+#!/bin/bash
+# =============================================================================
+# DeepResearch Local Server - llama.cpp with Metal Acceleration
+# =============================================================================
+#
+# This script starts the llama.cpp server optimized for the DeepResearch
+# ReAct agent workflow on Apple Silicon.
+#
+# The server provides:
+#   - OpenAI-compatible API at http://localhost:8080/v1/chat/completions
+#   - Built-in Web UI at http://localhost:8080 (chat interface!)
+#   - Metal (GPU) acceleration for fast inference
+#   - Model loaded once and kept resident in memory
+#
+# Usage:
+#   ./scripts/start_llama_server.sh              # Start with defaults
+#   ./scripts/start_llama_server.sh --ctx 16384  # Custom context size
+#   ./scripts/start_llama_server.sh --no-webui   # API only, no web UI
+#
+# Access:
+#   - Web UI:  http://localhost:8080
+#   - API:     http://localhost:8080/v1/chat/completions
+#   - CLI:     python inference/interactive_llamacpp.py
+#
+# =============================================================================
+
+set -e
+
+# Configuration
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+LLAMA_SERVER="$PROJECT_DIR/llama.cpp/build/bin/llama-server"
+MODEL_PATH="$PROJECT_DIR/models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf"
+
+# Default settings (optimized for Apple Silicon with 32GB RAM)
+PORT=${PORT:-8080}
+HOST=${HOST:-127.0.0.1}
+CTX_SIZE=${CTX_SIZE:-16384}       # 16K context (use --ctx 32768 for longer sessions)
+GPU_LAYERS=${GPU_LAYERS:-99}      # Offload all layers to Metal
+THREADS=${THREADS:-8}             # CPU threads for non-GPU ops
+PARALLEL=${PARALLEL:-1}           # Parallel request slots
+BATCH_SIZE=${BATCH_SIZE:-512}     # Batch size for prompt processing
+WEBUI=${WEBUI:-true}              # Enable web UI by default
+MLOCK=${MLOCK:-false}             # Don't lock model in RAM (saves memory for other apps)
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}"
+echo "============================================================"
+echo "  DeepResearch Local Server (llama.cpp + Metal)"
+echo "============================================================"
+echo -e "${NC}"
+
+# Check if llama-server exists
+if [ ! -f "$LLAMA_SERVER" ]; then
+    echo -e "${RED}Error: llama-server not found at $LLAMA_SERVER${NC}"
+    echo ""
+    echo "Please build llama.cpp first:"
+    echo "  cd $PROJECT_DIR/llama.cpp"
+    echo "  cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release"
+    echo "  cmake --build build --config Release"
+    exit 1
+fi
+
+# Check if model exists
+if [ ! -f "$MODEL_PATH" ]; then
+    echo -e "${RED}Error: Model not found at $MODEL_PATH${NC}"
+    echo ""
+    echo "Please download the model first:"
+    echo "  cd $PROJECT_DIR/models/gguf"
+    echo "  curl -L -C - -o Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \\"
+    echo "    'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf'"
+    exit 1
+fi
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --ctx)
+            CTX_SIZE="$2"
+            shift 2
+            ;;
+        --threads)
+            THREADS="$2"
+            shift 2
+            ;;
+        --parallel)
+            PARALLEL="$2"
+            shift 2
+            ;;
+        --no-webui)
+            WEBUI=false
+            shift
+            ;;
+        --webui)
+            WEBUI=true
+            shift
+            ;;
+        --mlock)
+            MLOCK=true
+            shift
+            ;;
+        --low-memory)
+            # Low memory mode: smaller context, no mlock
+            CTX_SIZE=8192
+            MLOCK=false
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [options]"
+            echo ""
+            echo "Options:"
+            echo "  --port N      Port number (default: 8080)"
+            echo "  --ctx N       Context size (default: 16384)"
+            echo "  --threads N   CPU threads (default: 8)"
+            echo "  --parallel N  Parallel requests (default: 1)"
+            echo "  --webui       Enable web UI (default)"
+            echo "  --no-webui    Disable web UI, API only"
+            echo "  --mlock       Lock model in RAM (uses more memory but faster)"
+            echo "  --low-memory  Low memory mode: 8K context, no mlock"
+            echo "  -h, --help    Show this help"
+            echo ""
+            echo "Access points:"
+            echo "  Web UI:  http://127.0.0.1:PORT"
+            echo "  API:     http://127.0.0.1:PORT/v1/chat/completions"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+echo -e "${GREEN}Configuration:${NC}"
+echo "  Model:     $(basename "$MODEL_PATH")"
+echo "  Size:      $(du -h "$MODEL_PATH" | cut -f1)"
+echo "  Context:   $CTX_SIZE tokens"
+echo "  GPU:       Metal (all $GPU_LAYERS layers)"
+echo "  Threads:   $THREADS"
+echo "  Parallel:  $PARALLEL slots"
+echo "  Mlock:     $MLOCK"
+echo "  Web UI:    $WEBUI"
+echo "  Endpoint:  http://$HOST:$PORT"
+echo ""
+
+# Check for existing server on port
+if lsof -i :$PORT > /dev/null 2>&1; then
+    echo -e "${YELLOW}Warning: Port $PORT is already in use.${NC}"
+    echo "Existing process:"
+    lsof -i :$PORT | head -2
+    echo ""
+    read -p "Kill existing process and continue? (y/N) " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        lsof -t -i :$PORT | xargs kill -9 2>/dev/null || true
+        sleep 1
+    else
+        echo "Aborting."
+        exit 1
+    fi
+fi
+
+echo -e "${YELLOW}Starting server...${NC}"
+echo "(Model loading takes ~30-60 seconds)"
+echo ""
+
+# Build command arguments
+SERVER_ARGS=(
+    --model "$MODEL_PATH"
+    --host "$HOST"
+    --port "$PORT"
+    --ctx-size "$CTX_SIZE"
+    --n-gpu-layers "$GPU_LAYERS"
+    --threads "$THREADS"
+    --parallel "$PARALLEL"
+    --batch-size "$BATCH_SIZE"
+    --flash-attn auto
+    --metrics
+    --log-disable
+    --alias deepresearch
+)
+# Note: --jinja is enabled by default in recent llama.cpp versions
+
+# Add mlock if requested (uses more memory but may be faster)
+if [ "$MLOCK" = "true" ]; then
+    SERVER_ARGS+=(--mlock)
+fi
+
+# Add no-webui flag if requested
+if [ "$WEBUI" = "false" ]; then
+    SERVER_ARGS+=(--no-webui)
+fi
+
+# Start the server with optimized settings for DeepResearch
+exec "$LLAMA_SERVER" "${SERVER_ARGS[@]}" 2>&1 | while read -r line; do
+    # Colorize output
+    if [[ $line == *"error"* ]] || [[ $line == *"Error"* ]]; then
+        echo -e "${RED}$line${NC}"
+    elif [[ $line == *"listening"* ]] || [[ $line == *"ready"* ]]; then
+        echo -e "${GREEN}$line${NC}"
+        echo ""
+        echo -e "${GREEN}============================================================${NC}"
+        echo -e "${GREEN}  Server ready!${NC}"
+        echo -e "${GREEN}============================================================${NC}"
+        if [ "$WEBUI" = "true" ]; then
+            echo ""
+            echo -e "${GREEN}  Web UI:  http://$HOST:$PORT${NC}"
+            echo "           Open in your browser for a chat interface!"
+        fi
+        echo ""
+        echo -e "${GREEN}  API:     http://$HOST:$PORT/v1/chat/completions${NC}"
+        echo ""
+        echo "Run DeepResearch CLI:"
+        echo "  python inference/interactive_llamacpp.py"
+        echo ""
+        echo "Test API:"
+        echo "  curl http://$HOST:$PORT/v1/chat/completions \\"
+        echo "    -H 'Content-Type: application/json' \\"
+        echo "    -d '{\"model\": \"deepresearch\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'"
+        echo ""
+        echo "Press Ctrl+C to stop the server."
+    elif [[ $line == *"warning"* ]] || [[ $line == *"Warning"* ]]; then
+        echo -e "${YELLOW}$line${NC}"
+    else
+        echo "$line"
+    fi
+done