diff --git a/.env.example b/.env.example index 8558e9c4..c53f27cb 100644 --- a/.env.example +++ b/.env.example @@ -46,10 +46,24 @@ MAX_WORKERS=30 # API Keys and External Services # ============================================================================= -# Serper API for web search and Google Scholar +# Web Search Providers (in order of quality/preference) +# The system will try each provider in order until one succeeds. +# You only need ONE provider configured, but having multiple provides fallback. + +# Exa.ai - Best semantic/neural search ($10 free credits) +# Get your key from: https://exa.ai/ +EXA_API_KEY=your_key + +# Tavily - Purpose-built for RAG/LLMs (1,000 free requests/month) +# Get your key from: https://tavily.com/ +TAVILY_API_KEY=your_key + +# Serper API for Google search results (2,500 free queries) # Get your key from: https://serper.dev/ SERPER_KEY_ID=your_key +# DuckDuckGo is always available as final fallback (FREE, no API key needed) + # Jina API for web page reading # Get your key from: https://jina.ai/ JINA_API_KEYS=your_key @@ -95,4 +109,17 @@ IDP_KEY_SECRET=your_idp_key_secret # These are typically set by distributed training frameworks # WORLD_SIZE=1 -# RANK=0 \ No newline at end of file +# RANK=0 + +# ============================================================================= +# llama.cpp Local Inference (Alternative for Mac/Local Users) +# ============================================================================= +# If using the llama.cpp local inference option instead of vLLM: + +# The llama.cpp server URL (default works if using start_llama_server.sh) +LLAMA_SERVER_URL=http://127.0.0.1:8080 + +# For llama.cpp mode: +# - Web search uses DuckDuckGo by default (FREE, no API key needed) +# - JINA_API_KEYS is optional but recommended for better page reading +# - See: python inference/interactive_llamacpp.py --help \ No newline at end of file diff --git a/README.md b/README.md index 6a147f47..554bc0a8 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,55 @@ You need to modify the following in the file [inference/react_agent.py](https:// - Change the model name to alibaba/tongyi-deepresearch-30b-a3b. - Adjust the content concatenation way as described in the comments on lines **88–90.** + +--- + +### 7. Local Inference with llama.cpp (Optional) + +> **For Mac users or anyone who wants 100% local inference without vLLM/CUDA dependencies.** + +This repo includes support for running DeepResearch locally using [llama.cpp](https://github.com/ggerganov/llama.cpp) with Metal (Apple Silicon) or CUDA acceleration. Zero API costs, full privacy. + +#### Requirements + +- llama.cpp built with Metal or CUDA support +- GGUF model: [bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF](https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF) +- 32GB+ RAM (for Q4_K_M quantization) + +#### Quick Start + +```bash +# Install minimal dependencies +pip install -r requirements-local.txt + +# Build llama.cpp (Mac with Metal) +cd llama.cpp +cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release +cmake --build build --config Release +cd .. + +# Download model (~18GB) +mkdir -p models/gguf +curl -L -o models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \ + 'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf' + +# Terminal 1: Start the server +./scripts/start_llama_server.sh + +# Terminal 2: Run research queries +python inference/interactive_llamacpp.py +``` + +The llama.cpp server provides both an API and a web UI at http://localhost:8080. + +#### Features + +- **Free web search**: Uses DuckDuckGo (no API key required) +- **Page visiting**: Uses Jina Reader (optional API key for better results) +- **Loop detection**: Prevents infinite tool call cycles +- **32K context**: Long research sessions supported + +--- ## Benchmark Evaluation We provide benchmark evaluation scripts for various datasets. Please refer to the [evaluation scripts](./evaluation/) directory for more details. diff --git a/inference/interactive_llamacpp.py b/inference/interactive_llamacpp.py new file mode 100644 index 00000000..b12fcb53 --- /dev/null +++ b/inference/interactive_llamacpp.py @@ -0,0 +1,871 @@ +#!/usr/bin/env python3 +""" +DeepResearch Interactive CLI - llama.cpp Server Edition +========================================================= + +A powerful local research assistant that runs on YOUR machine. +Zero API costs. Full privacy. Complete control. + +This script connects to a local llama.cpp server running the +Tongyi-DeepResearch model and provides a full ReAct agent loop +with web search and page visiting capabilities. + +Architecture: + +------------------+ HTTP +-------------------+ + | This Script | ------------> | llama.cpp | + | (Agent Logic) | | Server | + | - Tool calls | <------------ | (Model loaded) | + | - Web search | JSON | - Metal GPU | + | - Page visits | | - 32K context | + +------------------+ +-------------------+ + +Search Providers (in order of quality): + 1. Exa.ai - Best semantic/neural search + 2. Tavily - Purpose-built for RAG/LLMs + 3. Serper - Google SERP results + 4. DuckDuckGo - Free fallback (no API key needed) + +Usage: + # Terminal 1: Start the server (one-time, stays running) + ./scripts/start_llama_server.sh + + # Terminal 2: Run research queries + python inference/interactive_llamacpp.py + +Requirements: + pip install requests duckduckgo-search python-dotenv +""" + +import argparse +import json +import os +import re +import sys +import time +from datetime import datetime +from typing import Any, Dict, List, Optional +from urllib.parse import urlparse + +import requests + +# Load environment variables +try: + from dotenv import load_dotenv + load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env")) +except ImportError: + pass + +# ============================================================================= +# Configuration +# ============================================================================= + +LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://127.0.0.1:8080") +JINA_API_KEY = os.environ.get("JINA_API_KEYS", "") or os.environ.get("JINA_API_KEY", "") + +# Search API keys +EXA_API_KEY = os.environ.get("EXA_API_KEY", "") +TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "") +SERPER_KEY = os.environ.get("SERPER_KEY_ID", "") + +MAX_ROUNDS = 10 +MAX_TOKENS = 4096 +TEMPERATURE = 0.7 +TOP_P = 0.95 +REQUEST_TIMEOUT = 300 # 5 minutes for long generations + +# Stop sequences for the ReAct loop +STOP_SEQUENCES = [ + "", + "\n", +] + + +# ============================================================================= +# System Prompt - Optimized for DeepResearch ReAct Agent +# ============================================================================= + +def get_system_prompt() -> str: + return f"""You are a deep research assistant. Your task is to answer questions by searching the web and synthesizing information from credible sources. + +# CRITICAL RULES + +1. **Think deeply**: Use tags to reason about what you know and what you need to find +2. **Search strategically**: Use multiple targeted searches to gather comprehensive information +3. **Verify information**: Cross-reference facts across multiple sources +4. **Synthesize thoroughly**: Combine information from multiple sources into a coherent answer +5. **NEVER visit the same URL twice**: Each URL can only be visited once +6. **Always conclude**: After gathering sufficient info (typically 5-15 sources), provide your answer in tags +7. **Be efficient**: Aim to answer in 10-20 rounds + +# Response Format + +When you need to search, respond with: +What I need to find and why + +{{"name": "search", "arguments": {{"query": ["your search query"]}}}} + + +When you need to visit a page for details: +Why I need to visit this specific page + +{{"name": "visit", "arguments": {{"url": "https://example.com", "goal": "what specific info you need"}}}} + + +When you have enough information, respond with: +Summary of what I found and my analysis +Your comprehensive, well-researched answer with citations where appropriate + +# Tools + + +{{"type": "function", "function": {{"name": "search", "description": "Web search. Returns titles, URLs, and snippets.", "parameters": {{"type": "object", "properties": {{"query": {{"type": "array", "items": {{"type": "string"}}, "description": "1-3 search queries"}}}}, "required": ["query"]}}}}}} +{{"type": "function", "function": {{"name": "visit", "description": "Visit a URL to get full page content. Each URL can only be visited ONCE.", "parameters": {{"type": "object", "properties": {{"url": {{"type": "string", "description": "URL to visit"}}, "goal": {{"type": "string", "description": "What info you need"}}}}, "required": ["url", "goal"]}}}}}} + + +# Important Notes +- The visit tool returns the COMPLETE page content in one response +- After 8-10 successful source visits, you likely have enough information to answer +- Prefer quality over quantity - don't just collect sources, synthesize them + +Current date: {datetime.now().strftime("%Y-%m-%d")}""" + + +# ============================================================================= +# Search Providers +# ============================================================================= + +def contains_chinese(text: str) -> bool: + """Check if text contains Chinese characters.""" + if not text: + return False + return any("\u4E00" <= char <= "\u9FFF" for char in text) + + +def sanitize_query(query: str) -> str: + """Sanitize and validate a search query.""" + if not query: + return "" + return query.strip()[:500] + + +def search_exa(query: str, num_results: int = 10) -> Optional[str]: + """ + Exa.ai - Neural/semantic search engine. + API Docs: https://docs.exa.ai/reference/search + """ + if not EXA_API_KEY: + return None + + query = sanitize_query(query) + if not query: + return None + + try: + response = requests.post( + "https://api.exa.ai/search", + headers={ + "x-api-key": EXA_API_KEY, + "Content-Type": "application/json", + }, + json={ + "query": query, + "numResults": min(num_results, 100), + "type": "auto", # Let Exa choose best search type + }, + timeout=30, + ) + + if response.status_code == 401: + print("[Exa] Invalid or expired API key") + return None + if response.status_code == 429: + print("[Exa] Rate limited") + return None + if response.status_code == 402: + print("[Exa] Payment required - credits exhausted") + return None + if response.status_code != 200: + return None + + data = response.json() + results = data.get("results", []) + if not results: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, r in enumerate(results, 1): + title = r.get("title") or "No title" + url = r.get("url", "") + text = r.get("text", "")[:300] if r.get("text") else "" + output.append(f"{idx}. [{title}]({url})") + if text: + output.append(f" {text}...") + + return "\n".join(output) + except requests.Timeout: + print("[Exa] Request timeout") + return None + except requests.ConnectionError: + print("[Exa] Connection error") + return None + except Exception as e: + print(f"[Exa] Error: {e}") + return None + + +def search_tavily(query: str, num_results: int = 10) -> Optional[str]: + """ + Tavily - Search API for RAG/LLM applications. + API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search + """ + if not TAVILY_API_KEY: + return None + + query = sanitize_query(query) + if not query: + return None + + try: + # Use Bearer token auth (preferred over api_key in body) + response = requests.post( + "https://api.tavily.com/search", + headers={ + "Authorization": f"Bearer {TAVILY_API_KEY}", + "Content-Type": "application/json", + }, + json={ + "query": query, + "max_results": min(num_results, 20), + "search_depth": "basic", # Use basic (1 credit) vs advanced (2 credits) + "include_answer": False, + "include_raw_content": False, + }, + timeout=30, + ) + + if response.status_code == 401: + print("[Tavily] Invalid or expired API key") + return None + if response.status_code == 429: + print("[Tavily] Rate limited") + return None + if response.status_code == 432: + print("[Tavily] Plan limit exceeded") + return None + if response.status_code == 433: + print("[Tavily] Pay-as-you-go limit exceeded") + return None + if response.status_code != 200: + return None + + data = response.json() + results = data.get("results", []) + if not results: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, r in enumerate(results, 1): + title = r.get("title") or "No title" + url = r.get("url", "") + content = r.get("content", "")[:300] + output.append(f"{idx}. [{title}]({url})") + if content: + output.append(f" {content}...") + + return "\n".join(output) + except requests.Timeout: + print("[Tavily] Request timeout") + return None + except requests.ConnectionError: + print("[Tavily] Connection error") + return None + except Exception as e: + print(f"[Tavily] Error: {e}") + return None + + +def search_serper(query: str, num_results: int = 10) -> Optional[str]: + """ + Serper - Google Search API. + API Docs: https://serper.dev/ + """ + if not SERPER_KEY: + return None + + query = sanitize_query(query) + if not query: + return None + + try: + if contains_chinese(query): + payload = {"q": query, "gl": "cn", "hl": "zh-cn", "num": min(num_results, 100)} + else: + payload = {"q": query, "gl": "us", "hl": "en", "num": min(num_results, 100)} + + response = requests.post( + "https://google.serper.dev/search", + headers={ + "X-API-KEY": SERPER_KEY, + "Content-Type": "application/json", + }, + json=payload, + timeout=30, + ) + + if response.status_code == 401: + print("[Serper] Invalid API key") + return None + if response.status_code == 429: + print("[Serper] Rate limited") + return None + if response.status_code != 200: + return None + + data = response.json() + if "error" in data: + print(f"[Serper] API error: {data['error']}") + return None + + organic = data.get("organic", []) + if not organic: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, page in enumerate(organic, 1): + title = page.get("title") or "No title" + url = page.get("link", "") + snippet = page.get("snippet", "")[:300].replace("Your browser can't play this video.", "").strip() + output.append(f"{idx}. [{title}]({url})") + if snippet: + output.append(f" {snippet}...") + + return "\n".join(output) + except requests.Timeout: + print("[Serper] Request timeout") + return None + except requests.ConnectionError: + print("[Serper] Connection error") + return None + except Exception as e: + print(f"[Serper] Error: {e}") + return None + + +def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]: + """DuckDuckGo - Free search, no API key required.""" + try: + from duckduckgo_search import DDGS + from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException + except ImportError: + print("[DuckDuckGo] duckduckgo-search package not installed") + return None + + query = sanitize_query(query) + if not query: + return None + + retries = 3 + for attempt in range(retries): + try: + with DDGS() as ddgs: + results = list(ddgs.text(query, max_results=min(num_results, 25))) + + if not results: + return None + + output = [f"\n## Search: '{query}'\n"] + for idx, r in enumerate(results, 1): + title = r.get("title") or "No title" + url = r.get("href") or r.get("link", "") + body = r.get("body", "")[:300] + output.append(f"{idx}. [{title}]({url})") + if body: + output.append(f" {body}...") + + return "\n".join(output) + except RatelimitException: + if attempt < retries - 1: + wait = 2 ** attempt + print(f"[DuckDuckGo] Rate limited, waiting {wait}s...") + time.sleep(wait) + continue + print("[DuckDuckGo] Rate limited after all retries") + return None + except DuckDuckGoSearchException as e: + print(f"[DuckDuckGo] Search error: {e}") + return None + except Exception as e: + print(f"[DuckDuckGo] Error: {e}") + return None + + return None + + +def multi_provider_search(queries: list, num_results: int = 10) -> str: + """Search using multiple providers with automatic fallback.""" + providers = [ + ("Exa", search_exa), + ("Tavily", search_tavily), + ("Serper", search_serper), + ("DuckDuckGo", search_duckduckgo), + ] + + all_results = [] + + for query in queries[:3]: + result = None + for name, search_fn in providers: + result = search_fn(query, num_results) + if result: + break + + if result: + all_results.append(result) + else: + all_results.append(f"\n## Search: '{query}'\n[No results found]") + + return "\n".join(all_results) if all_results else "No results found" + + +# ============================================================================= +# Page Visitor +# ============================================================================= + +def is_valid_url(url: str) -> bool: + """Check if URL is valid and uses http/https scheme.""" + try: + result = urlparse(url) + return all([result.scheme in ("http", "https"), result.netloc]) + except Exception: + return False + + +def visit_page(url: str, goal: str) -> str: + """Fetch webpage content using Jina Reader or direct fetch.""" + if isinstance(url, list): + url = url[0] if url else "" + + if not url: + return "[Visit Error] No URL provided" + + if not is_valid_url(url): + return f"[Visit Error] Invalid URL: {url}. Must be a valid http/https URL." + + # Try Jina Reader first + try: + headers = {"Accept": "text/plain"} + if JINA_API_KEY: + headers["Authorization"] = f"Bearer {JINA_API_KEY}" + + jina_url = f"https://r.jina.ai/{url}" + response = requests.get(jina_url, headers=headers, timeout=30, allow_redirects=True) + + if response.status_code == 200 and len(response.text) > 100: + content = response.text[:12000] + return f"**Content from {url}** (goal: {goal}):\n\n{content}" + except Exception: + pass + + # Fallback to direct fetch + try: + headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"} + response = requests.get(url, headers=headers, timeout=15) + response.raise_for_status() + + text = response.text + text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() + + if len(text) > 100: + return f"**Content from {url}** (goal: {goal}):\n\n{text[:12000]}" + return f"[Visit Error] Page content too short or blocked: {url}" + except requests.Timeout: + return f"[Visit Error] Timeout fetching {url}" + except Exception as e: + return f"[Visit Error] Could not fetch {url}: {type(e).__name__}: {e}" + + +# ============================================================================= +# llama.cpp Server Client +# ============================================================================= + +class LlamaCppClient: + """Client for the llama.cpp OpenAI-compatible API.""" + + def __init__(self, base_url: str = LLAMA_SERVER_URL): + self.base_url = base_url.rstrip("/") + self.api_url = f"{self.base_url}/v1/chat/completions" + self.session = requests.Session() + + def check_server(self) -> bool: + """Check if the server is running and responsive.""" + try: + response = self.session.get(f"{self.base_url}/health", timeout=5) + return response.status_code == 200 + except Exception: + return False + + def generate( + self, + messages: List[Dict[str, str]], + max_tokens: int = MAX_TOKENS, + temperature: float = TEMPERATURE, + top_p: float = TOP_P, + stop: Optional[List[str]] = None, + ) -> str: + """Generate a response from the llama.cpp server.""" + payload = { + "messages": messages, + "max_tokens": max_tokens, + "temperature": temperature, + "top_p": top_p, + "stop": stop or STOP_SEQUENCES, + "stream": False, + } + + try: + response = self.session.post( + self.api_url, + json=payload, + headers={"Content-Type": "application/json"}, + timeout=300, + ) + + if response.status_code != 200: + error_text = response.text[:500] + return f"[Server Error] Status {response.status_code}: {error_text}" + + data = response.json() + content = data["choices"][0]["message"]["content"] + return content.strip() + + except requests.Timeout: + return "[Error] Request timed out. The model may be processing a complex query." + except requests.ConnectionError: + return "[Error] Cannot connect to llama.cpp server. Is it running?" + except Exception as e: + return f"[Error] API call failed: {e}" + + +# ============================================================================= +# Research Agent +# ============================================================================= + +def research( + client: LlamaCppClient, + question: str, + verbose: bool = True, + max_rounds: int = MAX_ROUNDS, + temperature: float = TEMPERATURE, +) -> dict: + """Run the research agent loop.""" + if verbose: + print(f"\n[*] Researching: {question}\n") + print("-" * 60) + + system_prompt = get_system_prompt() + messages: List[Dict[str, str]] = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": question}, + ] + + sources: List[Dict[str, Any]] = [] + thinking: List[str] = [] + visited_urls: set = set() + consecutive_errors = 0 + max_consecutive_errors = 3 + start_time = time.time() + + for round_num in range(max_rounds): + if verbose: + print(f"\n[Round {round_num + 1}/{max_rounds}]") + + gen_start = time.time() + content = client.generate(messages, temperature=temperature) + gen_time = time.time() - gen_start + + if content.startswith("[Error]") or content.startswith("[Server Error]"): + if verbose: + print(f" Error: {content}") + break + + if verbose: + print(f" Generated in {gen_time:.1f}s") + + messages.append({"role": "assistant", "content": content}) + + # Extract and display thinking + if "" in content and "" in content: + think_content = content.split("")[1].split("")[0].strip() + thinking.append(think_content) + if verbose: + preview = think_content[:200] + "..." if len(think_content) > 200 else think_content + print(f" Thinking: {preview}") + + # Check for final answer + if "" in content: + if "" in content: + answer = content.split("")[1].split("")[0] + else: + answer = content.split("")[1] + + elapsed = time.time() - start_time + + if verbose: + print("\n" + "=" * 60) + print("ANSWER:") + print("=" * 60) + print(answer.strip()) + print("=" * 60) + print(f"\nStats: {round_num + 1} rounds, {len(sources)} sources, {elapsed:.1f}s") + + return { + "answer": answer.strip(), + "sources": sources, + "rounds": round_num + 1, + "thinking": thinking, + "elapsed_seconds": elapsed, + } + + # Handle tool calls + if "" in content and "" in content: + try: + tool_json = content.split("")[1].split("")[0] + tool = json.loads(tool_json.strip()) + name = tool.get("name", "") + args = tool.get("arguments", {}) + + if verbose: + print(f" Tool: {name}") + + tool_error = False + + if name == "search": + queries = args.get("query", [question]) + if isinstance(queries, str): + queries = [queries] + if verbose: + print(f" Queries: {queries}") + tool_result = multi_provider_search(queries) + if "[No results" in tool_result or "error" in tool_result.lower(): + tool_error = True + else: + sources.append({"type": "search", "queries": queries}) + + elif name == "visit": + url = args.get("url", "") + if isinstance(url, list): + url = url[0] if url else "" + goal = args.get("goal", "extract information") + + if url in visited_urls: + if verbose: + print(f" [!] Already visited: {url}") + tool_result = f"[Already Visited] You already visited {url}. Use the information from your previous visit or try a different source." + tool_error = True + else: + visited_urls.add(url) + if verbose: + print(f" Visiting: {url[:60]}...") + tool_result = visit_page(url, goal) + if "[Visit Error]" in tool_result: + tool_error = True + else: + sources.append({"type": "visit", "url": url}) + + else: + tool_result = f"Unknown tool: {name}. Available tools: search, visit" + tool_error = True + + # Track consecutive errors for loop detection + if tool_error: + consecutive_errors += 1 + if consecutive_errors >= max_consecutive_errors: + if verbose: + print(f"\n[!] {max_consecutive_errors} consecutive tool errors detected.") + messages.append({ + "role": "user", + "content": f"\n{tool_result}\n\n[System Notice] You have encountered {consecutive_errors} consecutive errors. Please provide your best answer now based on information gathered so far, or try a completely different approach.\n", + }) + continue + else: + consecutive_errors = 0 + + # Inject tool response + messages.append({ + "role": "user", + "content": f"\n{tool_result}\n", + }) + + except json.JSONDecodeError as e: + consecutive_errors += 1 + messages.append({ + "role": "user", + "content": f"\nError: Invalid JSON in tool call: {e}\n", + }) + except Exception as e: + consecutive_errors += 1 + messages.append({ + "role": "user", + "content": f"\nTool error: {e}\n", + }) + + # Force final answer after max rounds + if verbose: + print("\n[!] Max rounds reached, requesting final answer...") + + messages.append({ + "role": "user", + "content": "You have reached the maximum number of research rounds. Please provide your final answer now based on all the information gathered. Use tags.", + }) + + content = client.generate(messages, max_tokens=2048, temperature=temperature) + + if "" in content: + if "" in content: + answer = content.split("")[1].split("")[0] + else: + answer = content.split("")[1] + else: + answer = content + + elapsed = time.time() - start_time + + if verbose: + print("\n" + "=" * 60) + print("ANSWER:") + print("=" * 60) + print(answer.strip()) + print("=" * 60) + print(f"\nStats: {max_rounds} rounds (max), {len(sources)} sources, {elapsed:.1f}s") + + return { + "answer": answer.strip(), + "sources": sources, + "rounds": max_rounds, + "thinking": thinking, + "elapsed_seconds": elapsed, + } + + +# ============================================================================= +# Main +# ============================================================================= + +def get_available_providers() -> List[str]: + """Return list of available search providers.""" + providers = [] + if EXA_API_KEY: + providers.append("Exa") + if TAVILY_API_KEY: + providers.append("Tavily") + if SERPER_KEY: + providers.append("Serper") + providers.append("DuckDuckGo") + return providers + + +def main(): + parser = argparse.ArgumentParser( + description="DeepResearch Interactive CLI - llama.cpp Server Edition", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Start the server first (in another terminal): + ./scripts/start_llama_server.sh + + # Run interactive mode: + python inference/interactive_llamacpp.py + + # Single query mode: + python inference/interactive_llamacpp.py --query "What is quantum entanglement?" + + # Connect to a different server: + python inference/interactive_llamacpp.py --server http://192.168.1.100:8080 +""", + ) + parser.add_argument( + "--server", + type=str, + default=LLAMA_SERVER_URL, + help="llama.cpp server URL (default: http://127.0.0.1:8080)", + ) + parser.add_argument( + "--query", "-q", + type=str, + default=None, + help="Single query mode - run one research query and exit", + ) + parser.add_argument( + "--max-rounds", + type=int, + default=MAX_ROUNDS, + help=f"Maximum research rounds (default: {MAX_ROUNDS})", + ) + parser.add_argument( + "--temperature", + type=float, + default=TEMPERATURE, + help=f"Sampling temperature (default: {TEMPERATURE})", + ) + args = parser.parse_args() + + providers = get_available_providers() + + print("\n" + "=" * 60) + print("DeepResearch - Interactive CLI") + print("llama.cpp Server Edition (100% Local)") + print("=" * 60) + print(f"Server: {args.server}") + print(f"Search: {', '.join(providers)}") + print(f"Reader: Jina.ai {'[configured]' if JINA_API_KEY else '[free tier]'}") + print("=" * 60) + + # Initialize client + client = LlamaCppClient(base_url=args.server) + + # Check server connection + print("\nConnecting to llama.cpp server...", end=" ") + if not client.check_server(): + print("FAILED") + print(f"\nError: Cannot connect to llama.cpp server at {args.server}") + print("\nPlease start the server first:") + print(" ./scripts/start_llama_server.sh") + print("\nOr specify a different server URL:") + print(" python inference/interactive_llamacpp.py --server http://your-server:8080") + sys.exit(1) + print("OK") + + # Single query mode + if args.query: + research(client, args.query, max_rounds=args.max_rounds, temperature=args.temperature) + return + + # Interactive mode + print("\nType your research question (or 'quit' to exit):\n") + + while True: + try: + question = input("Question: ").strip() + + if not question: + continue + + if question.lower() in ("quit", "exit", "q"): + print("\nGoodbye!") + break + + research(client, question, max_rounds=args.max_rounds, temperature=args.temperature) + print("\n" + "-" * 60 + "\n") + + except KeyboardInterrupt: + print("\n\nGoodbye!") + break + except Exception as e: + print(f"\nError: {e}\n") + + +if __name__ == "__main__": + main() diff --git a/inference/tool_search.py b/inference/tool_search.py index 1a3f7b53..15f21063 100644 --- a/inference/tool_search.py +++ b/inference/tool_search.py @@ -1,131 +1,538 @@ +""" +Multi-Provider Web Search Tool +============================== + +Implements a robust search fallback chain optimized for AI research: + 1. Exa.ai - Best semantic/neural search ($10 free credits) + 2. Tavily - Purpose-built for RAG/LLMs (1,000 free requests/month) + 3. Serper - Google SERP results (2,500 free queries) + 4. DuckDuckGo - Free forever, final fallback (no API key needed) + +Each provider is tried in order. If one fails (rate limit, error, no key), +the next provider is attempted automatically. + +Environment Variables: + EXA_API_KEY - Exa.ai API key (https://exa.ai/) + TAVILY_API_KEY - Tavily API key (https://tavily.com/) + SERPER_KEY_ID - Serper API key (https://serper.dev/) + +If no API keys are set, DuckDuckGo is used as the default (free, no key needed). +""" + +import http.client import json -from concurrent.futures import ThreadPoolExecutor -from typing import List, Union +import os +import time +from typing import List, Optional, Union + import requests from qwen_agent.tools.base import BaseTool, register_tool -import asyncio -from typing import Dict, List, Optional, Union -import uuid -import http.client -import json -import os +# API Keys from environment +EXA_API_KEY = os.environ.get("EXA_API_KEY", "").strip() +TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY", "").strip() +SERPER_KEY = os.environ.get("SERPER_KEY_ID", "").strip() -SERPER_KEY=os.environ.get('SERPER_KEY_ID') +# Request timeouts (seconds) +REQUEST_TIMEOUT = 30 -@register_tool("search", allow_overwrite=True) -class Search(BaseTool): - name = "search" - description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call." - parameters = { - "type": "object", - "properties": { - "query": { - "type": "array", - "items": { - "type": "string" - }, - "description": "Array of query strings. Include multiple complementary search queries in a single call." +def contains_chinese(text: str) -> bool: + """Check if text contains Chinese characters.""" + if not text: + return False + return any("\u4E00" <= char <= "\u9FFF" for char in text) + + +def sanitize_query(query: str) -> str: + """Sanitize and validate a search query.""" + if not query: + return "" + # Strip whitespace and limit length + query = query.strip()[:500] + return query + + +def format_results(query: str, results: List[dict], provider: str) -> str: + """Format search results into a consistent markdown format.""" + if not results: + return "" + + snippets = [] + for idx, r in enumerate(results, 1): + title = r.get("title", "No title") + url = r.get("url", "") + snippet = r.get("snippet", "") + date = r.get("date", "") + + # Build result entry + entry = f"{idx}. [{title}]({url})" + if date: + entry += f"\nDate: {date}" + if snippet: + entry += f"\n{snippet}" + snippets.append(entry) + + header = f"A search for '{query}' found {len(snippets)} results:\n\n## Web Results\n\n" + return header + "\n\n".join(snippets) + + +# ============================================================================= +# Search Providers +# ============================================================================= + +def search_exa(query: str, num_results: int = 10) -> Optional[str]: + """ + Exa.ai - Neural/semantic search engine. + Best for finding conceptually relevant results, not just keyword matches. + + API Docs: https://docs.exa.ai/reference/search + """ + if not EXA_API_KEY: + return None + + query = sanitize_query(query) + if not query: + return None + + try: + response = requests.post( + "https://api.exa.ai/search", + headers={ + "x-api-key": EXA_API_KEY, + "Content-Type": "application/json", }, - }, - "required": ["query"], - } + json={ + "query": query, + "numResults": min(num_results, 100), # API max is 100 + "type": "auto", # Let Exa choose best search type + }, + timeout=REQUEST_TIMEOUT, + ) + + # Handle error responses + if response.status_code == 401: + print("[Exa] Invalid or expired API key") + return None + if response.status_code == 429: + print("[Exa] Rate limited - too many requests") + return None + if response.status_code == 402: + print("[Exa] Payment required - credits exhausted") + return None + if response.status_code != 200: + error_msg = response.text[:200] if response.text else "Unknown error" + print(f"[Exa] Error {response.status_code}: {error_msg}") + return None + + data = response.json() + api_results = data.get("results", []) + + if not api_results: + return None + + # Normalize results + results = [] + for r in api_results: + title = r.get("title") or "No title" + url = r.get("url", "") + text = r.get("text", "") + published = r.get("publishedDate", "") + + # Truncate text for snippet + snippet = text[:300] + "..." if len(text) > 300 else text + date = published[:10] if published else "" + + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": date, + }) + + return format_results(query, results, "Exa") + + except requests.Timeout: + print("[Exa] Request timeout") + return None + except requests.ConnectionError: + print("[Exa] Connection error - network issue") + return None + except json.JSONDecodeError: + print("[Exa] Invalid JSON response") + return None + except Exception as e: + print(f"[Exa] Unexpected error: {type(e).__name__}: {e}") + return None - def __init__(self, cfg: Optional[dict] = None): - super().__init__(cfg) - def google_search_with_serp(self, query: str): - def contains_chinese_basic(text: str) -> bool: - return any('\u4E00' <= char <= '\u9FFF' for char in text) - conn = http.client.HTTPSConnection("google.serper.dev") - if contains_chinese_basic(query): - payload = json.dumps({ + +def search_tavily(query: str, num_results: int = 10) -> Optional[str]: + """ + Tavily - Search API designed specifically for RAG and LLM applications. + Returns AI-optimized snippets and supports advanced filtering. + + API Docs: https://docs.tavily.com/documentation/api-reference/endpoint/search + """ + if not TAVILY_API_KEY: + return None + + query = sanitize_query(query) + if not query: + return None + + try: + # Tavily supports both Bearer token and api_key in body + # Using Bearer token as it's more standard + response = requests.post( + "https://api.tavily.com/search", + headers={ + "Authorization": f"Bearer {TAVILY_API_KEY}", + "Content-Type": "application/json", + }, + json={ + "query": query, + "max_results": min(num_results, 20), # API max is 20 + "search_depth": "basic", # Use basic (1 credit) vs advanced (2 credits) + "include_answer": False, + "include_raw_content": False, + }, + timeout=REQUEST_TIMEOUT, + ) + + # Handle error responses + if response.status_code == 401: + print("[Tavily] Invalid or expired API key") + return None + if response.status_code == 429: + print("[Tavily] Rate limited - too many requests") + return None + if response.status_code == 432: + print("[Tavily] Plan limit exceeded - upgrade required") + return None + if response.status_code == 433: + print("[Tavily] Pay-as-you-go limit exceeded") + return None + if response.status_code != 200: + error_msg = response.text[:200] if response.text else "Unknown error" + print(f"[Tavily] Error {response.status_code}: {error_msg}") + return None + + data = response.json() + api_results = data.get("results", []) + + if not api_results: + return None + + # Normalize results + results = [] + for r in api_results: + title = r.get("title") or "No title" + url = r.get("url", "") + content = r.get("content", "") + + # Truncate content for snippet + snippet = content[:300] + "..." if len(content) > 300 else content + + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": "", + }) + + return format_results(query, results, "Tavily") + + except requests.Timeout: + print("[Tavily] Request timeout") + return None + except requests.ConnectionError: + print("[Tavily] Connection error - network issue") + return None + except json.JSONDecodeError: + print("[Tavily] Invalid JSON response") + return None + except Exception as e: + print(f"[Tavily] Unexpected error: {type(e).__name__}: {e}") + return None + + +def search_serper(query: str, num_results: int = 10) -> Optional[str]: + """ + Serper - Google Search API (SERP results). + Fast and reliable Google search results. + + API Docs: https://serper.dev/ + """ + if not SERPER_KEY: + return None + + query = sanitize_query(query) + if not query: + return None + + try: + # Determine locale based on query content + if contains_chinese(query): + payload = { "q": query, - "location": "China", "gl": "cn", - "hl": "zh-cn" - }) - + "hl": "zh-cn", + "num": min(num_results, 100), + } else: - payload = json.dumps({ + payload = { "q": query, - "location": "United States", "gl": "us", - "hl": "en" - }) - headers = { - 'X-API-KEY': SERPER_KEY, - 'Content-Type': 'application/json' + "hl": "en", + "num": min(num_results, 100), } + # Use requests instead of http.client for consistency and better error handling + response = requests.post( + "https://google.serper.dev/search", + headers={ + "X-API-KEY": SERPER_KEY, + "Content-Type": "application/json", + }, + json=payload, + timeout=REQUEST_TIMEOUT, + ) - for i in range(5): - try: - conn.request("POST", "/search", payload, headers) - res = conn.getresponse() - break - except Exception as e: - print(e) - if i == 4: - return f"Google search Timeout, return None, Please try again later." - continue + # Handle error responses + if response.status_code == 401: + print("[Serper] Invalid API key") + return None + if response.status_code == 429: + print("[Serper] Rate limited") + return None + if response.status_code != 200: + error_msg = response.text[:200] if response.text else "Unknown error" + print(f"[Serper] Error {response.status_code}: {error_msg}") + return None + + data = response.json() + + # Check for API-level errors + if "error" in data: + print(f"[Serper] API error: {data['error']}") + return None + + organic = data.get("organic", []) + if not organic: + return None + + # Normalize results + results = [] + for page in organic: + title = page.get("title") or "No title" + url = page.get("link", "") + snippet_text = page.get("snippet", "") + date = page.get("date", "") + + # Clean up snippet + snippet = snippet_text.replace("Your browser can't play this video.", "").strip() + + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": date, + }) + + return format_results(query, results, "Serper") - data = res.read() - results = json.loads(data.decode("utf-8")) + except requests.Timeout: + print("[Serper] Request timeout") + return None + except requests.ConnectionError: + print("[Serper] Connection error - network issue") + return None + except json.JSONDecodeError: + print("[Serper] Invalid JSON response") + return None + except Exception as e: + print(f"[Serper] Unexpected error: {type(e).__name__}: {e}") + return None - try: - if "organic" not in results: - raise Exception(f"No results found for query: '{query}'. Use a less specific query.") - web_snippets = list() - idx = 0 - if "organic" in results: - for page in results["organic"]: - idx += 1 - date_published = "" - if "date" in page: - date_published = "\nDate published: " + page["date"] +def search_duckduckgo(query: str, num_results: int = 10) -> Optional[str]: + """ + DuckDuckGo - Free search with no API key required. + Rate limited but reliable as a final fallback. + """ + try: + from duckduckgo_search import DDGS + from duckduckgo_search.exceptions import RatelimitException, DuckDuckGoSearchException + except ImportError: + print("[DuckDuckGo] duckduckgo-search package not installed. Run: pip install duckduckgo-search") + return None + + query = sanitize_query(query) + if not query: + return None + + max_retries = 3 + for attempt in range(max_retries): + try: + with DDGS() as ddgs: + api_results = list(ddgs.text(query, max_results=min(num_results, 25))) + + if not api_results: + return None + + # Normalize results + results = [] + for r in api_results: + title = r.get("title") or "No title" + url = r.get("href") or r.get("link", "") + body = r.get("body", "") + + # Truncate body for snippet + snippet = body[:300] + "..." if len(body) > 300 else body + + results.append({ + "title": title, + "url": url, + "snippet": snippet, + "date": "", + }) + + return format_results(query, results, "DuckDuckGo") + + except RatelimitException: + if attempt < max_retries - 1: + wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s + print(f"[DuckDuckGo] Rate limited, waiting {wait_time}s...") + time.sleep(wait_time) + continue + print("[DuckDuckGo] Rate limited after all retries") + return None + except DuckDuckGoSearchException as e: + print(f"[DuckDuckGo] Search error: {e}") + return None + except Exception as e: + print(f"[DuckDuckGo] Unexpected error: {type(e).__name__}: {e}") + return None + + return None - source = "" - if "source" in page: - source = "\nSource: " + page["source"] - snippet = "" - if "snippet" in page: - snippet = "\n" + page["snippet"] +# ============================================================================= +# Multi-Provider Search with Fallback +# ============================================================================= - redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{snippet}" - redacted_version = redacted_version.replace("Your browser can't play this video.", "") - web_snippets.append(redacted_version) +def multi_provider_search(query: str, num_results: int = 10) -> str: + """ + Search using multiple providers with automatic fallback. + + Provider priority (by quality): + 1. Exa.ai - Best semantic search + 2. Tavily - Purpose-built for LLMs + 3. Serper - Google SERP results + 4. DuckDuckGo - Free fallback + + Returns the first successful result or an error message. + """ + # Validate query + query = sanitize_query(query) + if not query: + return "[Search] Empty query provided. Please provide a search term." + + providers = [ + ("Exa", search_exa), + ("Tavily", search_tavily), + ("Serper", search_serper), + ("DuckDuckGo", search_duckduckgo), + ] + + failed_providers = [] + + for name, search_fn in providers: + result = search_fn(query, num_results) + if result: + return result + failed_providers.append(name) + + # All providers failed + return f"No results found for '{query}'. Providers attempted: {', '.join(failed_providers)}. Try a different or simpler query." - content = f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n" + "\n\n".join(web_snippets) - return content - except: - return f"No results found for '{query}'. Try with a more general query." +# ============================================================================= +# Qwen Agent Tool Registration +# ============================================================================= +@register_tool("search", allow_overwrite=True) +class Search(BaseTool): + """Web search tool with multi-provider fallback.""" - def search_with_serp(self, query: str): - result = self.google_search_with_serp(query) - return result + name = "search" + description = "Performs batched web searches: supply an array 'query'; the tool retrieves the top 10 results for each query in one call." + parameters = { + "type": "object", + "properties": { + "query": { + "type": "array", + "items": {"type": "string"}, + "description": "Array of query strings. Include multiple complementary search queries in a single call.", + }, + }, + "required": ["query"], + } + + def __init__(self, cfg: Optional[dict] = None): + super().__init__(cfg) + + # Log which providers are available at initialization + available = [] + if EXA_API_KEY: + available.append("Exa") + if TAVILY_API_KEY: + available.append("Tavily") + if SERPER_KEY: + available.append("Serper") + available.append("DuckDuckGo") # Always available + + print(f"[Search] Initialized with providers: {', '.join(available)}") def call(self, params: Union[str, dict], **kwargs) -> str: - try: - query = params["query"] - except: - return "[Search] Invalid request format: Input must be a JSON object containing 'query' field" + # Handle string input (invalid) + if isinstance(params, str): + return "[Search] Invalid request: Input must be a JSON object with 'query' field, not a string." + + # Handle None or non-dict + if not isinstance(params, dict): + return "[Search] Invalid request: Input must be a JSON object with 'query' field." + + query = params.get("query") + + # Handle missing query + if query is None: + return "[Search] Missing 'query' field in request." + # Handle single string query if isinstance(query, str): - # 单个查询 - response = self.search_with_serp(query) - else: - # 多个查询 - assert isinstance(query, List) + query = query.strip() + if not query: + return "[Search] Empty query string provided." + return multi_provider_search(query) + + # Handle list of queries + if isinstance(query, list): + if not query: + return "[Search] Empty query list provided." + + # Filter out empty strings + valid_queries = [q.strip() for q in query if isinstance(q, str) and q.strip()] + + if not valid_queries: + return "[Search] No valid queries in list (all empty or non-string)." + responses = [] - for q in query: - responses.append(self.search_with_serp(q)) - response = "\n=======\n".join(responses) + for q in valid_queries: + responses.append(multi_provider_search(q)) - return response - + return "\n=======\n".join(responses) + + # Invalid query type + return f"[Search] Invalid 'query' type: expected string or array, got {type(query).__name__}." diff --git a/requirements-local.txt b/requirements-local.txt new file mode 100644 index 00000000..7800ee0e --- /dev/null +++ b/requirements-local.txt @@ -0,0 +1,12 @@ +# DeepResearch - Minimal requirements for llama.cpp local inference +# Install with: pip install -r requirements-local.txt + +# Core dependencies +requests>=2.31.0 +python-dotenv>=1.0.0 + +# Web search (FREE, no API key needed) +duckduckgo-search>=6.0.0 + +# Optional but recommended +tqdm>=4.66.0 diff --git a/scripts/start_llama_server.sh b/scripts/start_llama_server.sh new file mode 100755 index 00000000..4b42ce9b --- /dev/null +++ b/scripts/start_llama_server.sh @@ -0,0 +1,237 @@ +#!/bin/bash +# ============================================================================= +# DeepResearch Local Server - llama.cpp with Metal Acceleration +# ============================================================================= +# +# This script starts the llama.cpp server optimized for the DeepResearch +# ReAct agent workflow on Apple Silicon. +# +# The server provides: +# - OpenAI-compatible API at http://localhost:8080/v1/chat/completions +# - Built-in Web UI at http://localhost:8080 (chat interface!) +# - Metal (GPU) acceleration for fast inference +# - Model loaded once and kept resident in memory +# +# Usage: +# ./scripts/start_llama_server.sh # Start with defaults +# ./scripts/start_llama_server.sh --ctx 16384 # Custom context size +# ./scripts/start_llama_server.sh --no-webui # API only, no web UI +# +# Access: +# - Web UI: http://localhost:8080 +# - API: http://localhost:8080/v1/chat/completions +# - CLI: python inference/interactive_llamacpp.py +# +# ============================================================================= + +set -e + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +LLAMA_SERVER="$PROJECT_DIR/llama.cpp/build/bin/llama-server" +MODEL_PATH="$PROJECT_DIR/models/gguf/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf" + +# Default settings (optimized for Apple Silicon with 32GB RAM) +PORT=${PORT:-8080} +HOST=${HOST:-127.0.0.1} +CTX_SIZE=${CTX_SIZE:-16384} # 16K context (use --ctx 32768 for longer sessions) +GPU_LAYERS=${GPU_LAYERS:-99} # Offload all layers to Metal +THREADS=${THREADS:-8} # CPU threads for non-GPU ops +PARALLEL=${PARALLEL:-1} # Parallel request slots +BATCH_SIZE=${BATCH_SIZE:-512} # Batch size for prompt processing +WEBUI=${WEBUI:-true} # Enable web UI by default +MLOCK=${MLOCK:-false} # Don't lock model in RAM (saves memory for other apps) + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}" +echo "============================================================" +echo " DeepResearch Local Server (llama.cpp + Metal)" +echo "============================================================" +echo -e "${NC}" + +# Check if llama-server exists +if [ ! -f "$LLAMA_SERVER" ]; then + echo -e "${RED}Error: llama-server not found at $LLAMA_SERVER${NC}" + echo "" + echo "Please build llama.cpp first:" + echo " cd $PROJECT_DIR/llama.cpp" + echo " cmake -B build -DLLAMA_METAL=ON -DCMAKE_BUILD_TYPE=Release" + echo " cmake --build build --config Release" + exit 1 +fi + +# Check if model exists +if [ ! -f "$MODEL_PATH" ]; then + echo -e "${RED}Error: Model not found at $MODEL_PATH${NC}" + echo "" + echo "Please download the model first:" + echo " cd $PROJECT_DIR/models/gguf" + echo " curl -L -C - -o Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf \\" + echo " 'https://huggingface.co/bartowski/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-GGUF/resolve/main/Alibaba-NLP_Tongyi-DeepResearch-30B-A3B-Q4_K_M.gguf'" + exit 1 +fi + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --port) + PORT="$2" + shift 2 + ;; + --ctx) + CTX_SIZE="$2" + shift 2 + ;; + --threads) + THREADS="$2" + shift 2 + ;; + --parallel) + PARALLEL="$2" + shift 2 + ;; + --no-webui) + WEBUI=false + shift + ;; + --webui) + WEBUI=true + shift + ;; + --mlock) + MLOCK=true + shift + ;; + --low-memory) + # Low memory mode: smaller context, no mlock + CTX_SIZE=8192 + MLOCK=false + shift + ;; + -h|--help) + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --port N Port number (default: 8080)" + echo " --ctx N Context size (default: 16384)" + echo " --threads N CPU threads (default: 8)" + echo " --parallel N Parallel requests (default: 1)" + echo " --webui Enable web UI (default)" + echo " --no-webui Disable web UI, API only" + echo " --mlock Lock model in RAM (uses more memory but faster)" + echo " --low-memory Low memory mode: 8K context, no mlock" + echo " -h, --help Show this help" + echo "" + echo "Access points:" + echo " Web UI: http://127.0.0.1:PORT" + echo " API: http://127.0.0.1:PORT/v1/chat/completions" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +echo -e "${GREEN}Configuration:${NC}" +echo " Model: $(basename "$MODEL_PATH")" +echo " Size: $(du -h "$MODEL_PATH" | cut -f1)" +echo " Context: $CTX_SIZE tokens" +echo " GPU: Metal (all $GPU_LAYERS layers)" +echo " Threads: $THREADS" +echo " Parallel: $PARALLEL slots" +echo " Mlock: $MLOCK" +echo " Web UI: $WEBUI" +echo " Endpoint: http://$HOST:$PORT" +echo "" + +# Check for existing server on port +if lsof -i :$PORT > /dev/null 2>&1; then + echo -e "${YELLOW}Warning: Port $PORT is already in use.${NC}" + echo "Existing process:" + lsof -i :$PORT | head -2 + echo "" + read -p "Kill existing process and continue? (y/N) " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + lsof -t -i :$PORT | xargs kill -9 2>/dev/null || true + sleep 1 + else + echo "Aborting." + exit 1 + fi +fi + +echo -e "${YELLOW}Starting server...${NC}" +echo "(Model loading takes ~30-60 seconds)" +echo "" + +# Build command arguments +SERVER_ARGS=( + --model "$MODEL_PATH" + --host "$HOST" + --port "$PORT" + --ctx-size "$CTX_SIZE" + --n-gpu-layers "$GPU_LAYERS" + --threads "$THREADS" + --parallel "$PARALLEL" + --batch-size "$BATCH_SIZE" + --flash-attn auto + --metrics + --log-disable + --alias deepresearch +) +# Note: --jinja is enabled by default in recent llama.cpp versions + +# Add mlock if requested (uses more memory but may be faster) +if [ "$MLOCK" = "true" ]; then + SERVER_ARGS+=(--mlock) +fi + +# Add no-webui flag if requested +if [ "$WEBUI" = "false" ]; then + SERVER_ARGS+=(--no-webui) +fi + +# Start the server with optimized settings for DeepResearch +exec "$LLAMA_SERVER" "${SERVER_ARGS[@]}" 2>&1 | while read -r line; do + # Colorize output + if [[ $line == *"error"* ]] || [[ $line == *"Error"* ]]; then + echo -e "${RED}$line${NC}" + elif [[ $line == *"listening"* ]] || [[ $line == *"ready"* ]]; then + echo -e "${GREEN}$line${NC}" + echo "" + echo -e "${GREEN}============================================================${NC}" + echo -e "${GREEN} Server ready!${NC}" + echo -e "${GREEN}============================================================${NC}" + if [ "$WEBUI" = "true" ]; then + echo "" + echo -e "${GREEN} Web UI: http://$HOST:$PORT${NC}" + echo " Open in your browser for a chat interface!" + fi + echo "" + echo -e "${GREEN} API: http://$HOST:$PORT/v1/chat/completions${NC}" + echo "" + echo "Run DeepResearch CLI:" + echo " python inference/interactive_llamacpp.py" + echo "" + echo "Test API:" + echo " curl http://$HOST:$PORT/v1/chat/completions \\" + echo " -H 'Content-Type: application/json' \\" + echo " -d '{\"model\": \"deepresearch\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'" + echo "" + echo "Press Ctrl+C to stop the server." + elif [[ $line == *"warning"* ]] || [[ $line == *"Warning"* ]]; then + echo -e "${YELLOW}$line${NC}" + else + echo "$line" + fi +done