From b4668a3b8e075fdd124e9edb9032dc6920a95f94 Mon Sep 17 00:00:00 2001 From: siddhart Date: Mon, 19 May 2025 14:54:12 +0530 Subject: [PATCH 1/2] Add Web Scraper functionality and update scraping configuration - Introduced a new WebScraper class for scraping documentation from websites. - Updated the scraping configuration section in app.py to allow selection between Firecrawl Scraper and Web Scraper. - Enhanced user input options based on the selected scraper type, including maximum pages and delay settings. - Modified the scraping logic to accommodate the new Web Scraper implementation. --- deepseek-rag/src/app.py | 57 ++++++++-- deepseek-rag/src/rag.py | 10 +- deepseek-rag/src/web_scraper.py | 192 ++++++++++++++++++++++++++++++++ 3 files changed, 246 insertions(+), 13 deletions(-) create mode 100644 deepseek-rag/src/web_scraper.py diff --git a/deepseek-rag/src/app.py b/deepseek-rag/src/app.py index ff3b065f..f6051aa6 100644 --- a/deepseek-rag/src/app.py +++ b/deepseek-rag/src/app.py @@ -6,6 +6,7 @@ from dotenv import load_dotenv from rag import DocumentationRAG from scraper import DocumentationScraper +from web_scraper import WebScraper load_dotenv() @@ -34,6 +35,14 @@ def get_doc_page_count(docs_dir: str) -> int: def scraping_config_section(): """Create the documentation scraping configuration section""" st.markdown("### Configure Scraping") + + # Add scraper selection + scraper_type = st.radio( + "Select Scraper", + ["Firecrawl Scraper", "Web Scraper"], + help="Choose which scraper to use for documentation extraction" + ) + base_url = st.text_input( "Documentation URL", placeholder="https://docs.firecrawl.dev", @@ -46,16 +55,35 @@ def scraping_config_section(): help="Name of the directory to store documentation", ) - n_pages = st.number_input( - "Number of Pages", - min_value=0, - value=0, - help="Limit the number of pages to scrape (0 for all pages)", - ) + # Show different options based on selected scraper + if scraper_type == "Web Scraper": + col1, col2 = st.columns(2) + with col1: + max_pages = st.number_input( + "Maximum Pages", + min_value=1, + value=10, + help="Maximum number of pages to scrape", + ) + with col2: + delay = st.number_input( + "Delay (seconds)", + min_value=0.5, + value=1.0, + step=0.5, + help="Delay between requests in seconds", + ) + else: # Firecrawl Scraper + n_pages = st.number_input( + "Number of Pages", + min_value=0, + value=0, + help="Limit the number of pages to scrape (0 for all pages)", + ) st.info( "💡 Add '-docs' suffix to the documentation name. " - "Set pages to 0 to scrape all available pages." + "Set pages to 0 to scrape all available pages (Firecrawl Scraper only)." ) # Add scrape button @@ -67,9 +95,18 @@ def scraping_config_section(): else: with st.spinner("Scraping documentation..."): try: - scraper = DocumentationScraper() - n_pages = None if n_pages == 0 else n_pages - scraper.pull_docs(base_url, docs_name, n_pages=n_pages) + if scraper_type == "Firecrawl Scraper": + scraper = DocumentationScraper() + n_pages = None if n_pages == 0 else n_pages + scraper.pull_docs(base_url, docs_name, n_pages=n_pages) + else: # Web Scraper + scraper = WebScraper( + base_url=base_url, + output_dir=docs_name, + max_pages=max_pages, + delay=delay + ) + scraper.scrape() st.success("Documentation scraped successfully!") except Exception as e: st.error(f"Error scraping documentation: {str(e)}") diff --git a/deepseek-rag/src/rag.py b/deepseek-rag/src/rag.py index 7a2adc0e..06e72b53 100644 --- a/deepseek-rag/src/rag.py +++ b/deepseek-rag/src/rag.py @@ -10,7 +10,9 @@ def __init__(self): # Initialize embeddings and vector store self.embeddings = OllamaEmbeddings(model="nomic-embed-text") self.vector_store = Chroma( - embedding_function=self.embeddings, persist_directory="./chroma_db" + collection_name="documentation_rag", + embedding_function=self.embeddings, + persist_directory="./chroma_db" ) # Initialize LLM @@ -44,9 +46,11 @@ def load_docs_from_directory(self, docs_dir: str): def process_documents(self, docs_dir: str): """Process documents and add to vector store""" - # Clear existing documents + # Create a new Chroma instance for processing self.vector_store = Chroma( - embedding_function=self.embeddings, persist_directory="./chroma_db" + collection_name="documentation_rag", + embedding_function=self.embeddings, + persist_directory="./chroma_db" ) # Load and process new documents diff --git a/deepseek-rag/src/web_scraper.py b/deepseek-rag/src/web_scraper.py new file mode 100644 index 00000000..4bf96982 --- /dev/null +++ b/deepseek-rag/src/web_scraper.py @@ -0,0 +1,192 @@ +import os +import logging +from typing import List, Set +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse +import re +from pathlib import Path +import time + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +class WebScraper: + def __init__(self, base_url: str, output_dir: str = "docs", max_pages: int = 10, delay: float = 1.0): + """ + Initialize the web scraper. + + Args: + base_url (str): The base URL to start scraping from + output_dir (str): Directory to save scraped content + max_pages (int): Maximum number of pages to scrape + delay (float): Delay between requests in seconds + """ + self.base_url = base_url + self.output_dir = output_dir + self.max_pages = max_pages + self.delay = delay + self.visited_urls: Set[str] = set() + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + def is_valid_url(self, url: str) -> bool: + """Check if URL is valid and belongs to the same domain.""" + try: + result = urlparse(url) + base_domain = urlparse(self.base_url).netloc + return ( + result.netloc == base_domain and + result.scheme in ('http', 'https') and + not url.endswith(('.pdf', '.zip', '.png', '.jpg', '.jpeg', '.gif')) + ) + except Exception: + return False + + def get_page_content(self, url: str) -> str: + """Fetch and parse page content.""" + try: + response = requests.get(url, headers=self.headers, timeout=10) + response.raise_for_status() + return response.text + except Exception as e: + logger.error(f"Error fetching {url}: {str(e)}") + return "" + + def extract_links(self, html_content: str, current_url: str) -> List[str]: + """Extract valid links from HTML content.""" + soup = BeautifulSoup(html_content, 'html.parser') + links = [] + + for a_tag in soup.find_all('a', href=True): + href = a_tag['href'] + full_url = urljoin(current_url, href) + if self.is_valid_url(full_url): + links.append(full_url) + + return list(set(links)) # Remove duplicates + + def extract_main_content(self, html_content: str) -> str: + """Extract main content from HTML and convert to markdown.""" + soup = BeautifulSoup(html_content, 'html.parser') + + # Remove unwanted elements + for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header']): + element.decompose() + + # Try to find main content + main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article')) + + if not main_content: + main_content = soup.body + + if not main_content: + return "" + + # Convert to markdown-like format + content = [] + + # Process headings + for heading in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): + level = int(heading.name[1]) + content.append(f"{'#' * level} {heading.get_text().strip()}\n") + + # Process paragraphs + for p in main_content.find_all('p'): + content.append(f"{p.get_text().strip()}\n") + + # Process code blocks + for code in main_content.find_all('pre'): + content.append(f"```\n{code.get_text().strip()}\n```\n") + + # Process lists + for ul in main_content.find_all(['ul', 'ol']): + for li in ul.find_all('li', recursive=False): + content.append(f"- {li.get_text().strip()}\n") + + return "\n".join(content) + + def save_content(self, url: str, content: str) -> None: + """Save content to a markdown file.""" + if not content.strip(): + return + + # Create filename from URL + parsed_url = urlparse(url) + path_parts = parsed_url.path.strip('/').split('/') + filename = '_'.join(path_parts) or 'index' + filename = f"{filename}.md" + + # Save file + filepath = os.path.join(self.output_dir, filename) + try: + with open(filepath, 'w', encoding='utf-8') as f: + f.write(f"# Content from {url}\n\n") + f.write(content) + logger.info(f"Saved content to {filepath}") + except Exception as e: + logger.error(f"Error saving content to {filepath}: {str(e)}") + + def scrape(self) -> None: + """Main scraping method.""" + urls_to_visit = [self.base_url] + pages_scraped = 0 + + while urls_to_visit and pages_scraped < self.max_pages: + current_url = urls_to_visit.pop(0) + + if current_url in self.visited_urls: + continue + + logger.info(f"Scraping {current_url}") + self.visited_urls.add(current_url) + + # Get and process page content + html_content = self.get_page_content(current_url) + if not html_content: + continue + + # Extract and save main content + main_content = self.extract_main_content(html_content) + self.save_content(current_url, main_content) + + # Extract new links + new_links = self.extract_links(html_content, current_url) + urls_to_visit.extend([link for link in new_links if link not in self.visited_urls]) + + pages_scraped += 1 + time.sleep(self.delay) # Be nice to servers + + logger.info(f"Scraping completed. Scraped {pages_scraped} pages.") + +def main(): + """Example usage of the WebScraper.""" + import argparse + + parser = argparse.ArgumentParser(description='Web scraper for documentation pages') + parser.add_argument('url', help='Base URL to start scraping from') + parser.add_argument('--output', '-o', default='docs', help='Output directory for scraped content') + parser.add_argument('--max-pages', '-m', type=int, default=10, help='Maximum number of pages to scrape') + parser.add_argument('--delay', '-d', type=float, default=1.0, help='Delay between requests in seconds') + + args = parser.parse_args() + + scraper = WebScraper( + base_url=args.url, + output_dir=args.output, + max_pages=args.max_pages, + delay=args.delay + ) + + scraper.scrape() + +if __name__ == "__main__": + main() \ No newline at end of file From 05f12cc182a0e0bb6d0f3844913ab51759354371 Mon Sep 17 00:00:00 2001 From: siddhart Date: Mon, 19 May 2025 14:54:35 +0530 Subject: [PATCH 2/2] Revise README for DeepSeek RAG Documentation Assistant --- deepseek-rag/README.md | 203 +++++++++++++++++++++++----------- deepseek-rag/requirements.txt | 10 ++ 2 files changed, 150 insertions(+), 63 deletions(-) diff --git a/deepseek-rag/README.md b/deepseek-rag/README.md index 93504158..16ad0050 100644 --- a/deepseek-rag/README.md +++ b/deepseek-rag/README.md @@ -1,102 +1,179 @@ -# Documentation RAG System - -A system for scraping, processing, and managing documentation for RAG (Retrieval-Augmented Generation) applications using DeepSeek R1, Ollama, Streamlit and Firecrawl. - -## Table of Contents - -- [Features](#features) -- [Prerequisites](#prerequisites) -- [Installation](#installation) -- [Usage](#usage) - -## Features - -- **Documentation Scraping**: Automatically scrapes documentation from specified URLs -- **Flexible Storage**: Stores documentation in organized directory structures -- **Streamlit Interface**: User-friendly web interface for managing documentation -- **Configurable Scraping**: Control the number of pages scraped and documentation naming +# DeepSeek RAG Documentation Assistant + +Transform any documentation into an intelligent chatbot! This Streamlit-based RAG (Retrieval-Augmented Generation) application allows you to convert any documentation website or local documents into an interactive AI assistant. Powered by DeepSeek and Ollama, it provides intelligent, context-aware responses to your documentation queries. + +## Key Features + +- **Documentation to Chatbot**: Convert any documentation into an interactive AI assistant +- **Multiple Scraping Options**: + - **Firecrawl Scraper**: Specialized scraper for structured documentation sites + - **Web Scraper**: Flexible scraper for general websites with configurable crawling +- **Smart Document Processing**: + - Automatic content extraction and structuring + - Intelligent chunking for optimal context retrieval + - Vector-based semantic search +- **Interactive Chat Interface**: + - Natural language querying + - Chain-of-thought reasoning + - Context-aware responses +- **Documentation Management**: + - Support for multiple documentation sets + - Easy switching between different docs + - Automatic content indexing ## Prerequisites -Before setting up the project, ensure you have: - -- Python 3.x installed -- A Firecrawl account (sign up at and get your API key) -- Environment variables properly configured (see `.env.example` file) -- Ollama installed and running ([download instructions](https://ollama.com/download/mac)): +- Python 3.10 or higher +- [Ollama](https://ollama.ai/) installed and running locally +- Required Ollama models: + - `deepseek-r1:14b` (for chat) + - `nomic-embed-text` (for embeddings) -1. Install Ollama +## Installation +1. Clone the repository: ```bash -brew install ollama +git clone +cd deepseek-rag ``` -2. Start the Ollama server - +2. Create and activate a virtual environment (recommended): ```bash -ollama serve +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate ``` -3. Pull the required model used in the repository +3. Install the required dependencies: +```bash +pip install -r requirements.txt +``` +4. Pull required Ollama models: ```bash -# 14 billion parameters -ollama pull deepseek-r1:14b +ollama pull deepseek-r1:14b +ollama pull nomic-embed-text ``` -If you have strong enough hardware, try bigger versions of the DeepSeek R1 model as model size significantly affects performance. +## Dependencies -## Installation +The application uses the following key dependencies: +- `streamlit`: For the web interface +- `langchain-core>=0.3.34`: Core LangChain functionality +- `langchain-ollama>=0.0.3`: Ollama integration +- `langchain-chroma>=0.1.4`: Vector store integration +- `chromadb>=0.4.22`: Vector database +- `langchain-community`: Document loaders and utilities +- `langchain-text-splitters`: Text chunking utilities +- `beautifulsoup4`: For web scraping +- `requests`: For HTTP requests -1. Clone the repository: +## Usage + +### 1. Convert Documentation to Chatbot +#### Using the Web Interface + +1. Start the application: ```bash -git clone https://github.com/BexTuychiev/local-documentation-rag.git -cd local-documentation-rag +streamlit run src/app.py ``` -2. Install required dependencies: +2. Open your browser at http://localhost:8501 + +3. In the sidebar: + - Choose your preferred scraper (Firecrawl or Web Scraper) + - Enter the documentation URL + - Set a name for your documentation (must end with '-docs') + - Configure scraper-specific settings + - Click "Start Scraping" + +4. Once scraping is complete: + - Select your documentation from the dropdown + - Start chatting with your AI assistant! + +#### Using the Command Line + +For advanced users, you can use the web scraper directly: ```bash -pip install -r requirements.txt +python src/web_scraper.py https://docs.example.com --output docs --max-pages 20 --delay 1.5 ``` -3. Create a `.env` file in the project root and configure required environment variables. +### 2. Interacting with Your Documentation Bot -## Usage +- **Ask Questions**: Type natural language questions about the documentation +- **View Reasoning**: Click "View reasoning" to see how the AI arrived at its answer +- **Switch Context**: Use the sidebar to switch between different documentation sets +- **Real-time Processing**: The bot processes and indexes new documentation automatically -1. Start the Streamlit application: +## Supported Documentation Types + +- **Web Documentation**: + - Documentation websites + - Technical blogs + - API documentation + - Product manuals +- **Local Documentation**: + - Markdown files + - Text documents + - Documentation directories + +## Project Structure -```bash -streamlit run src/app.py ``` +deepseek-rag/ +├── src/ +│ ├── app.py # Streamlit application +│ ├── rag.py # RAG implementation +│ ├── scraper.py # Firecrawl scraper +│ └── web_scraper.py # Web scraping utility +├── requirements.txt # Project dependencies +└── README.md # This file +``` + +## Troubleshooting + +1. **Model Not Found Errors**: + - Ensure Ollama is running (`ollama serve`) + - Verify required models are pulled (`ollama list`) + - Pull missing models using `ollama pull ` + +2. **ChromaDB Issues**: + - If you encounter database errors, try removing the `./chroma_db` directory and restarting the application + - Ensure you have sufficient disk space for the vector store -2. Through the web interface, you can: - - Configure documentation scraping - - Specify base URLs for documentation - - Control the number of pages to scrape +3. **Performance Issues**: + - The application requires significant RAM for the language models + - Consider using a machine with at least 16GB RAM + - Adjust chunk sizes in `rag.py` if needed -### Documentation Naming Conventions +4. **Web Scraping Issues**: + - If scraping fails, check your internet connection + - Some websites may block automated scraping + - Adjust the delay parameter if you're getting rate-limited + - Ensure you have permission to scrape the target website -- Documentation directories must end with `-docs` suffix -- Names should be descriptive and relate to the documentation being scraped -- Example: `Firecrawl-docs`, `API-docs` +## Best Practices -## Features in Detail +1. **Documentation Preparation**: + - Use clear, well-structured documentation + - Ensure proper HTML formatting for web docs + - Include relevant metadata and titles -### Documentation Scraping +2. **Scraping Configuration**: + - Start with a small number of pages for testing + - Adjust delay settings based on the target website + - Use appropriate scraper for your documentation type -The system provides a robust scraping mechanism that: +3. **Chat Interaction**: + - Ask specific, clear questions + - Use natural language + - Check the reasoning for complex queries -- Handles various documentation formats -- Maintains proper directory structure -- Supports pagination and depth control -- Provides progress feedback +## Contributing -### RAG Integration +Contributions are welcome! Please feel free to submit a Pull Request. -The system is designed to work with RAG applications by: +## License -- Converting documentation to appropriate formats -- Maintaining proper data structure for retrieval -- Supporting various documentation sources +[Add your license information here] diff --git a/deepseek-rag/requirements.txt b/deepseek-rag/requirements.txt index 7a35a9ee..89ada6fc 100644 --- a/deepseek-rag/requirements.txt +++ b/deepseek-rag/requirements.txt @@ -170,3 +170,13 @@ wrapt==1.17.2 ; python_version >= "3.10" and python_version < "4.0" yarl==1.18.3 ; python_version >= "3.10" and python_version < "4.0" zipp==3.21.0 ; python_version >= "3.10" and python_version < "4" zstandard==0.23.0 ; python_version >= "3.10" and python_version < "4.0" +streamlit>=1.32.0 +langchain-core>=0.3.34 +langchain-ollama>=0.0.3 +langchain-chroma>=0.1.4 +chromadb>=0.4.22 +langchain-community +langchain-text-splitters +beautifulsoup4>=4.12.0 +requests>=2.31.0 +urllib3>=2.0.0