From b4668a3b8e075fdd124e9edb9032dc6920a95f94 Mon Sep 17 00:00:00 2001
From: siddhart <s.vikram@enact-systems.com>
Date: Mon, 19 May 2025 14:54:12 +0530
Subject: [PATCH 1/2] Add Web Scraper functionality and update scraping
 configuration

- Introduced a new WebScraper class for scraping documentation from websites.
- Updated the scraping configuration section in app.py to allow selection between Firecrawl Scraper and Web Scraper.
- Enhanced user input options based on the selected scraper type, including maximum pages and delay settings.
- Modified the scraping logic to accommodate the new Web Scraper implementation.
---
 deepseek-rag/src/app.py         |  57 ++++++++--
 deepseek-rag/src/rag.py         |  10 +-
 deepseek-rag/src/web_scraper.py | 192 ++++++++++++++++++++++++++++++++
 3 files changed, 246 insertions(+), 13 deletions(-)
 create mode 100644 deepseek-rag/src/web_scraper.py

diff --git a/deepseek-rag/src/app.py b/deepseek-rag/src/app.py
index ff3b065f..f6051aa6 100644
--- a/deepseek-rag/src/app.py
+++ b/deepseek-rag/src/app.py
@@ -6,6 +6,7 @@
 from dotenv import load_dotenv
 from rag import DocumentationRAG
 from scraper import DocumentationScraper
+from web_scraper import WebScraper
 
 load_dotenv()
 
@@ -34,6 +35,14 @@ def get_doc_page_count(docs_dir: str) -> int:
 def scraping_config_section():
     """Create the documentation scraping configuration section"""
     st.markdown("### Configure Scraping")
+    
+    # Add scraper selection
+    scraper_type = st.radio(
+        "Select Scraper",
+        ["Firecrawl Scraper", "Web Scraper"],
+        help="Choose which scraper to use for documentation extraction"
+    )
+    
     base_url = st.text_input(
         "Documentation URL",
         placeholder="https://docs.firecrawl.dev",
@@ -46,16 +55,35 @@ def scraping_config_section():
         help="Name of the directory to store documentation",
     )
 
-    n_pages = st.number_input(
-        "Number of Pages",
-        min_value=0,
-        value=0,
-        help="Limit the number of pages to scrape (0 for all pages)",
-    )
+    # Show different options based on selected scraper
+    if scraper_type == "Web Scraper":
+        col1, col2 = st.columns(2)
+        with col1:
+            max_pages = st.number_input(
+                "Maximum Pages",
+                min_value=1,
+                value=10,
+                help="Maximum number of pages to scrape",
+            )
+        with col2:
+            delay = st.number_input(
+                "Delay (seconds)",
+                min_value=0.5,
+                value=1.0,
+                step=0.5,
+                help="Delay between requests in seconds",
+            )
+    else:  # Firecrawl Scraper
+        n_pages = st.number_input(
+            "Number of Pages",
+            min_value=0,
+            value=0,
+            help="Limit the number of pages to scrape (0 for all pages)",
+        )
 
     st.info(
         "💡 Add '-docs' suffix to the documentation name. "
-        "Set pages to 0 to scrape all available pages."
+        "Set pages to 0 to scrape all available pages (Firecrawl Scraper only)."
     )
 
     # Add scrape button
@@ -67,9 +95,18 @@ def scraping_config_section():
         else:
             with st.spinner("Scraping documentation..."):
                 try:
-                    scraper = DocumentationScraper()
-                    n_pages = None if n_pages == 0 else n_pages
-                    scraper.pull_docs(base_url, docs_name, n_pages=n_pages)
+                    if scraper_type == "Firecrawl Scraper":
+                        scraper = DocumentationScraper()
+                        n_pages = None if n_pages == 0 else n_pages
+                        scraper.pull_docs(base_url, docs_name, n_pages=n_pages)
+                    else:  # Web Scraper
+                        scraper = WebScraper(
+                            base_url=base_url,
+                            output_dir=docs_name,
+                            max_pages=max_pages,
+                            delay=delay
+                        )
+                        scraper.scrape()
                     st.success("Documentation scraped successfully!")
                 except Exception as e:
                     st.error(f"Error scraping documentation: {str(e)}")
diff --git a/deepseek-rag/src/rag.py b/deepseek-rag/src/rag.py
index 7a2adc0e..06e72b53 100644
--- a/deepseek-rag/src/rag.py
+++ b/deepseek-rag/src/rag.py
@@ -10,7 +10,9 @@ def __init__(self):
         # Initialize embeddings and vector store
         self.embeddings = OllamaEmbeddings(model="nomic-embed-text")
         self.vector_store = Chroma(
-            embedding_function=self.embeddings, persist_directory="./chroma_db"
+            collection_name="documentation_rag",
+            embedding_function=self.embeddings,
+            persist_directory="./chroma_db"
         )
 
         # Initialize LLM
@@ -44,9 +46,11 @@ def load_docs_from_directory(self, docs_dir: str):
 
     def process_documents(self, docs_dir: str):
         """Process documents and add to vector store"""
-        # Clear existing documents
+        # Create a new Chroma instance for processing
         self.vector_store = Chroma(
-            embedding_function=self.embeddings, persist_directory="./chroma_db"
+            collection_name="documentation_rag",
+            embedding_function=self.embeddings,
+            persist_directory="./chroma_db"
         )
 
         # Load and process new documents
diff --git a/deepseek-rag/src/web_scraper.py b/deepseek-rag/src/web_scraper.py
new file mode 100644
index 00000000..4bf96982
--- /dev/null
+++ b/deepseek-rag/src/web_scraper.py
@@ -0,0 +1,192 @@
+import os
+import logging
+from typing import List, Set
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import re
+from pathlib import Path
+import time
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class WebScraper:
+    def __init__(self, base_url: str, output_dir: str = "docs", max_pages: int = 10, delay: float = 1.0):
+        """
+        Initialize the web scraper.
+        
+        Args:
+            base_url (str): The base URL to start scraping from
+            output_dir (str): Directory to save scraped content
+            max_pages (int): Maximum number of pages to scrape
+            delay (float): Delay between requests in seconds
+        """
+        self.base_url = base_url
+        self.output_dir = output_dir
+        self.max_pages = max_pages
+        self.delay = delay
+        self.visited_urls: Set[str] = set()
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+
+    def is_valid_url(self, url: str) -> bool:
+        """Check if URL is valid and belongs to the same domain."""
+        try:
+            result = urlparse(url)
+            base_domain = urlparse(self.base_url).netloc
+            return (
+                result.netloc == base_domain and
+                result.scheme in ('http', 'https') and
+                not url.endswith(('.pdf', '.zip', '.png', '.jpg', '.jpeg', '.gif'))
+            )
+        except Exception:
+            return False
+
+    def get_page_content(self, url: str) -> str:
+        """Fetch and parse page content."""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=10)
+            response.raise_for_status()
+            return response.text
+        except Exception as e:
+            logger.error(f"Error fetching {url}: {str(e)}")
+            return ""
+
+    def extract_links(self, html_content: str, current_url: str) -> List[str]:
+        """Extract valid links from HTML content."""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        links = []
+        
+        for a_tag in soup.find_all('a', href=True):
+            href = a_tag['href']
+            full_url = urljoin(current_url, href)
+            if self.is_valid_url(full_url):
+                links.append(full_url)
+        
+        return list(set(links))  # Remove duplicates
+
+    def extract_main_content(self, html_content: str) -> str:
+        """Extract main content from HTML and convert to markdown."""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        
+        # Remove unwanted elements
+        for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header']):
+            element.decompose()
+        
+        # Try to find main content
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article'))
+        
+        if not main_content:
+            main_content = soup.body
+        
+        if not main_content:
+            return ""
+        
+        # Convert to markdown-like format
+        content = []
+        
+        # Process headings
+        for heading in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
+            level = int(heading.name[1])
+            content.append(f"{'#' * level} {heading.get_text().strip()}\n")
+        
+        # Process paragraphs
+        for p in main_content.find_all('p'):
+            content.append(f"{p.get_text().strip()}\n")
+        
+        # Process code blocks
+        for code in main_content.find_all('pre'):
+            content.append(f"```\n{code.get_text().strip()}\n```\n")
+        
+        # Process lists
+        for ul in main_content.find_all(['ul', 'ol']):
+            for li in ul.find_all('li', recursive=False):
+                content.append(f"- {li.get_text().strip()}\n")
+        
+        return "\n".join(content)
+
+    def save_content(self, url: str, content: str) -> None:
+        """Save content to a markdown file."""
+        if not content.strip():
+            return
+        
+        # Create filename from URL
+        parsed_url = urlparse(url)
+        path_parts = parsed_url.path.strip('/').split('/')
+        filename = '_'.join(path_parts) or 'index'
+        filename = f"{filename}.md"
+        
+        # Save file
+        filepath = os.path.join(self.output_dir, filename)
+        try:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(f"# Content from {url}\n\n")
+                f.write(content)
+            logger.info(f"Saved content to {filepath}")
+        except Exception as e:
+            logger.error(f"Error saving content to {filepath}: {str(e)}")
+
+    def scrape(self) -> None:
+        """Main scraping method."""
+        urls_to_visit = [self.base_url]
+        pages_scraped = 0
+        
+        while urls_to_visit and pages_scraped < self.max_pages:
+            current_url = urls_to_visit.pop(0)
+            
+            if current_url in self.visited_urls:
+                continue
+            
+            logger.info(f"Scraping {current_url}")
+            self.visited_urls.add(current_url)
+            
+            # Get and process page content
+            html_content = self.get_page_content(current_url)
+            if not html_content:
+                continue
+            
+            # Extract and save main content
+            main_content = self.extract_main_content(html_content)
+            self.save_content(current_url, main_content)
+            
+            # Extract new links
+            new_links = self.extract_links(html_content, current_url)
+            urls_to_visit.extend([link for link in new_links if link not in self.visited_urls])
+            
+            pages_scraped += 1
+            time.sleep(self.delay)  # Be nice to servers
+        
+        logger.info(f"Scraping completed. Scraped {pages_scraped} pages.")
+
+def main():
+    """Example usage of the WebScraper."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='Web scraper for documentation pages')
+    parser.add_argument('url', help='Base URL to start scraping from')
+    parser.add_argument('--output', '-o', default='docs', help='Output directory for scraped content')
+    parser.add_argument('--max-pages', '-m', type=int, default=10, help='Maximum number of pages to scrape')
+    parser.add_argument('--delay', '-d', type=float, default=1.0, help='Delay between requests in seconds')
+    
+    args = parser.parse_args()
+    
+    scraper = WebScraper(
+        base_url=args.url,
+        output_dir=args.output,
+        max_pages=args.max_pages,
+        delay=args.delay
+    )
+    
+    scraper.scrape()
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file

From 05f12cc182a0e0bb6d0f3844913ab51759354371 Mon Sep 17 00:00:00 2001
From: siddhart <s.vikram@enact-systems.com>
Date: Mon, 19 May 2025 14:54:35 +0530
Subject: [PATCH 2/2] Revise README for DeepSeek RAG Documentation Assistant

---
 deepseek-rag/README.md        | 203 +++++++++++++++++++++++-----------
 deepseek-rag/requirements.txt |  10 ++
 2 files changed, 150 insertions(+), 63 deletions(-)

diff --git a/deepseek-rag/README.md b/deepseek-rag/README.md
index 93504158..16ad0050 100644
--- a/deepseek-rag/README.md
+++ b/deepseek-rag/README.md
@@ -1,102 +1,179 @@
-# Documentation RAG System
-
-A system for scraping, processing, and managing documentation for RAG (Retrieval-Augmented Generation) applications using DeepSeek R1, Ollama, Streamlit and Firecrawl.
-
-## Table of Contents
-
-- [Features](#features)
-- [Prerequisites](#prerequisites)
-- [Installation](#installation)
-- [Usage](#usage)
-
-## Features
-
-- **Documentation Scraping**: Automatically scrapes documentation from specified URLs
-- **Flexible Storage**: Stores documentation in organized directory structures
-- **Streamlit Interface**: User-friendly web interface for managing documentation
-- **Configurable Scraping**: Control the number of pages scraped and documentation naming
+# DeepSeek RAG Documentation Assistant
+
+Transform any documentation into an intelligent chatbot! This Streamlit-based RAG (Retrieval-Augmented Generation) application allows you to convert any documentation website or local documents into an interactive AI assistant. Powered by DeepSeek and Ollama, it provides intelligent, context-aware responses to your documentation queries.
+
+## Key Features
+
+- **Documentation to Chatbot**: Convert any documentation into an interactive AI assistant
+- **Multiple Scraping Options**:
+  - **Firecrawl Scraper**: Specialized scraper for structured documentation sites
+  - **Web Scraper**: Flexible scraper for general websites with configurable crawling
+- **Smart Document Processing**:
+  - Automatic content extraction and structuring
+  - Intelligent chunking for optimal context retrieval
+  - Vector-based semantic search
+- **Interactive Chat Interface**:
+  - Natural language querying
+  - Chain-of-thought reasoning
+  - Context-aware responses
+- **Documentation Management**:
+  - Support for multiple documentation sets
+  - Easy switching between different docs
+  - Automatic content indexing
 
 ## Prerequisites
 
-Before setting up the project, ensure you have:
-
-- Python 3.x installed
-- A Firecrawl account (sign up at <https://firecrawl.dev> and get your API key)
-- Environment variables properly configured (see `.env.example` file)
-- Ollama installed and running ([download instructions](https://ollama.com/download/mac)):
+- Python 3.10 or higher
+- [Ollama](https://ollama.ai/) installed and running locally
+- Required Ollama models:
+  - `deepseek-r1:14b` (for chat)
+  - `nomic-embed-text` (for embeddings)
 
-1. Install Ollama
+## Installation
 
+1. Clone the repository:
 ```bash
-brew install ollama
+git clone <repository-url>
+cd deepseek-rag
 ```
 
-2. Start the Ollama server
-
+2. Create and activate a virtual environment (recommended):
 ```bash
-ollama serve
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
 ```
 
-3. Pull the required model used in the repository
+3. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
 
+4. Pull required Ollama models:
 ```bash
-# 14 billion parameters
-ollama pull deepseek-r1:14b 
+ollama pull deepseek-r1:14b
+ollama pull nomic-embed-text
 ```
 
-If you have strong enough hardware, try bigger versions of the DeepSeek R1 model as model size significantly affects performance.
+## Dependencies
 
-## Installation
+The application uses the following key dependencies:
+- `streamlit`: For the web interface
+- `langchain-core>=0.3.34`: Core LangChain functionality
+- `langchain-ollama>=0.0.3`: Ollama integration
+- `langchain-chroma>=0.1.4`: Vector store integration
+- `chromadb>=0.4.22`: Vector database
+- `langchain-community`: Document loaders and utilities
+- `langchain-text-splitters`: Text chunking utilities
+- `beautifulsoup4`: For web scraping
+- `requests`: For HTTP requests
 
-1. Clone the repository:
+## Usage
+
+### 1. Convert Documentation to Chatbot
 
+#### Using the Web Interface
+
+1. Start the application:
 ```bash
-git clone https://github.com/BexTuychiev/local-documentation-rag.git
-cd local-documentation-rag
+streamlit run src/app.py
 ```
 
-2. Install required dependencies:
+2. Open your browser at http://localhost:8501
+
+3. In the sidebar:
+   - Choose your preferred scraper (Firecrawl or Web Scraper)
+   - Enter the documentation URL
+   - Set a name for your documentation (must end with '-docs')
+   - Configure scraper-specific settings
+   - Click "Start Scraping"
+
+4. Once scraping is complete:
+   - Select your documentation from the dropdown
+   - Start chatting with your AI assistant!
+
+#### Using the Command Line
+
+For advanced users, you can use the web scraper directly:
 
 ```bash
-pip install -r requirements.txt
+python src/web_scraper.py https://docs.example.com --output docs --max-pages 20 --delay 1.5
 ```
 
-3. Create a `.env` file in the project root and configure required environment variables.
+### 2. Interacting with Your Documentation Bot
 
-## Usage
+- **Ask Questions**: Type natural language questions about the documentation
+- **View Reasoning**: Click "View reasoning" to see how the AI arrived at its answer
+- **Switch Context**: Use the sidebar to switch between different documentation sets
+- **Real-time Processing**: The bot processes and indexes new documentation automatically
 
-1. Start the Streamlit application:
+## Supported Documentation Types
+
+- **Web Documentation**:
+  - Documentation websites
+  - Technical blogs
+  - API documentation
+  - Product manuals
+- **Local Documentation**:
+  - Markdown files
+  - Text documents
+  - Documentation directories
+
+## Project Structure
 
-```bash
-streamlit run src/app.py
 ```
+deepseek-rag/
+├── src/
+│   ├── app.py           # Streamlit application
+│   ├── rag.py           # RAG implementation
+│   ├── scraper.py       # Firecrawl scraper
+│   └── web_scraper.py   # Web scraping utility
+├── requirements.txt     # Project dependencies
+└── README.md           # This file
+```
+
+## Troubleshooting
+
+1. **Model Not Found Errors**:
+   - Ensure Ollama is running (`ollama serve`)
+   - Verify required models are pulled (`ollama list`)
+   - Pull missing models using `ollama pull <model-name>`
+
+2. **ChromaDB Issues**:
+   - If you encounter database errors, try removing the `./chroma_db` directory and restarting the application
+   - Ensure you have sufficient disk space for the vector store
 
-2. Through the web interface, you can:
-   - Configure documentation scraping
-   - Specify base URLs for documentation
-   - Control the number of pages to scrape
+3. **Performance Issues**:
+   - The application requires significant RAM for the language models
+   - Consider using a machine with at least 16GB RAM
+   - Adjust chunk sizes in `rag.py` if needed
 
-### Documentation Naming Conventions
+4. **Web Scraping Issues**:
+   - If scraping fails, check your internet connection
+   - Some websites may block automated scraping
+   - Adjust the delay parameter if you're getting rate-limited
+   - Ensure you have permission to scrape the target website
 
-- Documentation directories must end with `-docs` suffix
-- Names should be descriptive and relate to the documentation being scraped
-- Example: `Firecrawl-docs`, `API-docs`
+## Best Practices
 
-## Features in Detail
+1. **Documentation Preparation**:
+   - Use clear, well-structured documentation
+   - Ensure proper HTML formatting for web docs
+   - Include relevant metadata and titles
 
-### Documentation Scraping
+2. **Scraping Configuration**:
+   - Start with a small number of pages for testing
+   - Adjust delay settings based on the target website
+   - Use appropriate scraper for your documentation type
 
-The system provides a robust scraping mechanism that:
+3. **Chat Interaction**:
+   - Ask specific, clear questions
+   - Use natural language
+   - Check the reasoning for complex queries
 
-- Handles various documentation formats
-- Maintains proper directory structure
-- Supports pagination and depth control
-- Provides progress feedback
+## Contributing
 
-### RAG Integration
+Contributions are welcome! Please feel free to submit a Pull Request.
 
-The system is designed to work with RAG applications by:
+## License
 
-- Converting documentation to appropriate formats
-- Maintaining proper data structure for retrieval
-- Supporting various documentation sources
+[Add your license information here]
diff --git a/deepseek-rag/requirements.txt b/deepseek-rag/requirements.txt
index 7a35a9ee..89ada6fc 100644
--- a/deepseek-rag/requirements.txt
+++ b/deepseek-rag/requirements.txt
@@ -170,3 +170,13 @@ wrapt==1.17.2 ; python_version >= "3.10" and python_version < "4.0"
 yarl==1.18.3 ; python_version >= "3.10" and python_version < "4.0"
 zipp==3.21.0 ; python_version >= "3.10" and python_version < "4"
 zstandard==0.23.0 ; python_version >= "3.10" and python_version < "4.0"
+streamlit>=1.32.0
+langchain-core>=0.3.34
+langchain-ollama>=0.0.3
+langchain-chroma>=0.1.4
+chromadb>=0.4.22
+langchain-community
+langchain-text-splitters
+beautifulsoup4>=4.12.0
+requests>=2.31.0
+urllib3>=2.0.0