From 73714606835287dfda4556002998a5b20c156e51 Mon Sep 17 00:00:00 2001
From: Muhammad Talha Imran <92talhaimran@gmail.com>
Date: Mon, 14 Jul 2025 10:12:55 +0200
Subject: [PATCH] Add Reddit search and integrate PRAW

---
 Agents/Crawler/README.md                   |  25 ++-
 Agents/Crawler/ai_lead_generation_agent.py | 202 ++++++++++-----------
 Agents/Crawler/requirements.txt            |  14 +-
 3 files changed, 120 insertions(+), 121 deletions(-)

diff --git a/Agents/Crawler/README.md b/Agents/Crawler/README.md
index 8916d11..55a61eb 100644
--- a/Agents/Crawler/README.md
+++ b/Agents/Crawler/README.md
@@ -1,13 +1,14 @@
-## 🎯 AI Lead Generation Agent - Powered by Firecrawl's Extract Endpoint
+## 🎯 AI Lead Generation Agent
 
-The AI Lead Generation Agent automates the process of finding and qualifying potential leads from Quora. It uses Firecrawl's search and the new Extract endpoint to identify relevant user profiles, extract valuable information, and organize it into a structured format in Google Sheets. This agent helps sales and marketing teams efficiently build targeted lead lists while saving hours of manual research.
+This Streamlit application searches Quora with DuckDuckGo and Reddit using the official API. It downloads each page with simple HTTP requests, extracts user interactions with a local Mistral model and saves the results to an Excel file.
 
 ### Features
-- **Targeted Search**: Uses Firecrawl's search endpoint to find relevant Quora URLs based on your search criteria
-- **Intelligent Extraction**: Leverages Firecrawl's new Extract endpoint to pull user information from Quora profiles
-- **Automated Processing**: Formats extracted user information into a clean, structured format
-- **Google Sheets Integration**: Automatically creates and populates Google Sheets with lead information
-- **Customizable Criteria**: Allows you to define specific search parameters to find your ideal leads for your niche
+- Searches Quora links via DuckDuckGo
+- Queries Reddit posts through the Reddit API
+- Fetches pages directly via HTTP requests
+- Uses a local `mistral:7b-instruct` model for extraction
+- Outputs the collected data to an Excel spreadsheet
+- Choose how many links to process
 
 ### Setup
 
@@ -17,4 +18,12 @@ The AI Lead Generation Agent automates the process of finding and qualifying pot
    pip install -r requirements.txt
    ```
 
-2. When running the Streamlit app you'll be prompted for a **HuggingFace Repo ID**. Provide the repo for the chat model you would like to use (e.g. `meta-llama/Llama-3-8B-Instruct`).
+2. Run the Streamlit app:
+
+   ```bash
+   streamlit run ai_lead_generation_agent.py
+   ```
+
+When prompted, provide the Excel filename and describe the leads you are looking for.
+
+The Reddit API requires credentials set in the `REDDIT_CLIENT_ID`, `REDDIT_CLIENT_SECRET`, and `REDDIT_USER_AGENT` environment variables.
diff --git a/Agents/Crawler/ai_lead_generation_agent.py b/Agents/Crawler/ai_lead_generation_agent.py
index df232e1..c269e2c 100644
--- a/Agents/Crawler/ai_lead_generation_agent.py
+++ b/Agents/Crawler/ai_lead_generation_agent.py
@@ -1,64 +1,90 @@
 import streamlit as st
-from agno.agent import Agent
-from duckduckgo_search import ddg
-from langchain_community.chat_models.huggingface import ChatHuggingFaceHub
-from langchain_community.document_loaders import PlaywrightURLLoader
+from duckduckgo_search import DDGS
+from langchain_community.chat_models import ChatOllama
 from langchain.chains import LLMChain
 from langchain.prompts import ChatPromptTemplate
-from langchain.output_parsers import PydanticOutputParser
-from pydantic import BaseModel, Field
 from typing import List
-from composio_phidata import Action, ComposioToolSet
+import pandas as pd
 import json
+import requests
+from bs4 import BeautifulSoup
+import os
+import praw
 
-class QuoraUserInteractionSchema(BaseModel):
-    username: str = Field(description="The username of the user who posted the question or answer")
-    bio: str = Field(description="The bio or description of the user")
-    post_type: str = Field(description="The type of post, either 'question' or 'answer'")
-    timestamp: str = Field(description="When the question or answer was posted")
-    upvotes: int = Field(default=0, description="Number of upvotes received")
-    links: List[str] = Field(default_factory=list, description="Any links included in the post")
+reddit = praw.Reddit(
+    client_id=os.getenv("REDDIT_CLIENT_ID"),
+    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
+    user_agent=os.getenv("REDDIT_USER_AGENT", "lead-gen-app"),
+)
 
-class QuoraPageSchema(BaseModel):
-    interactions: List[QuoraUserInteractionSchema] = Field(description="List of all user interactions (questions and answers) on the page")
 
-def search_for_urls(company_description: str, num_links: int) -> List[str]:
+def search_ddg_quora(company_description: str, num_links: int) -> List[str]:
+    """Return Quora URLs using DuckDuckGo."""
     query = f"site:quora.com {company_description}"
-    results = ddg(query, max_results=num_links) or []
-    return [r.get("href") for r in results if r.get("href")]
+    with DDGS() as ddgs:
+        results = ddgs.text(query, max_results=num_links) or []
+    return [r.get("href") or r.get("url") for r in results if r.get("href") or r.get("url")]
+
+
+def search_reddit(company_description: str, limit: int) -> List[str]:
+    """Return Reddit post links using the official API."""
+    if limit <= 0:
+        return []
+    urls = []
+    for submission in reddit.subreddit("all").search(company_description, limit=limit):
+        urls.append(f"https://www.reddit.com{submission.permalink}")
+        if len(urls) >= limit:
+            break
+    return urls
+
+
+def search_for_urls(company_description: str, num_links: int) -> List[str]:
+    quora_urls = search_ddg_quora(company_description, num_links // 2)
+    reddit_urls = search_reddit(company_description, num_links - len(quora_urls))
+    combined = quora_urls + reddit_urls
+    return combined[:num_links]
 
-def extract_user_info_from_urls(urls: List[str], hf_api_token: str) -> List[dict]:
+
+def load_page_text(url: str) -> str:
+    """Fetch page HTML and return plain text."""
+    headers = {"User-Agent": "Mozilla/5.0"}
+    resp = requests.get(url, headers=headers, timeout=10)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, "html.parser")
+    return soup.get_text(separator=" ", strip=True)
+
+def extract_user_info_from_urls(urls: List[str]) -> List[dict]:
     user_info_list = []
-    loader = PlaywrightURLLoader(urls, continue_on_failure=True)
-    docs = loader.load()
 
-    llm = ChatHuggingFaceHub(
-        repo_id="mistralai/Mistral-7B-Instruct-v0.1",
-        model_kwargs={"temperature": 0.0, "max_new_tokens": 512},
-        huggingfacehub_api_token=hf_api_token,
-    )
+    llm = ChatOllama(model="mistral:7b-instruct")
 
-    parser = PydanticOutputParser(pydantic_object=QuoraPageSchema)
     prompt = ChatPromptTemplate.from_messages(
         [
-            ("system", "Extract all user interactions from this Quora page. {format_instructions}"),
+            (
+                "system",
+                (
+                    "Extract all user interactions from this page (Quora or Reddit) as JSON. "
+                    "Return a dictionary with an 'interactions' key containing a list "
+                    "of objects with 'username', 'bio', 'post_type', 'timestamp', "
+                    "'upvotes' and 'links'."
+                ),
+            ),
             ("human", "{page_content}"),
         ]
     )
     chain = LLMChain(llm=llm, prompt=prompt)
 
-    for url, doc in zip(urls, docs):
+    for url in urls:
         try:
-            result = chain.predict(
-                page_content=doc.page_content,
-                format_instructions=parser.get_format_instructions(),
-            )
-            parsed = parser.parse(result)
-            if parsed.interactions:
+            page_content = load_page_text(url)
+            result = chain.predict(page_content=page_content)
+            parsed = json.loads(result)
+            interactions = parsed.get("interactions", [])
+            if interactions:
                 user_info_list.append(
                     {
                         "website_url": url,
-                        "user_info": [i.dict() for i in parsed.interactions],
+                        "user_info": interactions,
                     }
                 )
         except Exception:
@@ -87,51 +113,15 @@ def format_user_info_to_flattened_json(user_info_list: List[dict]) -> List[dict]
     
     return flattened_data
 
-def create_google_sheets_agent(composio_api_key: str, hf_api_token: str) -> Agent:
-    composio_toolset = ComposioToolSet(api_key=composio_api_key)
-    google_sheets_tool = composio_toolset.get_tools(actions=[Action.GOOGLESHEETS_SHEET_FROM_JSON])[0]
-    
-    google_sheets_agent = Agent(
-        model=ChatHuggingFaceHub(
-            repo_id="mistralai/Mistral-7B-Instruct-v0.1",
-            model_kwargs={"temperature": 0.0, "max_new_tokens": 512},
-            huggingfacehub_api_token=hf_api_token,
-        ),
-        tools=[google_sheets_tool],
-        show_tool_calls=True,
-        system_prompt="You are an expert at creating and updating Google Sheets. You will be given user information in JSON format, and you need to write it into a new Google Sheet.",
-        markdown=True
-    )
-    return google_sheets_agent
 
-def write_to_google_sheets(flattened_data: List[dict], composio_api_key: str, hf_api_token: str) -> str:
-    google_sheets_agent = create_google_sheets_agent(composio_api_key, hf_api_token)
-    
-    try:
-        message = (
-            "Create a new Google Sheet with this data. "
-            "The sheet should have these columns: Website URL, Username, Bio, Post Type, Timestamp, Upvotes, and Links in the same order as mentioned. "
-            "Here's the data in JSON format:\n\n"
-            f"{json.dumps(flattened_data, indent=2)}"
-        )
-        
-        create_sheet_response = google_sheets_agent.run(message)
-        
-        if "https://docs.google.com/spreadsheets/d/" in create_sheet_response.content:
-            google_sheets_link = create_sheet_response.content.split("https://docs.google.com/spreadsheets/d/")[1].split(" ")[0]
-            return f"https://docs.google.com/spreadsheets/d/{google_sheets_link}"
-    except Exception:
-        pass
-    return None
-
-def create_prompt_transformation_agent(hf_api_token: str) -> Agent:
-    return Agent(
-        model=ChatHuggingFaceHub(
-            repo_id="mistralai/Mistral-7B-Instruct-v0.1",
-            model_kwargs={"temperature": 0.0, "max_new_tokens": 512},
-            huggingfacehub_api_token=hf_api_token,
-        ),
-        system_prompt="""You are an expert at transforming detailed user queries into concise company descriptions.
+def write_to_excel(flattened_data: List[dict], path: str) -> None:
+    df = pd.DataFrame(flattened_data)
+    df.to_excel(path, index=False)
+
+
+def transform_query(user_query: str) -> str:
+    llm = ChatOllama(model="mistral:7b-instruct")
+    system_prompt = """You are an expert at transforming detailed user queries into concise company descriptions.
 Your task is to extract the core business/product focus in 3-4 words.
 
 Examples:
@@ -147,22 +137,23 @@ def create_prompt_transformation_agent(hf_api_token: str) -> Agent:
 Input: "Need to find businesses interested in implementing machine learning solutions for fraud detection"
 Output: "ML fraud detection"
 
-Always focus on the core product/service and keep it concise but clear.""",
-        markdown=True
-    )
+Always focus on the core product/service and keep it concise but clear."""
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", system_prompt),
+        ("human", "{query}"),
+    ])
+    chain = LLMChain(llm=llm, prompt=prompt)
+    return chain.predict(query=user_query).strip()
 
 def main():
     st.title("🎯 AI Lead Generation Agent")
     st.info("Generate leads from Quora by searching for relevant posts and extracting user information.")
 
     with st.sidebar:
-        st.header("API Keys")
-        hf_api_token = st.text_input("HuggingFace API Token", type="password")
-        composio_api_key = st.text_input("Composio API Key", type="password")
-        st.caption(" Get your Composio API key from [Composio's website](https://composio.ai)")
-        
+        st.header("Configuration")
+        output_file = st.text_input("Output Excel filename", value="leads.xlsx")
         num_links = st.number_input("Number of links to search", min_value=1, max_value=10, value=3)
-        
+
         if st.button("Reset"):
             st.session_state.clear()
             st.experimental_rerun()
@@ -174,39 +165,34 @@ def main():
     )
 
     if st.button("Generate Leads"):
-        if not all([hf_api_token, composio_api_key, user_query]):
-            st.error("Please fill in all the API keys and describe what leads you're looking for.")
+        if not all([user_query, output_file]):
+            st.error("Please provide a description and output filename.")
         else:
             with st.spinner("Processing your query..."):
-                transform_agent = create_prompt_transformation_agent(hf_api_token)
-                company_description = transform_agent.run(f"Transform this query into a concise 3-4 word company description: {user_query}")
-                st.write("🎯 Searching for:", company_description.content)
+                company_description = transform_query(user_query)
+                st.write("🎯 Searching for:", company_description)
 
             with st.spinner("Searching for relevant URLs..."):
-                urls = search_for_urls(company_description.content, num_links)
+                urls = search_for_urls(company_description, num_links)
             
             if urls:
-                st.subheader("Quora Links Used:")
+                st.subheader("Links Used:")
                 for url in urls:
                     st.write(url)
                 
                 with st.spinner("Extracting user info from URLs..."):
-                    user_info_list = extract_user_info_from_urls(urls, hf_api_token)
+                    user_info_list = extract_user_info_from_urls(urls)
                 
                 with st.spinner("Formatting user info..."):
                     flattened_data = format_user_info_to_flattened_json(user_info_list)
                 
-                with st.spinner("Writing to Google Sheets..."):
-                    google_sheets_link = write_to_google_sheets(flattened_data, composio_api_key, hf_api_token)
-                
-                if google_sheets_link:
-                    st.success("Lead generation and data writing to Google Sheets completed successfully!")
-                    st.subheader("Google Sheets Link:")
-                    st.markdown(f"[View Google Sheet]({google_sheets_link})")
-                else:
-                    st.error("Failed to retrieve the Google Sheets link.")
+                with st.spinner("Writing to Excel..."):
+                    write_to_excel(flattened_data, output_file)
+                st.success("Lead generation completed successfully!")
+                st.subheader("Saved File:")
+                st.write(output_file)
             else:
                 st.warning("No relevant URLs found.")
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/Agents/Crawler/requirements.txt b/Agents/Crawler/requirements.txt
index 1eb65a0..e93fcae 100644
--- a/Agents/Crawler/requirements.txt
+++ b/Agents/Crawler/requirements.txt
@@ -1,5 +1,9 @@
-agno
-composio-phidata
-composio==0.1.1
-pydantic==2.10.5
-streamlit
\ No newline at end of file
+# Requires Reddit API credentials for praw
+duckduckgo_search
+langchain_community
+pandas
+openpyxl
+requests
+beautifulsoup4
+praw
+streamlit