Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions Agents/Crawler/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
## 🎯 AI Lead Generation Agent - Powered by Firecrawl's Extract Endpoint
## 🎯 AI Lead Generation Agent

The AI Lead Generation Agent automates the process of finding and qualifying potential leads from Quora. It uses Firecrawl's search and the new Extract endpoint to identify relevant user profiles, extract valuable information, and organize it into a structured format in Google Sheets. This agent helps sales and marketing teams efficiently build targeted lead lists while saving hours of manual research.
This Streamlit application searches Quora with DuckDuckGo and Reddit using the official API. It downloads each page with simple HTTP requests, extracts user interactions with a local Mistral model and saves the results to an Excel file.

### Features
- **Targeted Search**: Uses Firecrawl's search endpoint to find relevant Quora URLs based on your search criteria
- **Intelligent Extraction**: Leverages Firecrawl's new Extract endpoint to pull user information from Quora profiles
- **Automated Processing**: Formats extracted user information into a clean, structured format
- **Google Sheets Integration**: Automatically creates and populates Google Sheets with lead information
- **Customizable Criteria**: Allows you to define specific search parameters to find your ideal leads for your niche
- Searches Quora links via DuckDuckGo
- Queries Reddit posts through the Reddit API
- Fetches pages directly via HTTP requests
- Uses a local `mistral:7b-instruct` model for extraction
- Outputs the collected data to an Excel spreadsheet
- Choose how many links to process

### Setup

Expand All @@ -17,4 +18,12 @@ The AI Lead Generation Agent automates the process of finding and qualifying pot
pip install -r requirements.txt
```

2. When running the Streamlit app you'll be prompted for a **HuggingFace Repo ID**. Provide the repo for the chat model you would like to use (e.g. `meta-llama/Llama-3-8B-Instruct`).
2. Run the Streamlit app:

```bash
streamlit run ai_lead_generation_agent.py
```

When prompted, provide the Excel filename and describe the leads you are looking for.

The Reddit API requires credentials set in the `REDDIT_CLIENT_ID`, `REDDIT_CLIENT_SECRET`, and `REDDIT_USER_AGENT` environment variables.
202 changes: 94 additions & 108 deletions Agents/Crawler/ai_lead_generation_agent.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,90 @@
import streamlit as st
from agno.agent import Agent
from duckduckgo_search import ddg
from langchain_community.chat_models.huggingface import ChatHuggingFaceHub
from langchain_community.document_loaders import PlaywrightURLLoader
from duckduckgo_search import DDGS
from langchain_community.chat_models import ChatOllama
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
from composio_phidata import Action, ComposioToolSet
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import os
import praw

class QuoraUserInteractionSchema(BaseModel):
username: str = Field(description="The username of the user who posted the question or answer")
bio: str = Field(description="The bio or description of the user")
post_type: str = Field(description="The type of post, either 'question' or 'answer'")
timestamp: str = Field(description="When the question or answer was posted")
upvotes: int = Field(default=0, description="Number of upvotes received")
links: List[str] = Field(default_factory=list, description="Any links included in the post")
reddit = praw.Reddit(
client_id=os.getenv("REDDIT_CLIENT_ID"),
client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
user_agent=os.getenv("REDDIT_USER_AGENT", "lead-gen-app"),
)

class QuoraPageSchema(BaseModel):
interactions: List[QuoraUserInteractionSchema] = Field(description="List of all user interactions (questions and answers) on the page")

def search_for_urls(company_description: str, num_links: int) -> List[str]:
def search_ddg_quora(company_description: str, num_links: int) -> List[str]:
"""Return Quora URLs using DuckDuckGo."""
query = f"site:quora.com {company_description}"
results = ddg(query, max_results=num_links) or []
return [r.get("href") for r in results if r.get("href")]
with DDGS() as ddgs:
results = ddgs.text(query, max_results=num_links) or []
return [r.get("href") or r.get("url") for r in results if r.get("href") or r.get("url")]


def search_reddit(company_description: str, limit: int) -> List[str]:
"""Return Reddit post links using the official API."""
if limit <= 0:
return []
urls = []
for submission in reddit.subreddit("all").search(company_description, limit=limit):
urls.append(f"https://www.reddit.com{submission.permalink}")
if len(urls) >= limit:
break
return urls


def search_for_urls(company_description: str, num_links: int) -> List[str]:
quora_urls = search_ddg_quora(company_description, num_links // 2)
reddit_urls = search_reddit(company_description, num_links - len(quora_urls))
combined = quora_urls + reddit_urls
return combined[:num_links]

def extract_user_info_from_urls(urls: List[str], hf_api_token: str) -> List[dict]:

def load_page_text(url: str) -> str:
"""Fetch page HTML and return plain text."""
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
return soup.get_text(separator=" ", strip=True)

def extract_user_info_from_urls(urls: List[str]) -> List[dict]:
user_info_list = []
loader = PlaywrightURLLoader(urls, continue_on_failure=True)
docs = loader.load()

llm = ChatHuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
model_kwargs={"temperature": 0.0, "max_new_tokens": 512},
huggingfacehub_api_token=hf_api_token,
)
llm = ChatOllama(model="mistral:7b-instruct")

parser = PydanticOutputParser(pydantic_object=QuoraPageSchema)
prompt = ChatPromptTemplate.from_messages(
[
("system", "Extract all user interactions from this Quora page. {format_instructions}"),
(
"system",
(
"Extract all user interactions from this page (Quora or Reddit) as JSON. "
"Return a dictionary with an 'interactions' key containing a list "
"of objects with 'username', 'bio', 'post_type', 'timestamp', "
"'upvotes' and 'links'."
),
),
("human", "{page_content}"),
]
)
chain = LLMChain(llm=llm, prompt=prompt)

for url, doc in zip(urls, docs):
for url in urls:
try:
result = chain.predict(
page_content=doc.page_content,
format_instructions=parser.get_format_instructions(),
)
parsed = parser.parse(result)
if parsed.interactions:
page_content = load_page_text(url)
result = chain.predict(page_content=page_content)
parsed = json.loads(result)
interactions = parsed.get("interactions", [])
if interactions:
user_info_list.append(
{
"website_url": url,
"user_info": [i.dict() for i in parsed.interactions],
"user_info": interactions,
}
)
except Exception:
Expand Down Expand Up @@ -87,51 +113,15 @@ def format_user_info_to_flattened_json(user_info_list: List[dict]) -> List[dict]

return flattened_data

def create_google_sheets_agent(composio_api_key: str, hf_api_token: str) -> Agent:
composio_toolset = ComposioToolSet(api_key=composio_api_key)
google_sheets_tool = composio_toolset.get_tools(actions=[Action.GOOGLESHEETS_SHEET_FROM_JSON])[0]

google_sheets_agent = Agent(
model=ChatHuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
model_kwargs={"temperature": 0.0, "max_new_tokens": 512},
huggingfacehub_api_token=hf_api_token,
),
tools=[google_sheets_tool],
show_tool_calls=True,
system_prompt="You are an expert at creating and updating Google Sheets. You will be given user information in JSON format, and you need to write it into a new Google Sheet.",
markdown=True
)
return google_sheets_agent

def write_to_google_sheets(flattened_data: List[dict], composio_api_key: str, hf_api_token: str) -> str:
google_sheets_agent = create_google_sheets_agent(composio_api_key, hf_api_token)

try:
message = (
"Create a new Google Sheet with this data. "
"The sheet should have these columns: Website URL, Username, Bio, Post Type, Timestamp, Upvotes, and Links in the same order as mentioned. "
"Here's the data in JSON format:\n\n"
f"{json.dumps(flattened_data, indent=2)}"
)

create_sheet_response = google_sheets_agent.run(message)

if "https://docs.google.com/spreadsheets/d/" in create_sheet_response.content:
google_sheets_link = create_sheet_response.content.split("https://docs.google.com/spreadsheets/d/")[1].split(" ")[0]
return f"https://docs.google.com/spreadsheets/d/{google_sheets_link}"
except Exception:
pass
return None

def create_prompt_transformation_agent(hf_api_token: str) -> Agent:
return Agent(
model=ChatHuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
model_kwargs={"temperature": 0.0, "max_new_tokens": 512},
huggingfacehub_api_token=hf_api_token,
),
system_prompt="""You are an expert at transforming detailed user queries into concise company descriptions.
def write_to_excel(flattened_data: List[dict], path: str) -> None:
df = pd.DataFrame(flattened_data)
df.to_excel(path, index=False)


def transform_query(user_query: str) -> str:
llm = ChatOllama(model="mistral:7b-instruct")
system_prompt = """You are an expert at transforming detailed user queries into concise company descriptions.
Your task is to extract the core business/product focus in 3-4 words.

Examples:
Expand All @@ -147,22 +137,23 @@ def create_prompt_transformation_agent(hf_api_token: str) -> Agent:
Input: "Need to find businesses interested in implementing machine learning solutions for fraud detection"
Output: "ML fraud detection"

Always focus on the core product/service and keep it concise but clear.""",
markdown=True
)
Always focus on the core product/service and keep it concise but clear."""
prompt = ChatPromptTemplate.from_messages([
("system", system_prompt),
("human", "{query}"),
])
chain = LLMChain(llm=llm, prompt=prompt)
return chain.predict(query=user_query).strip()

def main():
st.title("🎯 AI Lead Generation Agent")
st.info("Generate leads from Quora by searching for relevant posts and extracting user information.")

with st.sidebar:
st.header("API Keys")
hf_api_token = st.text_input("HuggingFace API Token", type="password")
composio_api_key = st.text_input("Composio API Key", type="password")
st.caption(" Get your Composio API key from [Composio's website](https://composio.ai)")

st.header("Configuration")
output_file = st.text_input("Output Excel filename", value="leads.xlsx")
num_links = st.number_input("Number of links to search", min_value=1, max_value=10, value=3)

if st.button("Reset"):
st.session_state.clear()
st.experimental_rerun()
Expand All @@ -174,39 +165,34 @@ def main():
)

if st.button("Generate Leads"):
if not all([hf_api_token, composio_api_key, user_query]):
st.error("Please fill in all the API keys and describe what leads you're looking for.")
if not all([user_query, output_file]):
st.error("Please provide a description and output filename.")
else:
with st.spinner("Processing your query..."):
transform_agent = create_prompt_transformation_agent(hf_api_token)
company_description = transform_agent.run(f"Transform this query into a concise 3-4 word company description: {user_query}")
st.write("🎯 Searching for:", company_description.content)
company_description = transform_query(user_query)
st.write("🎯 Searching for:", company_description)

with st.spinner("Searching for relevant URLs..."):
urls = search_for_urls(company_description.content, num_links)
urls = search_for_urls(company_description, num_links)

if urls:
st.subheader("Quora Links Used:")
st.subheader("Links Used:")
for url in urls:
st.write(url)

with st.spinner("Extracting user info from URLs..."):
user_info_list = extract_user_info_from_urls(urls, hf_api_token)
user_info_list = extract_user_info_from_urls(urls)

with st.spinner("Formatting user info..."):
flattened_data = format_user_info_to_flattened_json(user_info_list)

with st.spinner("Writing to Google Sheets..."):
google_sheets_link = write_to_google_sheets(flattened_data, composio_api_key, hf_api_token)

if google_sheets_link:
st.success("Lead generation and data writing to Google Sheets completed successfully!")
st.subheader("Google Sheets Link:")
st.markdown(f"[View Google Sheet]({google_sheets_link})")
else:
st.error("Failed to retrieve the Google Sheets link.")
with st.spinner("Writing to Excel..."):
write_to_excel(flattened_data, output_file)
st.success("Lead generation completed successfully!")
st.subheader("Saved File:")
st.write(output_file)
else:
st.warning("No relevant URLs found.")

if __name__ == "__main__":
main()
main()
14 changes: 9 additions & 5 deletions Agents/Crawler/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
agno
composio-phidata
composio==0.1.1
pydantic==2.10.5
streamlit
# Requires Reddit API credentials for praw
duckduckgo_search
langchain_community
pandas
openpyxl
requests
beautifulsoup4
praw
streamlit