Skip to content

Commit c595975

Browse files
feat: add new scraping and sitemap extraction tools to ScapeGraphClient
Changes: - Introduced `scrape` method for basic scraping of page content. - Added `sitemap` method to extract sitemap URLs for a given website. - Implemented `agentic_scrapper` method for running the Agentic Scraper workflow with flexible input handling. - Updated timeout settings for HTTP client to improve request handling. This enhances the functionality of the ScapeGraphClient, allowing for more versatile web scraping capabilities.
1 parent 8136002 commit c595975

File tree

2 files changed

+1670
-2
lines changed

2 files changed

+1670
-2
lines changed

src/scrapegraph_mcp/server.py

Lines changed: 186 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
"""
1111

1212
import os
13-
from typing import Any, Dict
13+
import json
14+
from typing import Any, Dict, Optional, List, Union
1415

1516
import httpx
1617
from fastmcp import FastMCP
@@ -33,7 +34,7 @@ def __init__(self, api_key: str):
3334
"SGAI-APIKEY": api_key,
3435
"Content-Type": "application/json"
3536
}
36-
self.client = httpx.Client(timeout=60.0)
37+
self.client = httpx.Client(timeout=httpx.Timeout(120.0))
3738

3839
def markdownify(self, website_url: str) -> Dict[str, Any]:
3940
"""
@@ -126,6 +127,85 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr
126127

127128
return response.json()
128129

130+
def scrape(self, website_url: str, render_heavy_js: Optional[bool] = None) -> Dict[str, Any]:
131+
"""
132+
Basic scrape endpoint to fetch page content.
133+
134+
Args:
135+
website_url: URL to scrape
136+
render_heavy_js: Whether to render heavy JS (optional)
137+
138+
Returns:
139+
Dictionary containing the scraped result
140+
"""
141+
url = f"{self.BASE_URL}/scrape"
142+
payload: Dict[str, Any] = {"website_url": website_url}
143+
if render_heavy_js is not None:
144+
payload["render_heavy_js"] = render_heavy_js
145+
146+
response = self.client.post(url, headers=self.headers, json=payload)
147+
response.raise_for_status()
148+
return response.json()
149+
150+
def sitemap(self, website_url: str) -> Dict[str, Any]:
151+
"""
152+
Extract sitemap for a given website.
153+
154+
Args:
155+
website_url: Base website URL
156+
157+
Returns:
158+
Dictionary containing sitemap URLs/structure
159+
"""
160+
url = f"{self.BASE_URL}/sitemap"
161+
payload: Dict[str, Any] = {"website_url": website_url}
162+
163+
response = self.client.post(url, headers=self.headers, json=payload)
164+
response.raise_for_status()
165+
return response.json()
166+
167+
def agentic_scrapper(
168+
self,
169+
url: str,
170+
user_prompt: Optional[str] = None,
171+
output_schema: Optional[Dict[str, Any]] = None,
172+
steps: Optional[List[str]] = None,
173+
ai_extraction: Optional[bool] = None,
174+
persistent_session: Optional[bool] = None,
175+
timeout_seconds: Optional[float] = None,
176+
) -> Dict[str, Any]:
177+
"""
178+
Run the Agentic Scraper workflow (no live session/browser interaction).
179+
180+
Args:
181+
url: Target website URL
182+
user_prompt: Instructions for what to do/extract (optional)
183+
output_schema: Desired structured output schema (optional)
184+
steps: High-level steps/instructions for the agent (optional)
185+
ai_extraction: Whether to enable AI extraction mode (optional)
186+
persistent_session: Whether to keep session alive between steps (optional)
187+
timeout_seconds: Per-request timeout override in seconds (optional)
188+
"""
189+
endpoint = f"{self.BASE_URL}/agentic-scrapper"
190+
payload: Dict[str, Any] = {"url": url}
191+
if user_prompt is not None:
192+
payload["user_prompt"] = user_prompt
193+
if output_schema is not None:
194+
payload["output_schema"] = output_schema
195+
if steps is not None:
196+
payload["steps"] = steps
197+
if ai_extraction is not None:
198+
payload["ai_extraction"] = ai_extraction
199+
if persistent_session is not None:
200+
payload["persistent_session"] = persistent_session
201+
202+
if timeout_seconds is not None:
203+
response = self.client.post(endpoint, headers=self.headers, json=payload, timeout=timeout_seconds)
204+
else:
205+
response = self.client.post(endpoint, headers=self.headers, json=payload)
206+
response.raise_for_status()
207+
return response.json()
208+
129209
def smartcrawler_initiate(
130210
self,
131211
url: str,
@@ -371,6 +451,110 @@ def smartcrawler_fetch_results(request_id: str) -> Dict[str, Any]:
371451
return {"error": str(e)}
372452

373453

454+
# Add tool for basic scrape
455+
@mcp.tool()
456+
def scrape(website_url: str, render_heavy_js: Optional[bool] = None) -> Dict[str, Any]:
457+
"""
458+
Fetch page content for a URL.
459+
460+
Args:
461+
website_url: URL to scrape
462+
render_heavy_js: Whether to render heavy JS (optional)
463+
"""
464+
if scrapegraph_client is None:
465+
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
466+
467+
try:
468+
return scrapegraph_client.scrape(website_url=website_url, render_heavy_js=render_heavy_js)
469+
except httpx.HTTPError as http_err:
470+
return {"error": str(http_err)}
471+
except ValueError as val_err:
472+
return {"error": str(val_err)}
473+
474+
475+
# Add tool for sitemap extraction
476+
@mcp.tool()
477+
def sitemap(website_url: str) -> Dict[str, Any]:
478+
"""
479+
Extract sitemap for a website.
480+
481+
Args:
482+
website_url: Base website URL
483+
"""
484+
if scrapegraph_client is None:
485+
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
486+
487+
try:
488+
return scrapegraph_client.sitemap(website_url=website_url)
489+
except httpx.HTTPError as http_err:
490+
return {"error": str(http_err)}
491+
except ValueError as val_err:
492+
return {"error": str(val_err)}
493+
494+
495+
# Add tool for Agentic Scraper (no live session/browser interaction)
496+
@mcp.tool()
497+
def agentic_scrapper(
498+
url: str,
499+
user_prompt: Optional[str] = None,
500+
output_schema: Optional[Union[str, Dict[str, Any]]] = None,
501+
steps: Optional[Union[str, List[str]]] = None,
502+
ai_extraction: Optional[bool] = None,
503+
persistent_session: Optional[bool] = None,
504+
timeout_seconds: Optional[float] = None,
505+
) -> Dict[str, Any]:
506+
"""
507+
Run the Agentic Scraper workflow. Accepts flexible input forms for steps and schema.
508+
"""
509+
if scrapegraph_client is None:
510+
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
511+
512+
# Normalize inputs
513+
normalized_steps: Optional[List[str]] = None
514+
if isinstance(steps, list):
515+
normalized_steps = steps
516+
elif isinstance(steps, str):
517+
parsed_steps: Optional[Any] = None
518+
try:
519+
parsed_steps = json.loads(steps)
520+
except json.JSONDecodeError:
521+
parsed_steps = None
522+
if isinstance(parsed_steps, list):
523+
normalized_steps = parsed_steps
524+
else:
525+
normalized_steps = [steps]
526+
527+
normalized_schema: Optional[Dict[str, Any]] = None
528+
if isinstance(output_schema, dict):
529+
normalized_schema = output_schema
530+
elif isinstance(output_schema, str):
531+
try:
532+
parsed_schema = json.loads(output_schema)
533+
if isinstance(parsed_schema, dict):
534+
normalized_schema = parsed_schema
535+
else:
536+
return {"error": "output_schema must be a JSON object"}
537+
except json.JSONDecodeError as e:
538+
return {"error": f"Invalid JSON for output_schema: {str(e)}"}
539+
540+
try:
541+
return scrapegraph_client.agentic_scrapper(
542+
url=url,
543+
user_prompt=user_prompt,
544+
output_schema=normalized_schema,
545+
steps=normalized_steps,
546+
ai_extraction=ai_extraction,
547+
persistent_session=persistent_session,
548+
timeout_seconds=timeout_seconds,
549+
)
550+
except httpx.TimeoutException as timeout_err:
551+
return {"error": f"Request timed out: {str(timeout_err)}"}
552+
except httpx.HTTPError as http_err:
553+
return {"error": str(http_err)}
554+
except ValueError as val_err:
555+
return {"error": str(val_err)}
556+
557+
374558
def main() -> None:
375559
"""Run the ScapeGraph MCP server."""
376560
print("Starting ScapeGraph MCP server!")

0 commit comments

Comments
 (0)