55- markdownify: Convert any webpage into clean, formatted markdown
66- smartscraper: Extract structured data from any webpage using AI
77- searchscraper: Perform AI-powered web searches with structured results
8- - crawl_requester : Initiate intelligent web crawling requests (step 1)
9- - crawl_fetcher: Fetch results from crawling requests (step 2)
8+ - smartcrawler_initiate : Initiate intelligent multi-page web crawling with AI extraction or markdown conversion
9+ - smartcrawler_fetch_results: Retrieve results from asynchronous crawling operations
1010"""
1111
1212import os
@@ -126,49 +126,58 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr
126126
127127 return response .json ()
128128
129- def crawl_requester (
129+ def smartcrawler_initiate (
130130 self ,
131131 url : str ,
132132 prompt : str = None ,
133- cache_website : bool = None ,
133+ extraction_mode : str = "ai" ,
134134 depth : int = None ,
135135 max_pages : int = None ,
136- same_domain_only : bool = None ,
137- markdown_only : bool = None
136+ same_domain_only : bool = None
138137 ) -> Dict [str , Any ]:
139138 """
140- Initiate a web crawling request and get a request ID.
139+ Initiate a SmartCrawler request for multi-page web crawling.
140+
141+ SmartCrawler supports two modes:
142+ - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt
143+ - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown
144+
145+ Smartcrawler takes some time to process the request and returns the request id.
146+ Use smartcrawler_fetch_results to get the results of the request.
147+ You have to keep polling the smartcrawler_fetch_results until the request is complete.
148+ The request is complete when the status is "completed".
141149
142150 Args:
143151 url: Starting URL to crawl
144- prompt: AI prompt for data extraction (optional, if not provided returns markdown only )
145- cache_website: Whether to cache the website content (optional )
146- depth: Maximum crawling depth (optional)
152+ prompt: AI prompt for data extraction (required for AI mode )
153+ extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai" )
154+ depth: Maximum link traversal depth (optional)
147155 max_pages: Maximum number of pages to crawl (optional)
148156 same_domain_only: Whether to crawl only within the same domain (optional)
149- markdown_only: Whether to return only markdown content without AI processing (optional)
150157
151158 Returns:
152- Dictionary containing the request ID and status
159+ Dictionary containing the request ID for async processing
153160 """
154- endpoint = f"{ self .BASE_URL } /crawl/requester "
161+ endpoint = f"{ self .BASE_URL } /crawl"
155162 data = {
156163 "url" : url
157164 }
158165
159- # Add optional parameters if provided
160- if prompt is not None :
166+ # Handle extraction mode
167+ if extraction_mode == "markdown" :
168+ data ["markdown_only" ] = True
169+ elif extraction_mode == "ai" :
170+ if prompt is None :
171+ raise ValueError ("prompt is required when extraction_mode is 'ai'" )
161172 data ["prompt" ] = prompt
162- if cache_website is not None :
163- data [ "cache_website" ] = cache_website
173+ else :
174+ raise ValueError ( f"Invalid extraction_mode: { extraction_mode } . Must be 'ai' or 'markdown'" )
164175 if depth is not None :
165176 data ["depth" ] = depth
166177 if max_pages is not None :
167178 data ["max_pages" ] = max_pages
168179 if same_domain_only is not None :
169180 data ["same_domain_only" ] = same_domain_only
170- if markdown_only is not None :
171- data ["markdown_only" ] = markdown_only
172181
173182 response = self .client .post (endpoint , headers = self .headers , json = data )
174183
@@ -178,22 +187,27 @@ def crawl_requester(
178187
179188 return response .json ()
180189
181- def crawl_fetcher (self , request_id : str ) -> Dict [str , Any ]:
190+ def smartcrawler_fetch_results (self , request_id : str ) -> Dict [str , Any ]:
182191 """
183- Fetch the results of a crawling request using the request ID .
192+ Fetch the results of a SmartCrawler operation .
184193
185194 Args:
186- request_id: The request ID returned by crawl_requester
195+ request_id: The request ID returned by smartcrawler_initiate
187196
188197 Returns:
189- Dictionary containing the crawl results or status
198+ Dictionary containing the crawled data (structured extraction or markdown)
199+ and metadata about processed pages
200+
201+ Note:
202+ It takes some time to process the request and returns the results.
203+ Meanwhile it returns the status of the request.
204+ You have to keep polling the smartcrawler_fetch_results until the request is complete.
205+ The request is complete when the status is "completed". and you get results
206+ Keep polling the smartcrawler_fetch_results until the request is complete.
190207 """
191- endpoint = f"{ self .BASE_URL } /crawl/fetcher"
192- data = {
193- "request_id" : request_id
194- }
195-
196- response = self .client .post (endpoint , headers = self .headers , json = data )
208+ endpoint = f"{ self .BASE_URL } /crawl/{ request_id } "
209+
210+ response = self .client .get (endpoint , headers = self .headers )
197211
198212 if response .status_code != 200 :
199213 error_msg = f"Error { response .status_code } : { response .text } "
@@ -291,66 +305,68 @@ def searchscraper(
291305 return {"error" : str (e )}
292306
293307
294- # Add tool for crawl requester (smartcrawler step 1)
308+ # Add tool for SmartCrawler initiation
295309@mcp .tool ()
296- def crawl_requester (
310+ def smartcrawler_initiate (
297311 url : str ,
298312 prompt : str = None ,
299- cache_website : bool = None ,
313+ extraction_mode : str = "ai" ,
300314 depth : int = None ,
301315 max_pages : int = None ,
302- same_domain_only : bool = None ,
303- markdown_only : bool = None
316+ same_domain_only : bool = None
304317) -> Dict [str , Any ]:
305318 """
306- Initiate a web crawling request and get a request ID.
319+ Initiate a SmartCrawler request for intelligent multi-page web crawling.
320+
321+ SmartCrawler supports two modes:
322+ - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt
323+ - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown
307324
308325 Args:
309326 url: Starting URL to crawl
310- prompt: AI prompt for data extraction (optional, if not provided returns markdown only )
311- cache_website: Whether to cache the website content (optional )
312- depth: Maximum crawling depth (optional)
327+ prompt: AI prompt for data extraction (required for AI mode )
328+ extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai" )
329+ depth: Maximum link traversal depth (optional)
313330 max_pages: Maximum number of pages to crawl (optional)
314331 same_domain_only: Whether to crawl only within the same domain (optional)
315- markdown_only: Whether to return only markdown content without AI processing (optional)
316332
317333 Returns:
318- Dictionary containing the request ID and status
334+ Dictionary containing the request ID for async processing
319335 """
320336 if scrapegraph_client is None :
321337 return {"error" : "ScapeGraph client not initialized. Please provide an API key." }
322338
323339 try :
324- return scrapegraph_client .crawl_requester (
340+ return scrapegraph_client .smartcrawler_initiate (
325341 url = url ,
326342 prompt = prompt ,
327- cache_website = cache_website ,
343+ extraction_mode = extraction_mode ,
328344 depth = depth ,
329345 max_pages = max_pages ,
330- same_domain_only = same_domain_only ,
331- markdown_only = markdown_only
346+ same_domain_only = same_domain_only
332347 )
333348 except Exception as e :
334349 return {"error" : str (e )}
335350
336351
337- # Add tool for crawl fetcher (smartcrawler step 2)
352+ # Add tool for fetching SmartCrawler results
338353@mcp .tool ()
339- def crawl_fetcher (request_id : str ) -> Dict [str , Any ]:
354+ def smartcrawler_fetch_results (request_id : str ) -> Dict [str , Any ]:
340355 """
341- Fetch the results of a crawling request using the request ID .
356+ Fetch the results of a SmartCrawler operation .
342357
343358 Args:
344- request_id: The request ID returned by crawl_requester
359+ request_id: The request ID returned by smartcrawler_initiate
345360
346361 Returns:
347- Dictionary containing the crawl results or status
362+ Dictionary containing the crawled data (structured extraction or markdown)
363+ and metadata about processed pages
348364 """
349365 if scrapegraph_client is None :
350366 return {"error" : "ScapeGraph client not initialized. Please provide an API key." }
351367
352368 try :
353- return scrapegraph_client .crawl_fetcher (request_id )
369+ return scrapegraph_client .smartcrawler_fetch_results (request_id )
354370 except Exception as e :
355371 return {"error" : str (e )}
356372
0 commit comments