1+ #!/usr/bin/env python3
2+ """
3+ Async Step-by-Step Cookies Example
4+
5+ This example demonstrates how to use cookies with SmartScraper API using async/await patterns.
6+ It shows how to set up and execute requests with custom cookies for authentication and session management.
7+ """
8+
9+ import asyncio
10+ import json
11+ import logging
12+ import os
13+ import time
14+
15+ import httpx
16+ from dotenv import load_dotenv
17+
18+ # Configure logging
19+ logging .basicConfig (
20+ level = logging .INFO ,
21+ format = "%(asctime)s - %(levelname)s - %(message)s" ,
22+ handlers = [logging .StreamHandler ()],
23+ )
24+ logger = logging .getLogger (__name__ )
25+
26+ # Load environment variables from .env file
27+ load_dotenv ()
28+
29+
30+ async def step_1_environment_setup ():
31+ """Step 1: Set up environment and API key"""
32+ print ("STEP 1: Environment Setup" )
33+ print ("=" * 40 )
34+
35+ # Check if API key is available
36+ api_key = os .getenv ("TEST_API_KEY" )
37+ if not api_key :
38+ print ("❌ Error: TEST_API_KEY environment variable not set" )
39+ print ("Please either:" )
40+ print (" 1. Set environment variable: export TEST_API_KEY='your-api-key-here'" )
41+ print (" 2. Create a .env file with: TEST_API_KEY=your-api-key-here" )
42+ return None
43+
44+ print ("✅ API key found in environment" )
45+ print (f"🔑 API Key: { api_key [:8 ]} ...{ api_key [- 4 :]} " )
46+ return api_key
47+
48+
49+ async def step_2_server_connectivity_check (api_key ):
50+ """Step 2: Check server connectivity"""
51+ print ("\n STEP 2: Server Connectivity Check" )
52+ print ("=" * 40 )
53+
54+ url = "http://localhost:8001/v1/smartscraper"
55+
56+ try :
57+ async with httpx .AsyncClient (timeout = 5.0 ) as client :
58+ # Try to access the health endpoint
59+ health_url = url .replace ("/v1/smartscraper" , "/healthz" )
60+ response = await client .get (health_url )
61+
62+ if response .status_code == 200 :
63+ print ("✅ Server is accessible" )
64+ print (f"🔗 Health endpoint: { health_url } " )
65+ return True
66+ else :
67+ print (f"❌ Server health check failed with status { response .status_code } " )
68+ return False
69+ except Exception as e :
70+ print (f"❌ Server connectivity check failed: { e } " )
71+ print ("Please ensure the server is running:" )
72+ print (" poetry run uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload" )
73+ return False
74+
75+
76+ def step_3_define_cookies ():
77+ """Step 3: Define cookies for authentication"""
78+ print ("\n STEP 3: Define Cookies" )
79+ print ("=" * 40 )
80+
81+ # Example cookies for a website that requires authentication
82+ cookies = {
83+ "session_id" : "abc123def456ghi789" ,
84+ "user_token" : "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." ,
85+ "remember_me" : "true" ,
86+ "language" : "en" ,
87+ "theme" : "dark"
88+ }
89+
90+ print ("🍪 Cookies configured:" )
91+ for key , value in cookies .items ():
92+ if "token" in key .lower ():
93+ # Mask sensitive tokens
94+ masked_value = value [:20 ] + "..." if len (value ) > 20 else value
95+ print (f" { key } : { masked_value } " )
96+ else :
97+ print (f" { key } : { value } " )
98+
99+ print (f"\n 📊 Total cookies: { len (cookies )} " )
100+ return cookies
101+
102+
103+ def step_4_define_request_parameters ():
104+ """Step 4: Define the request parameters"""
105+ print ("\n STEP 4: Define Request Parameters" )
106+ print ("=" * 40 )
107+
108+ # Configuration parameters
109+ website_url = "https://example.com/dashboard"
110+ user_prompt = "Extract user profile information and account details"
111+
112+ print ("🌐 Website URL:" )
113+ print (f" { website_url } " )
114+ print ("\n 📝 User Prompt:" )
115+ print (f" { user_prompt } " )
116+ print ("\n 🎯 Goal: Access authenticated content using cookies" )
117+
118+ return {
119+ "website_url" : website_url ,
120+ "user_prompt" : user_prompt
121+ }
122+
123+
124+ def step_5_prepare_headers (api_key ):
125+ """Step 5: Prepare request headers"""
126+ print ("\n STEP 5: Prepare Request Headers" )
127+ print ("=" * 40 )
128+
129+ headers = {
130+ "SGAI-APIKEY" : api_key ,
131+ "Content-Type" : "application/json" ,
132+ "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36" ,
133+ "Accept" : "application/json" ,
134+ "Accept-Language" : "en-US,en;q=0.9" ,
135+ "Accept-Encoding" : "gzip, deflate, br" ,
136+ "Connection" : "keep-alive" ,
137+ }
138+
139+ print ("📋 Headers configured:" )
140+ for key , value in headers .items ():
141+ if key == "SGAI-APIKEY" :
142+ print (f" { key } : { value [:10 ]} ...{ value [- 10 :]} " ) # Mask API key
143+ else :
144+ print (f" { key } : { value } " )
145+
146+ return headers
147+
148+
149+ async def step_6_execute_cookies_request (headers , cookies , config ):
150+ """Step 6: Execute the request with cookies"""
151+ print ("\n STEP 6: Execute Request with Cookies" )
152+ print ("=" * 40 )
153+
154+ url = "http://localhost:8001/v1/smartscraper"
155+
156+ # Request payload with cookies
157+ payload = {
158+ "website_url" : config ["website_url" ],
159+ "user_prompt" : config ["user_prompt" ],
160+ "output_schema" : {},
161+ "cookies" : cookies ,
162+ }
163+
164+ print ("🚀 Starting request with cookies..." )
165+ print ("🍪 Using authentication cookies for access..." )
166+
167+ try :
168+ # Start timing
169+ start_time = time .time ()
170+
171+ # Use timeout for cookies requests
172+ async with httpx .AsyncClient (timeout = 120.0 ) as client :
173+ response = await client .post (url , headers = headers , json = payload )
174+
175+ # Calculate duration
176+ duration = time .time () - start_time
177+
178+ print (f"✅ Request completed in { duration :.2f} seconds" )
179+ print (f"📊 Response Status: { response .status_code } " )
180+
181+ if response .status_code == 200 :
182+ result = response .json ()
183+ return result , duration
184+ else :
185+ print (f"❌ Request failed with status { response .status_code } " )
186+ print (f"Response: { response .text } " )
187+ return None , duration
188+
189+ except httpx .TimeoutException :
190+ duration = time .time () - start_time
191+ print (f"❌ Request timed out after { duration :.2f} seconds (>120s timeout)" )
192+ print ("This may indicate authentication issues or slow response." )
193+ return None , duration
194+
195+ except httpx .RequestError as e :
196+ duration = time .time () - start_time
197+ print (f"❌ Request error after { duration :.2f} seconds: { e } " )
198+ print ("Common causes:" )
199+ print (" - Server is not running" )
200+ print (" - Invalid cookies" )
201+ print (" - Network connectivity issues" )
202+ return None , duration
203+
204+ except Exception as e :
205+ duration = time .time () - start_time
206+ print (f"❌ Unexpected error after { duration :.2f} seconds: { e } " )
207+ return None , duration
208+
209+
210+ def step_7_process_results (result , duration ):
211+ """Step 7: Process and display the results"""
212+ print ("\n STEP 7: Process Results" )
213+ print ("=" * 40 )
214+
215+ if result is None :
216+ print ("❌ No results to process" )
217+ return
218+
219+ print ("📋 Processing authenticated results..." )
220+
221+ # Display results based on type
222+ if isinstance (result , dict ):
223+ print ("\n 🔍 Response Structure:" )
224+ print (json .dumps (result , indent = 2 , ensure_ascii = False ))
225+
226+ # Check for authentication success indicators
227+ if "result" in result :
228+ print (f"\n ✨ Authentication successful! Data extracted with cookies" )
229+
230+ elif isinstance (result , list ):
231+ print (f"\n ✅ Authentication successful! Extracted { len (result )} items" )
232+
233+ # Show first few items
234+ print ("\n 📦 Sample Results:" )
235+ for i , item in enumerate (result [:3 ]): # Show first 3 items
236+ print (f" { i + 1 } . { item } " )
237+
238+ if len (result ) > 3 :
239+ print (f" ... and { len (result ) - 3 } more items" )
240+
241+ else :
242+ print (f"\n 📋 Result: { result } " )
243+
244+ print (f"\n ⏱️ Total processing time: { duration :.2f} seconds" )
245+
246+
247+ def step_8_show_curl_equivalent (api_key , cookies , config ):
248+ """Step 8: Show equivalent curl command"""
249+ print ("\n STEP 8: Equivalent curl Command" )
250+ print ("=" * 40 )
251+
252+ # Convert cookies dict to curl format
253+ cookies_str = "; " .join ([f"{ k } ={ v } " for k , v in cookies .items ()])
254+
255+ curl_command = f"""
256+ curl --location 'http://localhost:8001/v1/smartscraper' \\
257+ --header 'SGAI-APIKEY: { api_key } ' \\
258+ --header 'Content-Type: application/json' \\
259+ --header 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36' \\
260+ --header 'Accept: application/json' \\
261+ --header 'Accept-Language: en-US,en;q=0.9' \\
262+ --header 'Accept-Encoding: gzip, deflate, br' \\
263+ --header 'Connection: keep-alive' \\
264+ --cookie '{ cookies_str } ' \\
265+ --data '{{
266+ "website_url": "{ config ['website_url' ]} ",
267+ "user_prompt": "{ config ['user_prompt' ]} ",
268+ "output_schema": {{}},
269+ "cookies": { json .dumps (cookies )}
270+ }}'
271+ """
272+
273+ print ("Equivalent curl command:" )
274+ print (curl_command )
275+
276+
277+ def step_9_cookie_management_tips ():
278+ """Step 9: Provide cookie management tips"""
279+ print ("\n STEP 9: Cookie Management Tips" )
280+ print ("=" * 40 )
281+
282+ print ("🍪 Best Practices for Cookie Management:" )
283+ print ("1. 🔐 Store sensitive cookies securely (environment variables)" )
284+ print ("2. ⏰ Set appropriate expiration times" )
285+ print ("3. 🧹 Clean up expired cookies regularly" )
286+ print ("4. 🔄 Refresh tokens before they expire" )
287+ print ("5. 🛡️ Use HTTPS for cookie transmission" )
288+ print ("6. 📝 Log cookie usage for debugging" )
289+ print ("7. 🚫 Don't hardcode cookies in source code" )
290+ print ("8. 🔍 Validate cookie format before sending" )
291+
292+
293+ async def main ():
294+ """Main function to run the async step-by-step cookies example"""
295+ total_start_time = time .time ()
296+ logger .info ("Starting Async Step-by-Step Cookies Example" )
297+
298+ print ("ScrapeGraph SDK - Async Step-by-Step Cookies Example" )
299+ print ("=" * 60 )
300+ print ("This example shows the complete async process of setting up and" )
301+ print ("executing requests with cookies for authentication" )
302+ print ("=" * 60 )
303+
304+ # Step 1: Environment setup
305+ api_key = await step_1_environment_setup ()
306+ if not api_key :
307+ return
308+
309+ # Step 2: Server connectivity check
310+ server_ok = await step_2_server_connectivity_check (api_key )
311+ if not server_ok :
312+ return
313+
314+ # Step 3: Define cookies
315+ cookies = step_3_define_cookies ()
316+
317+ # Step 4: Define request parameters
318+ config = step_4_define_request_parameters ()
319+
320+ # Step 5: Prepare headers
321+ headers = step_5_prepare_headers (api_key )
322+
323+ # Step 6: Execute request
324+ result , duration = await step_6_execute_cookies_request (headers , cookies , config )
325+
326+ # Step 7: Process results
327+ step_7_process_results (result , duration )
328+
329+ # Step 8: Show curl equivalent
330+ step_8_show_curl_equivalent (api_key , cookies , config )
331+
332+ # Step 9: Cookie management tips
333+ step_9_cookie_management_tips ()
334+
335+ total_duration = time .time () - total_start_time
336+ logger .info (f"Example completed! Total execution time: { total_duration :.2f} seconds" )
337+
338+ print ("\n " + "=" * 60 )
339+ print ("Async step-by-step cookies example completed!" )
340+ print (f"⏱️ Total execution time: { total_duration :.2f} seconds" )
341+ print ("\n Key takeaways:" )
342+ print ("1. Async/await provides better performance for I/O operations" )
343+ print ("2. Cookies enable access to authenticated content" )
344+ print ("3. Always validate API key and server connectivity first" )
345+ print ("4. Secure cookie storage is crucial for production use" )
346+ print ("5. Handle authentication errors gracefully" )
347+ print ("6. Use equivalent curl commands for testing" )
348+ print ("\n Next steps:" )
349+ print ("- Implement secure cookie storage" )
350+ print ("- Add cookie refresh logic" )
351+ print ("- Handle authentication failures" )
352+ print ("- Monitor cookie expiration" )
353+ print ("- Implement retry logic for failed requests" )
354+
355+
356+ if __name__ == "__main__" :
357+ asyncio .run (main ())
0 commit comments