1+ /**
2+ * Comprehensive example demonstrating cookies integration for web scraping.
3+ *
4+ * This example shows various real-world scenarios where cookies are essential:
5+ * 1. E-commerce site scraping with authentication
6+ * 2. Social media scraping with session cookies
7+ * 3. Banking/financial site scraping with secure cookies
8+ * 4. News site scraping with user preferences
9+ * 5. API endpoint scraping with authentication tokens
10+ *
11+ * Requirements:
12+ * - Node.js 16+
13+ * - scrapegraph-js
14+ * - A .env file with your SGAI_APIKEY
15+ *
16+ * Example .env file:
17+ * SGAI_APIKEY=your_api_key_here
18+ */
19+
20+ import { smartScraper } from 'scrapegraph-js' ;
21+ import { z } from 'zod' ;
22+ import 'dotenv/config' ;
23+
24+ // Define data schemas for different scenarios
25+ const ProductInfoSchema = z . object ( {
26+ name : z . string ( ) . describe ( 'Product name' ) ,
27+ price : z . string ( ) . describe ( 'Product price' ) ,
28+ availability : z . string ( ) . describe ( 'Product availability status' ) ,
29+ rating : z . string ( ) . optional ( ) . describe ( 'Product rating' )
30+ } ) ;
31+
32+ const SocialMediaPostSchema = z . object ( {
33+ author : z . string ( ) . describe ( 'Post author' ) ,
34+ content : z . string ( ) . describe ( 'Post content' ) ,
35+ likes : z . string ( ) . optional ( ) . describe ( 'Number of likes' ) ,
36+ comments : z . string ( ) . optional ( ) . describe ( 'Number of comments' ) ,
37+ timestamp : z . string ( ) . optional ( ) . describe ( 'Post timestamp' )
38+ } ) ;
39+
40+ const NewsArticleSchema = z . object ( {
41+ title : z . string ( ) . describe ( 'Article title' ) ,
42+ summary : z . string ( ) . describe ( 'Article summary' ) ,
43+ author : z . string ( ) . optional ( ) . describe ( 'Article author' ) ,
44+ publish_date : z . string ( ) . optional ( ) . describe ( 'Publish date' )
45+ } ) ;
46+
47+ const BankTransactionSchema = z . object ( {
48+ date : z . string ( ) . describe ( 'Transaction date' ) ,
49+ description : z . string ( ) . describe ( 'Transaction description' ) ,
50+ amount : z . string ( ) . describe ( 'Transaction amount' ) ,
51+ type : z . string ( ) . describe ( 'Transaction type (credit/debit)' )
52+ } ) ;
53+
54+ async function scrapeEcommerceWithAuth ( ) {
55+ console . log ( '=' . repeat ( 60 ) ) ;
56+ console . log ( 'E-COMMERCE SITE SCRAPING WITH AUTHENTICATION' ) ;
57+ console . log ( '=' . repeat ( 60 ) ) ;
58+
59+ // Example cookies for an e-commerce site
60+ const cookies = {
61+ session_id : 'abc123def456' ,
62+ user_id : 'user789' ,
63+ cart_id : 'cart101112' ,
64+ preferences : 'dark_mode,usd' ,
65+ auth_token : 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'
66+ } ;
67+
68+ const websiteUrl = 'https://example-ecommerce.com/products' ;
69+ const userPrompt = 'Extract product information including name, price, availability, and rating' ;
70+
71+ try {
72+ const response = await smartScraper (
73+ process . env . SGAI_APIKEY ,
74+ websiteUrl ,
75+ userPrompt ,
76+ ProductInfoSchema ,
77+ 5 , // numberOfScrolls - Scroll to load more products
78+ null , // totalPages
79+ cookies
80+ ) ;
81+
82+ console . log ( '✅ E-commerce scraping completed successfully' ) ;
83+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
84+
85+ } catch ( error ) {
86+ console . error ( `❌ Error in e-commerce scraping: ${ error . message } ` ) ;
87+ }
88+ }
89+
90+ async function scrapeSocialMediaWithSession ( ) {
91+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
92+ console . log ( 'SOCIAL MEDIA SCRAPING WITH SESSION COOKIES' ) ;
93+ console . log ( '=' . repeat ( 60 ) ) ;
94+
95+ // Example cookies for a social media site
96+ const cookies = {
97+ session_token : 'xyz789abc123' ,
98+ user_session : 'def456ghi789' ,
99+ csrf_token : 'jkl012mno345' ,
100+ remember_me : 'true' ,
101+ language : 'en_US'
102+ } ;
103+
104+ const websiteUrl = 'https://example-social.com/feed' ;
105+ const userPrompt = 'Extract posts from the feed including author, content, likes, and comments' ;
106+
107+ try {
108+ const response = await smartScraper (
109+ process . env . SGAI_APIKEY ,
110+ websiteUrl ,
111+ userPrompt ,
112+ SocialMediaPostSchema ,
113+ 10 , // numberOfScrolls - Scroll to load more posts
114+ null , // totalPages
115+ cookies
116+ ) ;
117+
118+ console . log ( '✅ Social media scraping completed successfully' ) ;
119+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
120+
121+ } catch ( error ) {
122+ console . error ( `❌ Error in social media scraping: ${ error . message } ` ) ;
123+ }
124+ }
125+
126+ async function scrapeNewsWithPreferences ( ) {
127+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
128+ console . log ( 'NEWS SITE SCRAPING WITH USER PREFERENCES' ) ;
129+ console . log ( '=' . repeat ( 60 ) ) ;
130+
131+ // Example cookies for a news site
132+ const cookies = {
133+ user_preferences : 'technology,science,ai' ,
134+ reading_level : 'advanced' ,
135+ region : 'US' ,
136+ subscription_tier : 'premium' ,
137+ theme : 'dark'
138+ } ;
139+
140+ const websiteUrl = 'https://example-news.com/technology' ;
141+ const userPrompt = 'Extract news articles including title, summary, author, and publish date' ;
142+
143+ try {
144+ const response = await smartScraper (
145+ process . env . SGAI_APIKEY ,
146+ websiteUrl ,
147+ userPrompt ,
148+ NewsArticleSchema ,
149+ null , // numberOfScrolls
150+ 3 , // totalPages - Scrape multiple pages
151+ cookies
152+ ) ;
153+
154+ console . log ( '✅ News scraping completed successfully' ) ;
155+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
156+
157+ } catch ( error ) {
158+ console . error ( `❌ Error in news scraping: ${ error . message } ` ) ;
159+ }
160+ }
161+
162+ async function scrapeBankingWithSecureCookies ( ) {
163+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
164+ console . log ( 'BANKING SITE SCRAPING WITH SECURE COOKIES' ) ;
165+ console . log ( '=' . repeat ( 60 ) ) ;
166+
167+ // Example secure cookies for a banking site
168+ const cookies = {
169+ secure_session : 'pqr678stu901' ,
170+ auth_token : 'vwx234yz567' ,
171+ mfa_verified : 'true' ,
172+ device_id : 'device_abc123' ,
173+ last_activity : '2024-01-15T10:30:00Z'
174+ } ;
175+
176+ const websiteUrl = 'https://example-bank.com/transactions' ;
177+ const userPrompt = 'Extract recent transactions including date, description, amount, and type' ;
178+
179+ try {
180+ const response = await smartScraper (
181+ process . env . SGAI_APIKEY ,
182+ websiteUrl ,
183+ userPrompt ,
184+ BankTransactionSchema ,
185+ null , // numberOfScrolls
186+ 5 , // totalPages - Scrape multiple pages of transactions
187+ cookies
188+ ) ;
189+
190+ console . log ( '✅ Banking scraping completed successfully' ) ;
191+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
192+
193+ } catch ( error ) {
194+ console . error ( `❌ Error in banking scraping: ${ error . message } ` ) ;
195+ }
196+ }
197+
198+ async function scrapeApiWithAuthTokens ( ) {
199+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
200+ console . log ( 'API ENDPOINT SCRAPING WITH AUTH TOKENS' ) ;
201+ console . log ( '=' . repeat ( 60 ) ) ;
202+
203+ // Example API authentication cookies
204+ const cookies = {
205+ api_token : 'api_abc123def456' ,
206+ client_id : 'client_789' ,
207+ access_token : 'access_xyz789' ,
208+ refresh_token : 'refresh_abc123' ,
209+ scope : 'read:all'
210+ } ;
211+
212+ const websiteUrl = 'https://api.example.com/data' ;
213+ const userPrompt = 'Extract data from the API response' ;
214+
215+ try {
216+ const response = await smartScraper (
217+ process . env . SGAI_APIKEY ,
218+ websiteUrl ,
219+ userPrompt ,
220+ null , // No schema for generic API response
221+ null , // numberOfScrolls
222+ null , // totalPages
223+ cookies
224+ ) ;
225+
226+ console . log ( '✅ API scraping completed successfully' ) ;
227+ console . log ( JSON . stringify ( response , null , 2 ) ) ;
228+
229+ } catch ( error ) {
230+ console . error ( `❌ Error in API scraping: ${ error . message } ` ) ;
231+ }
232+ }
233+
234+ async function main ( ) {
235+ const apiKey = process . env . SGAI_APIKEY ;
236+
237+ // Check if API key is available
238+ if ( ! apiKey ) {
239+ console . error ( 'Error: SGAI_APIKEY not found in .env file' ) ;
240+ console . log ( 'Please create a .env file with your API key:' ) ;
241+ console . log ( 'SGAI_APIKEY=your_api_key_here' ) ;
242+ return ;
243+ }
244+
245+ console . log ( '🍪 COOKIES INTEGRATION EXAMPLES' ) ;
246+ console . log ( 'This demonstrates various real-world scenarios where cookies are essential for web scraping.' ) ;
247+
248+ // Run all examples
249+ await scrapeEcommerceWithAuth ( ) ;
250+ await scrapeSocialMediaWithSession ( ) ;
251+ await scrapeNewsWithPreferences ( ) ;
252+ await scrapeBankingWithSecureCookies ( ) ;
253+ await scrapeApiWithAuthTokens ( ) ;
254+
255+ console . log ( '\n' + '=' . repeat ( 60 ) ) ;
256+ console . log ( '✅ All examples completed!' ) ;
257+ console . log ( '=' . repeat ( 60 ) ) ;
258+ }
259+
260+ // Run the example
261+ main ( ) . catch ( console . error ) ;
0 commit comments