1+ import { crawl , getCrawlRequest } from '../index.js' ;
2+ import 'dotenv/config' ;
3+
4+ // Example .env file:
5+ // SGAI_APIKEY=your_sgai_api_key
6+
7+ const apiKey = process . env . SGAI_APIKEY ;
8+
9+ const schema = {
10+ "$schema" : "http://json-schema.org/draft-07/schema#" ,
11+ "title" : "ScrapeGraphAI Website Content" ,
12+ "type" : "object" ,
13+ "properties" : {
14+ "company" : {
15+ "type" : "object" ,
16+ "properties" : {
17+ "name" : { "type" : "string" } ,
18+ "description" : { "type" : "string" } ,
19+ "features" : { "type" : "array" , "items" : { "type" : "string" } } ,
20+ "contact_email" : { "type" : "string" , "format" : "email" } ,
21+ "social_links" : {
22+ "type" : "object" ,
23+ "properties" : {
24+ "github" : { "type" : "string" , "format" : "uri" } ,
25+ "linkedin" : { "type" : "string" , "format" : "uri" } ,
26+ "twitter" : { "type" : "string" , "format" : "uri" }
27+ } ,
28+ "additionalProperties" : false
29+ }
30+ } ,
31+ "required" : [ "name" , "description" ]
32+ } ,
33+ "services" : {
34+ "type" : "array" ,
35+ "items" : {
36+ "type" : "object" ,
37+ "properties" : {
38+ "service_name" : { "type" : "string" } ,
39+ "description" : { "type" : "string" } ,
40+ "features" : { "type" : "array" , "items" : { "type" : "string" } }
41+ } ,
42+ "required" : [ "service_name" , "description" ]
43+ }
44+ } ,
45+ "legal" : {
46+ "type" : "object" ,
47+ "properties" : {
48+ "privacy_policy" : { "type" : "string" } ,
49+ "terms_of_service" : { "type" : "string" }
50+ } ,
51+ "required" : [ "privacy_policy" , "terms_of_service" ]
52+ }
53+ } ,
54+ "required" : [ "company" , "services" , "legal" ]
55+ } ;
56+
57+ const url = 'https://scrapegraphai.com/' ;
58+ const prompt = 'What does the company do? and I need text content from there privacy and terms' ;
59+
60+ ( async ( ) => {
61+ if ( ! apiKey ) {
62+ console . error ( 'SGAI_APIKEY not found in environment. Please set it in your .env file.' ) ;
63+ process . exit ( 1 ) ;
64+ }
65+
66+ try {
67+ // Start the crawl job
68+ console . log ( `\nStarting crawl for: ${ url } ` ) ;
69+ const crawlResponse = await crawl ( apiKey , url , prompt , schema , {
70+ cacheWebsite : true ,
71+ depth : 2 ,
72+ maxPages : 2 ,
73+ sameDomainOnly : true ,
74+ batchSize : 1 ,
75+ } ) ;
76+ console . log ( '\nCrawl job started. Response:' ) ;
77+ console . log ( JSON . stringify ( crawlResponse , null , 2 ) ) ;
78+
79+ // If the crawl is asynchronous and returns an ID, fetch the result
80+ const crawlId = crawlResponse . id || crawlResponse . task_id ;
81+ if ( crawlId ) {
82+ console . log ( '\nPolling for crawl result...' ) ;
83+ for ( let i = 0 ; i < 10 ; i ++ ) {
84+ await new Promise ( ( resolve ) => setTimeout ( resolve , 5000 ) ) ;
85+ const result = await getCrawlRequest ( apiKey , crawlId ) ;
86+ if ( result . status === 'success' && result . result ) {
87+ console . log ( `\nCrawl completed. Result:` ) ;
88+ console . log ( JSON . stringify ( result . result . llm_result , null , 2 ) ) ;
89+ break ;
90+ } else if ( result . status === 'failed' ) {
91+ console . log ( '\nCrawl failed. Result:' ) ;
92+ console . log ( JSON . stringify ( result , null , 2 ) ) ;
93+ break ;
94+ } else {
95+ console . log ( `Status: ${ result . status } , waiting...` ) ;
96+ }
97+ }
98+ } else {
99+ console . log ( 'No crawl ID found in response. Synchronous result:' ) ;
100+ console . log ( JSON . stringify ( crawlResponse , null , 2 ) ) ;
101+ }
102+ } catch ( error ) {
103+ console . error ( 'Error occurred:' , error ) ;
104+ }
105+ } ) ( ) ;
0 commit comments