Skip to content

Commit a02cb1e

Browse files
committed
feat: add examples and md mode
1 parent 60e921a commit a02cb1e

12 files changed

+1843
-24
lines changed

scrapegraph-js/AGENTIC_SCRAPER.md

Lines changed: 198 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ The Agentic Scraper enables AI-powered browser automation for complex interactio
44

55
## 🚀 Quick Start
66

7+
### Basic Usage (No AI Extraction)
78
```javascript
89
import { agenticScraper, getAgenticScraperRequest } from 'scrapegraph-js';
910

@@ -15,26 +16,76 @@ const steps = [
1516
'click on login'
1617
];
1718

18-
// Submit automation request
19+
// Submit automation request (basic scraping)
1920
const response = await agenticScraper(apiKey, url, steps, true);
2021
console.log('Request ID:', response.request_id);
2122

2223
// Check results
2324
const result = await getAgenticScraperRequest(apiKey, response.request_id);
2425
console.log('Status:', result.status);
26+
console.log('Markdown Content:', result.markdown);
27+
```
28+
29+
### AI Extraction Usage
30+
```javascript
31+
import { agenticScraper, getAgenticScraperRequest } from 'scrapegraph-js';
32+
33+
const apiKey = 'your-api-key';
34+
const url = 'https://dashboard.scrapegraphai.com/';
35+
const steps = [
36+
'Type email@gmail.com in email input box',
37+
'Type test-password@123 in password inputbox',
38+
'click on login',
39+
'wait for dashboard to load'
40+
];
41+
42+
// Define extraction schema
43+
const outputSchema = {
44+
user_info: {
45+
type: "object",
46+
properties: {
47+
username: { type: "string" },
48+
email: { type: "string" },
49+
dashboard_sections: { type: "array", items: { type: "string" } }
50+
}
51+
}
52+
};
53+
54+
// Submit automation request with AI extraction
55+
const response = await agenticScraper(
56+
apiKey,
57+
url,
58+
steps,
59+
true, // useSession
60+
"Extract user information and available dashboard sections", // userPrompt
61+
outputSchema, // outputSchema
62+
true // aiExtraction
63+
);
64+
65+
console.log('Request ID:', response.request_id);
66+
67+
// Check results
68+
const result = await getAgenticScraperRequest(apiKey, response.request_id);
69+
if (result.status === 'completed') {
70+
console.log('Extracted Data:', result.result);
71+
console.log('Raw Markdown:', result.markdown);
72+
}
2573
```
2674

2775
## 📚 API Reference
2876

29-
### `agenticScraper(apiKey, url, steps, useSession)`
77+
### `agenticScraper(apiKey, url, steps, useSession, userPrompt, outputSchema, aiExtraction)`
3078

31-
Performs automated browser actions on a webpage.
79+
Performs automated browser actions on a webpage with optional AI extraction.
3280

3381
**Parameters:**
3482
- `apiKey` (string): Your ScrapeGraph AI API key
3583
- `url` (string): The URL of the webpage to interact with
3684
- `steps` (string[]): Array of automation steps to perform
3785
- `useSession` (boolean, optional): Whether to use session management (default: true)
86+
- `userPrompt` (string, optional): Prompt for AI extraction (required when aiExtraction=true)
87+
- `outputSchema` (object, optional): Schema for structured data extraction (used with aiExtraction=true)
88+
- `aiExtraction` (boolean, optional): Whether to use AI for data extraction (default: false)
3889

3990
**Returns:** Promise<Object> with `request_id` and initial `status`
4091

@@ -67,6 +118,150 @@ Retrieves the status or result of an agentic scraper request.
67118

68119
## 🎯 Use Cases
69120

121+
### 1. **Basic Automation (No AI)**
122+
Perfect for simple automation tasks where you just need the raw HTML/markdown content:
123+
- **Login automation**: Automate login flows and capture the resulting page
124+
- **Form submission**: Fill out forms and get confirmation pages
125+
- **Navigation**: Navigate through multi-step workflows
126+
- **Content scraping**: Get page content after performing actions
127+
128+
### 2. **AI-Powered Data Extraction**
129+
Ideal when you need structured data from the automated interactions:
130+
- **Dashboard data extraction**: Login and extract user information, metrics, settings
131+
- **E-commerce scraping**: Search products and extract structured product data
132+
- **Form result parsing**: Submit forms and extract confirmation details, reference numbers
133+
- **Content analysis**: Navigate to content and extract key information in structured format
134+
135+
### 3. **Hybrid Approach**
136+
Use both modes depending on your needs:
137+
- **Development/Testing**: Start with basic mode to test automation steps
138+
- **Production**: Add AI extraction for structured data processing
139+
- **Fallback**: Use basic mode when AI extraction isn't needed
140+
141+
## 💡 AI Extraction Examples
142+
143+
### E-commerce Product Search
144+
```javascript
145+
const steps = [
146+
'click on search box',
147+
'type "wireless headphones" in search',
148+
'press enter',
149+
'wait for results to load',
150+
'scroll down 2 times'
151+
];
152+
153+
const schema = {
154+
products: {
155+
type: "array",
156+
items: {
157+
type: "object",
158+
properties: {
159+
name: { type: "string" },
160+
price: { type: "string" },
161+
rating: { type: "number" },
162+
availability: { type: "string" }
163+
}
164+
}
165+
}
166+
};
167+
168+
const response = await agenticScraper(
169+
apiKey,
170+
'https://example-store.com',
171+
steps,
172+
true,
173+
'Extract product names, prices, ratings, and availability from search results',
174+
schema,
175+
true
176+
);
177+
```
178+
179+
### Contact Form with Confirmation
180+
```javascript
181+
const steps = [
182+
'type "John Doe" in name field',
183+
'type "john@example.com" in email field',
184+
'type "Product inquiry" in subject field',
185+
'type "I need more information about pricing" in message field',
186+
'click submit button',
187+
'wait for confirmation'
188+
];
189+
190+
const schema = {
191+
submission: {
192+
type: "object",
193+
properties: {
194+
status: { type: "string" },
195+
message: { type: "string" },
196+
reference_number: { type: "string" },
197+
response_time: { type: "string" }
198+
}
199+
}
200+
};
201+
202+
const response = await agenticScraper(
203+
apiKey,
204+
'https://company.com/contact',
205+
steps,
206+
true,
207+
'Extract form submission status, confirmation message, and any reference numbers',
208+
schema,
209+
true
210+
);
211+
```
212+
213+
### Social Media Data Extraction
214+
```javascript
215+
const steps = [
216+
'type "username" in username field',
217+
'type "password" in password field',
218+
'click login button',
219+
'wait for dashboard',
220+
'click on profile section'
221+
];
222+
223+
const schema = {
224+
profile: {
225+
type: "object",
226+
properties: {
227+
username: { type: "string" },
228+
followers: { type: "number" },
229+
following: { type: "number" },
230+
posts: { type: "number" },
231+
recent_activity: { type: "array", items: { type: "string" } }
232+
}
233+
}
234+
};
235+
236+
const response = await agenticScraper(
237+
apiKey,
238+
'https://social-platform.com/login',
239+
steps,
240+
true,
241+
'Extract profile information including username, follower counts, and recent activity',
242+
schema,
243+
true
244+
);
245+
```
246+
247+
## 🔧 Best Practices
248+
249+
### When to Use AI Extraction
250+
-**Use AI extraction when**: You need structured data, specific information extraction, or data validation
251+
-**Skip AI extraction when**: You just need raw content, testing automation steps, or processing content externally
252+
253+
### Schema Design Tips
254+
- **Be specific**: Define exact data types and required fields
255+
- **Use descriptions**: Add description fields to guide AI extraction
256+
- **Nested objects**: Use nested schemas for complex data structures
257+
- **Arrays**: Use arrays for lists of similar items (products, comments, etc.)
258+
259+
### Step Optimization
260+
- **Wait steps**: Add wait steps after actions that trigger loading
261+
- **Specific selectors**: Use specific element descriptions ("click on blue submit button")
262+
- **Sequential actions**: Break complex actions into smaller, specific steps
263+
- **Error handling**: Include steps to handle common UI variations
264+
70265
### 🔐 Login Automation
71266
```javascript
72267
const loginSteps = [

scrapegraph-js/examples/agenticScraper_advanced_example.js

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ async function advancedAgenticScrapingExample() {
1313
// Example configurations for different scenarios
1414
const scenarios = [
1515
{
16-
name: 'Social Media Login',
16+
name: 'Social Media Login (No AI)',
1717
url: 'https://twitter.com/login',
1818
steps: [
1919
'click on email input field',
@@ -23,10 +23,11 @@ async function advancedAgenticScrapingExample() {
2323
'click login button',
2424
'wait for 3 seconds'
2525
],
26-
useSession: true
26+
useSession: true,
27+
aiExtraction: false
2728
},
2829
{
29-
name: 'Form Submission',
30+
name: 'Form Submission with AI Extraction',
3031
url: 'https://example.com/contact',
3132
steps: [
3233
'click on name input',
@@ -35,12 +36,26 @@ async function advancedAgenticScrapingExample() {
3536
'type "john@example.com" in email field',
3637
'click on message textarea',
3738
'type "Hello, this is a test message" in message field',
38-
'click submit button'
39+
'click submit button',
40+
'wait for confirmation message'
3941
],
40-
useSession: false
42+
useSession: false,
43+
aiExtraction: true,
44+
userPrompt: 'Extract the form submission result, confirmation message, and any reference numbers provided',
45+
outputSchema: {
46+
submission: {
47+
type: "object",
48+
properties: {
49+
status: { type: "string" },
50+
message: { type: "string" },
51+
reference_id: { type: "string" }
52+
},
53+
required: ["status", "message"]
54+
}
55+
}
4156
},
4257
{
43-
name: 'E-commerce Search',
58+
name: 'E-commerce Search with Product Extraction',
4459
url: 'https://example-store.com',
4560
steps: [
4661
'wait for page to load',
@@ -50,9 +65,33 @@ async function advancedAgenticScrapingExample() {
5065
'wait for 2 seconds',
5166
'click on filter button',
5267
'select price range $50-$100',
53-
'click apply filters'
68+
'click apply filters',
69+
'scroll down to see more products'
5470
],
55-
useSession: true
71+
useSession: true,
72+
aiExtraction: true,
73+
userPrompt: 'Extract product information including names, prices, ratings, and availability from the search results',
74+
outputSchema: {
75+
search_results: {
76+
type: "object",
77+
properties: {
78+
products: {
79+
type: "array",
80+
items: {
81+
type: "object",
82+
properties: {
83+
name: { type: "string" },
84+
price: { type: "string" },
85+
rating: { type: "number" },
86+
availability: { type: "string" }
87+
}
88+
}
89+
},
90+
total_results: { type: "number" },
91+
current_page: { type: "number" }
92+
}
93+
}
94+
}
5695
}
5796
];
5897

@@ -64,6 +103,11 @@ async function advancedAgenticScrapingExample() {
64103
console.log(`URL: ${scenario.url}`);
65104
console.log(`Steps: ${scenario.steps.length} automation actions`);
66105
console.log(`Use Session: ${scenario.useSession}`);
106+
console.log(`AI Extraction: ${scenario.aiExtraction}`);
107+
if (scenario.aiExtraction) {
108+
console.log(`User Prompt: ${scenario.userPrompt}`);
109+
console.log(`Output Schema: ${scenario.outputSchema ? 'Provided' : 'None'}`);
110+
}
67111

68112
// Validate inputs before making the request
69113
validateInputs(scenario.url, scenario.steps);
@@ -75,7 +119,10 @@ async function advancedAgenticScrapingExample() {
75119
apiKey,
76120
scenario.url,
77121
scenario.steps,
78-
scenario.useSession
122+
scenario.useSession,
123+
scenario.userPrompt || null,
124+
scenario.outputSchema || null,
125+
scenario.aiExtraction || false
79126
);
80127

81128
console.log('✅ Request submitted successfully!');
@@ -86,7 +133,19 @@ async function advancedAgenticScrapingExample() {
86133
const result = await monitorRequest(response.request_id, 120); // 2 minute timeout
87134

88135
console.log('\n🎉 Automation completed!');
89-
console.log('Final Result:', JSON.stringify(result.result, null, 2));
136+
137+
if (scenario.aiExtraction && result.result) {
138+
console.log('🎯 Extracted Structured Data:');
139+
console.log(JSON.stringify(result.result, null, 2));
140+
} else if (result.markdown) {
141+
console.log('📄 Raw Content (markdown):');
142+
const preview = result.markdown.length > 500
143+
? result.markdown.substring(0, 500) + '...'
144+
: result.markdown;
145+
console.log(preview);
146+
} else {
147+
console.log('Final Result:', JSON.stringify(result.result, null, 2));
148+
}
90149

91150
return result;
92151

0 commit comments

Comments
 (0)