diff --git a/scrapegraph-js/AGENTIC_SCRAPER.md b/scrapegraph-js/AGENTIC_SCRAPER.md new file mode 100644 index 0000000..d108cfc --- /dev/null +++ b/scrapegraph-js/AGENTIC_SCRAPER.md @@ -0,0 +1,243 @@ +# πŸ€– Agentic Scraper + +The Agentic Scraper enables AI-powered browser automation for complex interactions like form filling, clicking buttons, and navigating multi-step workflows. + +## πŸš€ Quick Start + +```javascript +import { agenticScraper, getAgenticScraperRequest } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://dashboard.scrapegraphai.com/'; +const steps = [ + 'Type email@gmail.com in email input box', + 'Type test-password@123 in password inputbox', + 'click on login' +]; + +// Submit automation request +const response = await agenticScraper(apiKey, url, steps, true); +console.log('Request ID:', response.request_id); + +// Check results +const result = await getAgenticScraperRequest(apiKey, response.request_id); +console.log('Status:', result.status); +``` + +## πŸ“š API Reference + +### `agenticScraper(apiKey, url, steps, useSession)` + +Performs automated browser actions on a webpage. + +**Parameters:** +- `apiKey` (string): Your ScrapeGraph AI API key +- `url` (string): The URL of the webpage to interact with +- `steps` (string[]): Array of automation steps to perform +- `useSession` (boolean, optional): Whether to use session management (default: true) + +**Returns:** Promise with `request_id` and initial `status` + +**Example Steps:** +```javascript +const steps = [ + 'click on search bar', + 'type "laptop" in search input', + 'press Enter key', + 'wait for 2 seconds', + 'click on first result', + 'scroll down to reviews' +]; +``` + +### `getAgenticScraperRequest(apiKey, requestId)` + +Retrieves the status or result of an agentic scraper request. + +**Parameters:** +- `apiKey` (string): Your ScrapeGraph AI API key +- `requestId` (string): The request ID from a previous agentic scraper call + +**Returns:** Promise with: +- `status`: 'pending', 'completed', or 'failed' +- `result`: Automation results (when completed) +- `error`: Error message (when failed) +- `created_at`: Request creation timestamp +- `completed_at`: Completion timestamp (when completed) + +## 🎯 Use Cases + +### πŸ” Login Automation +```javascript +const loginSteps = [ + 'click on email input', + 'type "user@example.com" in email field', + 'click on password input', + 'type "password123" in password field', + 'click login button', + 'wait for dashboard to load' +]; + +const response = await agenticScraper(apiKey, 'https://app.example.com/login', loginSteps, true); +``` + +### πŸ›’ E-commerce Interaction +```javascript +const shoppingSteps = [ + 'click on search bar', + 'type "wireless headphones" in search', + 'press Enter', + 'wait for results to load', + 'click on first product', + 'click add to cart button', + 'click view cart' +]; + +const response = await agenticScraper(apiKey, 'https://shop.example.com', shoppingSteps, true); +``` + +### πŸ“ Form Submission +```javascript +const formSteps = [ + 'click on name input', + 'type "John Doe" in name field', + 'click on email input', + 'type "john@example.com" in email field', + 'click on message textarea', + 'type "Hello, this is a test message" in message area', + 'click submit button' +]; + +const response = await agenticScraper(apiKey, 'https://example.com/contact', formSteps, false); +``` + +## ⚑ Advanced Usage + +### Polling for Results +```javascript +async function waitForCompletion(requestId, timeoutSeconds = 120) { + const startTime = Date.now(); + const timeout = timeoutSeconds * 1000; + + while (Date.now() - startTime < timeout) { + const status = await getAgenticScraperRequest(apiKey, requestId); + + if (status.status === 'completed') { + return status.result; + } else if (status.status === 'failed') { + throw new Error(status.error); + } + + await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds + } + + throw new Error('Timeout waiting for completion'); +} +``` + +### Error Handling +```javascript +try { + const response = await agenticScraper(apiKey, url, steps, true); + const result = await waitForCompletion(response.request_id); + console.log('Automation successful:', result); +} catch (error) { + if (error.message.includes('validation')) { + console.log('Input validation failed:', error.message); + } else if (error.message.includes('timeout')) { + console.log('Automation timed out'); + } else { + console.log('Automation failed:', error.message); + } +} +``` + +## πŸ“ Step Syntax + +Steps should be written in natural language describing the action to perform: + +### Clicking Elements +- `"click on login button"` +- `"click on search icon"` +- `"click on first result"` + +### Typing Text +- `"type 'username' in email field"` +- `"type 'password123' in password input"` +- `"type 'search query' in search box"` + +### Keyboard Actions +- `"press Enter key"` +- `"press Tab key"` +- `"press Escape key"` + +### Waiting +- `"wait for 2 seconds"` +- `"wait for page to load"` +- `"wait for results to appear"` + +### Scrolling +- `"scroll down"` +- `"scroll to bottom"` +- `"scroll to top"` + +## πŸ”§ Best Practices + +1. **Use Session Management**: Set `useSession: true` for multi-step workflows +2. **Add Wait Steps**: Include wait times between actions for reliability +3. **Be Specific**: Use descriptive selectors like "login button" vs "button" +4. **Handle Timeouts**: Implement proper timeout handling for long operations +5. **Validate Inputs**: Check URLs and steps before making requests + +## 🚨 Common Errors + +### Input Validation Errors +```javascript +// ❌ Invalid URL +await agenticScraper(apiKey, 'not-a-url', steps); + +// ❌ Empty steps +await agenticScraper(apiKey, url, []); + +// ❌ Invalid step +await agenticScraper(apiKey, url, ['click button', '']); // Empty step +``` + +### Runtime Errors +- **Element not found**: Make steps more specific or add wait times +- **Timeout**: Increase polling timeout or break down complex steps +- **Session expired**: Use session management for multi-step flows + +## 🌐 cURL Equivalent + +```bash +curl --location 'https://api.scrapegraphai.com/v1/agentic-scrapper' \ +--header 'SGAI-APIKEY: your-api-key' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "url": "https://dashboard.scrapegraphai.com/", + "use_session": true, + "steps": [ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ] +}' +``` + +## πŸ“– Examples + +Check out the example files in the `/examples` directory: + +- `agenticScraper_example.js` - Basic usage +- `getAgenticScraperRequest_example.js` - Status checking +- `agenticScraper_complete_example.js` - Complete workflow +- `agenticScraper_advanced_example.js` - Advanced patterns with error handling + +## πŸ’‘ Tips + +- Start with simple steps and gradually add complexity +- Test individual steps before combining them +- Use browser developer tools to identify element selectors +- Consider mobile vs desktop layouts when writing steps +- Monitor request status regularly for long-running automations diff --git a/scrapegraph-js/examples/agenticScraper_advanced_example.js b/scrapegraph-js/examples/agenticScraper_advanced_example.js new file mode 100644 index 0000000..7e6e562 --- /dev/null +++ b/scrapegraph-js/examples/agenticScraper_advanced_example.js @@ -0,0 +1,263 @@ +import { agenticScraper, getAgenticScraperRequest } from 'scrapegraph-js'; +import 'dotenv/config'; + +const apiKey = process.env.SGAI_APIKEY; + +/** + * Advanced example with input validation and error handling + */ +async function advancedAgenticScrapingExample() { + console.log('πŸš€ Advanced Agentic Scraping Example'); + console.log('=' * 45); + + // Example configurations for different scenarios + const scenarios = [ + { + name: 'Social Media Login', + url: 'https://twitter.com/login', + steps: [ + 'click on email input field', + 'type "user@example.com" in email field', + 'click on password input field', + 'type "password123" in password field', + 'click login button', + 'wait for 3 seconds' + ], + useSession: true + }, + { + name: 'Form Submission', + url: 'https://example.com/contact', + steps: [ + 'click on name input', + 'type "John Doe" in name field', + 'click on email input', + 'type "john@example.com" in email field', + 'click on message textarea', + 'type "Hello, this is a test message" in message field', + 'click submit button' + ], + useSession: false + }, + { + name: 'E-commerce Search', + url: 'https://example-store.com', + steps: [ + 'wait for page to load', + 'click on search bar', + 'type "wireless headphones" in search', + 'press Enter key', + 'wait for 2 seconds', + 'click on filter button', + 'select price range $50-$100', + 'click apply filters' + ], + useSession: true + } + ]; + + // Run a specific scenario (change index to test different ones) + const scenario = scenarios[0]; // Social Media Login + + try { + console.log(`\nπŸ“‹ Running Scenario: ${scenario.name}`); + console.log(`URL: ${scenario.url}`); + console.log(`Steps: ${scenario.steps.length} automation actions`); + console.log(`Use Session: ${scenario.useSession}`); + + // Validate inputs before making the request + validateInputs(scenario.url, scenario.steps); + + console.log('\nβœ… Input validation passed'); + console.log('πŸš€ Submitting agentic scraper request...'); + + const response = await agenticScraper( + apiKey, + scenario.url, + scenario.steps, + scenario.useSession + ); + + console.log('βœ… Request submitted successfully!'); + console.log(`Request ID: ${response.request_id}`); + console.log(`Status: ${response.status}`); + + // Monitor the request with timeout + const result = await monitorRequest(response.request_id, 120); // 2 minute timeout + + console.log('\nπŸŽ‰ Automation completed!'); + console.log('Final Result:', JSON.stringify(result.result, null, 2)); + + return result; + + } catch (error) { + console.error(`\n❌ Error in ${scenario.name}:`, error.message); + + // Provide helpful error context + if (error.message.includes('validation')) { + console.log('\nπŸ’‘ Validation Tips:'); + console.log('- Ensure URL starts with http:// or https://'); + console.log('- Make sure all steps are non-empty strings'); + console.log('- Check that the steps array is not empty'); + } else if (error.message.includes('timeout')) { + console.log('\nπŸ’‘ Timeout Tips:'); + console.log('- Complex automations may take longer'); + console.log('- Consider breaking down into smaller steps'); + console.log('- Check if the target website is responsive'); + } + + throw error; + } +} + +/** + * Input validation function + */ +function validateInputs(url, steps) { + // Validate URL + if (!url || typeof url !== 'string') { + throw new Error('validation: URL must be a non-empty string'); + } + + if (!url.startsWith('http://') && !url.startsWith('https://')) { + throw new Error('validation: URL must start with http:// or https://'); + } + + // Validate steps + if (!Array.isArray(steps) || steps.length === 0) { + throw new Error('validation: Steps must be a non-empty array'); + } + + steps.forEach((step, index) => { + if (!step || typeof step !== 'string' || !step.trim()) { + throw new Error(`validation: Step ${index + 1} must be a non-empty string`); + } + }); + + console.log(`βœ… Validated URL and ${steps.length} steps`); +} + +/** + * Monitor request with timeout and progress updates + */ +async function monitorRequest(requestId, timeoutSeconds = 120) { + const startTime = Date.now(); + const timeoutMs = timeoutSeconds * 1000; + let attempts = 0; + + console.log(`\nπŸ”„ Monitoring request ${requestId}`); + console.log(`Timeout: ${timeoutSeconds} seconds`); + + while (Date.now() - startTime < timeoutMs) { + attempts++; + const elapsed = Math.round((Date.now() - startTime) / 1000); + + try { + console.log(`\n⏳ Check ${attempts} (${elapsed}s elapsed)`); + + const status = await getAgenticScraperRequest(apiKey, requestId); + console.log(`Status: ${status.status}`); + + if (status.status === 'completed') { + const totalTime = Math.round((Date.now() - startTime) / 1000); + console.log(`βœ… Completed in ${totalTime} seconds`); + return status; + } else if (status.status === 'failed') { + throw new Error(`Automation failed: ${status.error}`); + } + + // Wait before next check (progressive backoff) + const waitTime = Math.min(5000 + (attempts * 1000), 15000); // 5-15 seconds + console.log(`⏸️ Waiting ${waitTime/1000}s before next check...`); + await sleep(waitTime); + + } catch (error) { + if (error.message.includes('Automation failed')) { + throw error; + } + console.log(`⚠️ Check failed: ${error.message}`); + await sleep(5000); + } + } + + throw new Error(`timeout: Request did not complete within ${timeoutSeconds} seconds`); +} + +/** + * Demonstrate error handling scenarios + */ +async function errorHandlingExamples() { + console.log('\nπŸ›‘οΈ Error Handling Examples'); + console.log('=' * 30); + + const errorScenarios = [ + { + name: 'Invalid URL', + url: 'not-a-valid-url', + steps: ['click button'], + expectedError: 'URL must start with' + }, + { + name: 'Empty Steps', + url: 'https://example.com', + steps: [], + expectedError: 'non-empty array' + }, + { + name: 'Invalid Step', + url: 'https://example.com', + steps: ['valid step', '', 'another valid step'], + expectedError: 'non-empty string' + } + ]; + + for (const scenario of errorScenarios) { + try { + console.log(`\nπŸ§ͺ Testing: ${scenario.name}`); + await agenticScraper(apiKey, scenario.url, scenario.steps); + console.log('❌ Expected error but request succeeded'); + } catch (error) { + if (error.message.includes(scenario.expectedError)) { + console.log(`βœ… Correctly caught error: ${error.message}`); + } else { + console.log(`⚠️ Unexpected error: ${error.message}`); + } + } + } +} + +/** + * Utility function + */ +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +/** + * Main execution + */ +async function main() { + if (!apiKey) { + console.error('❌ Error: SGAI_APIKEY environment variable not set'); + console.log('\nPlease create a .env file with:'); + console.log('SGAI_APIKEY=your-api-key-here'); + process.exit(1); + } + + try { + // Run the advanced example + await advancedAgenticScrapingExample(); + + // Uncomment to test error handling + // await errorHandlingExamples(); + + console.log('\n✨ Advanced example completed successfully!'); + + } catch (error) { + console.error('\nπŸ’₯ Advanced example failed:', error.message); + process.exit(1); + } +} + +// Run the advanced example +main(); diff --git a/scrapegraph-js/examples/agenticScraper_complete_example.js b/scrapegraph-js/examples/agenticScraper_complete_example.js new file mode 100644 index 0000000..e848207 --- /dev/null +++ b/scrapegraph-js/examples/agenticScraper_complete_example.js @@ -0,0 +1,146 @@ +import { agenticScraper, getAgenticScraperRequest } from 'scrapegraph-js'; +import 'dotenv/config'; + +const apiKey = process.env.SGAI_APIKEY; + +/** + * Complete example showing how to use agentic scraper for automated login + * and then retrieve the results + */ +async function completeAgenticScrapingExample() { + console.log('πŸ€– Starting Complete Agentic Scraping Example'); + console.log('=' * 50); + + // Configuration + const url = 'https://dashboard.scrapegraphai.com/'; + const steps = [ + 'Type email@gmail.com in email input box', + 'Type test-password@123 in password inputbox', + 'click on login' + ]; + const useSession = true; + + try { + // Step 1: Submit the agentic scraper request + console.log('\nπŸ“€ Step 1: Submitting agentic scraper request...'); + console.log('URL:', url); + console.log('Use Session:', useSession); + console.log('Steps:', steps.length, 'automation steps'); + + const submitResponse = await agenticScraper(apiKey, url, steps, useSession); + + console.log('βœ… Request submitted successfully!'); + console.log('Request ID:', submitResponse.request_id); + console.log('Initial Status:', submitResponse.status); + + const requestId = submitResponse.request_id; + + // Step 2: Poll for results + console.log('\nπŸ”„ Step 2: Polling for results...'); + let attempts = 0; + const maxAttempts = 12; // 2 minutes max (10 seconds * 12) + + while (attempts < maxAttempts) { + attempts++; + console.log(`\n⏳ Attempt ${attempts}/${maxAttempts}: Checking status...`); + + const statusResponse = await getAgenticScraperRequest(apiKey, requestId); + console.log('Status:', statusResponse.status); + + if (statusResponse.status === 'completed') { + console.log('\nπŸŽ‰ Automation completed successfully!'); + console.log('Completed At:', statusResponse.completed_at); + console.log('Processing Time:', calculateProcessingTime(submitResponse.created_at, statusResponse.completed_at)); + console.log('\nπŸ“‹ Results:'); + console.log(JSON.stringify(statusResponse.result, null, 2)); + + return statusResponse; + } else if (statusResponse.status === 'failed') { + console.log('\n❌ Automation failed'); + console.log('Error:', statusResponse.error); + throw new Error(`Automation failed: ${statusResponse.error}`); + } else { + console.log('Still processing... waiting 10 seconds'); + await sleep(10000); // Wait 10 seconds + } + } + + throw new Error('Timeout: Automation took too long to complete'); + + } catch (error) { + console.error('\n❌ Error in complete example:', error.message); + throw error; + } +} + +/** + * Example with different automation steps + */ +async function ecommerceAutomationExample() { + console.log('\nπŸ›’ E-commerce Automation Example'); + console.log('=' * 40); + + const url = 'https://example-shop.com'; + const steps = [ + 'click on search input', + 'type "laptop" in search box', + 'click search button', + 'wait for 2 seconds', + 'click on first product', + 'scroll down to reviews section' + ]; + + try { + const response = await agenticScraper(apiKey, url, steps, true); + console.log('E-commerce automation started:', response.request_id); + return response; + } catch (error) { + console.error('E-commerce automation error:', error.message); + } +} + +/** + * Utility functions + */ +function calculateProcessingTime(startTime, endTime) { + const start = new Date(startTime); + const end = new Date(endTime); + const diffMs = end - start; + const diffSeconds = Math.round(diffMs / 1000); + return `${diffSeconds} seconds`; +} + +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +/** + * Main execution + */ +async function main() { + if (!apiKey) { + console.error('❌ Error: SGAI_APIKEY environment variable not set'); + console.log('Please set your API key in the .env file:'); + console.log('SGAI_APIKEY=your-api-key-here'); + process.exit(1); + } + + try { + console.log('πŸš€ Running Agentic Scraper Examples'); + + // Run the complete login automation example + await completeAgenticScrapingExample(); + + // Uncomment to run the e-commerce example + // await ecommerceAutomationExample(); + + console.log('\nβœ… All examples completed successfully!'); + + } catch (error) { + console.error('\nπŸ’₯ Example failed:', error.message); + process.exit(1); + } +} + +// Run the examples +main(); diff --git a/scrapegraph-js/examples/agenticScraper_example.js b/scrapegraph-js/examples/agenticScraper_example.js new file mode 100644 index 0000000..d52aa46 --- /dev/null +++ b/scrapegraph-js/examples/agenticScraper_example.js @@ -0,0 +1,20 @@ +import { agenticScraper } from 'scrapegraph-js'; +import 'dotenv/config'; + +const apiKey = process.env.SGAI_APIKEY; +const url = 'https://dashboard.scrapegraphai.com/'; +const steps = [ + 'Type email@gmail.com in email input box', + 'Type test-password@123 in password inputbox', + 'click on login' +]; + +try { + const response = await agenticScraper(apiKey, url, steps, true); + console.log('πŸ€– Agentic Scraper Request Submitted'); + console.log('Request ID:', response.request_id); + console.log('Status:', response.status); + console.log('Full Response:', JSON.stringify(response, null, 2)); +} catch (error) { + console.error('❌ Error:', error.message); +} diff --git a/scrapegraph-js/examples/getAgenticScraperRequest_example.js b/scrapegraph-js/examples/getAgenticScraperRequest_example.js new file mode 100644 index 0000000..1d54af4 --- /dev/null +++ b/scrapegraph-js/examples/getAgenticScraperRequest_example.js @@ -0,0 +1,31 @@ +import { getAgenticScraperRequest } from 'scrapegraph-js'; +import 'dotenv/config'; + +const apiKey = process.env.SGAI_APIKEY; +// Replace this with an actual request ID from a previous agenticScraper call +const requestId = 'your-request-id-here'; + +try { + const response = await getAgenticScraperRequest(apiKey, requestId); + + console.log('πŸ” Agentic Scraper Request Status'); + console.log('Request ID:', requestId); + console.log('Status:', response.status); + console.log('Created At:', response.created_at); + + if (response.status === 'completed') { + console.log('βœ… Automation Completed!'); + console.log('Completed At:', response.completed_at); + console.log('Result:', JSON.stringify(response.result, null, 2)); + } else if (response.status === 'pending') { + console.log('⏳ Automation is still in progress...'); + console.log('Please check again in a few moments.'); + } else if (response.status === 'failed') { + console.log('❌ Automation Failed'); + console.log('Error:', response.error); + } + + console.log('\nFull Response:', JSON.stringify(response, null, 2)); +} catch (error) { + console.error('❌ Error:', error.message); +} diff --git a/scrapegraph-js/index.js b/scrapegraph-js/index.js index 84c4d47..47ee6ce 100644 --- a/scrapegraph-js/index.js +++ b/scrapegraph-js/index.js @@ -1,3 +1,4 @@ +export { agenticScraper, getAgenticScraperRequest } from './src/agenticScraper.js'; export { smartScraper, getSmartScraperRequest } from './src/smartScraper.js'; export { markdownify, getMarkdownifyRequest } from './src/markdownify.js'; export { searchScraper, getSearchScraperRequest } from './src/searchScraper.js'; diff --git a/scrapegraph-js/src/agenticScraper.js b/scrapegraph-js/src/agenticScraper.js new file mode 100644 index 0000000..14e8a18 --- /dev/null +++ b/scrapegraph-js/src/agenticScraper.js @@ -0,0 +1,136 @@ +import axios from 'axios'; +import handleError from './utils/handleError.js'; + +/** + * Perform automated browser actions on a webpage using AI-powered agentic scraping. + * + * @param {string} apiKey - Your ScrapeGraph AI API key + * @param {string} url - The URL of the webpage to interact with + * @param {string[]} steps - Array of steps to perform on the webpage (e.g., ["Type email@gmail.com in email input box", "click on login"]) + * @param {boolean} [useSession=true] - Whether to use session for the scraping operations + * @returns {Promise} Response from the API containing request_id and initial status + * @throws {Error} Will throw an error in case of an HTTP failure or invalid parameters. + * + * @example + * // Example usage for automated login: + * const apiKey = 'your-api-key'; + * const url = 'https://dashboard.scrapegraphai.com/'; + * const steps = [ + * 'Type email@gmail.com in email input box', + * 'Type test-password@123 in password inputbox', + * 'click on login' + * ]; + * + * try { + * const result = await agenticScraper(apiKey, url, steps, true); + * console.log('Request ID:', result.request_id); + * console.log('Status:', result.status); + * } catch (error) { + * console.error('Error:', error.message); + * } + */ +export async function agenticScraper(apiKey, url, steps, useSession = true) { + const endpoint = 'https://api.scrapegraphai.com/v1/agentic-scrapper'; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + 'Content-Type': 'application/json', + }; + + // Validate inputs + if (!apiKey || typeof apiKey !== 'string') { + throw new Error('API key must be a non-empty string'); + } + + if (!url || typeof url !== 'string') { + throw new Error('URL must be a non-empty string'); + } + + if (!url.startsWith('http://') && !url.startsWith('https://')) { + throw new Error('URL must start with http:// or https://'); + } + + if (!Array.isArray(steps) || steps.length === 0) { + throw new Error('Steps must be a non-empty array'); + } + + if (steps.some(step => !step || typeof step !== 'string' || !step.trim())) { + throw new Error('All steps must be non-empty strings'); + } + + if (typeof useSession !== 'boolean') { + throw new Error('useSession must be a boolean value'); + } + + const payload = { + url: url, + use_session: useSession, + steps: steps, + }; + + try { + const response = await axios.post(endpoint, payload, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} + +/** + * Retrieve the status or result of an agentic scraper request. + * + * @param {string} apiKey - Your ScrapeGraph AI API key + * @param {string} requestId - The request ID associated with the agentic scraper request + * @returns {Promise} A promise that resolves to an object containing: + * - status: The current status of the request ('pending', 'completed', 'failed') + * - result: The extracted data or automation result when status is 'completed' + * - error: Error message if the request failed (when status is 'failed') + * - created_at: Timestamp of when the request was created + * - completed_at: Timestamp of when the request was completed (if applicable) + * @throws {Error} Throws an error if the HTTP request fails or if the API key is invalid + * + * @example + * // Example usage: + * const apiKey = 'your-api-key'; + * const requestId = 'previously-obtained-request-id'; + * + * try { + * const result = await getAgenticScraperRequest(apiKey, requestId); + * if (result.status === 'completed') { + * console.log('Automation completed:', result.result); + * } else if (result.status === 'pending') { + * console.log('Automation is still in progress'); + * } else { + * console.log('Automation failed:', result.error); + * } + * } catch (error) { + * console.error('Error fetching request:', error); + * } + * + * @note The agentic scraper performs browser automation steps sequentially, + * allowing for complex interactions like form filling, clicking buttons, + * and navigating through multi-step workflows with session management. + */ +export async function getAgenticScraperRequest(apiKey, requestId) { + const endpoint = 'https://api.scrapegraphai.com/v1/agentic-scrapper/' + requestId; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + }; + + // Validate inputs + if (!apiKey || typeof apiKey !== 'string') { + throw new Error('API key must be a non-empty string'); + } + + if (!requestId || typeof requestId !== 'string') { + throw new Error('Request ID must be a non-empty string'); + } + + try { + const response = await axios.get(endpoint, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} diff --git a/scrapegraph-js/test/agenticScraper_test.js b/scrapegraph-js/test/agenticScraper_test.js new file mode 100644 index 0000000..b23f658 --- /dev/null +++ b/scrapegraph-js/test/agenticScraper_test.js @@ -0,0 +1,506 @@ +import { agenticScraper, getAgenticScraperRequest } from '../index.js'; +import 'dotenv/config'; + +/** + * Test suite for AgenticScraper functionality + * This file demonstrates usage and validates the agentic scraper parameters + */ + +// Mock API key for testing (replace with real key for actual testing) +const API_KEY = process.env.SGAI_APIKEY || 'test-api-key'; + +/** + * Test input validation for agenticScraper + */ +function testInputValidation() { + console.log('πŸ§ͺ Testing Input Validation'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Valid inputs', + apiKey: 'valid-key', + url: 'https://example.com', + steps: ['click button', 'type text'], + useSession: true, + expected: true, + description: 'All valid parameters' + }, + { + name: 'Invalid URL - no protocol', + apiKey: 'valid-key', + url: 'example.com', + steps: ['click button'], + useSession: true, + expected: false, + description: 'URL without http/https protocol' + }, + { + name: 'Empty API key', + apiKey: '', + url: 'https://example.com', + steps: ['click button'], + useSession: true, + expected: false, + description: 'Empty API key string' + }, + { + name: 'Empty steps array', + apiKey: 'valid-key', + url: 'https://example.com', + steps: [], + useSession: true, + expected: false, + description: 'Empty steps array' + }, + { + name: 'Steps with empty string', + apiKey: 'valid-key', + url: 'https://example.com', + steps: ['click button', '', 'type text'], + useSession: true, + expected: false, + description: 'Steps array containing empty string' + }, + { + name: 'Non-boolean useSession', + apiKey: 'valid-key', + url: 'https://example.com', + steps: ['click button'], + useSession: 'true', + expected: false, + description: 'useSession as string instead of boolean' + }, + { + name: 'Default useSession', + apiKey: 'valid-key', + url: 'https://example.com', + steps: ['click button'], + useSession: undefined, + expected: true, + description: 'useSession parameter omitted (should default to true)' + } + ]; + + let passed = 0; + let failed = 0; + + testCases.forEach((testCase, index) => { + console.log(`\n${index + 1}. Testing: ${testCase.name}`); + console.log(` ${testCase.description}`); + + try { + // Simulate the validation logic from agenticScraper + const { apiKey, url, steps, useSession } = testCase; + + // API Key validation + if (!apiKey || typeof apiKey !== 'string') { + throw new Error('API key must be a non-empty string'); + } + + // URL validation + if (!url || typeof url !== 'string') { + throw new Error('URL must be a non-empty string'); + } + if (!url.startsWith('http://') && !url.startsWith('https://')) { + throw new Error('URL must start with http:// or https://'); + } + + // Steps validation + if (!Array.isArray(steps) || steps.length === 0) { + throw new Error('Steps must be a non-empty array'); + } + if (steps.some(step => !step || typeof step !== 'string' || !step.trim())) { + throw new Error('All steps must be non-empty strings'); + } + + // useSession validation (only if provided) + if (useSession !== undefined && typeof useSession !== 'boolean') { + throw new Error('useSession must be a boolean value'); + } + + if (testCase.expected) { + console.log(' βœ… PASS - Validation passed as expected'); + passed++; + } else { + console.log(' ❌ FAIL - Expected validation to fail, but it passed'); + failed++; + } + } catch (error) { + if (!testCase.expected) { + console.log(` βœ… PASS - Validation failed as expected: ${error.message}`); + passed++; + } else { + console.log(` ❌ FAIL - Unexpected validation failure: ${error.message}`); + failed++; + } + } + }); + + console.log(`\nπŸ“Š Validation Results: ${passed} passed, ${failed} failed`); + return { passed, failed }; +} + +/** + * Test function signatures and parameter handling + */ +function testFunctionSignatures() { + console.log('\nπŸ§ͺ Testing Function Signatures'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'agenticScraper with all parameters', + func: 'agenticScraper', + args: [API_KEY, 'https://example.com', ['click button'], true], + description: 'apiKey, url, steps, useSession' + }, + { + name: 'agenticScraper with default useSession', + func: 'agenticScraper', + args: [API_KEY, 'https://example.com', ['click button']], + description: 'apiKey, url, steps (useSession defaults to true)' + }, + { + name: 'getAgenticScraperRequest', + func: 'getAgenticScraperRequest', + args: [API_KEY, 'test-request-id'], + description: 'apiKey, requestId' + } + ]; + + testCases.forEach((testCase, index) => { + console.log(`\n${index + 1}. Testing: ${testCase.name}`); + console.log(` Parameters: ${testCase.description}`); + + try { + // Simulate function signature validation + if (testCase.func === 'agenticScraper') { + const [apiKey, url, steps, useSession] = testCase.args; + if (typeof apiKey !== 'string' || typeof url !== 'string' || !Array.isArray(steps)) { + throw new Error('Invalid parameter types'); + } + } else if (testCase.func === 'getAgenticScraperRequest') { + const [apiKey, requestId] = testCase.args; + if (typeof apiKey !== 'string' || typeof requestId !== 'string') { + throw new Error('Invalid parameter types'); + } + } + + console.log(' βœ… PASS - Function signature valid'); + } catch (error) { + console.log(` ❌ FAIL - Function signature error: ${error.message}`); + } + }); +} + +/** + * Test step parsing and validation + */ +function testStepValidation() { + console.log('\nπŸ§ͺ Testing Step Validation'); + console.log('='.repeat(50)); + + const validSteps = [ + 'click on login button', + 'type "username" in email field', + 'press Enter key', + 'wait for 2 seconds', + 'scroll down', + 'click on first result' + ]; + + const invalidSteps = [ + '', // Empty string + ' ', // Only whitespace + null, // Null value + 123, // Number instead of string + {}, // Object instead of string + ]; + + console.log('\n1. Testing valid steps:'); + validSteps.forEach((step, index) => { + console.log(` ${index + 1}. "${step}" βœ… Valid`); + }); + + console.log('\n2. Testing invalid steps:'); + invalidSteps.forEach((step, index) => { + const stepStr = step === null ? 'null' : + typeof step === 'object' ? 'object' : + `"${step}"`; + console.log(` ${index + 1}. ${stepStr} ❌ Invalid`); + }); + + console.log('\n3. Testing step combinations:'); + + const testCombinations = [ + { + name: 'All valid steps', + steps: validSteps, + expected: true + }, + { + name: 'Mixed valid and invalid', + steps: ['click button', '', 'type text'], + expected: false + }, + { + name: 'Single valid step', + steps: ['click button'], + expected: true + } + ]; + + testCombinations.forEach((test, index) => { + const isValid = test.steps.every(step => + step && typeof step === 'string' && step.trim() + ); + const result = isValid === test.expected ? 'βœ… PASS' : '❌ FAIL'; + console.log(` ${index + 1}. ${test.name}: ${result}`); + }); +} + +/** + * Test payload construction + */ +function testPayloadConstruction() { + console.log('\nπŸ§ͺ Testing Payload Construction'); + console.log('='.repeat(50)); + + const testCases = [ + { + name: 'Basic payload', + url: 'https://example.com', + steps: ['click button', 'type text'], + useSession: true, + expected: { + url: 'https://example.com', + use_session: true, + steps: ['click button', 'type text'] + } + }, + { + name: 'Payload with useSession false', + url: 'https://test.com', + steps: ['fill form'], + useSession: false, + expected: { + url: 'https://test.com', + use_session: false, + steps: ['fill form'] + } + }, + { + name: 'Payload with default useSession', + url: 'https://default.com', + steps: ['navigate'], + useSession: undefined, + expected: { + url: 'https://default.com', + use_session: true, // Should default to true + steps: ['navigate'] + } + } + ]; + + testCases.forEach((testCase, index) => { + console.log(`\n${index + 1}. Testing: ${testCase.name}`); + + // Simulate payload construction + const payload = { + url: testCase.url, + use_session: testCase.useSession !== undefined ? testCase.useSession : true, + steps: testCase.steps + }; + + console.log(' πŸ“¦ Constructed payload:'); + console.log(' ', JSON.stringify(payload, null, 2)); + + // Validate against expected + const matches = JSON.stringify(payload) === JSON.stringify(testCase.expected); + console.log(` ${matches ? 'βœ… PASS' : '❌ FAIL'} - Payload matches expected`); + }); +} + +/** + * Test common use case patterns + */ +function testUseCasePatterns() { + console.log('\nπŸ§ͺ Testing Use Case Patterns'); + console.log('='.repeat(50)); + + const useCases = [ + { + name: 'Login Flow', + steps: [ + 'click on email input', + 'type "user@example.com" in email field', + 'click on password input', + 'type "password123" in password field', + 'click login button', + 'wait for dashboard to load' + ], + useSession: true, + description: 'Typical login automation' + }, + { + name: 'Search and Filter', + steps: [ + 'click on search bar', + 'type "laptop" in search input', + 'press Enter key', + 'wait for results to load', + 'click on price filter', + 'select $500-$1000 range', + 'click apply filters' + ], + useSession: false, + description: 'E-commerce search workflow' + }, + { + name: 'Form Submission', + steps: [ + 'click on name input', + 'type "John Doe" in name field', + 'click on email input', + 'type "john@example.com" in email field', + 'click on message textarea', + 'type "Test message" in message field', + 'click submit button' + ], + useSession: false, + description: 'Contact form automation' + } + ]; + + useCases.forEach((useCase, index) => { + console.log(`\n${index + 1}. ${useCase.name}`); + console.log(` Description: ${useCase.description}`); + console.log(` Steps: ${useCase.steps.length} automation actions`); + console.log(` Use Session: ${useCase.useSession}`); + console.log(' βœ… PASS - Valid use case pattern'); + }); +} + +/** + * Test error scenarios + */ +function testErrorScenarios() { + console.log('\nπŸ§ͺ Testing Error Scenarios'); + console.log('='.repeat(50)); + + const errorScenarios = [ + { + name: 'Missing API Key', + test: () => { + // Simulate missing API key + throw new Error('API key must be a non-empty string'); + }, + expectedError: 'API key must be a non-empty string' + }, + { + name: 'Invalid URL Format', + test: () => { + // Simulate invalid URL + throw new Error('URL must start with http:// or https://'); + }, + expectedError: 'URL must start with' + }, + { + name: 'Empty Steps Array', + test: () => { + // Simulate empty steps + throw new Error('Steps must be a non-empty array'); + }, + expectedError: 'non-empty array' + } + ]; + + errorScenarios.forEach((scenario, index) => { + console.log(`\n${index + 1}. Testing: ${scenario.name}`); + + try { + scenario.test(); + console.log(' ❌ FAIL - Expected error but none was thrown'); + } catch (error) { + if (error.message.includes(scenario.expectedError)) { + console.log(` βœ… PASS - Correctly caught expected error: ${error.message}`); + } else { + console.log(` ⚠️ PARTIAL - Caught error but message differs: ${error.message}`); + } + } + }); +} + +/** + * Main test runner + */ +function runTests() { + console.log('πŸš€ ScrapeGraph JS SDK - AgenticScraper Tests'); + console.log('='.repeat(60)); + + if (!process.env.SGAI_APIKEY) { + console.log('⚠️ Note: SGAI_APIKEY not set - using mock key for validation tests'); + } + + console.log('\n🎯 Testing AgenticScraper functionality...'); + + const results = { + validation: testInputValidation(), + signatures: testFunctionSignatures(), + steps: testStepValidation(), + payload: testPayloadConstruction(), + useCases: testUseCasePatterns(), + errors: testErrorScenarios(), + }; + + console.log('\n' + '='.repeat(60)); + console.log('πŸ“Š Test Summary'); + console.log('='.repeat(60)); + console.log('βœ… Input Validation Tests: Completed'); + console.log('βœ… Function Signature Tests: Completed'); + console.log('βœ… Step Validation Tests: Completed'); + console.log('βœ… Payload Construction Tests: Completed'); + console.log('βœ… Use Case Pattern Tests: Completed'); + console.log('βœ… Error Scenario Tests: Completed'); + + const totalPassed = results.validation.passed; + const totalFailed = results.validation.failed; + + console.log(`\nπŸ“Š Overall Results: ${totalPassed} passed, ${totalFailed} failed`); + + if (totalFailed === 0) { + console.log('πŸŽ‰ All tests passed!'); + } else { + console.log('⚠️ Some tests failed - please review the results above'); + } + + console.log('\nπŸ’‘ Usage Examples:'); + console.log('// Basic login automation'); + console.log('await agenticScraper(apiKey, url, ["click login", "type email"], true);'); + console.log(''); + console.log('// Form submission without session'); + console.log('await agenticScraper(apiKey, url, ["fill form", "submit"], false);'); + console.log(''); + console.log('// Check request status'); + console.log('await getAgenticScraperRequest(apiKey, requestId);'); + + console.log('\nπŸ”§ Next Steps:'); + console.log('1. Set SGAI_APIKEY environment variable for real API testing'); + console.log('2. Run the example files in the examples/ directory'); + console.log('3. Try with different websites and automation steps'); + console.log('4. Test with both useSession: true and false'); + console.log('5. Monitor request status for long-running automations'); + + console.log('\nπŸ“š Available Examples:'); + console.log('- agenticScraper_example.js - Basic usage'); + console.log('- getAgenticScraperRequest_example.js - Status checking'); + console.log('- agenticScraper_complete_example.js - Full workflow'); + console.log('- agenticScraper_advanced_example.js - Error handling'); + + return totalFailed === 0; +} + +// Run the tests +const success = runTests(); +process.exit(success ? 0 : 1); diff --git a/scrapegraph-py/examples/async/async_agenticscraper_example.py b/scrapegraph-py/examples/async/async_agenticscraper_example.py new file mode 100644 index 0000000..285ddb9 --- /dev/null +++ b/scrapegraph-py/examples/async/async_agenticscraper_example.py @@ -0,0 +1,46 @@ +import asyncio +import os + +from dotenv import load_dotenv + +from scrapegraph_py import AsyncClient +from scrapegraph_py.logger import sgai_logger + +# Load environment variables from .env file +load_dotenv() + +sgai_logger.set_logging(level="INFO") + + +async def main(): + # Initialize async client with API key from environment variable + api_key = os.getenv("SGAI_API_KEY") + if not api_key: + print("❌ Error: SGAI_API_KEY environment variable not set") + print("Please either:") + print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") + print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") + return + + sgai_client = AsyncClient(api_key=api_key) + + # AgenticScraper request - automated login example + response = await sgai_client.agenticscraper( + url="https://dashboard.scrapegraphai.com/", + use_session=True, + steps=[ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ] + ) + + # Print the response + print(f"Request ID: {response['request_id']}") + print(f"Result: {response['result']}") + + await sgai_client.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scrapegraph-py/examples/async/steps/async_step_by_step_agenticscraper_example.py b/scrapegraph-py/examples/async/steps/async_step_by_step_agenticscraper_example.py new file mode 100644 index 0000000..e858703 --- /dev/null +++ b/scrapegraph-py/examples/async/steps/async_step_by_step_agenticscraper_example.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +""" +Async Step-by-Step AgenticScraper Example + +This example demonstrates how to use the AgenticScraper API asynchronously +for automated browser interactions with proper async/await patterns. +""" + +import asyncio +import json +import os +import time + +import aiohttp +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + + +async def agentic_scraper_request(): + """Example of making an async request to the agentic scraper API""" + + # Get API key from .env file + api_key = os.getenv("SGAI_API_KEY") + if not api_key: + raise ValueError( + "API key must be provided or set in .env file as SGAI_API_KEY. " + "Create a .env file with: SGAI_API_KEY=your_api_key_here" + ) + + steps = [ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ] + website_url = "https://dashboard.scrapegraphai.com/" + + headers = { + "SGAI-APIKEY": api_key, + "Content-Type": "application/json", + } + + body = { + "url": website_url, + "use_session": True, + "steps": steps, + } + + print("πŸ€– Starting Async Agentic Scraper with Automated Actions...") + print(f"🌐 Website URL: {website_url}") + print(f"πŸ”§ Use Session: True") + print(f"πŸ“‹ Steps: {len(steps)} automated actions") + print("\n" + "=" * 60) + + # Start timer + start_time = time.time() + print( + f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" + ) + print("πŸ”„ Processing request asynchronously...") + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + "http://localhost:8001/v1/agentic-scrapper", + json=body, + headers=headers, + ) as response: + # Calculate execution time + end_time = time.time() + execution_time = end_time - start_time + execution_minutes = execution_time / 60 + + print( + f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" + ) + print( + f"⚑ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" + ) + print( + f"πŸ“Š Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for {len(steps)} steps" + ) + + if response.status == 200: + result = await response.json() + print("βœ… Request completed successfully!") + print(f"πŸ“Š Request ID: {result.get('request_id', 'N/A')}") + print(f"πŸ”„ Status: {result.get('status', 'N/A')}") + + if result.get("error"): + print(f"❌ Error: {result['error']}") + else: + print("\nπŸ“‹ EXTRACTED DATA:") + print("=" * 60) + + # Pretty print the result with proper indentation + if "result" in result: + print(json.dumps(result["result"], indent=2, ensure_ascii=False)) + else: + print("No result data found") + + else: + response_text = await response.text() + print(f"❌ Request failed with status code: {response.status}") + print(f"Response: {response_text}") + + except aiohttp.ClientError as e: + end_time = time.time() + execution_time = end_time - start_time + execution_minutes = execution_time / 60 + print( + f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" + ) + print( + f"⚑ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" + ) + print(f"🌐 Network error: {str(e)}") + except Exception as e: + end_time = time.time() + execution_time = end_time - start_time + execution_minutes = execution_time / 60 + print( + f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" + ) + print( + f"⚑ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" + ) + print(f"πŸ’₯ Unexpected error: {str(e)}") + + +def show_curl_equivalent(): + """Show the equivalent curl command for reference""" + + # Load environment variables from .env file + load_dotenv() + + api_key = os.getenv("SGAI_API_KEY", "your-api-key-here") + curl_command = f""" +curl --location 'http://localhost:8001/v1/agentic-scrapper' \\ +--header 'SGAI-APIKEY: {api_key}' \\ +--header 'Content-Type: application/json' \\ +--data-raw '{{ + "url": "https://dashboard.scrapegraphai.com/", + "use_session": true, + "steps": [ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ] +}}' + """ + + print("Equivalent curl command:") + print(curl_command) + + +async def main(): + """Main async function to run the agentic scraper example""" + try: + print("πŸ€– ASYNC AGENTIC SCRAPER EXAMPLE") + print("=" * 60) + print("This example demonstrates async automated browser interactions") + print() + + # Show the curl equivalent + show_curl_equivalent() + + print("\n" + "=" * 60) + + # Make the actual API request + await agentic_scraper_request() + + print("\n" + "=" * 60) + print("Example completed!") + print("\nKey takeaways:") + print("1. Async agentic scraper enables non-blocking automation") + print("2. Each step is executed sequentially but asynchronously") + print("3. Session management allows for complex workflows") + print("4. Perfect for concurrent automation tasks") + print("\nNext steps:") + print("- Run multiple agentic scrapers concurrently") + print("- Combine with other async operations") + print("- Implement async error handling") + print("- Use async session management for efficiency") + + except Exception as e: + print(f"πŸ’₯ Error occurred: {str(e)}") + print("\nπŸ› οΈ Troubleshooting:") + print("1. Make sure your .env file contains SGAI_API_KEY") + print("2. Ensure the API server is running on localhost:8001") + print("3. Check your internet connection") + print("4. Verify the target website is accessible") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scrapegraph-py/examples/sync/agenticscraper_example.py b/scrapegraph-py/examples/sync/agenticscraper_example.py new file mode 100644 index 0000000..b524ebc --- /dev/null +++ b/scrapegraph-py/examples/sync/agenticscraper_example.py @@ -0,0 +1,39 @@ +import os + +from dotenv import load_dotenv + +from scrapegraph_py import Client +from scrapegraph_py.logger import sgai_logger + +# Load environment variables from .env file +load_dotenv() + +sgai_logger.set_logging(level="INFO") + +# Initialize the client with API key from environment variable +api_key = os.getenv("SGAI_API_KEY") +if not api_key: + print("❌ Error: SGAI_API_KEY environment variable not set") + print("Please either:") + print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") + print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") + exit(1) + +sgai_client = Client(api_key=api_key) + +# AgenticScraper request - automated login example +response = sgai_client.agenticscraper( + url="https://dashboard.scrapegraphai.com/", + use_session=True, + steps=[ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ] +) + +# Print the response +print(f"Request ID: {response['request_id']}") +print(f"Result: {response['result']}") + +sgai_client.close() diff --git a/scrapegraph-py/examples/sync/steps/step_by_step_agenticscraper_example.py b/scrapegraph-py/examples/sync/steps/step_by_step_agenticscraper_example.py new file mode 100644 index 0000000..d82ad2b --- /dev/null +++ b/scrapegraph-py/examples/sync/steps/step_by_step_agenticscraper_example.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Step-by-Step AgenticScraper Example + +This example demonstrates how to use the AgenticScraper API for automated browser interactions. +It shows how to make actual HTTP requests with step-by-step browser actions. +""" + +import json +import os +import time + +import requests +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + + +def agentic_scraper_request(): + """Example of making a request to the agentic scraper API""" + + # Get API key from .env file + api_key = os.getenv("SGAI_API_KEY") + if not api_key: + raise ValueError( + "API key must be provided or set in .env file as SGAI_API_KEY. " + "Create a .env file with: SGAI_API_KEY=your_api_key_here" + ) + + steps = [ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ] + website_url = "https://dashboard.scrapegraphai.com/" + + headers = { + "SGAI-APIKEY": api_key, + "Content-Type": "application/json", + } + + body = { + "url": website_url, + "use_session": True, + "steps": steps, + } + + print("πŸ€– Starting Agentic Scraper with Automated Actions...") + print(f"🌐 Website URL: {website_url}") + print(f"πŸ”§ Use Session: True") + print(f"πŸ“‹ Steps: {len(steps)} automated actions") + print("\n" + "=" * 60) + + # Start timer + start_time = time.time() + print( + f"⏱️ Timer started at: {time.strftime('%H:%M:%S', time.localtime(start_time))}" + ) + print("πŸ”„ Processing request...") + + try: + response = requests.post( + "http://localhost:8001/v1/agentic-scrapper", + json=body, + headers=headers, + ) + + # Calculate execution time + end_time = time.time() + execution_time = end_time - start_time + execution_minutes = execution_time / 60 + + print( + f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" + ) + print( + f"⚑ Total execution time: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" + ) + print( + f"πŸ“Š Performance: {execution_time:.1f}s ({execution_minutes:.1f}m) for {len(steps)} steps" + ) + + if response.status_code == 200: + result = response.json() + print("βœ… Request completed successfully!") + print(f"πŸ“Š Request ID: {result.get('request_id', 'N/A')}") + print(f"πŸ”„ Status: {result.get('status', 'N/A')}") + + if result.get("error"): + print(f"❌ Error: {result['error']}") + else: + print("\nπŸ“‹ EXTRACTED DATA:") + print("=" * 60) + + # Pretty print the result with proper indentation + if "result" in result: + print(json.dumps(result["result"], indent=2, ensure_ascii=False)) + else: + print("No result data found") + + else: + print(f"❌ Request failed with status code: {response.status_code}") + print(f"Response: {response.text}") + + except requests.exceptions.RequestException as e: + end_time = time.time() + execution_time = end_time - start_time + execution_minutes = execution_time / 60 + print( + f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" + ) + print( + f"⚑ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" + ) + print(f"🌐 Network error: {str(e)}") + except Exception as e: + end_time = time.time() + execution_time = end_time - start_time + execution_minutes = execution_time / 60 + print( + f"⏱️ Timer stopped at: {time.strftime('%H:%M:%S', time.localtime(end_time))}" + ) + print( + f"⚑ Execution time before error: {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)" + ) + print(f"πŸ’₯ Unexpected error: {str(e)}") + + +def show_curl_equivalent(): + """Show the equivalent curl command for reference""" + + # Load environment variables from .env file + load_dotenv() + + api_key = os.getenv("SGAI_API_KEY", "your-api-key-here") + curl_command = f""" +curl --location 'http://localhost:8001/v1/agentic-scrapper' \\ +--header 'SGAI-APIKEY: {api_key}' \\ +--header 'Content-Type: application/json' \\ +--data-raw '{{ + "url": "https://dashboard.scrapegraphai.com/", + "use_session": true, + "steps": [ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ] +}}' + """ + + print("Equivalent curl command:") + print(curl_command) + + +def main(): + """Main function to run the agentic scraper example""" + try: + print("πŸ€– AGENTIC SCRAPER EXAMPLE") + print("=" * 60) + print("This example demonstrates automated browser interactions") + print() + + # Show the curl equivalent + show_curl_equivalent() + + print("\n" + "=" * 60) + + # Make the actual API request + agentic_scraper_request() + + print("\n" + "=" * 60) + print("Example completed!") + print("\nKey takeaways:") + print("1. Agentic scraper enables automated browser actions") + print("2. Each step is executed sequentially") + print("3. Session management allows for complex workflows") + print("4. Perfect for login flows and form interactions") + print("\nNext steps:") + print("- Customize the steps for your specific use case") + print("- Add more complex automation sequences") + print("- Implement error handling for failed actions") + print("- Use session management for multi-step workflows") + + except Exception as e: + print(f"πŸ’₯ Error occurred: {str(e)}") + print("\nπŸ› οΈ Troubleshooting:") + print("1. Make sure your .env file contains SGAI_API_KEY") + print("2. Ensure the API server is running on localhost:8001") + print("3. Check your internet connection") + print("4. Verify the target website is accessible") + + +if __name__ == "__main__": + main() diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 1c4a82f..47ebf82 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -8,6 +8,10 @@ from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger +from scrapegraph_py.models.agenticscraper import ( + AgenticScraperRequest, + GetAgenticScraperRequest, +) from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest @@ -392,6 +396,48 @@ async def get_crawl(self, crawl_id: str): logger.info(f"✨ Successfully retrieved result for request {crawl_id}") return result + async def agenticscraper( + self, + url: str, + steps: list[str], + use_session: bool = True, + ): + """Send an agentic scraper request to perform automated actions on a webpage + + Args: + url: The URL to scrape + steps: List of steps to perform on the webpage + use_session: Whether to use session for the scraping (default: True) + """ + logger.info(f"πŸ€– Starting agentic scraper request for {url}") + logger.debug(f"πŸ”§ Use session: {use_session}") + logger.debug(f"πŸ“‹ Steps: {steps}") + + request = AgenticScraperRequest( + url=url, + steps=steps, + use_session=use_session, + ) + logger.debug("βœ… Request validation passed") + + result = await self._make_request( + "POST", f"{API_BASE_URL}/agentic-scrapper", json=request.model_dump() + ) + logger.info("✨ Agentic scraper request completed successfully") + return result + + async def get_agenticscraper(self, request_id: str): + """Get the result of a previous agentic scraper request""" + logger.info(f"πŸ” Fetching agentic scraper result for request {request_id}") + + # Validate input using Pydantic model + GetAgenticScraperRequest(request_id=request_id) + logger.debug("βœ… Request ID validation passed") + + result = await self._make_request("GET", f"{API_BASE_URL}/agentic-scrapper/{request_id}") + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + async def close(self): """Close the session to free up resources""" logger.info("πŸ”’ Closing AsyncClient session") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 5dbf57d..0a836b2 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -9,6 +9,10 @@ from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS from scrapegraph_py.exceptions import APIError from scrapegraph_py.logger import sgai_logger as logger +from scrapegraph_py.models.agenticscraper import ( + AgenticScraperRequest, + GetAgenticScraperRequest, +) from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest @@ -392,6 +396,48 @@ def get_crawl(self, crawl_id: str): logger.info(f"✨ Successfully retrieved result for request {crawl_id}") return result + def agenticscraper( + self, + url: str, + steps: list[str], + use_session: bool = True, + ): + """Send an agentic scraper request to perform automated actions on a webpage + + Args: + url: The URL to scrape + steps: List of steps to perform on the webpage + use_session: Whether to use session for the scraping (default: True) + """ + logger.info(f"πŸ€– Starting agentic scraper request for {url}") + logger.debug(f"πŸ”§ Use session: {use_session}") + logger.debug(f"πŸ“‹ Steps: {steps}") + + request = AgenticScraperRequest( + url=url, + steps=steps, + use_session=use_session, + ) + logger.debug("βœ… Request validation passed") + + result = self._make_request( + "POST", f"{API_BASE_URL}/agentic-scrapper", json=request.model_dump() + ) + logger.info("✨ Agentic scraper request completed successfully") + return result + + def get_agenticscraper(self, request_id: str): + """Get the result of a previous agentic scraper request""" + logger.info(f"πŸ” Fetching agentic scraper result for request {request_id}") + + # Validate input using Pydantic model + GetAgenticScraperRequest(request_id=request_id) + logger.debug("βœ… Request ID validation passed") + + result = self._make_request("GET", f"{API_BASE_URL}/agentic-scrapper/{request_id}") + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + def close(self): """Close the session to free up resources""" logger.info("πŸ”’ Closing Client session") diff --git a/scrapegraph-py/scrapegraph_py/models/__init__.py b/scrapegraph-py/scrapegraph_py/models/__init__.py index e9655b1..cbde5de 100644 --- a/scrapegraph-py/scrapegraph_py/models/__init__.py +++ b/scrapegraph-py/scrapegraph_py/models/__init__.py @@ -1,3 +1,4 @@ +from .agenticscraper import AgenticScraperRequest, GetAgenticScraperRequest from .crawl import CrawlRequest, GetCrawlRequest from .feedback import FeedbackRequest from .markdownify import GetMarkdownifyRequest, MarkdownifyRequest @@ -5,6 +6,8 @@ from .smartscraper import GetSmartScraperRequest, SmartScraperRequest __all__ = [ + "AgenticScraperRequest", + "GetAgenticScraperRequest", "CrawlRequest", "GetCrawlRequest", "FeedbackRequest", diff --git a/scrapegraph-py/scrapegraph_py/models/agenticscraper.py b/scrapegraph-py/scrapegraph_py/models/agenticscraper.py new file mode 100644 index 0000000..bbe7e8c --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/agenticscraper.py @@ -0,0 +1,66 @@ +# Models for agentic scraper endpoint + +from typing import List, Optional +from uuid import UUID + +from pydantic import BaseModel, Field, model_validator + + +class AgenticScraperRequest(BaseModel): + url: str = Field( + ..., + example="https://dashboard.scrapegraphai.com/", + description="The URL to scrape" + ) + use_session: bool = Field( + default=True, + description="Whether to use session for the scraping" + ) + steps: List[str] = Field( + ..., + example=[ + "Type email@gmail.com in email input box", + "Type test-password@123 in password inputbox", + "click on login" + ], + description="List of steps to perform on the webpage" + ) + + @model_validator(mode="after") + def validate_url(self) -> "AgenticScraperRequest": + if not self.url.strip(): + raise ValueError("URL cannot be empty") + if not ( + self.url.startswith("http://") + or self.url.startswith("https://") + ): + raise ValueError("Invalid URL - must start with http:// or https://") + return self + + @model_validator(mode="after") + def validate_steps(self) -> "AgenticScraperRequest": + if not self.steps: + raise ValueError("Steps cannot be empty") + if any(not step.strip() for step in self.steps): + raise ValueError("All steps must contain valid instructions") + return self + + def model_dump(self, *args, **kwargs) -> dict: + # Set exclude_none=True to exclude None values from serialization + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) + + +class GetAgenticScraperRequest(BaseModel): + """Request model for get_agenticscraper endpoint""" + + request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") + + @model_validator(mode="after") + def validate_request_id(self) -> "GetAgenticScraperRequest": + try: + # Validate the request_id is a valid UUID + UUID(self.request_id) + except ValueError: + raise ValueError("request_id must be a valid UUID") + return self diff --git a/scrapegraph-py/test_agentic_scraper.py b/scrapegraph-py/test_agentic_scraper.py new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/scrapegraph-py/test_agentic_scraper.py @@ -0,0 +1 @@ + \ No newline at end of file