scrapfly
diff --git a/‎__tests__/client.test.ts‎
Lines changed: 119 additions & 24 deletions b/‎__tests__/client.test.ts‎
Lines changed: 119 additions & 24 deletions
diff --git a/‎__tests__/extractionconfig.test.ts‎
Lines changed: 122 additions & 0 deletions b/‎__tests__/extractionconfig.test.ts‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎examples/extraction/auto_extraction.js‎
Lines changed: 56 additions & 0 deletions b/‎examples/extraction/auto_extraction.js‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎examples/extraction/prompt_extraction.js‎
Lines changed: 37 additions & 0 deletions b/‎examples/extraction/prompt_extraction.js‎
Lines changed: 37 additions & 0 deletions
@@ -1,8 +1,9 @@
 import * as errors from '../src/errors.js';
 import { ScrapflyClient } from '../src/client.js';
 import { ScrapeConfig } from '../src/scrapeconfig.js';
-import { describe, it, expect, beforeEach, jest } from '@jest/globals';
 import { ScreenshotConfig } from '../src/screenshotconfig.js';
+import { ExtractionConfig } from '../src/extractionconfig.js';
+import { describe, it, expect, beforeEach, jest } from '@jest/globals';
 
 function mockedStream() {
     const mockStream = {
@@ -484,28 +485,28 @@ describe('screenshot', () => {
         jest.spyOn(client, 'fetch').mockClear(); // clear all mock meta on each test
     });
 
-    it('succeeds', async () => {
-        const spy = jest.spyOn(client, 'fetch');
-        const url = 'https://web-scraping.dev/';
-        jest.spyOn(client, 'fetch').mockImplementation(async (config: Request): Promise<any> => {
-            const configUrl = config[Object.getOwnPropertySymbols(config)[1]].url;
-            // Ensure the URL matches the pattern
-            expect(configUrl.origin + configUrl.pathname).toEqual(client.HOST + '/screenshot');
-            expect(config.method).toEqual('GET');
-            expect(configUrl.searchParams.get('key')).toMatch(KEY);
-            expect(configUrl.searchParams.get('url')).toMatch(url);
-            expect(Array.from(configUrl.searchParams.keys())).toEqual(['key', 'url']);
-            const body = mockedStream();
-            return responseFactory(body, {
-                status: 200,
-                headers: {
-                    'content-encoding': 'gzip',
-                    'content-type': 'image/png',
-                    'x-scrapfly-upstream-http-code': '200',
-                    'x-scrapfly-upstream-url': url,
-                },
-            });
+it('succeeds', async () => {
+    const spy = jest.spyOn(client, 'fetch');
+    const url = 'https://web-scraping.dev/';
+    jest.spyOn(client, 'fetch').mockImplementation(async (config: Request): Promise<any> => {
+        const configUrl = config[Object.getOwnPropertySymbols(config)[1]].url;
+        // Ensure the URL matches the pattern
+        expect(configUrl.origin + configUrl.pathname).toEqual(client.HOST + '/screenshot');
+        expect(config.method).toEqual('GET');
+        expect(configUrl.searchParams.get('key')).toMatch(KEY);
+        expect(configUrl.searchParams.get('url')).toMatch(url);
+        expect(Array.from(configUrl.searchParams.keys())).toEqual(['key', 'url']);
+        const body = mockedStream();
+        return responseFactory(body, {
+            status: 200,
+            headers: {
+                'content-encoding': 'gzip',
+                'content-type': 'image/png',
+                'x-scrapfly-upstream-http-code': '200',
+                'x-scrapfly-upstream-url': url,
+            },
         });
+    });
 
         const result = await client.screenshot(new ScreenshotConfig({ url: url }));
         expect(result).toBeDefined();
@@ -531,7 +532,7 @@ describe('screenshot', () => {
                 },
             });
         });
-        await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.UnableToTakeScreenshot);
+        await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.ScreenshotApiError);
         expect(spy).toHaveBeenCalledTimes(1);
     });
 
@@ -551,7 +552,101 @@ describe('screenshot', () => {
                 },
             });
         });
-        await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.ScreenshotInvalidContent);
+        await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.ScreenshotApiError);
         expect(spy).toHaveBeenCalledTimes(1);
     });
 });
+
+describe('extract', () => {
+    const KEY = '__API_KEY__';
+    const client = new ScrapflyClient({ key: KEY });
+
+    beforeEach(() => {
+        jest.spyOn(client, 'fetch').mockClear(); // clear all mock meta on each test
+    });
+
+    it('succeeds', async () => {
+        const spy = jest.spyOn(client, 'fetch');
+        const html = 'very long html file';
+        jest.spyOn(client, 'fetch').mockImplementation(async (config: Request): Promise<any> => {
+            const configUrl = config[Object.getOwnPropertySymbols(config)[1]].url;
+            const configBody = config[Object.getOwnPropertySymbols(config)[1]].body.source;
+            // Ensure the URL matches the pattern
+            expect(configUrl.origin + configUrl.pathname).toEqual(client.HOST + '/extraction');
+            expect(config.method).toEqual('POST');
+            expect(configUrl.searchParams.get('key')).toMatch(KEY);
+            expect(configBody).toEqual(html);
+            const body = { data: 'a document summary', content_type: 'text/html' };
+            return responseFactory(body, {
+                status: 200,
+            });
+        });
+
+        const result = await client.extract(new ExtractionConfig({ body: html, content_type: 'text/html' }));
+        expect(result).toBeDefined();
+        expect(result.content_type).toBe('text/html');
+        expect(result.data).toBe('a document summary');
+        expect(spy).toHaveBeenCalledTimes(1);
+    });
+
+    it('fails due to failing to invalid config', async () => {
+        const html = 'very long html file';
+        await expect(
+            client.extract(
+                new ExtractionConfig({
+                    body: html,
+                    content_type: 'text/html',
+                    epehemeral_template: { source: 'html' },
+                    template: 'template',
+                }),
+            ),
+        ).rejects.toThrow(errors.ExtractionConfigError);
+    });
+
+    it('fails to invalid API key', async () => {
+        const html = 'very long html file';
+        jest.spyOn(client, 'fetch').mockImplementation(async (): Promise<any> => {
+            const result = {
+                status: 'error',
+                http_code: 401,
+                reason: 'Unauthorized',
+                error_id: '301e2d9e-b4f5-4289-85ea-e452143338df',
+                message: 'Invalid API key',
+            };
+            return responseFactory(result, {
+                status: 401,
+                headers: {
+                    'Content-Type': 'application/json',
+                },
+            });
+        });
+        await expect(client.extract(new ExtractionConfig({ body: html, content_type: 'text/html' }))).rejects.toThrow(
+            errors.BadApiKeyError,
+        );
+    });
+
+    it('fails to any extraction related error', async () => {
+        const html = 'very long html file';
+        jest.spyOn(client, 'fetch').mockImplementation(async (): Promise<any> => {
+            const result = {
+                code: 'ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
+                error_id: 'f0e9a6af-846a-49ab-8321-e21bb12bf494',
+                http_code: 422,
+                links: {
+                    'Related Error Doc':
+                        'https://scrapfly.io/docs/extraction-api/error/ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
+                },
+                message: 'ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
+            };
+            return responseFactory(result, {
+                status: 422,
+                headers: {
+                    'Content-Type': 'application/json',
+                },
+            });
+        });
+        await expect(client.extract(new ExtractionConfig({ body: html, content_type: 'text/html' }))).rejects.toThrow(
+            errors.ExtractionApiError,
+        );
+    });
+});
@@ -0,0 +1,122 @@
+import { ExtractionConfig } from '../src/extractionconfig.js';
+// import { ScreenshotConfigError } from '../src/errors.js';
+import { describe, it, expect } from '@jest/globals';
+
+const input_html = 'very long html file';
+const input_content_type = 'text/html';
+
+describe('extactionconfig', () => {
+    it('loads', () => {
+        const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
+        expect(config.body).toBe(input_html);
+        expect(config.content_type).toBe(input_content_type);
+    });
+});
+
+describe('url param generation', () => {
+    it('loads', () => {
+        const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
+        expect(config.body).toBe(input_html);
+        expect(config.content_type).toBe(input_content_type);
+    });
+
+    it('basic config', () => {
+        const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
+        const params = config.toApiParams({ key: '1234' });
+        expect(params).toEqual({
+            key: '1234',
+            body: input_html,
+            content_type: input_content_type,
+        });
+    });
+
+    it('sets url', () => {
+        const config = new ExtractionConfig({
+            body: input_html,
+            content_type: input_content_type,
+            url: 'https://web-scraping.dev/products',
+        });
+        const params = config.toApiParams({ key: '1234' });
+        expect(params).toEqual({
+            key: '1234',
+            body: input_html,
+            content_type: input_content_type,
+            url: 'https://web-scraping.dev/products',
+        });
+    });
+
+    it('sets charset', () => {
+        const config = new ExtractionConfig({
+            body: input_html,
+            content_type: input_content_type,
+            charset: 'utf-8',
+        });
+        const params = config.toApiParams({ key: '1234' });
+        expect(params).toEqual({
+            key: '1234',
+            body: input_html,
+            content_type: input_content_type,
+            charset: 'utf-8',
+        });
+    });
+
+    it('sets template', () => {
+        const config = new ExtractionConfig({
+            body: input_html,
+            content_type: input_content_type,
+            template: 'my_template',
+        });
+        const params = config.toApiParams({ key: '1234' });
+        expect(params).toEqual({
+            key: '1234',
+            body: input_html,
+            content_type: input_content_type,
+            template: 'my_template',
+        });
+    });
+
+    it('sets epehemeral_template', () => {
+        const config = new ExtractionConfig({
+            body: input_html,
+            content_type: input_content_type,
+            epehemeral_template: { source: 'html', selectors: [] },
+        });
+        const params = config.toApiParams({ key: '1234' });
+        expect(params).toEqual({
+            key: '1234',
+            body: input_html,
+            content_type: input_content_type,
+            epehemeral_template: 'ephemeral:eyJzb3VyY2UiOiJodG1sIiwic2VsZWN0b3JzIjpbXX0',
+        });
+    });
+
+    it('sets extraction_prompt', () => {
+        const config = new ExtractionConfig({
+            body: input_html,
+            content_type: input_content_type,
+            extraction_prompt: 'summarize the document'
+        });
+        const params = config.toApiParams({ key: '1234' });
+        expect(params).toEqual({
+            key: '1234',
+            body: input_html,
+            content_type: input_content_type,
+            extraction_prompt: 'summarize the document'
+        });
+    });
+
+    it('sets extraction_model', () => {
+        const config = new ExtractionConfig({
+            body: input_html,
+            content_type: input_content_type,
+            extraction_model: 'review_list'
+        });
+        const params = config.toApiParams({ key: '1234' });
+        expect(params).toEqual({
+            key: '1234',
+            body: input_html,
+            content_type: input_content_type,
+            extraction_model: 'review_list'
+        });
+    });               
+});
@@ -0,0 +1,56 @@
+/*
+This example shows how to use Scrapfly's auto extraction using defined extraction models
+*/
+import { ScrapflyClient, ScrapeConfig, ExtractionConfig } from 'scrapfly-sdk';
+
+const key = 'YOUR SCRAPFLY KEY';
+const client = new ScrapflyClient({ key });
+
+// First, scrape the web page to retrieve its HTML
+const scrapeResult = await client.scrape(
+    new ScrapeConfig({
+        url: 'https://web-scraping.dev/reviews',
+        render_js: true,
+        auto_scroll: true
+    }),
+);
+
+// raw HTML content
+const html = scrapeResult.result.content;
+
+// use the AI auto extraction models for common web pages types:
+// for the available models, refer to https://scrapfly.io/docs/extraction-api/automatic-ai#models
+const extractionResult = await client.extract(
+    new ExtractionConfig({
+        body: html, // pass the scraped HTML content
+        content_type: 'text/html', // content data type
+        charset: 'utf-8', // passed content charset, use `auto` if you aren't sure
+        url: 'https://web-scraping.dev/reviews', // when passed, used to transform relative URLs in the document into absolute URLs automatically
+        extraction_model: 'review_list',
+    }),
+);
+
+// extraction result
+console.log(extractionResult.data);
+`
+{
+  ....
+  reviews: [
+    {
+      author_name: null,
+      content: "Unique flavor and great energy boost. It's the perfect gamer's drink!",
+      date_published_formatted: '2023-05-18',
+      date_published_raw: '2023-05-18',
+      rating: null,
+      sentiment: 'POSITIVE',
+      sentiment_probability: 0.85,
+      title: null,
+      verified: null
+    },
+    ....
+]
+`
+
+// result content type
+console.log(extractionResult.content_type);
+`application/json`
@@ -0,0 +1,37 @@
+/*
+This example shows how to use Scrapfly's extraction prompts for doument RAG processing (question answering)
+*/
+import { ScrapflyClient, ScrapeConfig, ExtractionConfig } from 'scrapfly-sdk';
+
+const key = 'YOUR SCRAPFLY KEY';
+const client = new ScrapflyClient({ key });
+
+// First, scrape the web page to retrieve its HTML
+const scrapeResult = await client.scrape(
+    new ScrapeConfig({
+        url: 'https://web-scraping.dev/products',
+        render_js: true,
+    }),
+);
+
+// raw HTML content
+const html = scrapeResult.result.content;
+
+// Second, pass the HTML and an extraction prompt
+// In this example, we'll ask a question about the data
+const extractionResult = await client.extract(
+    new ExtractionConfig({
+        body: html, // pass the HTML content
+        content_type: 'text/html', // data content type
+        charset: 'utf-8', // passed content charset, use `auto` if you aren't sure
+        extraction_prompt: 'what is the flavor of the dark energy potion?', // LLM extraction prompt
+    }),
+);
+
+// extraction result
+console.log(extractionResult.data);
+`The document mentions the flavor of the Dark Red Energy Potion is **bold cherry cola**.`;
+
+// result content type
+console.log(extractionResult.content_type);
+`text/plain`;