Skip to content

Commit 5bf08a1

Browse files
committed
support extraction api
1 parent 17f5c03 commit 5bf08a1

File tree

11 files changed

+659
-40
lines changed

11 files changed

+659
-40
lines changed

__tests__/client.test.ts

Lines changed: 119 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import * as errors from '../src/errors.js';
22
import { ScrapflyClient } from '../src/client.js';
33
import { ScrapeConfig } from '../src/scrapeconfig.js';
4-
import { describe, it, expect, beforeEach, jest } from '@jest/globals';
54
import { ScreenshotConfig } from '../src/screenshotconfig.js';
5+
import { ExtractionConfig } from '../src/extractionconfig.js';
6+
import { describe, it, expect, beforeEach, jest } from '@jest/globals';
67

78
function mockedStream() {
89
const mockStream = {
@@ -484,28 +485,28 @@ describe('screenshot', () => {
484485
jest.spyOn(client, 'fetch').mockClear(); // clear all mock meta on each test
485486
});
486487

487-
it('succeeds', async () => {
488-
const spy = jest.spyOn(client, 'fetch');
489-
const url = 'https://web-scraping.dev/';
490-
jest.spyOn(client, 'fetch').mockImplementation(async (config: Request): Promise<any> => {
491-
const configUrl = config[Object.getOwnPropertySymbols(config)[1]].url;
492-
// Ensure the URL matches the pattern
493-
expect(configUrl.origin + configUrl.pathname).toEqual(client.HOST + '/screenshot');
494-
expect(config.method).toEqual('GET');
495-
expect(configUrl.searchParams.get('key')).toMatch(KEY);
496-
expect(configUrl.searchParams.get('url')).toMatch(url);
497-
expect(Array.from(configUrl.searchParams.keys())).toEqual(['key', 'url']);
498-
const body = mockedStream();
499-
return responseFactory(body, {
500-
status: 200,
501-
headers: {
502-
'content-encoding': 'gzip',
503-
'content-type': 'image/png',
504-
'x-scrapfly-upstream-http-code': '200',
505-
'x-scrapfly-upstream-url': url,
506-
},
507-
});
488+
it('succeeds', async () => {
489+
const spy = jest.spyOn(client, 'fetch');
490+
const url = 'https://web-scraping.dev/';
491+
jest.spyOn(client, 'fetch').mockImplementation(async (config: Request): Promise<any> => {
492+
const configUrl = config[Object.getOwnPropertySymbols(config)[1]].url;
493+
// Ensure the URL matches the pattern
494+
expect(configUrl.origin + configUrl.pathname).toEqual(client.HOST + '/screenshot');
495+
expect(config.method).toEqual('GET');
496+
expect(configUrl.searchParams.get('key')).toMatch(KEY);
497+
expect(configUrl.searchParams.get('url')).toMatch(url);
498+
expect(Array.from(configUrl.searchParams.keys())).toEqual(['key', 'url']);
499+
const body = mockedStream();
500+
return responseFactory(body, {
501+
status: 200,
502+
headers: {
503+
'content-encoding': 'gzip',
504+
'content-type': 'image/png',
505+
'x-scrapfly-upstream-http-code': '200',
506+
'x-scrapfly-upstream-url': url,
507+
},
508508
});
509+
});
509510

510511
const result = await client.screenshot(new ScreenshotConfig({ url: url }));
511512
expect(result).toBeDefined();
@@ -531,7 +532,7 @@ describe('screenshot', () => {
531532
},
532533
});
533534
});
534-
await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.UnableToTakeScreenshot);
535+
await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.ScreenshotApiError);
535536
expect(spy).toHaveBeenCalledTimes(1);
536537
});
537538

@@ -551,7 +552,101 @@ describe('screenshot', () => {
551552
},
552553
});
553554
});
554-
await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.ScreenshotInvalidContent);
555+
await expect(client.screenshot(new ScreenshotConfig({ url }))).rejects.toThrow(errors.ScreenshotApiError);
555556
expect(spy).toHaveBeenCalledTimes(1);
556557
});
557558
});
559+
560+
describe('extract', () => {
561+
const KEY = '__API_KEY__';
562+
const client = new ScrapflyClient({ key: KEY });
563+
564+
beforeEach(() => {
565+
jest.spyOn(client, 'fetch').mockClear(); // clear all mock meta on each test
566+
});
567+
568+
it('succeeds', async () => {
569+
const spy = jest.spyOn(client, 'fetch');
570+
const html = 'very long html file';
571+
jest.spyOn(client, 'fetch').mockImplementation(async (config: Request): Promise<any> => {
572+
const configUrl = config[Object.getOwnPropertySymbols(config)[1]].url;
573+
const configBody = config[Object.getOwnPropertySymbols(config)[1]].body.source;
574+
// Ensure the URL matches the pattern
575+
expect(configUrl.origin + configUrl.pathname).toEqual(client.HOST + '/extraction');
576+
expect(config.method).toEqual('POST');
577+
expect(configUrl.searchParams.get('key')).toMatch(KEY);
578+
expect(configBody).toEqual(html);
579+
const body = { data: 'a document summary', content_type: 'text/html' };
580+
return responseFactory(body, {
581+
status: 200,
582+
});
583+
});
584+
585+
const result = await client.extract(new ExtractionConfig({ body: html, content_type: 'text/html' }));
586+
expect(result).toBeDefined();
587+
expect(result.content_type).toBe('text/html');
588+
expect(result.data).toBe('a document summary');
589+
expect(spy).toHaveBeenCalledTimes(1);
590+
});
591+
592+
it('fails due to failing to invalid config', async () => {
593+
const html = 'very long html file';
594+
await expect(
595+
client.extract(
596+
new ExtractionConfig({
597+
body: html,
598+
content_type: 'text/html',
599+
epehemeral_template: { source: 'html' },
600+
template: 'template',
601+
}),
602+
),
603+
).rejects.toThrow(errors.ExtractionConfigError);
604+
});
605+
606+
it('fails to invalid API key', async () => {
607+
const html = 'very long html file';
608+
jest.spyOn(client, 'fetch').mockImplementation(async (): Promise<any> => {
609+
const result = {
610+
status: 'error',
611+
http_code: 401,
612+
reason: 'Unauthorized',
613+
error_id: '301e2d9e-b4f5-4289-85ea-e452143338df',
614+
message: 'Invalid API key',
615+
};
616+
return responseFactory(result, {
617+
status: 401,
618+
headers: {
619+
'Content-Type': 'application/json',
620+
},
621+
});
622+
});
623+
await expect(client.extract(new ExtractionConfig({ body: html, content_type: 'text/html' }))).rejects.toThrow(
624+
errors.BadApiKeyError,
625+
);
626+
});
627+
628+
it('fails to any extraction related error', async () => {
629+
const html = 'very long html file';
630+
jest.spyOn(client, 'fetch').mockImplementation(async (): Promise<any> => {
631+
const result = {
632+
code: 'ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
633+
error_id: 'f0e9a6af-846a-49ab-8321-e21bb12bf494',
634+
http_code: 422,
635+
links: {
636+
'Related Error Doc':
637+
'https://scrapfly.io/docs/extraction-api/error/ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
638+
},
639+
message: 'ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
640+
};
641+
return responseFactory(result, {
642+
status: 422,
643+
headers: {
644+
'Content-Type': 'application/json',
645+
},
646+
});
647+
});
648+
await expect(client.extract(new ExtractionConfig({ body: html, content_type: 'text/html' }))).rejects.toThrow(
649+
errors.ExtractionApiError,
650+
);
651+
});
652+
});

__tests__/extractionconfig.test.ts

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import { ExtractionConfig } from '../src/extractionconfig.js';
2+
// import { ScreenshotConfigError } from '../src/errors.js';
3+
import { describe, it, expect } from '@jest/globals';
4+
5+
const input_html = 'very long html file';
6+
const input_content_type = 'text/html';
7+
8+
describe('extactionconfig', () => {
9+
it('loads', () => {
10+
const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
11+
expect(config.body).toBe(input_html);
12+
expect(config.content_type).toBe(input_content_type);
13+
});
14+
});
15+
16+
describe('url param generation', () => {
17+
it('loads', () => {
18+
const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
19+
expect(config.body).toBe(input_html);
20+
expect(config.content_type).toBe(input_content_type);
21+
});
22+
23+
it('basic config', () => {
24+
const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
25+
const params = config.toApiParams({ key: '1234' });
26+
expect(params).toEqual({
27+
key: '1234',
28+
body: input_html,
29+
content_type: input_content_type,
30+
});
31+
});
32+
33+
it('sets url', () => {
34+
const config = new ExtractionConfig({
35+
body: input_html,
36+
content_type: input_content_type,
37+
url: 'https://web-scraping.dev/products',
38+
});
39+
const params = config.toApiParams({ key: '1234' });
40+
expect(params).toEqual({
41+
key: '1234',
42+
body: input_html,
43+
content_type: input_content_type,
44+
url: 'https://web-scraping.dev/products',
45+
});
46+
});
47+
48+
it('sets charset', () => {
49+
const config = new ExtractionConfig({
50+
body: input_html,
51+
content_type: input_content_type,
52+
charset: 'utf-8',
53+
});
54+
const params = config.toApiParams({ key: '1234' });
55+
expect(params).toEqual({
56+
key: '1234',
57+
body: input_html,
58+
content_type: input_content_type,
59+
charset: 'utf-8',
60+
});
61+
});
62+
63+
it('sets template', () => {
64+
const config = new ExtractionConfig({
65+
body: input_html,
66+
content_type: input_content_type,
67+
template: 'my_template',
68+
});
69+
const params = config.toApiParams({ key: '1234' });
70+
expect(params).toEqual({
71+
key: '1234',
72+
body: input_html,
73+
content_type: input_content_type,
74+
template: 'my_template',
75+
});
76+
});
77+
78+
it('sets epehemeral_template', () => {
79+
const config = new ExtractionConfig({
80+
body: input_html,
81+
content_type: input_content_type,
82+
epehemeral_template: { source: 'html', selectors: [] },
83+
});
84+
const params = config.toApiParams({ key: '1234' });
85+
expect(params).toEqual({
86+
key: '1234',
87+
body: input_html,
88+
content_type: input_content_type,
89+
epehemeral_template: 'ephemeral:eyJzb3VyY2UiOiJodG1sIiwic2VsZWN0b3JzIjpbXX0',
90+
});
91+
});
92+
93+
it('sets extraction_prompt', () => {
94+
const config = new ExtractionConfig({
95+
body: input_html,
96+
content_type: input_content_type,
97+
extraction_prompt: 'summarize the document'
98+
});
99+
const params = config.toApiParams({ key: '1234' });
100+
expect(params).toEqual({
101+
key: '1234',
102+
body: input_html,
103+
content_type: input_content_type,
104+
extraction_prompt: 'summarize the document'
105+
});
106+
});
107+
108+
it('sets extraction_model', () => {
109+
const config = new ExtractionConfig({
110+
body: input_html,
111+
content_type: input_content_type,
112+
extraction_model: 'review_list'
113+
});
114+
const params = config.toApiParams({ key: '1234' });
115+
expect(params).toEqual({
116+
key: '1234',
117+
body: input_html,
118+
content_type: input_content_type,
119+
extraction_model: 'review_list'
120+
});
121+
});
122+
});
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
This example shows how to use Scrapfly's auto extraction using defined extraction models
3+
*/
4+
import { ScrapflyClient, ScrapeConfig, ExtractionConfig } from 'scrapfly-sdk';
5+
6+
const key = 'YOUR SCRAPFLY KEY';
7+
const client = new ScrapflyClient({ key });
8+
9+
// First, scrape the web page to retrieve its HTML
10+
const scrapeResult = await client.scrape(
11+
new ScrapeConfig({
12+
url: 'https://web-scraping.dev/reviews',
13+
render_js: true,
14+
auto_scroll: true
15+
}),
16+
);
17+
18+
// raw HTML content
19+
const html = scrapeResult.result.content;
20+
21+
// use the AI auto extraction models for common web pages types:
22+
// for the available models, refer to https://scrapfly.io/docs/extraction-api/automatic-ai#models
23+
const extractionResult = await client.extract(
24+
new ExtractionConfig({
25+
body: html, // pass the scraped HTML content
26+
content_type: 'text/html', // content data type
27+
charset: 'utf-8', // passed content charset, use `auto` if you aren't sure
28+
url: 'https://web-scraping.dev/reviews', // when passed, used to transform relative URLs in the document into absolute URLs automatically
29+
extraction_model: 'review_list',
30+
}),
31+
);
32+
33+
// extraction result
34+
console.log(extractionResult.data);
35+
`
36+
{
37+
....
38+
reviews: [
39+
{
40+
author_name: null,
41+
content: "Unique flavor and great energy boost. It's the perfect gamer's drink!",
42+
date_published_formatted: '2023-05-18',
43+
date_published_raw: '2023-05-18',
44+
rating: null,
45+
sentiment: 'POSITIVE',
46+
sentiment_probability: 0.85,
47+
title: null,
48+
verified: null
49+
},
50+
....
51+
]
52+
`
53+
54+
// result content type
55+
console.log(extractionResult.content_type);
56+
`application/json`
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
This example shows how to use Scrapfly's extraction prompts for doument RAG processing (question answering)
3+
*/
4+
import { ScrapflyClient, ScrapeConfig, ExtractionConfig } from 'scrapfly-sdk';
5+
6+
const key = 'YOUR SCRAPFLY KEY';
7+
const client = new ScrapflyClient({ key });
8+
9+
// First, scrape the web page to retrieve its HTML
10+
const scrapeResult = await client.scrape(
11+
new ScrapeConfig({
12+
url: 'https://web-scraping.dev/products',
13+
render_js: true,
14+
}),
15+
);
16+
17+
// raw HTML content
18+
const html = scrapeResult.result.content;
19+
20+
// Second, pass the HTML and an extraction prompt
21+
// In this example, we'll ask a question about the data
22+
const extractionResult = await client.extract(
23+
new ExtractionConfig({
24+
body: html, // pass the HTML content
25+
content_type: 'text/html', // data content type
26+
charset: 'utf-8', // passed content charset, use `auto` if you aren't sure
27+
extraction_prompt: 'what is the flavor of the dark energy potion?', // LLM extraction prompt
28+
}),
29+
);
30+
31+
// extraction result
32+
console.log(extractionResult.data);
33+
`The document mentions the flavor of the Dark Red Energy Potion is **bold cherry cola**.`;
34+
35+
// result content type
36+
console.log(extractionResult.content_type);
37+
`text/plain`;

0 commit comments

Comments
 (0)