Skip to content

Commit 6d05f11

Browse files
committed
support document compression with the extraction api
1 parent 9d123b5 commit 6d05f11

File tree

4 files changed

+161
-43
lines changed

4 files changed

+161
-43
lines changed
Lines changed: 62 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1-
import { ExtractionConfig } from '../../src/extractionconfig.js';
1+
import { gzip } from 'zlib';
2+
import { ExtractionConfig, CompressionFormat } from '../../src/extractionconfig.js';
23
import { describe, it, expect } from '@jest/globals';
4+
import { promisify } from 'util';
5+
import { ExtractionConfigError } from '../../src/errors.js';
36

7+
const gzipPromise = promisify(gzip);
48
const input_html = 'very long html file';
59
const input_content_type = 'text/html';
610

@@ -13,109 +17,136 @@ describe('extactionconfig', () => {
1317
});
1418

1519
describe('url param generation', () => {
16-
it('loads', () => {
20+
it('basic config', async () => {
1721
const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
18-
expect(config.body).toBe(input_html);
19-
expect(config.content_type).toBe(input_content_type);
20-
});
21-
22-
it('basic config', () => {
23-
const config = new ExtractionConfig({ body: input_html, content_type: input_content_type });
24-
const params = config.toApiParams({ key: '1234' });
22+
const params = await config.toApiParams({ key: '1234' });
2523
expect(params).toEqual({
2624
key: '1234',
27-
body: input_html,
2825
content_type: input_content_type,
2926
});
3027
});
3128

32-
it('sets url', () => {
29+
it('sets url', async () => {
3330
const config = new ExtractionConfig({
3431
body: input_html,
3532
content_type: input_content_type,
3633
url: 'https://web-scraping.dev/products',
3734
});
38-
const params = config.toApiParams({ key: '1234' });
35+
const params = await config.toApiParams({ key: '1234' });
3936
expect(params).toEqual({
4037
key: '1234',
41-
body: input_html,
4238
content_type: input_content_type,
4339
url: 'https://web-scraping.dev/products',
4440
});
4541
});
4642

47-
it('sets charset', () => {
43+
it('sets charset', async () => {
4844
const config = new ExtractionConfig({
4945
body: input_html,
5046
content_type: input_content_type,
5147
charset: 'utf-8',
5248
});
53-
const params = config.toApiParams({ key: '1234' });
49+
const params = await config.toApiParams({ key: '1234' });
5450
expect(params).toEqual({
5551
key: '1234',
56-
body: input_html,
5752
content_type: input_content_type,
5853
charset: 'utf-8',
5954
});
6055
});
6156

62-
it('sets template', () => {
57+
it('sets template', async () => {
6358
const config = new ExtractionConfig({
6459
body: input_html,
6560
content_type: input_content_type,
6661
template: 'my_template',
6762
});
68-
const params = config.toApiParams({ key: '1234' });
63+
const params = await config.toApiParams({ key: '1234' });
6964
expect(params).toEqual({
7065
key: '1234',
71-
body: input_html,
7266
content_type: input_content_type,
7367
extraction_template: 'my_template',
7468
});
7569
});
7670

77-
it('sets epehemeral_template', () => {
71+
it('sets epehemeral_template', async () => {
7872
const config = new ExtractionConfig({
7973
body: input_html,
8074
content_type: input_content_type,
8175
epehemeral_template: { source: 'html', selectors: [] },
8276
});
83-
const params = config.toApiParams({ key: '1234' });
77+
const params = await config.toApiParams({ key: '1234' });
8478
expect(params).toEqual({
8579
key: '1234',
86-
body: input_html,
8780
content_type: input_content_type,
8881
extraction_template: 'ephemeral:eyJzb3VyY2UiOiJodG1sIiwic2VsZWN0b3JzIjpbXX0',
8982
});
9083
});
9184

92-
it('sets extraction_prompt', () => {
85+
it('sets extraction_prompt', async () => {
9386
const config = new ExtractionConfig({
9487
body: input_html,
9588
content_type: input_content_type,
96-
extraction_prompt: 'summarize the document'
89+
extraction_prompt: 'summarize the document',
9790
});
98-
const params = config.toApiParams({ key: '1234' });
91+
const params = await config.toApiParams({ key: '1234' });
9992
expect(params).toEqual({
10093
key: '1234',
94+
content_type: input_content_type,
95+
extraction_prompt: 'summarize the document',
96+
});
97+
});
98+
99+
it('sets extraction_model', async () => {
100+
const config = new ExtractionConfig({
101101
body: input_html,
102102
content_type: input_content_type,
103-
extraction_prompt: 'summarize the document'
103+
extraction_model: 'review_list',
104+
});
105+
const params = await config.toApiParams({ key: '1234' });
106+
expect(params).toEqual({
107+
key: '1234',
108+
content_type: input_content_type,
109+
extraction_model: 'review_list',
104110
});
105111
});
106112

107-
it('sets extraction_model', () => {
113+
it('compresses body', async () => {
108114
const config = new ExtractionConfig({
109115
body: input_html,
110116
content_type: input_content_type,
111-
extraction_model: 'review_list'
117+
document_compression_format: CompressionFormat.GZIP,
118+
is_document_compressed: false,
112119
});
113-
const params = config.toApiParams({ key: '1234' });
120+
const params = await config.toApiParams({ key: '1234' });
114121
expect(params).toEqual({
115122
key: '1234',
123+
content_type: input_content_type,
124+
});
125+
expect(config.body).toEqual(await gzipPromise(Buffer.from(input_html as string, 'utf-8')));
126+
});
127+
128+
it('fails to missing compression state with delcated compression format', async () => {
129+
const config = new ExtractionConfig({
116130
body: input_html,
117131
content_type: input_content_type,
118-
extraction_model: 'review_list'
132+
document_compression_format: CompressionFormat.GZIP,
119133
});
120-
});
134+
135+
await expect(async () => {
136+
await config.toApiParams({ key: '1234' });
137+
}).rejects.toThrow(ExtractionConfigError);
138+
});
139+
140+
it('fails to unsupported auto compression format', async () => {
141+
const config = new ExtractionConfig({
142+
body: input_html,
143+
content_type: input_content_type,
144+
document_compression_format: CompressionFormat.ZSTD,
145+
is_document_compressed: false
146+
});
147+
148+
await expect(async () => {
149+
await config.toApiParams({ key: '1234' });
150+
}).rejects.toThrow(ExtractionConfigError);
151+
});
121152
});
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
This example shows how to use utilize document compression with Scrapfly's extration api
3+
*/
4+
import { ScrapflyClient, ScrapeConfig, ExtractionConfig } from 'scrapfly-sdk';
5+
6+
const key = 'YOUR SCRAPFLY KEY';
7+
const client = new ScrapflyClient({ key });
8+
9+
// First, scrape the web page to retrieve its HTML
10+
const scrapeResult = await client.scrape(
11+
new ScrapeConfig({
12+
url: 'https://web-scraping.dev/reviews',
13+
render_js: true,
14+
auto_scroll: true
15+
}),
16+
);
17+
18+
const html = scrapeResult.result.content;
19+
20+
const extractionResult = await client.extract(
21+
new ExtractionConfig({
22+
body: html, // pass the scraped HTML content
23+
content_type: 'text/html',
24+
charset: 'utf-8',
25+
extraction_model: 'review_list',
26+
is_document_compressed: false, // specify that the sent document is not compressed to compress it
27+
document_compression_format: CompressionFormat.GZIP // specify that compression format
28+
// If both is_document_compressed and document_compression_format are ignored, the raw HTML sould be sent
29+
// If is_document_compressed is set to false and CompressionFormat set to GZIP, the SDK will automatically compress the document to gzip
30+
// is_document_compressed is set to false and CompressionFormat set to ZSTD or DEFLATE, the document passed to ExtractionConfig must be manually compressed
31+
}),
32+
);
33+
34+
// extraction result
35+
console.log(extractionResult.data);
36+
37+
// result content type
38+
console.log(extractionResult.content_type);

src/client.ts

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -361,17 +361,21 @@ export class ScrapflyClient {
361361
let response;
362362
try {
363363
const url = new URL(this.HOST + '/extraction');
364-
const params = config.toApiParams({ key: this.key });
364+
const params = await config.toApiParams({ key: this.key });
365365
url.search = new URLSearchParams(params).toString();
366+
const headers: Record<string, string> = {
367+
'user-agent': this.ua,
368+
'accept-encoding': 'gzip, deflate, br',
369+
'content-type': config.content_type,
370+
accept: 'application/json',
371+
};
372+
if (config.document_compression_format && config.document_compression_format) {
373+
headers['content-encoding'] = config.document_compression_format;
374+
}
366375
response = await this.fetch(
367376
new Request(url.toString(), {
368377
method: 'POST',
369-
headers: {
370-
'user-agent': this.ua,
371-
'accept-encoding': 'gzip, deflate, br',
372-
'content-type': config.content_type,
373-
accept: 'application/json',
374-
},
378+
headers: headers,
375379
body: config.body,
376380
}),
377381
);

src/extractionconfig.ts

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,37 @@
1+
import { gzip } from 'zlib';
12
import { ExtractionConfigError } from './errors.js';
3+
import { errors } from './main.js';
24
import { urlsafe_b64encode } from './utils.js';
5+
import { promisify } from 'util';
6+
7+
const gzipPromise = promisify(gzip);
8+
9+
export enum CompressionFormat {
10+
/**
11+
Document compression format.
12+
13+
Attributes:
14+
GZIP: gzip format.
15+
ZSTD: zstd format.
16+
DEFLATE: deflate.
17+
"""
18+
*/
19+
GZIP = 'gzip',
20+
ZSTD = 'zstd',
21+
DEFLATE = 'deflate',
22+
}
323

424
export class ExtractionConfig {
5-
body: string;
25+
body: string | Buffer;
626
content_type: string;
727
url?: string = null;
828
charset?: string = null;
929
template?: string; // saved template name
1030
epehemeral_template?: object; // epehemeraly declared json template
1131
extraction_prompt?: string = null;
1232
extraction_model?: string = null;
33+
is_document_compressed?: boolean = null;
34+
document_compression_format?: CompressionFormat = null;
1335
webhook?: string = null;
1436

1537
constructor(options: {
@@ -21,6 +43,8 @@ export class ExtractionConfig {
2143
epehemeral_template?: object; // epehemeraly declared json template
2244
extraction_prompt?: string;
2345
extraction_model?: string;
46+
is_document_compressed?: boolean;
47+
document_compression_format?: CompressionFormat;
2448
webhook?: string;
2549
}) {
2650
this.body = options.body;
@@ -31,14 +55,16 @@ export class ExtractionConfig {
3155
this.epehemeral_template = options.epehemeral_template;
3256
this.extraction_prompt = options.extraction_prompt;
3357
this.extraction_model = options.extraction_model;
58+
this.is_document_compressed = options.is_document_compressed;
59+
this.document_compression_format = options.document_compression_format;
3460
this.webhook = options.webhook;
3561
}
3662

37-
toApiParams(options: { key: string }): Record<string, any> {
63+
async toApiParams(options: { key: string }): Promise<Record<string, any>> {
3864
const params: Record<string, any> = {
3965
key: options.key,
4066
};
41-
params.body = this.body;
67+
// params.body = this.body;
4268
params.content_type = this.content_type;
4369

4470
if (this.url) {
@@ -50,7 +76,9 @@ export class ExtractionConfig {
5076
}
5177

5278
if (this.template && this.epehemeral_template) {
53-
throw new ExtractionConfigError('You cannot pass both parameters template and epehemeral_template. You must choose')
79+
throw new ExtractionConfigError(
80+
'You cannot pass both parameters template and epehemeral_template. You must choose',
81+
);
5482
}
5583

5684
if (this.template) {
@@ -69,10 +97,27 @@ export class ExtractionConfig {
6997
params.extraction_model = this.extraction_model;
7098
}
7199

100+
if (this.document_compression_format) {
101+
if (this.is_document_compressed == null) {
102+
throw new errors.ExtractionConfigError(
103+
'When declaring compression format, your must declare the is_document_compressed parameter to compress the document or skip it.',
104+
);
105+
}
106+
if (this.is_document_compressed == false) {
107+
if (this.document_compression_format == CompressionFormat.GZIP) {
108+
this.body = await gzipPromise(Buffer.from(this.body as string, 'utf-8'));
109+
} else {
110+
throw new errors.ExtractionConfigError(
111+
`Auto compression for ${this.document_compression_format} format isn't available. You can manually compress to ${this.document_compression_format} or choose the gzip format for auto compression`,
112+
);
113+
}
114+
}
115+
}
116+
72117
if (this.webhook) {
73118
params.webhook_name = this.webhook;
74119
}
75-
120+
76121
return params;
77122
}
78123
}

0 commit comments

Comments
 (0)