diff --git a/README.md b/README.md index 665759af..f4373e27 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,9 @@ $ linkinator LOCATIONS [ --arguments ] --user-agent The user agent passed in all HTTP requests. Defaults to 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' + --body-regex + Regex string with exactly one capturing group to extract extra URLs from the response body. + --verbosity Override the default verbosity for this command. Available options are 'debug', 'info', 'warning', 'error', and 'none'. Defaults to 'warning'. @@ -284,6 +287,7 @@ where the server is started. Defaults to the path passed in `path`. - `directoryListing` (boolean) - Automatically serve a static file listing page when serving a directory. Defaults to `false`. - `urlRewriteExpressions` (array) - Collection of objects that contain a search pattern, and replacement. - `userAgent` (string) - The [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) that should be passed with each request. This uses a reasonable default. +- `bodyRegex` (string) - Regex string with exactly one capturing group to extract extra URLs from the response body. ### linkinator.LinkChecker() diff --git a/src/cli.ts b/src/cli.ts index 2ed45e27..7f784c19 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -129,6 +129,11 @@ const parser = yargs(hideBin(process.argv)) defaultDescription: DEFAULT_OPTIONS.userAgent.toString(), describe: 'The user agent passed in all HTTP requests.', }, + bodyRegex: { + type: 'string', + describe: + 'Regex string with exactly one capturing group to extract extra URLs from the response body.', + }, verbosity: { type: 'string', describe: diff --git a/src/crawler.ts b/src/crawler.ts index 8f82c501..edfa5f37 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -444,7 +444,7 @@ export class LinkChecker extends EventEmitter { // If we need to go deeper, scan the next level of depth for links and crawl this.emit('pagestart', options.url); - const urlResults = await getLinks(response, options.url.href); + const urlResults = await getLinks(response, options); for (const result of urlResults) { // If there was some sort of problem parsing the link while diff --git a/src/links.ts b/src/links.ts index 0de14599..057bb173 100644 --- a/src/links.ts +++ b/src/links.ts @@ -1,6 +1,7 @@ import { Stream } from 'node:stream'; import { WritableStream } from 'htmlparser2/WritableStream'; import { parseSrcset } from 'srcset'; +import type { CrawlOptions } from './crawler.ts'; import type { ElementMetadata } from './types.ts'; import { isCSS } from './utils.ts'; @@ -50,11 +51,13 @@ export type ParsedUrl = { export async function getLinks( response: Response, - baseUrl: string, + options: CrawlOptions, ): Promise { let source: ReadableStream; + const baseUrl = options.url.href; let realBaseUrl = baseUrl; let baseSet = false; + const bodyText = await response.clone().text(); if (!response.body) { return []; @@ -192,6 +195,16 @@ export async function getLinks( rs.pipe(parser).on('finish', resolve).on('error', reject); }); + + // Extract additional links from body via user-provided regex + const bodyRegex = options.checkOptions.bodyRegex; + if (bodyRegex && bodyText) { + const regex = new RegExp(bodyRegex, 'g'); + for (const match of bodyText.matchAll(regex)) { + if (match[1]) links.push(parseLink(match[1], realBaseUrl)); + } + } + return links; } diff --git a/src/options.ts b/src/options.ts index 9176971d..cfcd4506 100644 --- a/src/options.ts +++ b/src/options.ts @@ -20,6 +20,7 @@ export type SharedOptions = { retryErrorsJitter?: number; extraHeaders?: { [key: string]: string }; userAgent?: string; + bodyRegex?: string; }; export type UrlRewriteExpression = { diff --git a/test/fixtures/regex/index.html b/test/fixtures/regex/index.html new file mode 100644 index 00000000..528377e7 --- /dev/null +++ b/test/fixtures/regex/index.html @@ -0,0 +1,9 @@ + + + +
+Normal Link +
+ + + \ No newline at end of file diff --git a/test/fixtures/regexDuplicate/index.html b/test/fixtures/regexDuplicate/index.html new file mode 100644 index 00000000..420d77c7 --- /dev/null +++ b/test/fixtures/regexDuplicate/index.html @@ -0,0 +1,9 @@ + + + +
+Normal Link +
+ + + \ No newline at end of file diff --git a/test/fixtures/regexInvalid/index.html b/test/fixtures/regexInvalid/index.html new file mode 100644 index 00000000..c3857cbb --- /dev/null +++ b/test/fixtures/regexInvalid/index.html @@ -0,0 +1,7 @@ + + +
+
+ + + \ No newline at end of file diff --git a/test/test.index.ts b/test/test.index.ts index 57061679..b2dcce5c 100644 --- a/test/test.index.ts +++ b/test/test.index.ts @@ -649,6 +649,92 @@ describe('linkinator', () => { spy.mockRestore(); }); + describe('bodyRegex', () => { + it('should find links via regex when provided', async () => { + const scope = nock('http://example.invalid') + .head('/') + .reply(200) + .head('/1') + .reply(200) + .head('/2') + .reply(200) + .head('/3') + .reply(200); + const results = await check({ + path: 'test/fixtures/regex', + bodyRegex: '"link":"([^"]+)"', + }); + assert.ok(results.passed); + assert.strictEqual(results.links.length, 5); + scope.done(); + }); + + it('should handle non-matching regex', async () => { + const scope = nock('http://example.invalid').head('/').reply(200); + const results = await check({ + path: 'test/fixtures/regex', + bodyRegex: '"abc":"([^"]+)"', + }); + assert.ok(results.passed); + assert.strictEqual(results.links.length, 2); + scope.done(); + }); + + it('should handle duplicate regex matches', async () => { + const scope = nock('http://example.invalid') + .head('/') + .reply(200) + .head('/1') + .reply(200); + const results = await check({ + path: 'test/fixtures/regexDuplicate', + bodyRegex: '"link":"([^"]+)"', + }); + assert.ok(results.passed); + assert.strictEqual(results.links.length, 3); + scope.done(); + }); + + it('should ignore links in body when no regex provided', async () => { + const scope = nock('http://example.invalid').head('/').reply(200); + const results = await check({ path: 'test/fixtures/regex' }); + assert.ok(results.passed); + assert.strictEqual(results.links.length, 2); + scope.done(); + }); + + it('should handle invalid URLs matched by regex', async () => { + const scope = nock('http://example.invalid').head('/1').reply(200); + const results = await check({ + path: 'test/fixtures/regexInvalid', + bodyRegex: '"link":"([^"]+)"', + }); + assert.ok(!results.passed); + assert.strictEqual(results.links.length, 3); + assert.ok(results.links[2].url.endsWith('/abc')); + assert.strictEqual(results.links[2].state, 'BROKEN'); + scope.done(); + }); + + it('should get specific links by regex', async () => { + const scope = nock('http://example.invalid') + .head('/') + .reply(200) + .head('/1') + .reply(200) + .head('/3') + .reply(200); + const results = await check({ + path: 'test/fixtures/regex', + bodyRegex: + '"link":"(https?://(?:example.invalid/1|example.invalid/3)[^"]*)"', + }); + assert.ok(results.passed); + assert.strictEqual(results.links.length, 4); + scope.done(); + }); + }); + describe('element metadata', () => { it('should provide text in results', async () => { const scope = nock('http://example.invalid').head('/').reply(404); diff --git a/test/test.links.ts b/test/test.links.ts index 4fbcfa29..c5981bf4 100644 --- a/test/test.links.ts +++ b/test/test.links.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'vitest'; +import type { CrawlOptions } from '../src/crawler.ts'; import { getLinks } from '../src/links.js'; describe('getLinks', () => { @@ -12,11 +13,17 @@ describe('getLinks', () => { const response = { body, headers: new Headers({ 'content-type': 'text/html' }), + clone: () => ({ + text: () => '', + }), } as unknown as Response; // Expect getLinks to reject with our error, - await expect(getLinks(response, 'http://example.invalid')).rejects.toThrow( - 'StreamError', - ); + await expect( + getLinks(response, { + url: { href: 'http://example.invalid' }, + checkOptions: {}, + } as unknown as CrawlOptions), + ).rejects.toThrow('StreamError'); }); });