Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ $ linkinator LOCATIONS [ --arguments ]
--user-agent
The user agent passed in all HTTP requests. Defaults to 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'

--body-regex
Regex string with exactly one capturing group to extract extra URLs from the response body.

--verbosity
Override the default verbosity for this command. Available options are
'debug', 'info', 'warning', 'error', and 'none'. Defaults to 'warning'.
Expand Down Expand Up @@ -284,6 +287,7 @@ where the server is started. Defaults to the path passed in `path`.
- `directoryListing` (boolean) - Automatically serve a static file listing page when serving a directory. Defaults to `false`.
- `urlRewriteExpressions` (array) - Collection of objects that contain a search pattern, and replacement.
- `userAgent` (string) - The [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) that should be passed with each request. This uses a reasonable default.
- `bodyRegex` (string) - Regex string with exactly one capturing group to extract extra URLs from the response body.

### linkinator.LinkChecker()

Expand Down
5 changes: 5 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ const parser = yargs(hideBin(process.argv))
defaultDescription: DEFAULT_OPTIONS.userAgent.toString(),
describe: 'The user agent passed in all HTTP requests.',
},
bodyRegex: {
type: 'string',
describe:
'Regex string with exactly one capturing group to extract extra URLs from the response body.',
},
verbosity: {
type: 'string',
describe:
Expand Down
2 changes: 1 addition & 1 deletion src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ export class LinkChecker extends EventEmitter {
// If we need to go deeper, scan the next level of depth for links and crawl
this.emit('pagestart', options.url);

const urlResults = await getLinks(response, options.url.href);
const urlResults = await getLinks(response, options);

for (const result of urlResults) {
// If there was some sort of problem parsing the link while
Expand Down
15 changes: 14 additions & 1 deletion src/links.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { Stream } from 'node:stream';
import { WritableStream } from 'htmlparser2/WritableStream';
import { parseSrcset } from 'srcset';
import type { CrawlOptions } from './crawler.ts';
import type { ElementMetadata } from './types.ts';
import { isCSS } from './utils.ts';

Expand Down Expand Up @@ -50,11 +51,13 @@ export type ParsedUrl = {

export async function getLinks(
response: Response,
baseUrl: string,
options: CrawlOptions,
): Promise<ParsedUrl[]> {
let source: ReadableStream;
const baseUrl = options.url.href;
let realBaseUrl = baseUrl;
let baseSet = false;
const bodyText = await response.clone().text();

if (!response.body) {
return [];
Expand Down Expand Up @@ -192,6 +195,16 @@ export async function getLinks(

rs.pipe(parser).on('finish', resolve).on('error', reject);
});

// Extract additional links from body via user-provided regex
const bodyRegex = options.checkOptions.bodyRegex;
if (bodyRegex && bodyText) {
const regex = new RegExp(bodyRegex, 'g');
for (const match of bodyText.matchAll(regex)) {
if (match[1]) links.push(parseLink(match[1], realBaseUrl));
}
}

return links;
}

Expand Down
1 change: 1 addition & 0 deletions src/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export type SharedOptions = {
retryErrorsJitter?: number;
extraHeaders?: { [key: string]: string };
userAgent?: string;
bodyRegex?: string;
};

export type UrlRewriteExpression = {
Expand Down
9 changes: 9 additions & 0 deletions test/fixtures/regex/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<html>
<head></head>
<body>
<div>
<a href="http://example.invalid">Normal Link</a>
</div>
<script type="application/json">{"test": true,"links":[{"label":"X","link":"http://example.invalid/1","xyz":123},{"label":"Y","link":"http://example.invalid/2"},{"label":"Z","link":"http://example.invalid/3"}]}</script>
</body>
</html>
9 changes: 9 additions & 0 deletions test/fixtures/regexDuplicate/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<html>
<head></head>
<body>
<div>
<a href="http://example.invalid">Normal Link</a>
</div>
<script type="application/json">{"links":[{"label":"X","link":"http://example.invalid/1"},{"label":"Y","link":"http://example.invalid/1"}]}</script>
</body>
</html>
7 changes: 7 additions & 0 deletions test/fixtures/regexInvalid/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<body>
<div>
</div>
<script type="application/json">{"test": true,"links":[{"label":"X","link":"abc","xyz":123},{"label":"Y","link":"http://example.invalid/1"}]}</script>
</body>
</html>
86 changes: 86 additions & 0 deletions test/test.index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,92 @@ describe('linkinator', () => {
spy.mockRestore();
});

describe('bodyRegex', () => {
it('should find links via regex when provided', async () => {
const scope = nock('http://example.invalid')
.head('/')
.reply(200)
.head('/1')
.reply(200)
.head('/2')
.reply(200)
.head('/3')
.reply(200);
const results = await check({
path: 'test/fixtures/regex',
bodyRegex: '"link":"([^"]+)"',
});
assert.ok(results.passed);
assert.strictEqual(results.links.length, 5);
scope.done();
});

it('should handle non-matching regex', async () => {
const scope = nock('http://example.invalid').head('/').reply(200);
const results = await check({
path: 'test/fixtures/regex',
bodyRegex: '"abc":"([^"]+)"',
});
assert.ok(results.passed);
assert.strictEqual(results.links.length, 2);
scope.done();
});

it('should handle duplicate regex matches', async () => {
const scope = nock('http://example.invalid')
.head('/')
.reply(200)
.head('/1')
.reply(200);
const results = await check({
path: 'test/fixtures/regexDuplicate',
bodyRegex: '"link":"([^"]+)"',
});
assert.ok(results.passed);
assert.strictEqual(results.links.length, 3);
scope.done();
});

it('should ignore links in body when no regex provided', async () => {
const scope = nock('http://example.invalid').head('/').reply(200);
const results = await check({ path: 'test/fixtures/regex' });
assert.ok(results.passed);
assert.strictEqual(results.links.length, 2);
scope.done();
});

it('should handle invalid URLs matched by regex', async () => {
const scope = nock('http://example.invalid').head('/1').reply(200);
const results = await check({
path: 'test/fixtures/regexInvalid',
bodyRegex: '"link":"([^"]+)"',
});
assert.ok(!results.passed);
assert.strictEqual(results.links.length, 3);
assert.ok(results.links[2].url.endsWith('/abc'));
assert.strictEqual(results.links[2].state, 'BROKEN');
scope.done();
});

it('should get specific links by regex', async () => {
const scope = nock('http://example.invalid')
.head('/')
.reply(200)
.head('/1')
.reply(200)
.head('/3')
.reply(200);
const results = await check({
path: 'test/fixtures/regex',
bodyRegex:
'"link":"(https?://(?:example.invalid/1|example.invalid/3)[^"]*)"',
});
assert.ok(results.passed);
assert.strictEqual(results.links.length, 4);
scope.done();
});
});

describe('element metadata', () => {
it('should provide <a> text in results', async () => {
const scope = nock('http://example.invalid').head('/').reply(404);
Expand Down
13 changes: 10 additions & 3 deletions test/test.links.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { describe, expect, it } from 'vitest';
import type { CrawlOptions } from '../src/crawler.ts';
import { getLinks } from '../src/links.js';

describe('getLinks', () => {
Expand All @@ -12,11 +13,17 @@ describe('getLinks', () => {
const response = {
body,
headers: new Headers({ 'content-type': 'text/html' }),
clone: () => ({
text: () => '',
}),
} as unknown as Response;

// Expect getLinks to reject with our error,
await expect(getLinks(response, 'http://example.invalid')).rejects.toThrow(
'StreamError',
);
await expect(
getLinks(response, {
url: { href: 'http://example.invalid' },
checkOptions: {},
} as unknown as CrawlOptions),
).rejects.toThrow('StreamError');
});
});
Loading