avenga · lkmgr · Nov 7, 2025 · Nov 7, 2025
diff --git a/README.md b/README.md
@@ -172,6 +172,9 @@ $ linkinator LOCATIONS [ --arguments ]
     --user-agent
         The user agent passed in all HTTP requests. Defaults to 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
 
+    --body-regex
+        Regex string with exactly one capturing group to extract extra URLs from the response body.
+
     --verbosity
         Override the default verbosity for this command. Available options are
         'debug', 'info', 'warning', 'error', and 'none'.  Defaults to 'warning'.
@@ -284,6 +287,7 @@ where the server is started.  Defaults to the path passed in `path`.
 - `directoryListing` (boolean) - Automatically serve a static file listing page when serving a directory.  Defaults to `false`.
 - `urlRewriteExpressions` (array) - Collection of objects that contain a search pattern, and replacement.
 - `userAgent` (string) - The [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) that should be passed with each request. This uses a reasonable default.
+- `bodyRegex` (string) - Regex string with exactly one capturing group to extract extra URLs from the response body.
 
 ### linkinator.LinkChecker()
 

diff --git a/src/cli.ts b/src/cli.ts
@@ -129,6 +129,11 @@ const parser = yargs(hideBin(process.argv))
 			defaultDescription: DEFAULT_OPTIONS.userAgent.toString(),
 			describe: 'The user agent passed in all HTTP requests.',
 		},
+		bodyRegex: {
+			type: 'string',
+			describe:
+				'Regex string with exactly one capturing group to extract extra URLs from the response body.',
+		},
 		verbosity: {
 			type: 'string',
 			describe:

diff --git a/src/crawler.ts b/src/crawler.ts
@@ -444,7 +444,7 @@ export class LinkChecker extends EventEmitter {
 		// If we need to go deeper, scan the next level of depth for links and crawl
 		this.emit('pagestart', options.url);
 
-		const urlResults = await getLinks(response, options.url.href);
+		const urlResults = await getLinks(response, options);
 
 		for (const result of urlResults) {
 			// If there was some sort of problem parsing the link while

diff --git a/src/links.ts b/src/links.ts
@@ -1,6 +1,7 @@
 import { Stream } from 'node:stream';
 import { WritableStream } from 'htmlparser2/WritableStream';
 import { parseSrcset } from 'srcset';
+import type { CrawlOptions } from './crawler.ts';
 import type { ElementMetadata } from './types.ts';
 import { isCSS } from './utils.ts';
 
@@ -50,11 +51,13 @@ export type ParsedUrl = {
 
 export async function getLinks(
 	response: Response,
-	baseUrl: string,
+	options: CrawlOptions,
 ): Promise<ParsedUrl[]> {
 	let source: ReadableStream;
+	const baseUrl = options.url.href;
 	let realBaseUrl = baseUrl;
 	let baseSet = false;
+	const bodyText = await response.clone().text();
 
 	if (!response.body) {
 		return [];
@@ -192,6 +195,16 @@ export async function getLinks(
 
 		rs.pipe(parser).on('finish', resolve).on('error', reject);
 	});
+
+	// Extract additional links from body via user-provided regex
+	const bodyRegex = options.checkOptions.bodyRegex;
+	if (bodyRegex && bodyText) {
+		const regex = new RegExp(bodyRegex, 'g');
+		for (const match of bodyText.matchAll(regex)) {
+			if (match[1]) links.push(parseLink(match[1], realBaseUrl));
+		}
+	}
+
 	return links;
 }
 

diff --git a/src/options.ts b/src/options.ts
@@ -20,6 +20,7 @@ export type SharedOptions = {
 	retryErrorsJitter?: number;
 	extraHeaders?: { [key: string]: string };
 	userAgent?: string;
+	bodyRegex?: string;
 };
 
 export type UrlRewriteExpression = {

diff --git a/test/fixtures/regex/index.html b/test/fixtures/regex/index.html
@@ -0,0 +1,9 @@
+<html>
+<head></head>
+<body>
+<div>
+<a href="http://example.invalid">Normal Link</a>
+</div>
+<script type="application/json">{"test": true,"links":[{"label":"X","link":"http://example.invalid/1","xyz":123},{"label":"Y","link":"http://example.invalid/2"},{"label":"Z","link":"http://example.invalid/3"}]}</script>
+</body>
+</html>
diff --git a/test/fixtures/regexDuplicate/index.html b/test/fixtures/regexDuplicate/index.html
@@ -0,0 +1,9 @@
+<html>
+<head></head>
+<body>
+<div>
+<a href="http://example.invalid">Normal Link</a>
+</div>
+<script type="application/json">{"links":[{"label":"X","link":"http://example.invalid/1"},{"label":"Y","link":"http://example.invalid/1"}]}</script>
+</body>
+</html>
diff --git a/test/fixtures/regexInvalid/index.html b/test/fixtures/regexInvalid/index.html
@@ -0,0 +1,7 @@
+<html>
+<body>
+<div>
+</div>
+<script type="application/json">{"test": true,"links":[{"label":"X","link":"abc","xyz":123},{"label":"Y","link":"http://example.invalid/1"}]}</script>
+</body>
+</html>
diff --git a/test/test.index.ts b/test/test.index.ts
@@ -649,6 +649,92 @@ describe('linkinator', () => {
 		spy.mockRestore();
 	});
 
+	describe('bodyRegex', () => {
+		it('should find links via regex when provided', async () => {
+			const scope = nock('http://example.invalid')
+				.head('/')
+				.reply(200)
+				.head('/1')
+				.reply(200)
+				.head('/2')
+				.reply(200)
+				.head('/3')
+				.reply(200);
+			const results = await check({
+				path: 'test/fixtures/regex',
+				bodyRegex: '"link":"([^"]+)"',
+			});
+			assert.ok(results.passed);
+			assert.strictEqual(results.links.length, 5);
+			scope.done();
+		});
+
+		it('should handle non-matching regex', async () => {
+			const scope = nock('http://example.invalid').head('/').reply(200);
+			const results = await check({
+				path: 'test/fixtures/regex',
+				bodyRegex: '"abc":"([^"]+)"',
+			});
+			assert.ok(results.passed);
+			assert.strictEqual(results.links.length, 2);
+			scope.done();
+		});
+
+		it('should handle duplicate regex matches', async () => {
+			const scope = nock('http://example.invalid')
+				.head('/')
+				.reply(200)
+				.head('/1')
+				.reply(200);
+			const results = await check({
+				path: 'test/fixtures/regexDuplicate',
+				bodyRegex: '"link":"([^"]+)"',
+			});
+			assert.ok(results.passed);
+			assert.strictEqual(results.links.length, 3);
+			scope.done();
+		});
+
+		it('should ignore links in body when no regex provided', async () => {
+			const scope = nock('http://example.invalid').head('/').reply(200);
+			const results = await check({ path: 'test/fixtures/regex' });
+			assert.ok(results.passed);
+			assert.strictEqual(results.links.length, 2);
+			scope.done();
+		});
+
+		it('should handle invalid URLs matched by regex', async () => {
+			const scope = nock('http://example.invalid').head('/1').reply(200);
+			const results = await check({
+				path: 'test/fixtures/regexInvalid',
+				bodyRegex: '"link":"([^"]+)"',
+			});
+			assert.ok(!results.passed);
+			assert.strictEqual(results.links.length, 3);
+			assert.ok(results.links[2].url.endsWith('/abc'));
+			assert.strictEqual(results.links[2].state, 'BROKEN');
+			scope.done();
+		});
+
+		it('should get specific links by regex', async () => {
+			const scope = nock('http://example.invalid')
+				.head('/')
+				.reply(200)
+				.head('/1')
+				.reply(200)
+				.head('/3')
+				.reply(200);
+			const results = await check({
+				path: 'test/fixtures/regex',
+				bodyRegex:
+					'"link":"(https?://(?:example.invalid/1|example.invalid/3)[^"]*)"',
+			});
+			assert.ok(results.passed);
+			assert.strictEqual(results.links.length, 4);
+			scope.done();
+		});
+	});
+
 	describe('element metadata', () => {
 		it('should provide <a> text in results', async () => {
 			const scope = nock('http://example.invalid').head('/').reply(404);

diff --git a/test/test.links.ts b/test/test.links.ts
@@ -1,4 +1,5 @@
 import { describe, expect, it } from 'vitest';
+import type { CrawlOptions } from '../src/crawler.ts';
 import { getLinks } from '../src/links.js';
 
 describe('getLinks', () => {
@@ -12,11 +13,17 @@ describe('getLinks', () => {
 		const response = {
 			body,
 			headers: new Headers({ 'content-type': 'text/html' }),
+			clone: () => ({
+				text: () => '',
+			}),
 		} as unknown as Response;
 
 		// Expect getLinks to reject with our error,
-		await expect(getLinks(response, 'http://example.invalid')).rejects.toThrow(
-			'StreamError',
-		);
+		await expect(
+			getLinks(response, {
+				url: { href: 'http://example.invalid' },
+				checkOptions: {},
+			} as unknown as CrawlOptions),
+		).rejects.toThrow('StreamError');
 	});
 });