Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions src/helpers/get-page-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -679,17 +679,38 @@ export async function getUrlsFromSitemap(
return deduplicated.slice(0, maxUrls);
}

function isWwwVariant(hostname1: string, hostname2: string): boolean {
return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`;
}

/**
* Get the base URL for path-prefix filtering, accounting for cross-host redirects.
*
* When a cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
* When a true cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
* the original baseUrl path doesn't apply to the redirected host, so we return the
* effectiveOrigin (a root URL) which makes path filtering a no-op.
*
* When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com),
* the path structure is preserved, so we transfer the baseUrl's path to the
* effective origin to keep path-prefix filtering active.
*/
export function getPathFilterBase(ctx: CheckContext): string {
return ctx.effectiveOrigin && ctx.effectiveOrigin !== ctx.origin
? ctx.effectiveOrigin
: ctx.baseUrl;
if (!ctx.effectiveOrigin || ctx.effectiveOrigin === ctx.origin) {
return ctx.baseUrl;
}

try {
const originalHost = new URL(ctx.origin).hostname;
const effectiveHost = new URL(ctx.effectiveOrigin).hostname;
if (isWwwVariant(originalHost, effectiveHost)) {
const basePath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
return basePath ? `${ctx.effectiveOrigin}${basePath}` : ctx.effectiveOrigin;
}
} catch {
// fall through
}

return ctx.effectiveOrigin;
}

/**
Expand Down
38 changes: 38 additions & 0 deletions test/unit/helpers/get-page-urls.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
parseSitemapUrls,
parseSitemapDirectives,
filterByPathPrefix,
getPathFilterBase,
filterLocaleSitemaps,
filterLocalizedUrls,
deduplicateVersionedUrls,
Expand Down Expand Up @@ -155,6 +156,43 @@ describe('filterByPathPrefix', () => {
});
});

describe('getPathFilterBase', () => {
it('returns baseUrl when no effectiveOrigin is set', () => {
const ctx = createContext('https://example.com/docs', { requestDelay: 0 });
expect(getPathFilterBase(ctx)).toBe('https://example.com/docs');
});

it('returns baseUrl when effectiveOrigin matches origin', () => {
const ctx = createContext('https://example.com/docs', { requestDelay: 0 });
ctx.effectiveOrigin = 'https://example.com';
expect(getPathFilterBase(ctx)).toBe('https://example.com/docs');
});

it('preserves subpath for www-canonicalization redirects', () => {
const ctx = createContext('https://alchemy.com/docs', { requestDelay: 0 });
ctx.effectiveOrigin = 'https://www.alchemy.com';
expect(getPathFilterBase(ctx)).toBe('https://www.alchemy.com/docs');
});

it('preserves subpath when www is on the original origin', () => {
const ctx = createContext('https://www.example.com/docs', { requestDelay: 0 });
ctx.effectiveOrigin = 'https://example.com';
expect(getPathFilterBase(ctx)).toBe('https://example.com/docs');
});

it('returns root effectiveOrigin for true cross-host redirects', () => {
const ctx = createContext('https://example.com/docs', { requestDelay: 0 });
ctx.effectiveOrigin = 'https://docs.example.com';
expect(getPathFilterBase(ctx)).toBe('https://docs.example.com');
});

it('returns root effectiveOrigin for www redirect with root baseUrl', () => {
const ctx = createContext('https://alchemy.com', { requestDelay: 0 });
ctx.effectiveOrigin = 'https://www.alchemy.com';
expect(getPathFilterBase(ctx)).toBe('https://www.alchemy.com');
});
});

describe('filterLocaleSitemaps', () => {
it('filters to English sub-sitemaps when locale pattern detected in filenames', () => {
const urls = [
Expand Down