diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index b5ff193..88f3721 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -679,17 +679,38 @@ export async function getUrlsFromSitemap( return deduplicated.slice(0, maxUrls); } +function isWwwVariant(hostname1: string, hostname2: string): boolean { + return hostname1 === `www.${hostname2}` || hostname2 === `www.${hostname1}`; +} + /** * Get the base URL for path-prefix filtering, accounting for cross-host redirects. * - * When a cross-host redirect is in play (e.g. example.com/docs → docs.example.com), + * When a true cross-host redirect is in play (e.g. example.com/docs → docs.example.com), * the original baseUrl path doesn't apply to the redirected host, so we return the * effectiveOrigin (a root URL) which makes path filtering a no-op. + * + * When the redirect is www-canonicalization (e.g. alchemy.com → www.alchemy.com), + * the path structure is preserved, so we transfer the baseUrl's path to the + * effective origin to keep path-prefix filtering active. */ export function getPathFilterBase(ctx: CheckContext): string { - return ctx.effectiveOrigin && ctx.effectiveOrigin !== ctx.origin - ? ctx.effectiveOrigin - : ctx.baseUrl; + if (!ctx.effectiveOrigin || ctx.effectiveOrigin === ctx.origin) { + return ctx.baseUrl; + } + + try { + const originalHost = new URL(ctx.origin).hostname; + const effectiveHost = new URL(ctx.effectiveOrigin).hostname; + if (isWwwVariant(originalHost, effectiveHost)) { + const basePath = new URL(ctx.baseUrl).pathname.replace(/\/$/, ''); + return basePath ? `${ctx.effectiveOrigin}${basePath}` : ctx.effectiveOrigin; + } + } catch { + // fall through + } + + return ctx.effectiveOrigin; } /** diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts index 3726bae..6c5ddea 100644 --- a/test/unit/helpers/get-page-urls.test.ts +++ b/test/unit/helpers/get-page-urls.test.ts @@ -8,6 +8,7 @@ import { parseSitemapUrls, parseSitemapDirectives, filterByPathPrefix, + getPathFilterBase, filterLocaleSitemaps, filterLocalizedUrls, deduplicateVersionedUrls, @@ -155,6 +156,43 @@ describe('filterByPathPrefix', () => { }); }); +describe('getPathFilterBase', () => { + it('returns baseUrl when no effectiveOrigin is set', () => { + const ctx = createContext('https://example.com/docs', { requestDelay: 0 }); + expect(getPathFilterBase(ctx)).toBe('https://example.com/docs'); + }); + + it('returns baseUrl when effectiveOrigin matches origin', () => { + const ctx = createContext('https://example.com/docs', { requestDelay: 0 }); + ctx.effectiveOrigin = 'https://example.com'; + expect(getPathFilterBase(ctx)).toBe('https://example.com/docs'); + }); + + it('preserves subpath for www-canonicalization redirects', () => { + const ctx = createContext('https://alchemy.com/docs', { requestDelay: 0 }); + ctx.effectiveOrigin = 'https://www.alchemy.com'; + expect(getPathFilterBase(ctx)).toBe('https://www.alchemy.com/docs'); + }); + + it('preserves subpath when www is on the original origin', () => { + const ctx = createContext('https://www.example.com/docs', { requestDelay: 0 }); + ctx.effectiveOrigin = 'https://example.com'; + expect(getPathFilterBase(ctx)).toBe('https://example.com/docs'); + }); + + it('returns root effectiveOrigin for true cross-host redirects', () => { + const ctx = createContext('https://example.com/docs', { requestDelay: 0 }); + ctx.effectiveOrigin = 'https://docs.example.com'; + expect(getPathFilterBase(ctx)).toBe('https://docs.example.com'); + }); + + it('returns root effectiveOrigin for www redirect with root baseUrl', () => { + const ctx = createContext('https://alchemy.com', { requestDelay: 0 }); + ctx.effectiveOrigin = 'https://www.alchemy.com'; + expect(getPathFilterBase(ctx)).toBe('https://www.alchemy.com'); + }); +}); + describe('filterLocaleSitemaps', () => { it('filters to English sub-sitemaps when locale pattern detected in filenames', () => { const urls = [