diff --git a/src/checks/observability/llms-txt-freshness.ts b/src/checks/observability/llms-txt-freshness.ts index 3eaf1ce..0f46574 100644 --- a/src/checks/observability/llms-txt-freshness.ts +++ b/src/checks/observability/llms-txt-freshness.ts @@ -5,6 +5,7 @@ import { parseSitemapUrls, } from '../../helpers/get-page-urls.js'; import { isNonPageUrl } from '../../helpers/to-md-urls.js'; +import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js'; import type { CheckContext, CheckResult } from '../../types.js'; /** @@ -95,7 +96,7 @@ export function detectLocalePosition(urls: string[]): number | null { const segments = new URL(url).pathname.split('/').filter(Boolean); for (let i = 0; i < segments.length; i++) { const seg = segments[i].toLowerCase(); - if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) { + if (isLocaleSegment(seg)) { if (!positionCounts.has(i)) positionCounts.set(i, new Map()); const counts = positionCounts.get(i)!; counts.set(seg, (counts.get(seg) ?? 0) + 1); @@ -107,6 +108,7 @@ export function detectLocalePosition(urls: string[]): number | null { } } + // First pass: ≥2 distinct locale codes covering >50% of URLs (strong signal) for (const [pos, counts] of positionCounts) { if (counts.size < 2) continue; const total = positionTotals.get(pos) ?? 0; @@ -115,6 +117,17 @@ export function detectLocalePosition(urls: string[]): number | null { } } + // Second pass: single locale code confirmed by structural duplication. + // With ISO 639-1 validation, a single code is meaningful when stripping it + // produces paths that match unprefixed URLs in the same set. + for (const [pos, counts] of positionCounts) { + if (counts.size !== 1) continue; + const [code] = counts.keys(); + if (hasStructuralDuplication(urls, pos, code)) { + return pos; + } + } + return null; } @@ -163,6 +176,27 @@ function filterByLocale(urls: string[], locale: string, position: number): strin }); } +/** + * Test whether a URL has a locale code at the given path position. + */ +export function hasLocaleCodeAt(url: string, position: number): boolean { + try { + const segments = new URL(url).pathname.split('/').filter(Boolean); + return segments.length > position && isLocaleSegment(segments[position]); + } catch { + return false; + } +} + +/** + * Filter URLs to only those that do NOT have a locale code at `position`. + * Used when llms.txt covers the unprefixed default locale and we need to + * exclude locale-prefixed sitemap variants from coverage comparison. + */ +export function filterToUnprefixedLocale(urls: string[], position: number): string[] { + return urls.filter((url) => !hasLocaleCodeAt(url, position)); +} + /** Coverage thresholds */ const COVERAGE_PASS = 0.95; const COVERAGE_WARN = 0.8; @@ -316,6 +350,17 @@ async function check(ctx: CheckContext): Promise { const before = scopedSitemapUrls.length; scopedSitemapUrls = filterByLocale(scopedSitemapUrls, llmsLocale, localePosition); localeFiltered = scopedSitemapUrls.length < before; + } else { + // llms.txt may cover the unprefixed default locale (no /en/, /de/, etc.). + // If most llms.txt URLs lack locale codes at the detected position, + // filter the sitemap to only unprefixed URLs. + const withLocale = llmsTxtUrls.filter((u) => hasLocaleCodeAt(u, localePosition!)).length; + if (withLocale < llmsTxtUrls.length * 0.5) { + const before = scopedSitemapUrls.length; + scopedSitemapUrls = filterToUnprefixedLocale(scopedSitemapUrls, localePosition); + localeFiltered = scopedSitemapUrls.length < before; + if (localeFiltered) detectedLocale = 'default'; + } } } diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index 88f3721..09d5737 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt import { MAX_SITEMAP_URLS } from '../constants.js'; import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js'; import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js'; +import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js'; import type { CheckContext, DiscoveredFile } from '../types.js'; /** @@ -286,7 +287,7 @@ export function extractLocaleFromUrl(url: string): string | null { const segments = new URL(url).pathname.split('/').filter(Boolean); // Only check the first 3 segments to avoid matching content paths for (let i = 0; i < Math.min(segments.length, 3); i++) { - if (/^[a-z]{2}(-[a-z]{2})?$/i.test(segments[i])) { + if (isLocaleSegment(segments[i])) { return segments[i].toLowerCase(); } } @@ -325,7 +326,7 @@ export function filterLocaleSitemaps( const pathMatch = pathLocalePattern.exec(url); const match = filenameMatch ?? pathMatch; - if (match) { + if (match && isLocaleSegment(match[1])) { const locale = match[1].toLowerCase(); if (!locales.has(locale)) locales.set(locale, []); locales.get(locale)!.push(url); @@ -366,7 +367,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n const segments = new URL(url).pathname.split('/').filter(Boolean); for (let i = 0; i < segments.length; i++) { const seg = segments[i].toLowerCase(); - if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) { + if (isLocaleSegment(seg)) { if (!positionCounts.has(i)) positionCounts.set(i, new Map()); const counts = positionCounts.get(i)!; counts.set(seg, (counts.get(seg) ?? 0) + 1); @@ -380,6 +381,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n // Find the position that looks like a locale segment let localePosition: number | null = null; + // First pass: ≥2 distinct locale codes covering >50% of URLs for (const [pos, counts] of positionCounts) { if (counts.size < 2) continue; const total = positionTotals.get(pos) ?? 0; @@ -388,6 +390,17 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n break; } } + // Second pass: single locale code confirmed by structural duplication + if (localePosition === null) { + for (const [pos, counts] of positionCounts) { + if (counts.size !== 1) continue; + const [code] = counts.keys(); + if (hasStructuralDuplication(urls, pos, code)) { + localePosition = pos; + break; + } + } + } if (localePosition === null) return urls; @@ -403,8 +416,22 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n } }); - // If filtering removed everything (target locale not present), return original - return filtered.length > 0 ? filtered : urls; + if (filtered.length > 0) return filtered; + + // Target locale not found. The default language may use unprefixed paths + // (e.g. /docs/intro instead of /docs/en/intro). Filter to URLs that don't + // have any locale code at the detected position. + const unprefixed = urls.filter((url) => { + try { + const segments = new URL(url).pathname.split('/').filter(Boolean); + if (segments.length <= localePosition!) return true; + return !isLocaleSegment(segments[localePosition!]); + } catch { + return true; + } + }); + + return unprefixed.length > 0 ? unprefixed : urls; } /** diff --git a/src/helpers/locale-codes.ts b/src/helpers/locale-codes.ts new file mode 100644 index 0000000..450172d --- /dev/null +++ b/src/helpers/locale-codes.ts @@ -0,0 +1,248 @@ +// ISO 639-1 language codes. +// Stable standard (~184 codes, last major revision 2002). Used to validate +// locale-like path segments instead of a broad regex that would match +// non-locale 2-letter segments like "go", "ai", "my", "io", "up", "do". +const ISO_639_1 = new Set([ + 'aa', + 'ab', + 'ae', + 'af', + 'ak', + 'am', + 'an', + 'ar', + 'as', + 'av', + 'ay', + 'az', + 'ba', + 'be', + 'bg', + 'bh', + 'bi', + 'bm', + 'bn', + 'bo', + 'br', + 'bs', + 'ca', + 'ce', + 'ch', + 'co', + 'cr', + 'cs', + 'cu', + 'cv', + 'cy', + 'da', + 'de', + 'dv', + 'dz', + 'ee', + 'el', + 'en', + 'eo', + 'es', + 'et', + 'eu', + 'fa', + 'ff', + 'fi', + 'fj', + 'fo', + 'fr', + 'fy', + 'ga', + 'gd', + 'gl', + 'gn', + 'gu', + 'gv', + 'ha', + 'he', + 'hi', + 'ho', + 'hr', + 'ht', + 'hu', + 'hy', + 'hz', + 'ia', + 'id', + 'ie', + 'ig', + 'ii', + 'ik', + 'io', + 'is', + 'it', + 'iu', + 'ja', + 'jv', + 'ka', + 'kg', + 'ki', + 'kj', + 'kk', + 'kl', + 'km', + 'kn', + 'ko', + 'kr', + 'ks', + 'ku', + 'kv', + 'kw', + 'ky', + 'la', + 'lb', + 'lg', + 'li', + 'ln', + 'lo', + 'lt', + 'lu', + 'lv', + 'mg', + 'mh', + 'mi', + 'mk', + 'ml', + 'mn', + 'mr', + 'ms', + 'mt', + 'my', + 'na', + 'nb', + 'nd', + 'ne', + 'ng', + 'nl', + 'nn', + 'no', + 'nr', + 'nv', + 'ny', + 'oc', + 'oj', + 'om', + 'or', + 'os', + 'pa', + 'pi', + 'pl', + 'ps', + 'pt', + 'qu', + 'rm', + 'rn', + 'ro', + 'ru', + 'rw', + 'sa', + 'sc', + 'sd', + 'se', + 'sg', + 'si', + 'sk', + 'sl', + 'sm', + 'sn', + 'so', + 'sq', + 'sr', + 'ss', + 'st', + 'su', + 'sv', + 'sw', + 'ta', + 'te', + 'tg', + 'th', + 'ti', + 'tk', + 'tl', + 'tn', + 'to', + 'tr', + 'ts', + 'tt', + 'tw', + 'ty', + 'ug', + 'uk', + 'ur', + 'uz', + 've', + 'vi', + 'vo', + 'wa', + 'wo', + 'xh', + 'yi', + 'yo', + 'za', + 'zh', + 'zu', +]); + +/** + * Test whether a path segment is a valid locale code. + * Accepts ISO 639-1 language codes ("en", "de") and BCP 47 language-region + * subtags where the language part is a valid ISO 639-1 code ("pt-br", "zh-cn"). + */ +export function isLocaleSegment(segment: string): boolean { + const lower = segment.toLowerCase(); + if (ISO_639_1.has(lower)) return true; + const hyphen = lower.indexOf('-'); + if (hyphen === 2 && lower.length === 5) { + return ISO_639_1.has(lower.slice(0, 2)); + } + return false; +} + +/** + * Check whether a single locale code at `position` represents a real locale + * prefix by testing for structural duplication: if stripping the code from + * prefixed URLs produces paths that overlap with unprefixed URLs in the set, + * the code is a locale variant, not a topic segment. + * + * Example: `/docs/de/intro` stripped → `/docs/intro` matches the unprefixed + * URL `/docs/intro` → structural duplication confirmed. + */ +export function hasStructuralDuplication( + urls: string[], + position: number, + localeCode: string, +): boolean { + const strippedPaths = new Set(); + const unprefixedPaths = new Set(); + + for (const url of urls) { + try { + const segments = new URL(url).pathname.split('/').filter(Boolean); + if (segments.length > position && segments[position].toLowerCase() === localeCode) { + const stripped = [...segments.slice(0, position), ...segments.slice(position + 1)].join( + '/', + ); + strippedPaths.add(stripped); + } else if (segments.length > position && !isLocaleSegment(segments[position])) { + unprefixedPaths.add(segments.join('/')); + } + } catch { + continue; + } + } + + if (strippedPaths.size === 0 || unprefixedPaths.size === 0) return false; + + let overlap = 0; + for (const path of strippedPaths) { + if (unprefixedPaths.has(path)) overlap++; + } + + return overlap > strippedPaths.size * 0.5; +} diff --git a/test/unit/checks/llms-txt-freshness.test.ts b/test/unit/checks/llms-txt-freshness.test.ts index f3351b7..b874b63 100644 --- a/test/unit/checks/llms-txt-freshness.test.ts +++ b/test/unit/checks/llms-txt-freshness.test.ts @@ -4,9 +4,10 @@ import { setupServer } from 'msw/node'; import { getCheck } from '../../../src/checks/registry.js'; import { createContext } from '../../../src/runner.js'; import type { DiscoveredFile } from '../../../src/types.js'; - -// Ensure the check is registered -import '../../../src/checks/observability/llms-txt-freshness.js'; +import { + hasLocaleCodeAt, + filterToUnprefixedLocale, +} from '../../../src/checks/observability/llms-txt-freshness.js'; const server = setupServer(); @@ -603,6 +604,88 @@ describe('llms-txt-freshness', () => { expect(result.details?.sitemapDocPages).toBe(3); }); + test('filters sitemap to unprefixed default locale when llms.txt has no locale prefix', async () => { + const host = 'unprefixed-locale.local'; + // llms.txt covers the default (unprefixed) language + const llmsPages = [ + `http://${host}/docs/getting-started`, + `http://${host}/docs/api-reference`, + `http://${host}/docs/guides`, + ]; + + // Sitemap has 3 unprefixed + 3 German + 3 Japanese = 9 pages + const sitemapPages = [ + ...llmsPages, + `http://${host}/docs/de/getting-started`, + `http://${host}/docs/de/api-reference`, + `http://${host}/docs/de/guides`, + `http://${host}/docs/ja/getting-started`, + `http://${host}/docs/ja/api-reference`, + `http://${host}/docs/ja/guides`, + ]; + + const ctx = makeCtx(host, llmsPages, '/docs'); + + server.use( + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(sitemapPages), { + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + expect(result.details?.sitemapDocPages).toBe(3); + expect(result.details?.localeFiltered).toBe(true); + expect(result.details?.detectedLocale).toBe('default'); + }); + + test('detects single-locale site via structural duplication', async () => { + const host = 'single-locale.local'; + // llms.txt covers the default (unprefixed) language + const llmsPages = [ + `http://${host}/docs/getting-started`, + `http://${host}/docs/api-reference`, + `http://${host}/docs/guides`, + ]; + + // Sitemap has 3 unprefixed + 3 German (one locale only) + const sitemapPages = [ + ...llmsPages, + `http://${host}/docs/de/getting-started`, + `http://${host}/docs/de/api-reference`, + `http://${host}/docs/de/guides`, + ]; + + const ctx = makeCtx(host, llmsPages, '/docs'); + + server.use( + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(sitemapPages), { + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + expect(result.details?.sitemapDocPages).toBe(3); + expect(result.details?.localeFiltered).toBe(true); + }); + test('uses effectiveOrigin for sitemap discovery and scoping', async () => { const oldHost = 'old-host.local'; const newHost = 'new-host.local'; @@ -672,3 +755,43 @@ describe('llms-txt-freshness', () => { expect(result.details?.excludedNonDocPages).toBe(3); }); }); + +describe('hasLocaleCodeAt', () => { + test('returns true for 2-letter locale codes at position', () => { + expect(hasLocaleCodeAt('http://x.com/docs/de/intro', 1)).toBe(true); + expect(hasLocaleCodeAt('http://x.com/docs/ja/intro', 1)).toBe(true); + }); + + test('returns true for region subtags', () => { + expect(hasLocaleCodeAt('http://x.com/docs/pt-br/intro', 1)).toBe(true); + expect(hasLocaleCodeAt('http://x.com/docs/zh-cn/intro', 1)).toBe(true); + }); + + test('returns false for non-locale segments', () => { + expect(hasLocaleCodeAt('http://x.com/docs/getting-started', 1)).toBe(false); + expect(hasLocaleCodeAt('http://x.com/docs/api', 1)).toBe(false); + }); + + test('returns false when URL is shorter than position', () => { + expect(hasLocaleCodeAt('http://x.com/docs', 1)).toBe(false); + }); +}); + +describe('filterToUnprefixedLocale', () => { + test('removes URLs with locale codes at the given position', () => { + const urls = [ + 'http://x.com/docs/intro', + 'http://x.com/docs/de/intro', + 'http://x.com/docs/ja/intro', + 'http://x.com/docs/guides', + 'http://x.com/docs/fr/guides', + ]; + const filtered = filterToUnprefixedLocale(urls, 1); + expect(filtered).toEqual(['http://x.com/docs/intro', 'http://x.com/docs/guides']); + }); + + test('keeps all URLs when none have locale codes', () => { + const urls = ['http://x.com/docs/intro', 'http://x.com/docs/guides']; + expect(filterToUnprefixedLocale(urls, 1)).toEqual(urls); + }); +}); diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts index 6c5ddea..c04ade9 100644 --- a/test/unit/helpers/get-page-urls.test.ts +++ b/test/unit/helpers/get-page-urls.test.ts @@ -376,6 +376,43 @@ describe('filterLocalizedUrls', () => { expect(result).toEqual(urls); }); + it('filters to unprefixed default locale when target locale not found', () => { + // Default language has no prefix; other languages do + const urls = [ + 'https://example.com/docs/intro', + 'https://example.com/docs/guide', + 'https://example.com/docs/api', + 'https://example.com/docs/de/intro', + 'https://example.com/docs/de/guide', + 'https://example.com/docs/de/api', + 'https://example.com/docs/ja/intro', + 'https://example.com/docs/ja/guide', + 'https://example.com/docs/ja/api', + ]; + // Default preferred locale is 'en', which doesn't exist as a prefix + const result = filterLocalizedUrls(urls); + expect(result).toEqual([ + 'https://example.com/docs/intro', + 'https://example.com/docs/guide', + 'https://example.com/docs/api', + ]); + }); + + it('filters to unprefixed default locale with explicit preferred locale not found', () => { + const urls = [ + 'https://example.com/docs/intro', + 'https://example.com/docs/guide', + 'https://example.com/docs/de/intro', + 'https://example.com/docs/de/guide', + 'https://example.com/docs/fr/intro', + 'https://example.com/docs/fr/guide', + ]; + // Requesting 'es' which doesn't exist, and 'en' doesn't exist either — + // should fall back to unprefixed + const result = filterLocalizedUrls(urls, 'es'); + expect(result).toEqual(['https://example.com/docs/intro', 'https://example.com/docs/guide']); + }); + it('keeps URLs with fewer segments than the locale position', () => { // Locale at position 1 (docs/{locale}/...), so a URL with only 1 segment // doesn't reach the locale position and should be kept, not dropped. @@ -392,6 +429,39 @@ describe('filterLocalizedUrls', () => { expect(result).toContain('https://example.com/docs'); // kept, not dropped expect(result).not.toContain('https://example.com/docs/fr/intro'); }); + + it('detects single-locale site via structural duplication and filters to unprefixed', () => { + const urls = [ + 'https://example.com/docs/intro', + 'https://example.com/docs/guide', + 'https://example.com/docs/api', + 'https://example.com/docs/de/intro', + 'https://example.com/docs/de/guide', + 'https://example.com/docs/de/api', + ]; + const result = filterLocalizedUrls(urls); + // 'en' not found as prefix → falls back to unprefixed + expect(result).toEqual([ + 'https://example.com/docs/intro', + 'https://example.com/docs/guide', + 'https://example.com/docs/api', + ]); + }); + + it('does not false-detect topic paths as single-locale', () => { + // "hr" is a valid ISO 639-1 code (Croatian) but used here as a topic + const urls = [ + 'https://example.com/docs/hr/onboarding', + 'https://example.com/docs/hr/policies', + 'https://example.com/docs/hr/benefits', + 'https://example.com/docs/engineering/onboarding', + 'https://example.com/docs/engineering/policies', + 'https://example.com/docs/engineering/benefits', + ]; + const result = filterLocalizedUrls(urls); + // No structural duplication (stripped paths don't match) → no filtering + expect(result).toEqual(urls); + }); }); describe('deduplicateVersionedUrls', () => { diff --git a/test/unit/helpers/locale-codes.test.ts b/test/unit/helpers/locale-codes.test.ts new file mode 100644 index 0000000..cc3c89c --- /dev/null +++ b/test/unit/helpers/locale-codes.test.ts @@ -0,0 +1,103 @@ +import { describe, test, expect } from 'vitest'; +import { isLocaleSegment, hasStructuralDuplication } from '../../../src/helpers/locale-codes.js'; + +describe('isLocaleSegment', () => { + test('accepts common ISO 639-1 language codes', () => { + for (const code of ['en', 'es', 'fr', 'de', 'ja', 'ko', 'zh', 'pt', 'ru', 'ar', 'nl', 'it']) { + expect(isLocaleSegment(code), code).toBe(true); + } + }); + + test('accepts BCP 47 language-region subtags', () => { + for (const code of ['pt-br', 'en-us', 'zh-cn', 'fr-fr', 'de-de']) { + expect(isLocaleSegment(code), code).toBe(true); + } + }); + + test('is case-insensitive', () => { + expect(isLocaleSegment('EN')).toBe(true); + expect(isLocaleSegment('Pt-BR')).toBe(true); + expect(isLocaleSegment('zh-CN')).toBe(true); + }); + + test('rejects 2-letter path segments that are not ISO 639-1 codes', () => { + for (const seg of ['go', 'ai', 'up', 'do', 'us', 'ds', 'db', 'vm', 'qa', 'v2', 'wp']) { + expect(isLocaleSegment(seg), seg).toBe(false); + } + }); + + test('accepts 2-letter segments that happen to be valid language codes', () => { + // These look like common path segments but are real ISO 639-1 codes + expect(isLocaleSegment('my'), 'my = Burmese').toBe(true); + expect(isLocaleSegment('io'), 'io = Ido').toBe(true); + expect(isLocaleSegment('no'), 'no = Norwegian').toBe(true); + expect(isLocaleSegment('hr'), 'hr = Croatian').toBe(true); + expect(isLocaleSegment('am'), 'am = Amharic').toBe(true); + }); + + test('rejects longer path segments', () => { + expect(isLocaleSegment('docs')).toBe(false); + expect(isLocaleSegment('api')).toBe(false); + expect(isLocaleSegment('intro')).toBe(false); + expect(isLocaleSegment('getting-started')).toBe(false); + }); + + test('rejects single characters', () => { + expect(isLocaleSegment('a')).toBe(false); + expect(isLocaleSegment('v')).toBe(false); + }); + + test('rejects region-only subtags with invalid language part', () => { + expect(isLocaleSegment('xx-us')).toBe(false); + expect(isLocaleSegment('zz-cn')).toBe(false); + }); +}); + +describe('hasStructuralDuplication', () => { + test('confirms locale when stripped paths match unprefixed URLs', () => { + const urls = [ + 'https://x.com/docs/intro', + 'https://x.com/docs/guide', + 'https://x.com/docs/de/intro', + 'https://x.com/docs/de/guide', + ]; + expect(hasStructuralDuplication(urls, 1, 'de')).toBe(true); + }); + + test('confirms locale with partial translation overlap', () => { + const urls = [ + 'https://x.com/docs/intro', + 'https://x.com/docs/guide', + 'https://x.com/docs/api', + 'https://x.com/docs/de/intro', // only one translated page + ]; + expect(hasStructuralDuplication(urls, 1, 'de')).toBe(true); + }); + + test('rejects when stripped paths do not match unprefixed URLs', () => { + // "hr" used as topic (Human Resources), not locale + const urls = [ + 'https://x.com/docs/hr/onboarding', + 'https://x.com/docs/hr/policies', + 'https://x.com/docs/engineering/onboarding', + 'https://x.com/docs/engineering/policies', + ]; + expect(hasStructuralDuplication(urls, 1, 'hr')).toBe(false); + }); + + test('rejects when there are no unprefixed URLs', () => { + const urls = ['https://x.com/docs/de/intro', 'https://x.com/docs/de/guide']; + expect(hasStructuralDuplication(urls, 1, 'de')).toBe(false); + }); + + test('rejects when overlap is below 50%', () => { + const urls = [ + 'https://x.com/docs/intro', + 'https://x.com/docs/de/intro', + 'https://x.com/docs/de/guide', + 'https://x.com/docs/de/api', + ]; + // 1 out of 3 stripped paths matches → 33% < 50% + expect(hasStructuralDuplication(urls, 1, 'de')).toBe(false); + }); +});