Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion src/checks/observability/llms-txt-freshness.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
parseSitemapUrls,
} from '../../helpers/get-page-urls.js';
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js';
import type { CheckContext, CheckResult } from '../../types.js';

/**
Expand Down Expand Up @@ -95,7 +96,7 @@ export function detectLocalePosition(urls: string[]): number | null {
const segments = new URL(url).pathname.split('/').filter(Boolean);
for (let i = 0; i < segments.length; i++) {
const seg = segments[i].toLowerCase();
if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) {
if (isLocaleSegment(seg)) {
if (!positionCounts.has(i)) positionCounts.set(i, new Map());
const counts = positionCounts.get(i)!;
counts.set(seg, (counts.get(seg) ?? 0) + 1);
Expand All @@ -107,6 +108,7 @@ export function detectLocalePosition(urls: string[]): number | null {
}
}

// First pass: ≥2 distinct locale codes covering >50% of URLs (strong signal)
for (const [pos, counts] of positionCounts) {
if (counts.size < 2) continue;
const total = positionTotals.get(pos) ?? 0;
Expand All @@ -115,6 +117,17 @@ export function detectLocalePosition(urls: string[]): number | null {
}
}

// Second pass: single locale code confirmed by structural duplication.
// With ISO 639-1 validation, a single code is meaningful when stripping it
// produces paths that match unprefixed URLs in the same set.
for (const [pos, counts] of positionCounts) {
if (counts.size !== 1) continue;
const [code] = counts.keys();
if (hasStructuralDuplication(urls, pos, code)) {
return pos;
}
}

return null;
}

Expand Down Expand Up @@ -163,6 +176,27 @@ function filterByLocale(urls: string[], locale: string, position: number): strin
});
}

/**
* Test whether a URL has a locale code at the given path position.
*/
export function hasLocaleCodeAt(url: string, position: number): boolean {
try {
const segments = new URL(url).pathname.split('/').filter(Boolean);
return segments.length > position && isLocaleSegment(segments[position]);
} catch {
return false;
}
}

/**
* Filter URLs to only those that do NOT have a locale code at `position`.
* Used when llms.txt covers the unprefixed default locale and we need to
* exclude locale-prefixed sitemap variants from coverage comparison.
*/
export function filterToUnprefixedLocale(urls: string[], position: number): string[] {
return urls.filter((url) => !hasLocaleCodeAt(url, position));
}

/** Coverage thresholds */
const COVERAGE_PASS = 0.95;
const COVERAGE_WARN = 0.8;
Expand Down Expand Up @@ -316,6 +350,17 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
const before = scopedSitemapUrls.length;
scopedSitemapUrls = filterByLocale(scopedSitemapUrls, llmsLocale, localePosition);
localeFiltered = scopedSitemapUrls.length < before;
} else {
// llms.txt may cover the unprefixed default locale (no /en/, /de/, etc.).
// If most llms.txt URLs lack locale codes at the detected position,
// filter the sitemap to only unprefixed URLs.
const withLocale = llmsTxtUrls.filter((u) => hasLocaleCodeAt(u, localePosition!)).length;
if (withLocale < llmsTxtUrls.length * 0.5) {
const before = scopedSitemapUrls.length;
scopedSitemapUrls = filterToUnprefixedLocale(scopedSitemapUrls, localePosition);
localeFiltered = scopedSitemapUrls.length < before;
if (localeFiltered) detectedLocale = 'default';
}
}
}

Expand Down
37 changes: 32 additions & 5 deletions src/helpers/get-page-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
import { MAX_SITEMAP_URLS } from '../constants.js';
import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js';
import type { CheckContext, DiscoveredFile } from '../types.js';

/**
Expand Down Expand Up @@ -286,7 +287,7 @@ export function extractLocaleFromUrl(url: string): string | null {
const segments = new URL(url).pathname.split('/').filter(Boolean);
// Only check the first 3 segments to avoid matching content paths
for (let i = 0; i < Math.min(segments.length, 3); i++) {
if (/^[a-z]{2}(-[a-z]{2})?$/i.test(segments[i])) {
if (isLocaleSegment(segments[i])) {
return segments[i].toLowerCase();
}
}
Expand Down Expand Up @@ -325,7 +326,7 @@ export function filterLocaleSitemaps(
const pathMatch = pathLocalePattern.exec(url);
const match = filenameMatch ?? pathMatch;

if (match) {
if (match && isLocaleSegment(match[1])) {
const locale = match[1].toLowerCase();
if (!locales.has(locale)) locales.set(locale, []);
locales.get(locale)!.push(url);
Expand Down Expand Up @@ -366,7 +367,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
const segments = new URL(url).pathname.split('/').filter(Boolean);
for (let i = 0; i < segments.length; i++) {
const seg = segments[i].toLowerCase();
if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) {
if (isLocaleSegment(seg)) {
if (!positionCounts.has(i)) positionCounts.set(i, new Map());
const counts = positionCounts.get(i)!;
counts.set(seg, (counts.get(seg) ?? 0) + 1);
Expand All @@ -380,6 +381,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n

// Find the position that looks like a locale segment
let localePosition: number | null = null;
// First pass: ≥2 distinct locale codes covering >50% of URLs
for (const [pos, counts] of positionCounts) {
if (counts.size < 2) continue;
const total = positionTotals.get(pos) ?? 0;
Expand All @@ -388,6 +390,17 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
break;
}
}
// Second pass: single locale code confirmed by structural duplication
if (localePosition === null) {
for (const [pos, counts] of positionCounts) {
if (counts.size !== 1) continue;
const [code] = counts.keys();
if (hasStructuralDuplication(urls, pos, code)) {
localePosition = pos;
break;
}
}
}

if (localePosition === null) return urls;

Expand All @@ -403,8 +416,22 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
}
});

// If filtering removed everything (target locale not present), return original
return filtered.length > 0 ? filtered : urls;
if (filtered.length > 0) return filtered;

// Target locale not found. The default language may use unprefixed paths
// (e.g. /docs/intro instead of /docs/en/intro). Filter to URLs that don't
// have any locale code at the detected position.
const unprefixed = urls.filter((url) => {
try {
const segments = new URL(url).pathname.split('/').filter(Boolean);
if (segments.length <= localePosition!) return true;
return !isLocaleSegment(segments[localePosition!]);
} catch {
return true;
}
});

return unprefixed.length > 0 ? unprefixed : urls;
}

/**
Expand Down
Loading