diff --git a/src/checks/observability/llms-txt-freshness.ts b/src/checks/observability/llms-txt-freshness.ts
index 3eaf1ce..0f46574 100644
--- a/src/checks/observability/llms-txt-freshness.ts
+++ b/src/checks/observability/llms-txt-freshness.ts
@@ -5,6 +5,7 @@ import {
   parseSitemapUrls,
 } from '../../helpers/get-page-urls.js';
 import { isNonPageUrl } from '../../helpers/to-md-urls.js';
+import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js';
 import type { CheckContext, CheckResult } from '../../types.js';
 
 /**
@@ -95,7 +96,7 @@ export function detectLocalePosition(urls: string[]): number | null {
       const segments = new URL(url).pathname.split('/').filter(Boolean);
       for (let i = 0; i < segments.length; i++) {
         const seg = segments[i].toLowerCase();
-        if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) {
+        if (isLocaleSegment(seg)) {
           if (!positionCounts.has(i)) positionCounts.set(i, new Map());
           const counts = positionCounts.get(i)!;
           counts.set(seg, (counts.get(seg) ?? 0) + 1);
@@ -107,6 +108,7 @@ export function detectLocalePosition(urls: string[]): number | null {
     }
   }
 
+  // First pass: ≥2 distinct locale codes covering >50% of URLs (strong signal)
   for (const [pos, counts] of positionCounts) {
     if (counts.size < 2) continue;
     const total = positionTotals.get(pos) ?? 0;
@@ -115,6 +117,17 @@ export function detectLocalePosition(urls: string[]): number | null {
     }
   }
 
+  // Second pass: single locale code confirmed by structural duplication.
+  // With ISO 639-1 validation, a single code is meaningful when stripping it
+  // produces paths that match unprefixed URLs in the same set.
+  for (const [pos, counts] of positionCounts) {
+    if (counts.size !== 1) continue;
+    const [code] = counts.keys();
+    if (hasStructuralDuplication(urls, pos, code)) {
+      return pos;
+    }
+  }
+
   return null;
 }
 
@@ -163,6 +176,27 @@ function filterByLocale(urls: string[], locale: string, position: number): strin
   });
 }
 
+/**
+ * Test whether a URL has a locale code at the given path position.
+ */
+export function hasLocaleCodeAt(url: string, position: number): boolean {
+  try {
+    const segments = new URL(url).pathname.split('/').filter(Boolean);
+    return segments.length > position && isLocaleSegment(segments[position]);
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Filter URLs to only those that do NOT have a locale code at `position`.
+ * Used when llms.txt covers the unprefixed default locale and we need to
+ * exclude locale-prefixed sitemap variants from coverage comparison.
+ */
+export function filterToUnprefixedLocale(urls: string[], position: number): string[] {
+  return urls.filter((url) => !hasLocaleCodeAt(url, position));
+}
+
 /** Coverage thresholds */
 const COVERAGE_PASS = 0.95;
 const COVERAGE_WARN = 0.8;
@@ -316,6 +350,17 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
       const before = scopedSitemapUrls.length;
       scopedSitemapUrls = filterByLocale(scopedSitemapUrls, llmsLocale, localePosition);
       localeFiltered = scopedSitemapUrls.length < before;
+    } else {
+      // llms.txt may cover the unprefixed default locale (no /en/, /de/, etc.).
+      // If most llms.txt URLs lack locale codes at the detected position,
+      // filter the sitemap to only unprefixed URLs.
+      const withLocale = llmsTxtUrls.filter((u) => hasLocaleCodeAt(u, localePosition!)).length;
+      if (withLocale < llmsTxtUrls.length * 0.5) {
+        const before = scopedSitemapUrls.length;
+        scopedSitemapUrls = filterToUnprefixedLocale(scopedSitemapUrls, localePosition);
+        localeFiltered = scopedSitemapUrls.length < before;
+        if (localeFiltered) detectedLocale = 'default';
+      }
     }
   }
 
diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts
index 88f3721..09d5737 100644
--- a/src/helpers/get-page-urls.ts
+++ b/src/helpers/get-page-urls.ts
@@ -2,6 +2,7 @@ import { extractMarkdownLinks } from '../checks/content-discoverability/llms-txt
 import { MAX_SITEMAP_URLS } from '../constants.js';
 import { getLlmsTxtFilesForAnalysis, selectCanonicalLlmsTxt } from './llms-txt.js';
 import { isNonPageUrl, isMdUrl, toHtmlUrl } from './to-md-urls.js';
+import { isLocaleSegment, hasStructuralDuplication } from './locale-codes.js';
 import type { CheckContext, DiscoveredFile } from '../types.js';
 
 /**
@@ -286,7 +287,7 @@ export function extractLocaleFromUrl(url: string): string | null {
     const segments = new URL(url).pathname.split('/').filter(Boolean);
     // Only check the first 3 segments to avoid matching content paths
     for (let i = 0; i < Math.min(segments.length, 3); i++) {
-      if (/^[a-z]{2}(-[a-z]{2})?$/i.test(segments[i])) {
+      if (isLocaleSegment(segments[i])) {
         return segments[i].toLowerCase();
       }
     }
@@ -325,7 +326,7 @@ export function filterLocaleSitemaps(
     const pathMatch = pathLocalePattern.exec(url);
     const match = filenameMatch ?? pathMatch;
 
-    if (match) {
+    if (match && isLocaleSegment(match[1])) {
       const locale = match[1].toLowerCase();
       if (!locales.has(locale)) locales.set(locale, []);
       locales.get(locale)!.push(url);
@@ -366,7 +367,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
       const segments = new URL(url).pathname.split('/').filter(Boolean);
       for (let i = 0; i < segments.length; i++) {
         const seg = segments[i].toLowerCase();
-        if (/^[a-z]{2}(-[a-z]{2})?$/.test(seg)) {
+        if (isLocaleSegment(seg)) {
           if (!positionCounts.has(i)) positionCounts.set(i, new Map());
           const counts = positionCounts.get(i)!;
           counts.set(seg, (counts.get(seg) ?? 0) + 1);
@@ -380,6 +381,7 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
 
   // Find the position that looks like a locale segment
   let localePosition: number | null = null;
+  // First pass: ≥2 distinct locale codes covering >50% of URLs
   for (const [pos, counts] of positionCounts) {
     if (counts.size < 2) continue;
     const total = positionTotals.get(pos) ?? 0;
@@ -388,6 +390,17 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
       break;
     }
   }
+  // Second pass: single locale code confirmed by structural duplication
+  if (localePosition === null) {
+    for (const [pos, counts] of positionCounts) {
+      if (counts.size !== 1) continue;
+      const [code] = counts.keys();
+      if (hasStructuralDuplication(urls, pos, code)) {
+        localePosition = pos;
+        break;
+      }
+    }
+  }
 
   if (localePosition === null) return urls;
 
@@ -403,8 +416,22 @@ export function filterLocalizedUrls(urls: string[], preferredLocale?: string | n
     }
   });
 
-  // If filtering removed everything (target locale not present), return original
-  return filtered.length > 0 ? filtered : urls;
+  if (filtered.length > 0) return filtered;
+
+  // Target locale not found. The default language may use unprefixed paths
+  // (e.g. /docs/intro instead of /docs/en/intro). Filter to URLs that don't
+  // have any locale code at the detected position.
+  const unprefixed = urls.filter((url) => {
+    try {
+      const segments = new URL(url).pathname.split('/').filter(Boolean);
+      if (segments.length <= localePosition!) return true;
+      return !isLocaleSegment(segments[localePosition!]);
+    } catch {
+      return true;
+    }
+  });
+
+  return unprefixed.length > 0 ? unprefixed : urls;
 }
 
 /**
diff --git a/src/helpers/locale-codes.ts b/src/helpers/locale-codes.ts
new file mode 100644
index 0000000..450172d
--- /dev/null
+++ b/src/helpers/locale-codes.ts
@@ -0,0 +1,248 @@
+// ISO 639-1 language codes.
+// Stable standard (~184 codes, last major revision 2002). Used to validate
+// locale-like path segments instead of a broad regex that would match
+// non-locale 2-letter segments like "go", "ai", "my", "io", "up", "do".
+const ISO_639_1 = new Set([
+  'aa',
+  'ab',
+  'ae',
+  'af',
+  'ak',
+  'am',
+  'an',
+  'ar',
+  'as',
+  'av',
+  'ay',
+  'az',
+  'ba',
+  'be',
+  'bg',
+  'bh',
+  'bi',
+  'bm',
+  'bn',
+  'bo',
+  'br',
+  'bs',
+  'ca',
+  'ce',
+  'ch',
+  'co',
+  'cr',
+  'cs',
+  'cu',
+  'cv',
+  'cy',
+  'da',
+  'de',
+  'dv',
+  'dz',
+  'ee',
+  'el',
+  'en',
+  'eo',
+  'es',
+  'et',
+  'eu',
+  'fa',
+  'ff',
+  'fi',
+  'fj',
+  'fo',
+  'fr',
+  'fy',
+  'ga',
+  'gd',
+  'gl',
+  'gn',
+  'gu',
+  'gv',
+  'ha',
+  'he',
+  'hi',
+  'ho',
+  'hr',
+  'ht',
+  'hu',
+  'hy',
+  'hz',
+  'ia',
+  'id',
+  'ie',
+  'ig',
+  'ii',
+  'ik',
+  'io',
+  'is',
+  'it',
+  'iu',
+  'ja',
+  'jv',
+  'ka',
+  'kg',
+  'ki',
+  'kj',
+  'kk',
+  'kl',
+  'km',
+  'kn',
+  'ko',
+  'kr',
+  'ks',
+  'ku',
+  'kv',
+  'kw',
+  'ky',
+  'la',
+  'lb',
+  'lg',
+  'li',
+  'ln',
+  'lo',
+  'lt',
+  'lu',
+  'lv',
+  'mg',
+  'mh',
+  'mi',
+  'mk',
+  'ml',
+  'mn',
+  'mr',
+  'ms',
+  'mt',
+  'my',
+  'na',
+  'nb',
+  'nd',
+  'ne',
+  'ng',
+  'nl',
+  'nn',
+  'no',
+  'nr',
+  'nv',
+  'ny',
+  'oc',
+  'oj',
+  'om',
+  'or',
+  'os',
+  'pa',
+  'pi',
+  'pl',
+  'ps',
+  'pt',
+  'qu',
+  'rm',
+  'rn',
+  'ro',
+  'ru',
+  'rw',
+  'sa',
+  'sc',
+  'sd',
+  'se',
+  'sg',
+  'si',
+  'sk',
+  'sl',
+  'sm',
+  'sn',
+  'so',
+  'sq',
+  'sr',
+  'ss',
+  'st',
+  'su',
+  'sv',
+  'sw',
+  'ta',
+  'te',
+  'tg',
+  'th',
+  'ti',
+  'tk',
+  'tl',
+  'tn',
+  'to',
+  'tr',
+  'ts',
+  'tt',
+  'tw',
+  'ty',
+  'ug',
+  'uk',
+  'ur',
+  'uz',
+  've',
+  'vi',
+  'vo',
+  'wa',
+  'wo',
+  'xh',
+  'yi',
+  'yo',
+  'za',
+  'zh',
+  'zu',
+]);
+
+/**
+ * Test whether a path segment is a valid locale code.
+ * Accepts ISO 639-1 language codes ("en", "de") and BCP 47 language-region
+ * subtags where the language part is a valid ISO 639-1 code ("pt-br", "zh-cn").
+ */
+export function isLocaleSegment(segment: string): boolean {
+  const lower = segment.toLowerCase();
+  if (ISO_639_1.has(lower)) return true;
+  const hyphen = lower.indexOf('-');
+  if (hyphen === 2 && lower.length === 5) {
+    return ISO_639_1.has(lower.slice(0, 2));
+  }
+  return false;
+}
+
+/**
+ * Check whether a single locale code at `position` represents a real locale
+ * prefix by testing for structural duplication: if stripping the code from
+ * prefixed URLs produces paths that overlap with unprefixed URLs in the set,
+ * the code is a locale variant, not a topic segment.
+ *
+ * Example: `/docs/de/intro` stripped → `/docs/intro` matches the unprefixed
+ * URL `/docs/intro` → structural duplication confirmed.
+ */
+export function hasStructuralDuplication(
+  urls: string[],
+  position: number,
+  localeCode: string,
+): boolean {
+  const strippedPaths = new Set<string>();
+  const unprefixedPaths = new Set<string>();
+
+  for (const url of urls) {
+    try {
+      const segments = new URL(url).pathname.split('/').filter(Boolean);
+      if (segments.length > position && segments[position].toLowerCase() === localeCode) {
+        const stripped = [...segments.slice(0, position), ...segments.slice(position + 1)].join(
+          '/',
+        );
+        strippedPaths.add(stripped);
+      } else if (segments.length > position && !isLocaleSegment(segments[position])) {
+        unprefixedPaths.add(segments.join('/'));
+      }
+    } catch {
+      continue;
+    }
+  }
+
+  if (strippedPaths.size === 0 || unprefixedPaths.size === 0) return false;
+
+  let overlap = 0;
+  for (const path of strippedPaths) {
+    if (unprefixedPaths.has(path)) overlap++;
+  }
+
+  return overlap > strippedPaths.size * 0.5;
+}
diff --git a/test/unit/checks/llms-txt-freshness.test.ts b/test/unit/checks/llms-txt-freshness.test.ts
index f3351b7..b874b63 100644
--- a/test/unit/checks/llms-txt-freshness.test.ts
+++ b/test/unit/checks/llms-txt-freshness.test.ts
@@ -4,9 +4,10 @@ import { setupServer } from 'msw/node';
 import { getCheck } from '../../../src/checks/registry.js';
 import { createContext } from '../../../src/runner.js';
 import type { DiscoveredFile } from '../../../src/types.js';
-
-// Ensure the check is registered
-import '../../../src/checks/observability/llms-txt-freshness.js';
+import {
+  hasLocaleCodeAt,
+  filterToUnprefixedLocale,
+} from '../../../src/checks/observability/llms-txt-freshness.js';
 
 const server = setupServer();
 
@@ -603,6 +604,88 @@ describe('llms-txt-freshness', () => {
     expect(result.details?.sitemapDocPages).toBe(3);
   });
 
+  test('filters sitemap to unprefixed default locale when llms.txt has no locale prefix', async () => {
+    const host = 'unprefixed-locale.local';
+    // llms.txt covers the default (unprefixed) language
+    const llmsPages = [
+      `http://${host}/docs/getting-started`,
+      `http://${host}/docs/api-reference`,
+      `http://${host}/docs/guides`,
+    ];
+
+    // Sitemap has 3 unprefixed + 3 German + 3 Japanese = 9 pages
+    const sitemapPages = [
+      ...llmsPages,
+      `http://${host}/docs/de/getting-started`,
+      `http://${host}/docs/de/api-reference`,
+      `http://${host}/docs/de/guides`,
+      `http://${host}/docs/ja/getting-started`,
+      `http://${host}/docs/ja/api-reference`,
+      `http://${host}/docs/ja/guides`,
+    ];
+
+    const ctx = makeCtx(host, llmsPages, '/docs');
+
+    server.use(
+      http.get(
+        `http://${host}/robots.txt`,
+        () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }),
+      ),
+      http.get(
+        `http://${host}/sitemap.xml`,
+        () =>
+          new HttpResponse(makeSitemap(sitemapPages), {
+            headers: { 'content-type': 'application/xml' },
+          }),
+      ),
+    );
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    expect(result.details?.sitemapDocPages).toBe(3);
+    expect(result.details?.localeFiltered).toBe(true);
+    expect(result.details?.detectedLocale).toBe('default');
+  });
+
+  test('detects single-locale site via structural duplication', async () => {
+    const host = 'single-locale.local';
+    // llms.txt covers the default (unprefixed) language
+    const llmsPages = [
+      `http://${host}/docs/getting-started`,
+      `http://${host}/docs/api-reference`,
+      `http://${host}/docs/guides`,
+    ];
+
+    // Sitemap has 3 unprefixed + 3 German (one locale only)
+    const sitemapPages = [
+      ...llmsPages,
+      `http://${host}/docs/de/getting-started`,
+      `http://${host}/docs/de/api-reference`,
+      `http://${host}/docs/de/guides`,
+    ];
+
+    const ctx = makeCtx(host, llmsPages, '/docs');
+
+    server.use(
+      http.get(
+        `http://${host}/robots.txt`,
+        () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }),
+      ),
+      http.get(
+        `http://${host}/sitemap.xml`,
+        () =>
+          new HttpResponse(makeSitemap(sitemapPages), {
+            headers: { 'content-type': 'application/xml' },
+          }),
+      ),
+    );
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    expect(result.details?.sitemapDocPages).toBe(3);
+    expect(result.details?.localeFiltered).toBe(true);
+  });
+
   test('uses effectiveOrigin for sitemap discovery and scoping', async () => {
     const oldHost = 'old-host.local';
     const newHost = 'new-host.local';
@@ -672,3 +755,43 @@ describe('llms-txt-freshness', () => {
     expect(result.details?.excludedNonDocPages).toBe(3);
   });
 });
+
+describe('hasLocaleCodeAt', () => {
+  test('returns true for 2-letter locale codes at position', () => {
+    expect(hasLocaleCodeAt('http://x.com/docs/de/intro', 1)).toBe(true);
+    expect(hasLocaleCodeAt('http://x.com/docs/ja/intro', 1)).toBe(true);
+  });
+
+  test('returns true for region subtags', () => {
+    expect(hasLocaleCodeAt('http://x.com/docs/pt-br/intro', 1)).toBe(true);
+    expect(hasLocaleCodeAt('http://x.com/docs/zh-cn/intro', 1)).toBe(true);
+  });
+
+  test('returns false for non-locale segments', () => {
+    expect(hasLocaleCodeAt('http://x.com/docs/getting-started', 1)).toBe(false);
+    expect(hasLocaleCodeAt('http://x.com/docs/api', 1)).toBe(false);
+  });
+
+  test('returns false when URL is shorter than position', () => {
+    expect(hasLocaleCodeAt('http://x.com/docs', 1)).toBe(false);
+  });
+});
+
+describe('filterToUnprefixedLocale', () => {
+  test('removes URLs with locale codes at the given position', () => {
+    const urls = [
+      'http://x.com/docs/intro',
+      'http://x.com/docs/de/intro',
+      'http://x.com/docs/ja/intro',
+      'http://x.com/docs/guides',
+      'http://x.com/docs/fr/guides',
+    ];
+    const filtered = filterToUnprefixedLocale(urls, 1);
+    expect(filtered).toEqual(['http://x.com/docs/intro', 'http://x.com/docs/guides']);
+  });
+
+  test('keeps all URLs when none have locale codes', () => {
+    const urls = ['http://x.com/docs/intro', 'http://x.com/docs/guides'];
+    expect(filterToUnprefixedLocale(urls, 1)).toEqual(urls);
+  });
+});
diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts
index 6c5ddea..c04ade9 100644
--- a/test/unit/helpers/get-page-urls.test.ts
+++ b/test/unit/helpers/get-page-urls.test.ts
@@ -376,6 +376,43 @@ describe('filterLocalizedUrls', () => {
     expect(result).toEqual(urls);
   });
 
+  it('filters to unprefixed default locale when target locale not found', () => {
+    // Default language has no prefix; other languages do
+    const urls = [
+      'https://example.com/docs/intro',
+      'https://example.com/docs/guide',
+      'https://example.com/docs/api',
+      'https://example.com/docs/de/intro',
+      'https://example.com/docs/de/guide',
+      'https://example.com/docs/de/api',
+      'https://example.com/docs/ja/intro',
+      'https://example.com/docs/ja/guide',
+      'https://example.com/docs/ja/api',
+    ];
+    // Default preferred locale is 'en', which doesn't exist as a prefix
+    const result = filterLocalizedUrls(urls);
+    expect(result).toEqual([
+      'https://example.com/docs/intro',
+      'https://example.com/docs/guide',
+      'https://example.com/docs/api',
+    ]);
+  });
+
+  it('filters to unprefixed default locale with explicit preferred locale not found', () => {
+    const urls = [
+      'https://example.com/docs/intro',
+      'https://example.com/docs/guide',
+      'https://example.com/docs/de/intro',
+      'https://example.com/docs/de/guide',
+      'https://example.com/docs/fr/intro',
+      'https://example.com/docs/fr/guide',
+    ];
+    // Requesting 'es' which doesn't exist, and 'en' doesn't exist either —
+    // should fall back to unprefixed
+    const result = filterLocalizedUrls(urls, 'es');
+    expect(result).toEqual(['https://example.com/docs/intro', 'https://example.com/docs/guide']);
+  });
+
   it('keeps URLs with fewer segments than the locale position', () => {
     // Locale at position 1 (docs/{locale}/...), so a URL with only 1 segment
     // doesn't reach the locale position and should be kept, not dropped.
@@ -392,6 +429,39 @@ describe('filterLocalizedUrls', () => {
     expect(result).toContain('https://example.com/docs'); // kept, not dropped
     expect(result).not.toContain('https://example.com/docs/fr/intro');
   });
+
+  it('detects single-locale site via structural duplication and filters to unprefixed', () => {
+    const urls = [
+      'https://example.com/docs/intro',
+      'https://example.com/docs/guide',
+      'https://example.com/docs/api',
+      'https://example.com/docs/de/intro',
+      'https://example.com/docs/de/guide',
+      'https://example.com/docs/de/api',
+    ];
+    const result = filterLocalizedUrls(urls);
+    // 'en' not found as prefix → falls back to unprefixed
+    expect(result).toEqual([
+      'https://example.com/docs/intro',
+      'https://example.com/docs/guide',
+      'https://example.com/docs/api',
+    ]);
+  });
+
+  it('does not false-detect topic paths as single-locale', () => {
+    // "hr" is a valid ISO 639-1 code (Croatian) but used here as a topic
+    const urls = [
+      'https://example.com/docs/hr/onboarding',
+      'https://example.com/docs/hr/policies',
+      'https://example.com/docs/hr/benefits',
+      'https://example.com/docs/engineering/onboarding',
+      'https://example.com/docs/engineering/policies',
+      'https://example.com/docs/engineering/benefits',
+    ];
+    const result = filterLocalizedUrls(urls);
+    // No structural duplication (stripped paths don't match) → no filtering
+    expect(result).toEqual(urls);
+  });
 });
 
 describe('deduplicateVersionedUrls', () => {
diff --git a/test/unit/helpers/locale-codes.test.ts b/test/unit/helpers/locale-codes.test.ts
new file mode 100644
index 0000000..cc3c89c
--- /dev/null
+++ b/test/unit/helpers/locale-codes.test.ts
@@ -0,0 +1,103 @@
+import { describe, test, expect } from 'vitest';
+import { isLocaleSegment, hasStructuralDuplication } from '../../../src/helpers/locale-codes.js';
+
+describe('isLocaleSegment', () => {
+  test('accepts common ISO 639-1 language codes', () => {
+    for (const code of ['en', 'es', 'fr', 'de', 'ja', 'ko', 'zh', 'pt', 'ru', 'ar', 'nl', 'it']) {
+      expect(isLocaleSegment(code), code).toBe(true);
+    }
+  });
+
+  test('accepts BCP 47 language-region subtags', () => {
+    for (const code of ['pt-br', 'en-us', 'zh-cn', 'fr-fr', 'de-de']) {
+      expect(isLocaleSegment(code), code).toBe(true);
+    }
+  });
+
+  test('is case-insensitive', () => {
+    expect(isLocaleSegment('EN')).toBe(true);
+    expect(isLocaleSegment('Pt-BR')).toBe(true);
+    expect(isLocaleSegment('zh-CN')).toBe(true);
+  });
+
+  test('rejects 2-letter path segments that are not ISO 639-1 codes', () => {
+    for (const seg of ['go', 'ai', 'up', 'do', 'us', 'ds', 'db', 'vm', 'qa', 'v2', 'wp']) {
+      expect(isLocaleSegment(seg), seg).toBe(false);
+    }
+  });
+
+  test('accepts 2-letter segments that happen to be valid language codes', () => {
+    // These look like common path segments but are real ISO 639-1 codes
+    expect(isLocaleSegment('my'), 'my = Burmese').toBe(true);
+    expect(isLocaleSegment('io'), 'io = Ido').toBe(true);
+    expect(isLocaleSegment('no'), 'no = Norwegian').toBe(true);
+    expect(isLocaleSegment('hr'), 'hr = Croatian').toBe(true);
+    expect(isLocaleSegment('am'), 'am = Amharic').toBe(true);
+  });
+
+  test('rejects longer path segments', () => {
+    expect(isLocaleSegment('docs')).toBe(false);
+    expect(isLocaleSegment('api')).toBe(false);
+    expect(isLocaleSegment('intro')).toBe(false);
+    expect(isLocaleSegment('getting-started')).toBe(false);
+  });
+
+  test('rejects single characters', () => {
+    expect(isLocaleSegment('a')).toBe(false);
+    expect(isLocaleSegment('v')).toBe(false);
+  });
+
+  test('rejects region-only subtags with invalid language part', () => {
+    expect(isLocaleSegment('xx-us')).toBe(false);
+    expect(isLocaleSegment('zz-cn')).toBe(false);
+  });
+});
+
+describe('hasStructuralDuplication', () => {
+  test('confirms locale when stripped paths match unprefixed URLs', () => {
+    const urls = [
+      'https://x.com/docs/intro',
+      'https://x.com/docs/guide',
+      'https://x.com/docs/de/intro',
+      'https://x.com/docs/de/guide',
+    ];
+    expect(hasStructuralDuplication(urls, 1, 'de')).toBe(true);
+  });
+
+  test('confirms locale with partial translation overlap', () => {
+    const urls = [
+      'https://x.com/docs/intro',
+      'https://x.com/docs/guide',
+      'https://x.com/docs/api',
+      'https://x.com/docs/de/intro', // only one translated page
+    ];
+    expect(hasStructuralDuplication(urls, 1, 'de')).toBe(true);
+  });
+
+  test('rejects when stripped paths do not match unprefixed URLs', () => {
+    // "hr" used as topic (Human Resources), not locale
+    const urls = [
+      'https://x.com/docs/hr/onboarding',
+      'https://x.com/docs/hr/policies',
+      'https://x.com/docs/engineering/onboarding',
+      'https://x.com/docs/engineering/policies',
+    ];
+    expect(hasStructuralDuplication(urls, 1, 'hr')).toBe(false);
+  });
+
+  test('rejects when there are no unprefixed URLs', () => {
+    const urls = ['https://x.com/docs/de/intro', 'https://x.com/docs/de/guide'];
+    expect(hasStructuralDuplication(urls, 1, 'de')).toBe(false);
+  });
+
+  test('rejects when overlap is below 50%', () => {
+    const urls = [
+      'https://x.com/docs/intro',
+      'https://x.com/docs/de/intro',
+      'https://x.com/docs/de/guide',
+      'https://x.com/docs/de/api',
+    ];
+    // 1 out of 3 stripped paths matches → 33% < 50%
+    expect(hasStructuralDuplication(urls, 1, 'de')).toBe(false);
+  });
+});