From 52a0fc82433badb540fff42443360f2d2d437bef Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sat, 25 Apr 2026 12:20:55 -0400 Subject: [PATCH 01/13] Strip 'style' and 'script' tags in HTML --- src/checks/page-size/page-size-html.ts | 9 ++- src/helpers/html-to-markdown.ts | 14 ++-- .../checks/content-start-position.test.ts | 24 +++---- test/unit/helpers/html-to-markdown.test.ts | 66 +++++++++++++++++++ 4 files changed, 90 insertions(+), 23 deletions(-) create mode 100644 test/unit/helpers/html-to-markdown.test.ts diff --git a/src/checks/page-size/page-size-html.ts b/src/checks/page-size/page-size-html.ts index 65ea027..5af39be 100644 --- a/src/checks/page-size/page-size-html.ts +++ b/src/checks/page-size/page-size-html.ts @@ -107,6 +107,9 @@ async function check(ctx: CheckContext): Promise { const convertedSizes = successful.map((r) => r.convertedCharacters).sort((a, b) => a - b); const median = convertedSizes[Math.floor(convertedSizes.length / 2)]; const max = convertedSizes[convertedSizes.length - 1]; + const htmlSizes = successful.map((r) => r.htmlCharacters).sort((a, b) => a - b); + const medianHtml = htmlSizes[Math.floor(htmlSizes.length / 2)]; + const maxHtml = htmlSizes[htmlSizes.length - 1]; const avgRatio = Math.round( successful.reduce((sum, r) => sum + r.conversionRatio, 0) / successful.length, ); @@ -124,11 +127,11 @@ async function check(ctx: CheckContext): Promise { let message: string; if (overallStatus === 'pass') { - message = `All ${successful.length} ${pageLabel} convert under ${formatSize(passThreshold)} chars (median ${formatSize(median)}, ${avgRatio}% boilerplate)${suffix}`; + message = `All ${successful.length} ${pageLabel} under ${formatSize(passThreshold)} chars (median ${formatSize(medianHtml)} HTML → ${formatSize(median)} markdown (${avgRatio}% boilerplate))${suffix}`; } else if (overallStatus === 'warn') { - message = `${warnBucket} of ${successful.length} ${pageLabel} convert to ${formatSize(passThreshold)}–${formatSize(failThreshold)} chars (max ${formatSize(max)}, ${avgRatio}% boilerplate)${suffix}`; + message = `${warnBucket} of ${successful.length} ${pageLabel} convert to ${formatSize(passThreshold)}–${formatSize(failThreshold)} chars (max ${formatSize(maxHtml)} HTML → ${formatSize(max)} markdown (${avgRatio}% boilerplate))${suffix}`; } else { - message = `${failBucket} of ${successful.length} ${pageLabel} convert to over ${formatSize(failThreshold)} chars (max ${formatSize(max)}, ${avgRatio}% boilerplate)${suffix}`; + message = `${failBucket} of ${successful.length} ${pageLabel} convert to over ${formatSize(failThreshold)} chars (max ${formatSize(maxHtml)} HTML → ${formatSize(max)} markdown (${avgRatio}% boilerplate))${suffix}`; } return { diff --git a/src/helpers/html-to-markdown.ts b/src/helpers/html-to-markdown.ts index 53d5ca9..6d8b523 100644 --- a/src/helpers/html-to-markdown.ts +++ b/src/helpers/html-to-markdown.ts @@ -1,15 +1,13 @@ +import { parse } from 'node-html-parser'; import TurndownService from 'turndown'; import { tables } from 'turndown-plugin-gfm'; -/** - * Convert HTML to markdown using Turndown with default configuration. - * Matches real agent behavior per the Agent-Friendly Documentation Spec: - * no explicit

Tiny Content

`; + // Massive nav boilerplate before a tiny content section + const navLinks = Array.from( + { length: 100 }, + (_, i) => `
  • Navigation Link ${i}
  • `, + ).join(''); + const html = `

    Tiny Content

    A short paragraph of documentation.

    `; server.use( http.get( @@ -305,17 +305,17 @@ describe('content-start-position', () => { ), ); - // Page 2: massive CSS boilerplate before content (fail) - const cssRules = Array.from( - { length: 200 }, - (_, i) => `.c${i} { color: red; margin: ${i}px; }`, - ).join('\n'); + // Page 2: massive nav boilerplate before content (fail) + const navLinks = Array.from( + { length: 100 }, + (_, i) => `
  • Navigation Link ${i}
  • `, + ).join(''); server.use( http.get( 'http://test.local/docs/bad', () => new HttpResponse( - `

    Late Content

    `, + `

    Late Content

    A short paragraph.

    `, { status: 200, headers: { 'Content-Type': 'text/html' } }, ), ), diff --git a/test/unit/helpers/html-to-markdown.test.ts b/test/unit/helpers/html-to-markdown.test.ts new file mode 100644 index 0000000..8489603 --- /dev/null +++ b/test/unit/helpers/html-to-markdown.test.ts @@ -0,0 +1,66 @@ +import { describe, it, expect } from 'vitest'; +import { htmlToMarkdown } from '../../../src/helpers/html-to-markdown.js'; + +describe('htmlToMarkdown', () => { + it('converts basic HTML to markdown', () => { + const html = '

    Title

    Hello world.

    '; + const md = htmlToMarkdown(html); + expect(md).toContain('Title'); + expect(md).toContain('Hello world.'); + }); + + it('strips +

    Title

    + +

    Content.

    + `; + const md = htmlToMarkdown(html); + expect(md).toContain('Title'); + expect(md).toContain('Content.'); + expect(md).not.toContain('const x = 42'); + expect(md).not.toContain('console.log'); + expect(md).not.toContain('"key"'); + }); + + it('strips + +

    Title

    + +

    Content.

    + `; + const md = htmlToMarkdown(html); + expect(md).toContain('Title'); + expect(md).toContain('Content.'); + expect(md).not.toContain('color: red'); + expect(md).not.toContain('background: blue'); + expect(md).not.toContain('.nav'); + }); + + it('strips both +

    Documentation

    +

    This is the real content.

    + `; + const md = htmlToMarkdown(html); + expect(md).toContain('Documentation'); + expect(md).toContain('This is the real content.'); + expect(md).not.toContain('color: red'); + expect(md).not.toContain('document.getElementById'); + }); + + it('preserves HTML tables as markdown tables', () => { + const html = ` +
    NameValue
    foobar
    `; + const md = htmlToMarkdown(html); + expect(md).toContain('Name'); + expect(md).toContain('foo'); + expect(md).toContain('|'); + }); +}); From 9ca4499e0ab37aca4d4a0c58fef3aa4fd3f1095d Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sat, 25 Apr 2026 12:40:02 -0400 Subject: [PATCH 02/13] Rename 'llms-txt-freshness' to 'llms-txt-coverage' --- SCORING.md | 6 +-- docs/agent-score-calculation.md | 8 ++-- docs/checks/content-discoverability.md | 2 +- docs/checks/index.md | 2 +- docs/checks/observability.md | 12 +++--- docs/generate_llms_txt | 2 +- docs/improve-your-score.md | 4 +- docs/interaction-diagnostics.md | 2 +- docs/public/llms.txt | 2 +- docs/run-locally.md | 2 +- scoring-reference.md | 8 ++-- src/checks/index.ts | 2 +- ...-txt-freshness.ts => llms-txt-coverage.ts} | 16 +++---- src/helpers/get-page-urls.ts | 2 +- src/scoring/coefficients.ts | 2 +- src/scoring/proportions.ts | 4 +- src/scoring/resolutions.ts | 2 +- src/scoring/tag-scores.ts | 2 +- src/scoring/weights.ts | 2 +- src/types.ts | 2 +- test/integration/check-pipeline.test.ts | 12 +++--- ...ness.test.ts => llms-txt-coverage.test.ts} | 42 +++++++++---------- test/unit/scoring/coefficients.test.ts | 2 +- test/unit/scoring/proportions.test.ts | 8 ++-- test/unit/scoring/resolutions.test.ts | 2 +- test/unit/scoring/score.test.ts | 4 +- 26 files changed, 77 insertions(+), 77 deletions(-) rename src/checks/observability/{llms-txt-freshness.ts => llms-txt-coverage.ts} (97%) rename test/unit/checks/{llms-txt-freshness.test.ts => llms-txt-coverage.test.ts} (96%) diff --git a/SCORING.md b/SCORING.md index 924d59c..9e57247 100644 --- a/SCORING.md +++ b/SCORING.md @@ -88,7 +88,7 @@ Whether agent-facing resources stay accurate over time. | Check | Weight | What it measures | | ---------------------------------------------------------------------------------- | ---------- | ---------------------------------------------------------------------------------------------------------- | -| [llms-txt-freshness](https://agentdocsspec.com/spec/#llms-txt-freshness) | Medium (4) | Whether your llms.txt reflects your current site. A stale index sends agents to outdated or missing pages. | +| [llms-txt-coverage](https://agentdocsspec.com/spec/#llms-txt-coverage) | Medium (4) | Whether your llms.txt reflects your current site. A stale index sends agents to outdated or missing pages. | | [markdown-content-parity](https://agentdocsspec.com/spec/#markdown-content-parity) | Medium (4) | Whether markdown and HTML versions of pages contain the same content. | | [cache-header-hygiene](https://agentdocsspec.com/spec/#cache-header-hygiene) | Low (2) | Whether cache lifetimes allow content updates to reach agents in a reasonable timeframe. | @@ -128,7 +128,7 @@ Not all warnings represent the same degree of degradation. A warning on `llms-tx | Coefficient | Meaning | Checks | | ----------- | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **0.75** | Content substantively intact | `llms-txt-valid`, `content-negotiation`, `llms-txt-links-resolve`, `llms-txt-freshness`, `markdown-content-parity` | +| **0.75** | Content substantively intact | `llms-txt-valid`, `content-negotiation`, `llms-txt-links-resolve`, `llms-txt-coverage`, `markdown-content-parity` | | **0.60** | Partial coverage or platform-dependent | `llms-txt-directive`, `redirect-behavior` | | **0.50** | Genuine functional degradation | `llms-txt-exists`, `llms-txt-size`, `rendering-strategy`, `markdown-url-support`, `page-size-markdown`, `page-size-html`, `content-start-position`, `tabbed-content-serialization`, `section-header-quality`, `cache-header-hygiene`, `auth-gate-detection`, `auth-alternative-access` | | **0.25** | Actively steering agents to a worse path | `llms-txt-links-markdown` (markdown exists but llms.txt links to HTML; agents don't discover .md variants on their own) | @@ -237,7 +237,7 @@ If pages are SPA shells, measuring HTML quality is meaningless. This coefficient ### Index truncation coefficient -**Affects**: `llms-txt-links-resolve`, `llms-txt-valid`, `llms-txt-freshness`, `llms-txt-links-markdown` +**Affects**: `llms-txt-links-resolve`, `llms-txt-valid`, `llms-txt-coverage`, `llms-txt-links-markdown` If your llms.txt is truncated, agents only see part of the index. Measuring the quality of the invisible portion doesn't reflect agent experience. diff --git a/docs/agent-score-calculation.md b/docs/agent-score-calculation.md index bed91f0..640f4ab 100644 --- a/docs/agent-score-calculation.md +++ b/docs/agent-score-calculation.md @@ -74,7 +74,7 @@ Every check is assigned a weight tier based on its observed impact on agent work | Check | Weight | | ------------------------- | ---------- | -| `llms-txt-freshness` | Medium (4) | +| `llms-txt-coverage` | Medium (4) | | `markdown-content-parity` | Medium (4) | | `cache-header-hygiene` | Low (2) | @@ -123,7 +123,7 @@ These checks test a single site-wide resource and produce one pass, warn, or fai | `llms-txt-size` | Whether the llms.txt fits within agent context limits | | `llms-txt-links-resolve` | Whether links in the llms.txt return 200 | | `llms-txt-links-markdown` | Whether llms.txt links point to markdown content | -| `llms-txt-freshness` | Whether the llms.txt reflects the current site state | +| `llms-txt-coverage` | Whether the llms.txt reflects the current site state | Note that the llms.txt link checks (`llms-txt-links-resolve`, `llms-txt-links-markdown`) do test multiple URLs, but they test the links _within_ the llms.txt file rather than sampling pages from the site. Their result is a single pass/warn/fail based on the overall resolution or markdown rate. @@ -133,7 +133,7 @@ A warning is not a binary "half credit." Different warnings represent different | Coefficient | Meaning | Checks | | ----------- | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **0.75** | Content substantively intact | `llms-txt-valid`, `content-negotiation`, `llms-txt-links-resolve`, `llms-txt-freshness`, `markdown-content-parity` | +| **0.75** | Content substantively intact | `llms-txt-valid`, `content-negotiation`, `llms-txt-links-resolve`, `llms-txt-coverage`, `markdown-content-parity` | | **0.60** | Partial coverage or platform-dependent | `llms-txt-directive`, `redirect-behavior` | | **0.50** | Genuine functional degradation | `llms-txt-exists`, `llms-txt-size`, `rendering-strategy`, `markdown-url-support`, `page-size-markdown`, `page-size-html`, `content-start-position`, `tabbed-content-serialization`, `section-header-quality`, `cache-header-hygiene`, `auth-gate-detection`, `auth-alternative-access` | | **0.25** | Actively steering agents to a worse path | `llms-txt-links-markdown` (markdown exists but llms.txt links to HTML) | @@ -184,7 +184,7 @@ If pages are SPA shells, measuring HTML quality is meaningless. This coefficient ### Index truncation coefficient -**Affects**: `llms-txt-links-resolve`, `llms-txt-valid`, `llms-txt-freshness`, `llms-txt-links-markdown` +**Affects**: `llms-txt-links-resolve`, `llms-txt-valid`, `llms-txt-coverage`, `llms-txt-links-markdown` If your llms.txt is truncated, agents only see part of the index. Measuring the quality of the invisible portion doesn't reflect real agent experience. diff --git a/docs/checks/content-discoverability.md b/docs/checks/content-discoverability.md index 85534e9..b2d2fc8 100644 --- a/docs/checks/content-discoverability.md +++ b/docs/checks/content-discoverability.md @@ -43,7 +43,7 @@ If your `llms.txt` lives at a location not covered by these candidates, AFDocs w ### Canonical selection -When more than one candidate returns a file (e.g. an apex `llms.txt` for the marketing site _and_ a `/docs/llms.txt` for the docs section), AFDocs picks one as **canonical**. The canonical file is the single source of truth for downstream checks: link sampling, size, validation, freshness, and link-resolution all operate on it alone. Other discovered files still appear in `details.discoveredFiles` for visibility, and `cache-header-hygiene` still verifies headers on every llms.txt found. +When more than one candidate returns a file (e.g. an apex `llms.txt` for the marketing site _and_ a `/docs/llms.txt` for the docs section), AFDocs picks one as **canonical**. The canonical file is the single source of truth for downstream checks: link sampling, size, validation, coverage, and link-resolution all operate on it alone. Other discovered files still appear in `details.discoveredFiles` for visibility, and `cache-header-hygiene` still verifies headers on every llms.txt found. The selection rule is _most-specific-to-the-baseUrl wins_. AFDocs picks the file whose directory is the longest prefix of the URL you passed. For example: diff --git a/docs/checks/index.md b/docs/checks/index.md index dd620e6..7af8b4b 100644 --- a/docs/checks/index.md +++ b/docs/checks/index.md @@ -33,7 +33,7 @@ Some checks depend on others. If a dependency doesn't pass, the dependent check - `page-size-markdown` requires `markdown-url-support` or `content-negotiation` - `section-header-quality` requires `tabbed-content-serialization` - `markdown-code-fence-validity` requires `markdown-url-support` or `content-negotiation` -- `llms-txt-freshness` requires `llms-txt-exists` +- `llms-txt-coverage` requires `llms-txt-exists` - `markdown-content-parity` requires `markdown-url-support` or `content-negotiation` - `auth-alternative-access` requires `auth-gate-detection` (warn or fail) diff --git a/docs/checks/observability.md b/docs/checks/observability.md index cdd513f..5784dde 100644 --- a/docs/checks/observability.md +++ b/docs/checks/observability.md @@ -2,15 +2,15 @@ Whether agent-facing resources stay accurate over time. Getting `llms.txt` and markdown support working is the hard part; keeping them working is a different problem. These checks catch the silent failures: a stale index, drifting content between formats, and cache headers that delay updates. -## llms-txt-freshness +## llms-txt-coverage Whether your `llms.txt` reflects the current state of your documentation site. -| | | -| -------------- | ------------------------------------------------------------------------ | -| **Weight** | Medium (4) | -| **Depends on** | `llms-txt-exists` | -| **Spec** | [llms-txt-freshness](https://agentdocsspec.com/spec/#llms-txt-freshness) | +| | | +| -------------- | ---------------------------------------------------------------------- | +| **Weight** | Medium (4) | +| **Depends on** | `llms-txt-exists` | +| **Spec** | [llms-txt-coverage](https://agentdocsspec.com/spec/#llms-txt-coverage) | ### Why it matters diff --git a/docs/generate_llms_txt b/docs/generate_llms_txt index 7295825..5a19db3 100755 --- a/docs/generate_llms_txt +++ b/docs/generate_llms_txt @@ -44,7 +44,7 @@ echo "- [Markdown Availability](${BASE_URL}/checks/markdown-availability.md): .m echo "- [Page Size](${BASE_URL}/checks/page-size.md): rendering strategy, HTML/markdown size, content start position" >> "$OUTPUT" echo "- [Content Structure](${BASE_URL}/checks/content-structure.md): tabbed content serialization, header quality, code fence validity" >> "$OUTPUT" echo "- [URL Stability](${BASE_URL}/checks/url-stability.md): HTTP status codes, redirect behavior" >> "$OUTPUT" -echo "- [Observability](${BASE_URL}/checks/observability.md): llms.txt freshness, markdown content parity, cache headers" >> "$OUTPUT" +echo "- [Observability](${BASE_URL}/checks/observability.md): llms.txt coverage, markdown content parity, cache headers" >> "$OUTPUT" echo "- [Authentication](${BASE_URL}/checks/authentication.md): auth gate detection, alternative access paths" >> "$OUTPUT" cat >> "$OUTPUT" << 'SECTION' diff --git a/docs/improve-your-score.md b/docs/improve-your-score.md index 85e706b..05d68cd 100644 --- a/docs/improve-your-score.md +++ b/docs/improve-your-score.md @@ -67,7 +67,7 @@ Not all fixes are equal. Here are the highest-impact changes, ordered by the sco If `llms-txt-exists` fails, create an `llms.txt` at your site root listing your documentation pages with markdown links. See the [llms.txt specification](https://llmstxt.org/) for the format. -This also unblocks five dependent checks (`llms-txt-valid`, `llms-txt-size`, `llms-txt-links-resolve`, `llms-txt-links-markdown`, `llms-txt-freshness`) that are currently skipped. +This also unblocks five dependent checks (`llms-txt-valid`, `llms-txt-size`, `llms-txt-links-resolve`, `llms-txt-links-markdown`, `llms-txt-coverage`) that are currently skipped. **Enable server-side rendering** @@ -112,7 +112,7 @@ These are worth addressing but won't move the score as dramatically: - **Tabbed content** (`tabbed-content-serialization`): If tabbed UI components create oversized output, consider restructuring into separate pages or using query params to retrieve only specific tab versions. - **Code fence validity** (`markdown-code-fence-validity`): Fix unclosed code fences in your markdown sources. - **Redirect behavior** (`redirect-behavior`): Replace JavaScript and cross-host redirects with standard HTTP redirects. -- **llms.txt freshness** (`llms-txt-freshness`): Generate llms.txt at build time to keep it in sync with your site. +- **llms.txt coverage** (`llms-txt-coverage`): Generate llms.txt at build time to keep it in sync with your site. - **Content parity** (`markdown-content-parity`): Ensure markdown and HTML versions of pages contain the same content. - **llms.txt validity** (`llms-txt-valid`): Follow the [llmstxt.org](https://llmstxt.org/) structure. diff --git a/docs/interaction-diagnostics.md b/docs/interaction-diagnostics.md index 91ae2bc..10a6227 100644 --- a/docs/interaction-diagnostics.md +++ b/docs/interaction-diagnostics.md @@ -22,7 +22,7 @@ These diagnostics appear in the "Interaction Diagnostics" section of the `--form **What to do**: Split into a root llms.txt that links to section-level llms.txt files, each under 50,000 characters. The [llms-txt-size check](/checks/content-discoverability#llms-txt-size) details the thresholds. -**Score impact**: The index truncation coefficient scales down `llms-txt-links-resolve`, `llms-txt-valid`, `llms-txt-freshness`, and `llms-txt-links-markdown` proportionally. A file that's twice the limit counts those checks at roughly half weight. +**Score impact**: The index truncation coefficient scales down `llms-txt-links-resolve`, `llms-txt-valid`, `llms-txt-coverage`, and `llms-txt-links-markdown` proportionally. A file that's twice the limit counts those checks at roughly half weight. ## SPA shells invalidate HTML path diff --git a/docs/public/llms.txt b/docs/public/llms.txt index 5887f8a..181ad1c 100644 --- a/docs/public/llms.txt +++ b/docs/public/llms.txt @@ -24,7 +24,7 @@ - [Page Size](https://afdocs.dev/checks/page-size.md): rendering strategy, HTML/markdown size, content start position - [Content Structure](https://afdocs.dev/checks/content-structure.md): tabbed content serialization, header quality, code fence validity - [URL Stability](https://afdocs.dev/checks/url-stability.md): HTTP status codes, redirect behavior -- [Observability](https://afdocs.dev/checks/observability.md): llms.txt freshness, markdown content parity, cache headers +- [Observability](https://afdocs.dev/checks/observability.md): llms.txt coverage, markdown content parity, cache headers - [Authentication](https://afdocs.dev/checks/authentication.md): auth gate detection, alternative access paths ## API Reference diff --git a/docs/run-locally.md b/docs/run-locally.md index 61883b6..5b331bf 100644 --- a/docs/run-locally.md +++ b/docs/run-locally.md @@ -85,7 +85,7 @@ Some checks may behave differently against a local server: ## Production URLs in local builds -When you build your site locally, generated files like `llms.txt` and `sitemap.xml` typically contain your production domain. AFDocs sees URLs pointing to `https://docs.example.com` but you're testing `http://localhost:3000`, so origin comparisons fail and checks like `llms-txt-freshness` report 0% coverage. +When you build your site locally, generated files like `llms.txt` and `sitemap.xml` typically contain your production domain. AFDocs sees URLs pointing to `https://docs.example.com` but you're testing `http://localhost:3000`, so origin comparisons fail and checks like `llms-txt-coverage` report 0% coverage. Use `--canonical-origin` to tell AFDocs which production domain to rewrite: diff --git a/scoring-reference.md b/scoring-reference.md index 3e26294..1618daa 100644 --- a/scoring-reference.md +++ b/scoring-reference.md @@ -61,7 +61,7 @@ and the empirical evidence sections in each check definition. | `content-start-position` | Medium | 4 | Boilerplate preamble on HTML path wastes truncation budget. | | `tabbed-content-serialization` | Medium | 4 | Tabbed content can be catastrophic but only affects pages that use it. | | `markdown-code-fence-validity` | Medium | 4 | Unclosed fences corrupt all content after the break point. | -| `llms-txt-freshness` | Medium | 4 | Stale index is a slow failure mode; broken links catch the acute version. | +| `llms-txt-coverage` | Medium | 4 | Stale index is a slow failure mode; broken links catch the acute version. | | `markdown-content-parity` | Medium | 4 | Content drift between markdown and HTML leaves agents with outdated info. | | `auth-alternative-access` | Medium | 4 | Partial mitigation for auth-gated sites. | | `redirect-behavior` | Medium | 4 | Cross-host redirects are a known friction point for some agents. | @@ -108,7 +108,7 @@ Each check has a specific warn coefficient rather than a uniform default. | `llms-txt-valid` | 0.75 | Non-standard structure, but links are parseable. Missing a blockquote doesn't prevent navigation. | | `content-negotiation` | 0.75 | Agent gets the markdown content; wrong Content-Type may prevent optimizations but the content itself is correct. | | `llms-txt-links-resolve` | 0.75 | >90% of links work. A few broken links is a maintenance issue, not a structural one. | -| `llms-txt-freshness` | 0.75 | 80-95% of pages covered. Most of the site is represented in the index. | +| `llms-txt-coverage` | 0.75 | 80-95% of pages covered. Most of the site is represented in the index. | | `markdown-content-parity` | 0.75 | Minor formatting differences, not substantive content drift. | | **0.60: Partial coverage or platform-dependent** | | | | `llms-txt-directive` | 0.60 | Present on some pages but not others. Agents that land on covered pages benefit; others get no guidance. | @@ -172,7 +172,7 @@ Single-resource checks (no proportional scoring needed): | `llms-txt-size` | Per-file average (see note below) | | `llms-txt-links-resolve` | Uses resolve rate directly from details (`resolveRate` field) | | `llms-txt-links-markdown` | Percentage-based status | -| `llms-txt-freshness` | Coverage percentage | +| `llms-txt-coverage` | Coverage percentage | | `auth-alternative-access` | Binary: alternative path exists or doesn't | For `llms-txt-links-resolve`, the `resolveRate` field in details (a 0-1 float) @@ -317,7 +317,7 @@ of the HTML path as a whole). ### Index Truncation Coefficient **Applies to**: `llms-txt-links-resolve`, `llms-txt-valid`, -`llms-txt-freshness`, `llms-txt-links-markdown` +`llms-txt-coverage`, `llms-txt-links-markdown` If `llms-txt-size` fails, agents only see a fraction of the index. The quality of the invisible portion doesn't affect agent experience. diff --git a/src/checks/index.ts b/src/checks/index.ts index 6f9ec55..342b0c3 100644 --- a/src/checks/index.ts +++ b/src/checks/index.ts @@ -28,7 +28,7 @@ import './url-stability/http-status-codes.js'; import './url-stability/redirect-behavior.js'; // Category 6: Observability -import './observability/llms-txt-freshness.js'; +import './observability/llms-txt-coverage.js'; import './observability/markdown-content-parity.js'; import './observability/cache-header-hygiene.js'; diff --git a/src/checks/observability/llms-txt-freshness.ts b/src/checks/observability/llms-txt-coverage.ts similarity index 97% rename from src/checks/observability/llms-txt-freshness.ts rename to src/checks/observability/llms-txt-coverage.ts index 0f46574..30dddb8 100644 --- a/src/checks/observability/llms-txt-freshness.ts +++ b/src/checks/observability/llms-txt-coverage.ts @@ -202,13 +202,13 @@ const COVERAGE_PASS = 0.95; const COVERAGE_WARN = 0.8; /** - * Maximum sitemap URLs to collect for freshness comparison. + * Maximum sitemap URLs to collect for coverage comparison. * Higher than the default MAX_SITEMAP_URLS (500) used for page sampling, - * because freshness needs the full sitemap to produce meaningful coverage + * because coverage needs the full sitemap to produce meaningful coverage * percentages. Enterprise docs sites (Stripe, MongoDB) can have thousands * of pages. */ -const MAX_FRESHNESS_SITEMAP_URLS = 50_000; +const MAX_COVERAGE_SITEMAP_URLS = 50_000; /** * Try to fetch a docs-specific sitemap at {baseUrl}/sitemap.xml. @@ -271,7 +271,7 @@ function scopeUrls(urls: string[], origin: string, baseUrlPath: string): string[ } async function check(ctx: CheckContext): Promise { - const id = 'llms-txt-freshness'; + const id = 'llms-txt-coverage'; const category = 'observability'; // 1. Get llms.txt page URLs (with progressive disclosure walking) @@ -291,7 +291,7 @@ async function check(ctx: CheckContext): Promise { const effectiveOrigin = ctx.effectiveOrigin ?? ctx.origin; const sitemapWarnings: string[] = []; let sitemapUrls = await getUrlsFromSitemap(ctx, sitemapWarnings, { - maxUrls: MAX_FRESHNESS_SITEMAP_URLS, + maxUrls: MAX_COVERAGE_SITEMAP_URLS, originOverride: effectiveOrigin, skipRefinement: true, }); @@ -317,7 +317,7 @@ async function check(ctx: CheckContext): Promise { category, status: 'skip', message: - 'No sitemap found; cannot assess llms.txt freshness without a sitemap as ground truth', + 'No sitemap found; cannot assess llms.txt coverage without a sitemap as ground truth', details: { sitemapWarnings }, }; } @@ -478,9 +478,9 @@ async function check(ctx: CheckContext): Promise { } registerCheck({ - id: 'llms-txt-freshness', + id: 'llms-txt-coverage', category: 'observability', - description: 'Whether llms.txt reflects the current state of the site', + description: 'How much of the site is represented in llms.txt', dependsOn: ['llms-txt-exists'], run: check, }); diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index 09d5737..ba74302 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -625,7 +625,7 @@ export interface SitemapOptions { maxUrls?: number; originOverride?: string; pathFilterBase?: string; - /** Skip URL-level locale/version refinement. Use when the caller needs raw URLs (e.g. freshness coverage). */ + /** Skip URL-level locale/version refinement. Use when the caller needs raw URLs (e.g. coverage check). */ skipRefinement?: boolean; } diff --git a/src/scoring/coefficients.ts b/src/scoring/coefficients.ts index f7c6889..6ed4a9c 100644 --- a/src/scoring/coefficients.ts +++ b/src/scoring/coefficients.ts @@ -88,7 +88,7 @@ function getHtmlPathCoefficient(results: Map): number { const INDEX_TRUNCATION_CHECKS = new Set([ 'llms-txt-links-resolve', 'llms-txt-valid', - 'llms-txt-freshness', + 'llms-txt-coverage', 'llms-txt-links-markdown', ]); diff --git a/src/scoring/proportions.ts b/src/scoring/proportions.ts index 0a69afb..5dd4d17 100644 --- a/src/scoring/proportions.ts +++ b/src/scoring/proportions.ts @@ -91,7 +91,7 @@ const PROPORTION_EXTRACTORS: Record = { 'llms-txt-links-markdown': llmsTxtLinksMarkdownExtractor, // --- Percentage-based single-value checks --- - 'llms-txt-freshness': llmsTxtFreshnessExtractor, + 'llms-txt-coverage': llmsTxtCoverageExtractor, }; // --------------------------------------------------------------------------- @@ -450,7 +450,7 @@ function llmsTxtLinksMarkdownExtractor(result: CheckResult): ProportionResult | }; } -function llmsTxtFreshnessExtractor(result: CheckResult): ProportionResult | undefined { +function llmsTxtCoverageExtractor(result: CheckResult): ProportionResult | undefined { const d = result.details; if (!d) return undefined; diff --git a/src/scoring/resolutions.ts b/src/scoring/resolutions.ts index d1b2a98..254ce75 100644 --- a/src/scoring/resolutions.ts +++ b/src/scoring/resolutions.ts @@ -272,7 +272,7 @@ const RESOLUTION_TEMPLATES: Record = { }, }, - 'llms-txt-freshness': { + 'llms-txt-coverage': { warn: (d) => { const missing = (d.missingCount as number) ?? 0; return ( diff --git a/src/scoring/tag-scores.ts b/src/scoring/tag-scores.ts index 86cd71a..f2601c6 100644 --- a/src/scoring/tag-scores.ts +++ b/src/scoring/tag-scores.ts @@ -119,7 +119,7 @@ const SINGLE_RESOURCE_CHECKS = new Set([ 'llms-txt-size', 'llms-txt-links-resolve', 'llms-txt-links-markdown', - 'llms-txt-freshness', + 'llms-txt-coverage', ]); /** diff --git a/src/scoring/weights.ts b/src/scoring/weights.ts index c139262..4e61bed 100644 --- a/src/scoring/weights.ts +++ b/src/scoring/weights.ts @@ -45,7 +45,7 @@ export const CHECK_WEIGHTS: Record = { 'content-start-position': w('medium', 0.5), 'tabbed-content-serialization': w('medium', 0.5), 'markdown-code-fence-validity': w('medium'), - 'llms-txt-freshness': w('medium', 0.75), + 'llms-txt-coverage': w('medium', 0.75), 'markdown-content-parity': w('medium', 0.75), 'auth-alternative-access': w('medium', 0.5), 'redirect-behavior': w('medium', 0.6), diff --git a/src/types.ts b/src/types.ts index ed03263..1d77fb5 100644 --- a/src/types.ts +++ b/src/types.ts @@ -35,7 +35,7 @@ export interface CheckContext { /** * The actual origin where content lives, when the baseUrl origin redirects * cross-host. Set by llms-txt-exists when it detects a cross-host redirect. - * Checks that need ground-truth data (e.g. sitemap for freshness) should + * Checks that need ground-truth data (e.g. sitemap for coverage) should * use this over `origin`; checks that test agent experience should use `origin`. */ effectiveOrigin?: string; diff --git a/test/integration/check-pipeline.test.ts b/test/integration/check-pipeline.test.ts index 12bd04c..7beb6db 100644 --- a/test/integration/check-pipeline.test.ts +++ b/test/integration/check-pipeline.test.ts @@ -1182,7 +1182,7 @@ describe('check pipeline: canonical llms.txt selection', () => { }); describe('check pipeline: effectiveOrigin propagation', () => { - it('llms-txt-exists sets effectiveOrigin which llms-txt-freshness uses', async () => { + it('llms-txt-exists sets effectiveOrigin which llms-txt-coverage uses', async () => { // llms.txt redirects cross-host; sitemap lives at the redirected host const redirectedHost = 'pipe-effective-docs.local'; const llmsContent = `# Docs\n## Links\n- [Guide](http://${redirectedHost}/docs/guide): Guide\n`; @@ -1224,18 +1224,18 @@ describe('check pipeline: effectiveOrigin propagation', () => { ); const report = await runChecks('http://pipe-effective.local', { - checkIds: ['llms-txt-exists', 'llms-txt-freshness'], + checkIds: ['llms-txt-exists', 'llms-txt-coverage'], requestDelay: 0, }); const existsResult = report.results.find((r) => r.id === 'llms-txt-exists')!; - const freshnessResult = report.results.find((r) => r.id === 'llms-txt-freshness')!; + const coverageResult = report.results.find((r) => r.id === 'llms-txt-coverage')!; // Cross-host redirect produces 'warn' (agents may not follow it) expect(existsResult.status).toBe('warn'); - // Freshness should not skip — it should use the effectiveOrigin to find the sitemap + // Coverage should not skip — it should use the effectiveOrigin to find the sitemap // at the redirected host and match URLs there - expect(freshnessResult.status).not.toBe('skip'); - expect(freshnessResult.message).not.toContain('No sitemap found'); + expect(coverageResult.status).not.toBe('skip'); + expect(coverageResult.message).not.toContain('No sitemap found'); }); }); diff --git a/test/unit/checks/llms-txt-freshness.test.ts b/test/unit/checks/llms-txt-coverage.test.ts similarity index 96% rename from test/unit/checks/llms-txt-freshness.test.ts rename to test/unit/checks/llms-txt-coverage.test.ts index b874b63..e7082d6 100644 --- a/test/unit/checks/llms-txt-freshness.test.ts +++ b/test/unit/checks/llms-txt-coverage.test.ts @@ -7,7 +7,7 @@ import type { DiscoveredFile } from '../../../src/types.js'; import { hasLocaleCodeAt, filterToUnprefixedLocale, -} from '../../../src/checks/observability/llms-txt-freshness.js'; +} from '../../../src/checks/observability/llms-txt-coverage.js'; const server = setupServer(); @@ -16,7 +16,7 @@ beforeAll(() => { return () => server.close(); }); -const check = getCheck('llms-txt-freshness'); +const check = getCheck('llms-txt-coverage'); /** * Build a minimal llms.txt content string from an array of URLs. @@ -61,9 +61,9 @@ function makeCtx(host: string, llmsTxtUrls: string[], basePath = '') { return ctx; } -describe('llms-txt-freshness', () => { +describe('llms-txt-coverage', () => { test('passes when llms.txt fully covers sitemap', async () => { - const host = 'fresh-pass.local'; + const host = 'cov-pass.local'; const pages = [ `http://${host}/docs/getting-started`, `http://${host}/docs/api-reference`, @@ -94,7 +94,7 @@ describe('llms-txt-freshness', () => { }); test('passes when llms.txt uses .md URLs matching sitemap HTML URLs', async () => { - const host = 'fresh-md.local'; + const host = 'cov-md.local'; const llmsUrls = [ `http://${host}/docs/getting-started.md`, `http://${host}/docs/api-reference.md`, @@ -127,7 +127,7 @@ describe('llms-txt-freshness', () => { }); test('passes with trailing slash differences', async () => { - const host = 'fresh-slash.local'; + const host = 'cov-slash.local'; const llmsUrls = [`http://${host}/docs/guide`]; const sitemapUrls = [`http://${host}/docs/guide/`]; @@ -154,7 +154,7 @@ describe('llms-txt-freshness', () => { }); test('warns when coverage is between 80% and 95%', async () => { - const host = 'fresh-warn.local'; + const host = 'cov-warn.local'; // llms.txt has 9 of 10 pages (90% coverage) const allPages = Array.from({ length: 10 }, (_, i) => `http://${host}/docs/page-${i}`); const llmsPages = allPages.slice(0, 9); @@ -183,7 +183,7 @@ describe('llms-txt-freshness', () => { }); test('fails when coverage is below 80%', async () => { - const host = 'fresh-fail.local'; + const host = 'cov-fail.local'; // llms.txt has 5 of 10 pages (50% coverage) const allPages = Array.from({ length: 10 }, (_, i) => `http://${host}/docs/page-${i}`); const llmsPages = allPages.slice(0, 5); @@ -212,7 +212,7 @@ describe('llms-txt-freshness', () => { }); test('reports unmatched llms.txt links not in sitemap', async () => { - const host = 'fresh-unmatched.local'; + const host = 'cov-unmatched.local'; const sitemapPages = Array.from({ length: 10 }, (_, i) => `http://${host}/docs/page-${i}`); // llms.txt has all sitemap pages plus 3 extras not in sitemap const llmsPages = [ @@ -247,7 +247,7 @@ describe('llms-txt-freshness', () => { }); test('unmatched links do not affect overall status', async () => { - const host = 'fresh-unmatched-pass.local'; + const host = 'cov-unmatched-pass.local'; // Coverage is fine (100%) but many unmatched llms.txt links const sitemapPages = Array.from({ length: 5 }, (_, i) => `http://${host}/docs/page-${i}`); const llmsPages = [ @@ -283,7 +283,7 @@ describe('llms-txt-freshness', () => { }); test('skips when no sitemap is available', async () => { - const host = 'fresh-no-sitemap.local'; + const host = 'cov-no-sitemap.local'; const ctx = makeCtx(host, [`http://${host}/docs/page`], '/docs'); server.use( @@ -302,7 +302,7 @@ describe('llms-txt-freshness', () => { }); test('skips when no page URLs in llms.txt', async () => { - const host = 'fresh-no-pages.local'; + const host = 'cov-no-pages.local'; const ctx = createContext(`http://${host}/docs`, { requestDelay: 0 }); ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', @@ -327,7 +327,7 @@ describe('llms-txt-freshness', () => { }); test('scopes sitemap URLs to baseUrl path prefix', async () => { - const host = 'fresh-scope.local'; + const host = 'cov-scope.local'; const docPages = [`http://${host}/docs/guide`, `http://${host}/docs/api`]; const allSitemapPages = [ ...docPages, @@ -361,7 +361,7 @@ describe('llms-txt-freshness', () => { }); test('excludes blog/changelog/pricing paths from sitemap comparison', async () => { - const host = 'fresh-exclude.local'; + const host = 'cov-exclude.local'; const docPages = [`http://${host}/guide`]; const sitemapPages = [ `http://${host}/guide`, @@ -397,7 +397,7 @@ describe('llms-txt-freshness', () => { }); test('handles index.md normalization', async () => { - const host = 'fresh-index.local'; + const host = 'cov-index.local'; const llmsUrls = [`http://${host}/docs/guide/index.md`]; const sitemapUrls = [`http://${host}/docs/guide/`]; @@ -424,7 +424,7 @@ describe('llms-txt-freshness', () => { }); test('skips when sitemap has no URLs under docs path prefix', async () => { - const host = 'fresh-no-scope.local'; + const host = 'cov-no-scope.local'; const ctx = makeCtx(host, [`http://${host}/docs/page`], '/docs'); const sitemapPages = [`http://${host}/marketing/page1`, `http://${host}/marketing/page2`]; @@ -450,7 +450,7 @@ describe('llms-txt-freshness', () => { }); test('does not count cross-origin llms.txt URLs as unmatched', async () => { - const host = 'fresh-cross.local'; + const host = 'cov-cross.local'; const sitemapPages = [`http://${host}/docs/page`]; // llms.txt links to a page on a different host — should not be flagged const llmsPages = [`http://${host}/docs/page`, `http://other-host.local/docs/external`]; @@ -478,7 +478,7 @@ describe('llms-txt-freshness', () => { }); test('falls back to docs-specific sitemap when main sitemap has no docs URLs', async () => { - const host = 'fresh-docs-sitemap.local'; + const host = 'cov-docs-sitemap.local'; const docPages = [`http://${host}/docs/guide`, `http://${host}/docs/api`]; const marketingPages = [`http://${host}/about`, `http://${host}/pricing`]; @@ -517,7 +517,7 @@ describe('llms-txt-freshness', () => { }); test('follows docs-specific sitemap index one level deep', async () => { - const host = 'fresh-docs-index.local'; + const host = 'cov-docs-index.local'; const docPages = [ `http://${host}/docs/guide`, `http://${host}/docs/api`, @@ -557,7 +557,7 @@ describe('llms-txt-freshness', () => { expect(result.status).toBe('pass'); expect(result.details?.sitemapDocPages).toBe(3); // getUrlsFromSitemap now discovers the docs sitemap via subpath fallback, - // so the freshness check's own fetchDocsSitemap fallback doesn't fire. + // so the coverage check's own fetchDocsSitemap fallback doesn't fire. expect(result.details?.sitemapSource).toBe('robots.txt/sitemap.xml'); }); @@ -599,7 +599,7 @@ describe('llms-txt-freshness', () => { const result = await check.run(ctx); expect(result.status).toBe('pass'); - // Locale filtering now happens inside getUrlsFromSitemap, so the freshness + // Locale filtering now happens inside getUrlsFromSitemap, so the coverage // check receives only English URLs and its own locale detection is a no-op. expect(result.details?.sitemapDocPages).toBe(3); }); diff --git a/test/unit/scoring/coefficients.test.ts b/test/unit/scoring/coefficients.test.ts index 8ab3bb7..2364fc0 100644 --- a/test/unit/scoring/coefficients.test.ts +++ b/test/unit/scoring/coefficients.test.ts @@ -122,7 +122,7 @@ describe('coefficients', () => { const affectedChecks = [ 'llms-txt-links-resolve', 'llms-txt-valid', - 'llms-txt-freshness', + 'llms-txt-coverage', 'llms-txt-links-markdown', ]; diff --git a/test/unit/scoring/proportions.test.ts b/test/unit/scoring/proportions.test.ts index e1ea26a..061a70d 100644 --- a/test/unit/scoring/proportions.test.ts +++ b/test/unit/scoring/proportions.test.ts @@ -353,9 +353,9 @@ describe('proportions', () => { expect(result!.proportion).toBe(0.3); }); - it('llms-txt-freshness: uses coverageRate', () => { + it('llms-txt-coverage: uses coverageRate', () => { const result = getCheckProportion( - makeResult('llms-txt-freshness', 'warn', { + makeResult('llms-txt-coverage', 'warn', { coverageRate: 88, }), makeWeight(4, 0.75), @@ -386,9 +386,9 @@ describe('proportions', () => { expect(result!.proportion).toBe(0.0); }); - it('llms-txt-freshness: falls back when no coverageRate', () => { + it('llms-txt-coverage: falls back when no coverageRate', () => { const result = getCheckProportion( - makeResult('llms-txt-freshness', 'warn', {}), + makeResult('llms-txt-coverage', 'warn', {}), makeWeight(4, 0.75), ); expect(result!.proportion).toBe(0.75); diff --git a/test/unit/scoring/resolutions.test.ts b/test/unit/scoring/resolutions.test.ts index f8177fd..f848ea6 100644 --- a/test/unit/scoring/resolutions.test.ts +++ b/test/unit/scoring/resolutions.test.ts @@ -82,7 +82,7 @@ describe('resolutions', () => { 'markdown-code-fence-validity', 'http-status-codes', 'redirect-behavior', - 'llms-txt-freshness', + 'llms-txt-coverage', 'markdown-content-parity', 'cache-header-hygiene', 'auth-gate-detection', diff --git a/test/unit/scoring/score.test.ts b/test/unit/scoring/score.test.ts index af4ea6a..3f83740 100644 --- a/test/unit/scoring/score.test.ts +++ b/test/unit/scoring/score.test.ts @@ -65,7 +65,7 @@ describe('computeScore', () => { makeResult('markdown-code-fence-validity', 'content-structure', 'pass'), makeResult('http-status-codes', 'url-stability', 'pass'), makeResult('redirect-behavior', 'url-stability', 'pass'), - makeResult('llms-txt-freshness', 'observability', 'pass'), + makeResult('llms-txt-coverage', 'observability', 'pass'), makeResult('markdown-content-parity', 'observability', 'pass'), makeResult('cache-header-hygiene', 'observability', 'pass'), makeResult('auth-gate-detection', 'authentication', 'pass'), @@ -385,7 +385,7 @@ describe('computeScore', () => { makeResult('redirect-behavior', 'url-stability', 'pass'), // Observability - makeResult('llms-txt-freshness', 'observability', 'pass'), + makeResult('llms-txt-coverage', 'observability', 'pass'), makeResult('cache-header-hygiene', 'observability', 'pass'), // No auth issues From ce698fc923f328699235fad02291ea080f375a20 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sat, 25 Apr 2026 13:54:41 -0400 Subject: [PATCH 03/13] Omit subtrees from coverage check, support curation and exclusion configuration --- docs/checks/observability.md | 80 ++++- docs/reference/cli.md | 16 + docs/reference/config-file.md | 36 +- package-lock.json | 16 +- package.json | 2 + src/checks/observability/llms-txt-coverage.ts | 125 ++++++- src/cli/commands/check.ts | 26 ++ src/constants.ts | 6 + src/helpers/get-page-urls.ts | 35 +- src/scoring/resolutions.ts | 11 +- src/types.ts | 6 + test/unit/checks/llms-txt-coverage.test.ts | 324 +++++++++++++++++- 12 files changed, 621 insertions(+), 62 deletions(-) diff --git a/docs/checks/observability.md b/docs/checks/observability.md index 5784dde..7c0a240 100644 --- a/docs/checks/observability.md +++ b/docs/checks/observability.md @@ -4,7 +4,7 @@ Whether agent-facing resources stay accurate over time. Getting `llms.txt` and m ## llms-txt-coverage -Whether your `llms.txt` reflects the current state of your documentation site. +How much of your site's documentation is represented in `llms.txt`. | | | | -------------- | ---------------------------------------------------------------------- | @@ -14,21 +14,83 @@ Whether your `llms.txt` reflects the current state of your documentation site. ### Why it matters -An `llms.txt` that was accurate at launch but never updated is a silent failure. New pages won't appear in the index, deleted pages send agents to 404s, and renamed pages produce redirect chains. Unlike `llms-txt-links-resolve` (which catches broken links), this check catches missing coverage: pages that exist on your site but aren't listed in `llms.txt`. +Pages missing from `llms.txt` are effectively invisible to agents that rely on it for discovery. Unlike `llms-txt-links-resolve` (which catches broken links to pages that are listed), this check catches the opposite problem: pages that exist on your site but aren't listed at all. Not every gap is a problem; many sites intentionally curate their `llms.txt`. The check makes coverage visible so you can confirm it reflects your intent. ### Results -Based on coverage of your site's documentation pages (excluding non-docs pages like blog posts, pricing, login): +Based on coverage of your site's documentation pages, after excluding non-doc pages (see [built-in exclusions](#built-in-exclusions) below). Thresholds are configurable. -| Result | Condition | -| ------ | --------------------------------------------------------------- | -| Pass | `llms.txt` covers 95% or more of the site's documentation pages | -| Warn | 80-95% coverage (some live pages missing from the index) | -| Fail | Under 80% coverage (missing large documentation sections) | +| Result | Condition | +| ------ | ------------------------------------------------------------------------ | +| Pass | `llms.txt` covers >= pass threshold (default 95%) of documentation pages | +| Warn | Coverage between warn and pass thresholds (default 80-95%) | +| Fail | Coverage below warn threshold (default < 80%) | + +### Configuring coverage + +The check supports three use cases through configurable thresholds and exclusion patterns: + +- **Full parity** (default): The site intends `llms.txt` to mirror the sitemap. Default thresholds (95/80) apply. +- **Curated**: The site intentionally includes only a subset. Set thresholds to 0 (`--coverage-pass-threshold 0 --coverage-warn-threshold 0`) to make the check informational. It still reports coverage percentage and missing pages, but does not warn or fail. +- **Hybrid**: Strict coverage with known exclusions. Use `--coverage-exclusions` to remove intentional gaps from the denominator; the check holds remaining pages to default or custom thresholds. + +**CLI flags:** + +- `--coverage-pass-threshold ` — Pass threshold (0-100, default 95) +- `--coverage-warn-threshold ` — Warn threshold (0-100, default 80) +- `--coverage-exclusions ` — Comma-separated glob patterns to exclude from the sitemap before calculating coverage (e.g. `"/docs/reference/**,/docs/changelog/**"`) + +These can also be set in `agent-docs.config.yml` under `options`: + +```yaml +options: + coveragePassThreshold: 80 + coverageWarnThreshold: 50 + coverageExclusions: + - /docs/reference/** + - /docs/changelog/** + - '**/release-notes/**' # quote patterns starting with * +``` ### How to fix -**If this check warns or fails**, regenerate `llms.txt` from your sitemap or build pipeline. The best long-term fix is generating `llms.txt` at build time, so every deployment automatically includes an up-to-date index. Run with `--verbose` to see which pages are missing. +**If this check warns or fails**, regenerate `llms.txt` from your sitemap or build pipeline. The best long-term fix is generating `llms.txt` at build time, so every deployment automatically includes an up-to-date index. Run with `--verbose` to see which pages are missing. If the missing pages are intentionally excluded, use `--coverage-exclusions` or adjust thresholds. + +### Built-in exclusions + +Before calculating coverage, the check removes sitemap URLs whose paths match common non-documentation patterns. These pages appear in sitemaps but aren't meaningful to include in an `llms.txt` index. The excluded count is reported as `excludedNonDocPages` in the check details. + +The tool provides these built-in exclusions (matched at both root and relative to the base URL path): + +`/blog`, `/pricing`, `/about`, `/career`, `/careers`, `/job`, `/jobs`, `/contact`, `/legal`, `/privacy`, `/terms`, `/login`, `/signup`, `/sign-up`, `/sign-in`, `/register`, `/404`, `/500` + +For example, if your base URL is `https://example.com/docs`, both `/blog/post-1` and `/docs/blog/post-1` would be excluded. + +These are not configurable. If a built-in exclusion is removing pages you want counted, the page is likely at a path that conventionally indicates non-doc content. If you believe a pattern is wrong, please [open an issue](https://github.com/agent-ecosystem/afdocs/issues). + +Paths like `/changelog`, `/releases`, and `/security` are **not** excluded because many documentation sites intentionally include this content in their `llms.txt`. If you want to exclude them, use `--coverage-exclusions`. + +### Omitted subtrees + +When your `llms.txt` uses [progressive disclosure](https://agentdocsspec.com/spec/#progressive-disclosure-for-large-documentation-sets) (nested `llms.txt` files), the walker descends one level into linked `.txt` files. Any `.txt` files found at that depth (which the walker does not descend into) are treated as "omitted subtrees." Sitemap pages under those subtree prefixes are excluded from the coverage denominator rather than counted as missing. + +This means deeply nested `llms.txt` structures aren't penalized. The output distinguishes directly-verified pages from omitted subtrees. + +**Why not walk recursively?** A recursive walk would fetch every nested `.txt` file before any checks run. For a site like Alchemy, that's ~86 aggregate files across three levels. For a multi-product site like Microsoft Learn, it could be hundreds. A safety cap (e.g. 200 files) would silently truncate results, producing incomplete coverage numbers with no indication they're partial. Keeping the walker at depth 1 makes the HTTP footprint predictable, makes the runs more performant, and makes the results reproducible. + +**Run per-product for deeper visibility.** Organizations with large multi-product sites typically run `afdocs` at the per-product level, which gives full coverage visibility into each section without the cost of walking the entire tree: + +```bash +# Instead of walking the entire site's progressive disclosure tree: +afdocs check https://example.com/docs + +# Run per-product for deeper coverage: +afdocs check https://example.com/docs/chains/ethereum +afdocs check https://example.com/docs/chains/solana +afdocs check https://example.com/docs/sdk +``` + +Each per-product run picks up that section's `llms.txt` as canonical. For the sitemap, the tool scopes the root sitemap's URLs to the base path prefix. If no URLs match (common when the root sitemap doesn't cover the section), it falls back to looking for a section-level sitemap at `{basePath}/sitemap.xml`. This keeps runs fast and results meaningful. --- diff --git a/docs/reference/cli.md b/docs/reference/cli.md index c9a9190..987dbe8 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -198,6 +198,22 @@ These thresholds apply to `page-size-html`, `page-size-markdown`, and `tabbed-co The defaults (50K pass, 100K fail) reflect observed agent truncation limits. You generally don't need to change these unless you have specific knowledge of your users' agent platforms. +### Coverage thresholds + +| Flag | Default | Description | +| ---------------------------------- | ------- | ------------------------------------------------------------- | +| `--coverage-pass-threshold ` | `95` | `llms-txt-coverage` pass threshold (percentage, 0-100) | +| `--coverage-warn-threshold ` | `80` | `llms-txt-coverage` warn threshold (percentage, 0-100) | +| `--coverage-exclusions ` | | Comma-separated glob patterns to exclude from the denominator | + +These control the `llms-txt-coverage` check, which compares `llms.txt` page URLs against the sitemap. Set both thresholds to `0` to make the check informational: it still reports coverage percentage and missing pages, but doesn't warn or fail. + +Use exclusion patterns with glob syntax (`**` matches across path segments, `*` matches within one) to remove matching sitemap URLs from the denominator before calculating coverage. Exclude content like API reference pages or changelog archives that you omit intentionally from llms.txt: + +```bash +afdocs check https://example.com --coverage-exclusions "/docs/reference/**,/docs/changelog/**" +``` + ## Exit codes | Code | Meaning | diff --git a/docs/reference/config-file.md b/docs/reference/config-file.md index f6d2919..a8addf8 100644 --- a/docs/reference/config-file.md +++ b/docs/reference/config-file.md @@ -36,6 +36,13 @@ options: thresholds: pass: 50000 fail: 100000 + # Coverage check: thresholds and exclusions + # coveragePassThreshold: 95 + # coverageWarnThreshold: 80 + # coverageExclusions: + # - /docs/reference/** + # - /docs/changelog/** + # - "**/release-notes/**" # quote patterns starting with * # Optional: test specific pages instead of discovering via llms.txt/sitemap # pages: @@ -71,19 +78,22 @@ skipChecks: Override default runner options. All fields are optional: -| Field | Default | Description | -| ------------------ | ----------- | ------------------------------------------------------------------------------------------- | -| `maxLinksToTest` | `50` | Maximum number of pages to sample | -| `samplingStrategy` | `random` | `random`, `deterministic`, `curated`, or `none` | -| `maxConcurrency` | `3` | Maximum concurrent HTTP requests | -| `requestDelay` | `200` | Delay between requests in milliseconds | -| `requestTimeout` | `30000` | Timeout for individual HTTP requests in milliseconds | -| `preferredLocale` | auto-detect | Preferred locale for URL discovery (e.g. `en`, `fr`, `ja`) | -| `preferredVersion` | auto-detect | Preferred version for URL discovery (e.g. `v3`, `2.x`) | -| `canonicalOrigin` | | The production domain your content links to | -| `llmsTxtUrl` | | Explicit llms.txt URL to use as canonical (overrides the discovery heuristic; see CLI docs) | -| `thresholds.pass` | `50000` | Page size pass threshold in characters | -| `thresholds.fail` | `100000` | Page size fail threshold in characters | +| Field | Default | Description | +| ----------------------- | ----------- | -------------------------------------------------------------------------------------------------------- | +| `maxLinksToTest` | `50` | Maximum number of pages to sample | +| `samplingStrategy` | `random` | `random`, `deterministic`, `curated`, or `none` | +| `maxConcurrency` | `3` | Maximum concurrent HTTP requests | +| `requestDelay` | `200` | Delay between requests in milliseconds | +| `requestTimeout` | `30000` | Timeout for individual HTTP requests in milliseconds | +| `preferredLocale` | auto-detect | Preferred locale for URL discovery (e.g. `en`, `fr`, `ja`) | +| `preferredVersion` | auto-detect | Preferred version for URL discovery (e.g. `v3`, `2.x`) | +| `canonicalOrigin` | | The production domain your content links to | +| `llmsTxtUrl` | | Explicit llms.txt URL to use as canonical (overrides the discovery heuristic; see CLI docs) | +| `thresholds.pass` | `50000` | Page size pass threshold in characters | +| `thresholds.fail` | `100000` | Page size fail threshold in characters | +| `coveragePassThreshold` | `95` | `llms-txt-coverage` pass threshold (percentage, 0-100) | +| `coverageWarnThreshold` | `80` | `llms-txt-coverage` warn threshold (percentage, 0-100) | +| `coverageExclusions` | | Glob patterns to exclude from the sitemap before calculating coverage (quote patterns starting with `*`) | ### `pages` (optional) diff --git a/package-lock.json b/package-lock.json index 6d3142c..85f8644 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,6 +12,7 @@ "chalk": "^5.4.1", "commander": "^13.1.0", "node-html-parser": "^7.1.0", + "picomatch": "^4.0.4", "turndown": "^7.2.2", "turndown-plugin-gfm": "^1.0.2", "yaml": "^2.7.0" @@ -22,6 +23,7 @@ "devDependencies": { "@eslint/js": "^10.0.1", "@types/node": "^22.13.4", + "@types/picomatch": "^4.0.3", "@types/turndown": "^5.0.6", "@vitest/coverage-v8": "^4.0.18", "eslint": "^10.0.1", @@ -1291,6 +1293,13 @@ "undici-types": "~6.21.0" } }, + "node_modules/@types/picomatch": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/@types/picomatch/-/picomatch-4.0.3.tgz", + "integrity": "sha512-iG0T6+nYJ9FAPmx9SsUlnwcq1ZVRuCXcVEvWnntoPlrOpwtSTKNDC9uVAxTsC3PUvJ+99n4RpAcNgBbHX3JSnQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/statuses": { "version": "2.0.6", "resolved": "https://registry.npmjs.org/@types/statuses/-/statuses-2.0.6.tgz", @@ -3426,10 +3435,9 @@ "license": "ISC" }, "node_modules/picomatch": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", - "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", - "dev": true, + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", "license": "MIT", "peer": true, "engines": { diff --git a/package.json b/package.json index 1100388..c6e5ba6 100644 --- a/package.json +++ b/package.json @@ -79,6 +79,7 @@ "chalk": "^5.4.1", "commander": "^13.1.0", "node-html-parser": "^7.1.0", + "picomatch": "^4.0.4", "turndown": "^7.2.2", "turndown-plugin-gfm": "^1.0.2", "yaml": "^2.7.0" @@ -86,6 +87,7 @@ "devDependencies": { "@eslint/js": "^10.0.1", "@types/node": "^22.13.4", + "@types/picomatch": "^4.0.3", "@types/turndown": "^5.0.6", "@vitest/coverage-v8": "^4.0.18", "eslint": "^10.0.1", diff --git a/src/checks/observability/llms-txt-coverage.ts b/src/checks/observability/llms-txt-coverage.ts index 30dddb8..1ddf47b 100644 --- a/src/checks/observability/llms-txt-coverage.ts +++ b/src/checks/observability/llms-txt-coverage.ts @@ -1,11 +1,16 @@ import { registerCheck } from '../registry.js'; import { - getUrlsFromCachedLlmsTxt, + getUrlsFromCachedLlmsTxtWithOmitted, getUrlsFromSitemap, parseSitemapUrls, } from '../../helpers/get-page-urls.js'; import { isNonPageUrl } from '../../helpers/to-md-urls.js'; import { isLocaleSegment, hasStructuralDuplication } from '../../helpers/locale-codes.js'; +import { + DEFAULT_COVERAGE_PASS_THRESHOLD, + DEFAULT_COVERAGE_WARN_THRESHOLD, +} from '../../constants.js'; +import picomatch from 'picomatch'; import type { CheckContext, CheckResult } from '../../types.js'; /** @@ -41,8 +46,6 @@ export function normalizeUrlPath(url: string): string { */ const EXCLUDED_PATH_PATTERNS = [ /^\/blog(\/|$)/i, - /^\/changelog(\/|$)/i, - /^\/releases?(\/|$)/i, /^\/pricing(\/|$)/i, /^\/about(\/|$)/i, /^\/careers?(\/|$)/i, @@ -51,8 +54,6 @@ const EXCLUDED_PATH_PATTERNS = [ /^\/legal(\/|$)/i, /^\/privacy(\/|$)/i, /^\/terms(\/|$)/i, - /^\/security(\/|$)/i, - /^\/status(\/|$)/i, /^\/login(\/|$)/i, /^\/signup(\/|$)/i, /^\/sign-up(\/|$)/i, @@ -76,6 +77,50 @@ export function isExcludedPath(normalizedPath: string, baseUrlPath?: string): bo return false; } +/** + * Compile an array of glob patterns into a single picomatch matcher. + * Returns a function that tests a URL path against all patterns. + */ +export function compileExclusionMatcher(patterns: string[]): (path: string) => boolean { + if (patterns.length === 0) return () => false; + return picomatch(patterns, { nocase: true }); +} + +/** + * Test whether a normalized path matches any of the user-supplied exclusion globs. + * Patterns are tested against both the absolute path and the path relative to baseUrlPath. + */ +export function matchesUserExclusion( + normalizedPath: string, + matcher: (path: string) => boolean, + baseUrlPath?: string, +): boolean { + if (matcher(normalizedPath)) return true; + if (baseUrlPath && baseUrlPath !== '/' && normalizedPath.startsWith(baseUrlPath)) { + const relative = normalizedPath.slice(baseUrlPath.length) || '/'; + if (matcher(relative)) return true; + } + return false; +} + +/** + * Extract path prefixes from omitted .txt URLs. + * e.g. /docs/chains/ethereum/llms.txt → /docs/chains/ethereum + */ +export function extractOmittedPrefixes(omittedTxtUrls: string[]): string[] { + const prefixes: string[] = []; + for (const url of omittedTxtUrls) { + try { + const parsed = new URL(url); + const dir = parsed.pathname.replace(/\/[^/]+$/, ''); + if (dir) prefixes.push(dir.toLowerCase()); + } catch { + continue; + } + } + return prefixes; +} + /** * Detect whether a URL set uses locale-prefixed paths and, if so, return the * path segment position where locales appear. @@ -197,10 +242,6 @@ export function filterToUnprefixedLocale(urls: string[], position: number): stri return urls.filter((url) => !hasLocaleCodeAt(url, position)); } -/** Coverage thresholds */ -const COVERAGE_PASS = 0.95; -const COVERAGE_WARN = 0.8; - /** * Maximum sitemap URLs to collect for coverage comparison. * Higher than the default MAX_SITEMAP_URLS (500) used for page sampling, @@ -274,8 +315,28 @@ async function check(ctx: CheckContext): Promise { const id = 'llms-txt-coverage'; const category = 'observability'; - // 1. Get llms.txt page URLs (with progressive disclosure walking) - const llmsTxtUrls = await getUrlsFromCachedLlmsTxt(ctx); + // Resolve thresholds: CLI/config overrides → defaults, clamped to [0, 100] + const clamp = (v: number) => Math.max(0, Math.min(100, v)); + const rawPass = ctx.options.coveragePassThreshold ?? DEFAULT_COVERAGE_PASS_THRESHOLD; + const rawWarn = ctx.options.coverageWarnThreshold ?? DEFAULT_COVERAGE_WARN_THRESHOLD; + const passThreshold = clamp(rawPass) / 100; + const warnThreshold = clamp(rawWarn) / 100; + const thresholdWarnings: string[] = []; + if (passThreshold < warnThreshold) { + thresholdWarnings.push( + `coveragePassThreshold (${clamp(rawPass)}) is lower than ` + + `coverageWarnThreshold (${clamp(rawWarn)}); warn state is unreachable`, + ); + } + + // Compile user-supplied exclusion patterns + const userExclusionMatcher = compileExclusionMatcher(ctx.options.coverageExclusions ?? []); + + // 1. Get llms.txt page URLs + omitted subtrees (progressive disclosure) + const walkResult = await getUrlsFromCachedLlmsTxtWithOmitted(ctx); + const llmsTxtUrls = walkResult.pageUrls; + const omittedPrefixes = extractOmittedPrefixes(walkResult.omittedTxtUrls); + if (llmsTxtUrls.length === 0) { return { id, @@ -364,17 +425,30 @@ async function check(ctx: CheckContext): Promise { } } - // 3. Normalize both sets for comparison + // 3. Normalize both sets for comparison, applying exclusions: + // - Built-in non-doc path patterns (blog, changelog, etc.) + // - User-supplied exclusion globs (--coverage-exclusions) + // - Omitted subtree prefixes (nested llms.txt indexes not walked) const llmsNormalized = new Set(llmsTxtUrls.map(normalizeUrlPath)); const sitemapNormalized = new Map(); // normalized -> original URL + let omittedSubtreeCount = 0; + let userExcludedCount = 0; for (const url of scopedSitemapUrls) { const norm = normalizeUrlPath(url); - if (!isExcludedPath(norm, baseUrlPath)) { - sitemapNormalized.set(norm, url); + if (isExcludedPath(norm, baseUrlPath)) continue; + if (matchesUserExclusion(norm, userExclusionMatcher, baseUrlPath)) { + userExcludedCount++; + continue; } + if (omittedPrefixes.length > 0 && omittedPrefixes.some((p) => norm.startsWith(p))) { + omittedSubtreeCount++; + continue; + } + sitemapNormalized.set(norm, url); } - const excludedCount = scopedSitemapUrls.length - sitemapNormalized.size; + const excludedCount = + scopedSitemapUrls.length - sitemapNormalized.size - omittedSubtreeCount - userExcludedCount; // 4. Missing coverage: in sitemap but not in llms.txt const missingFromLlmsTxt: string[] = []; @@ -424,12 +498,12 @@ async function check(ctx: CheckContext): Promise { const coveragePct = Math.round(coverageRate * 100); const unmatchedPct = Math.round(unmatchedRate * 100); - // 7. Determine status based on coverage only + // 7. Determine status based on coverage and configurable thresholds // Unmatched links are informational (see note in step 5) let overallStatus: 'pass' | 'warn' | 'fail'; - if (coverageRate >= COVERAGE_PASS) { + if (coverageRate >= passThreshold) { overallStatus = 'pass'; - } else if (coverageRate >= COVERAGE_WARN) { + } else if (coverageRate >= warnThreshold) { overallStatus = 'warn'; } else { overallStatus = 'fail'; @@ -444,6 +518,11 @@ async function check(ctx: CheckContext): Promise { `llms.txt covers ${coveredCount}/${sitemapDocPages} sitemap doc pages (${coveragePct}%); ${missingFromLlmsTxt.length} missing`, ); } + if (omittedSubtreeCount > 0) { + parts.push( + `${walkResult.omittedTxtUrls.length} nested indexes omitted (${omittedSubtreeCount} sitemap pages excluded)`, + ); + } if (unmatchedLlmsTxtUrls.length > 0) { parts.push( `${unmatchedLlmsTxtUrls.length} llms.txt links not in sitemap (may indicate stale links or incomplete sitemap)`, @@ -464,15 +543,25 @@ async function check(ctx: CheckContext): Promise { sitemapDocPages, sitemapSource, excludedNonDocPages: excludedCount, + ...(userExcludedCount > 0 ? { userExcludedPages: userExcludedCount } : {}), + ...(omittedSubtreeCount > 0 + ? { + omittedSubtrees: walkResult.omittedTxtUrls.length, + omittedSubtreePages: omittedSubtreeCount, + } + : {}), ...(localeFiltered ? { localeFiltered: true, detectedLocale } : {}), baseUrlPath: baseUrlPath || '/', coverageRate: coveragePct, + coveragePassThreshold: Math.round(passThreshold * 100), + coverageWarnThreshold: Math.round(warnThreshold * 100), missingFromLlmsTxt: missingFromLlmsTxt.slice(0, 50), missingCount: missingFromLlmsTxt.length, unmatchedLlmsTxtUrls: unmatchedLlmsTxtUrls.slice(0, 50), unmatchedCount: unmatchedLlmsTxtUrls.length, unmatchedPct, sitemapWarnings, + ...(thresholdWarnings.length > 0 ? { thresholdWarnings } : {}), }, }; } diff --git a/src/cli/commands/check.ts b/src/cli/commands/check.ts index 36cd7b2..f3f6992 100644 --- a/src/cli/commands/check.ts +++ b/src/cli/commands/check.ts @@ -38,6 +38,12 @@ export function registerCheckCommand(program: Command): void { .option('-v, --verbose', 'Show per-page details for checks with issues') .option('--fixes', 'Show fix suggestions for warn/fail checks') .option('--score', 'Include scoring data in JSON output') + .option('--coverage-pass-threshold ', 'llms-txt-coverage pass threshold (0-100, default 95)') + .option('--coverage-warn-threshold ', 'llms-txt-coverage warn threshold (0-100, default 80)') + .option( + '--coverage-exclusions ', + 'Comma-separated glob patterns to exclude from coverage denominator', + ) .option( '--canonical-origin ', 'The production domain your content links to (for preview/staging testing)', @@ -221,6 +227,23 @@ export function registerCheckCommand(program: Command): void { } } + const coveragePassThreshold = + opts.coveragePassThreshold != null + ? parseInt(String(opts.coveragePassThreshold), 10) + : (config?.options?.coveragePassThreshold ?? undefined); + const coverageWarnThreshold = + opts.coverageWarnThreshold != null + ? parseInt(String(opts.coverageWarnThreshold), 10) + : (config?.options?.coverageWarnThreshold ?? undefined); + + const coverageExclusions = + opts.coverageExclusions != null + ? (opts.coverageExclusions as string) + .split(',') + .map((s) => s.trim()) + .filter(Boolean) + : (config?.options?.coverageExclusions ?? undefined); + const report = await runChecks(url, { checkIds, skipCheckIds, @@ -237,6 +260,9 @@ export function registerCheckCommand(program: Command): void { ...(preferredVersion && { preferredVersion }), ...(canonicalOrigin && { canonicalOrigin }), ...(llmsTxtUrl && { llmsTxtUrl }), + ...(coveragePassThreshold != null && { coveragePassThreshold }), + ...(coverageWarnThreshold != null && { coverageWarnThreshold }), + ...(coverageExclusions && { coverageExclusions }), }); let output: string; diff --git a/src/constants.ts b/src/constants.ts index 1fa63d1..670e1a9 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -34,5 +34,11 @@ export const LINK_RESOLVE_THRESHOLD = 0.9; /** Maximum number of URLs to collect from sitemaps before stopping. */ export const MAX_SITEMAP_URLS = 500; +/** Default llms-txt-coverage pass threshold (percentage). */ +export const DEFAULT_COVERAGE_PASS_THRESHOLD = 95; + +/** Default llms-txt-coverage warn threshold (percentage). */ +export const DEFAULT_COVERAGE_WARN_THRESHOLD = 80; + /** Base URL for the Agent-Friendly Documentation Spec. */ export const SPEC_BASE_URL = 'https://agentdocsspec.com/spec/'; diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index ba74302..1b6d42d 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -39,6 +39,13 @@ export function parseSitemapUrls(xml: string): { urls: string[]; sitemapIndexUrl } export async function getUrlsFromCachedLlmsTxt(ctx: CheckContext): Promise { + const result = await getUrlsFromCachedLlmsTxtWithOmitted(ctx); + return result.pageUrls; +} + +export async function getUrlsFromCachedLlmsTxtWithOmitted( + ctx: CheckContext, +): Promise { const existsResult = ctx.previousResults.get('llms-txt-exists'); const discovered = getLlmsTxtFilesForAnalysis(existsResult); @@ -85,9 +92,16 @@ function extractLinksFromLlmsTxtFiles(files: DiscoveredFile[]): string[] { * origin as the site being tested. This covers both sub-product llms.txt * files (Cloudflare) and aggregate content files (Supabase). */ -async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { +export interface AggregateWalkResult { + pageUrls: string[]; + /** Same-origin .txt URLs found at depth 1 that the walker did not descend into. */ + omittedTxtUrls: string[]; +} + +async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { const pageUrls: string[] = []; const aggregateUrls: string[] = []; + const omittedTxtUrls: string[] = []; const siteOrigin = ctx.effectiveOrigin ?? ctx.origin; @@ -110,7 +124,7 @@ async function walkAggregateLinks(ctx: CheckContext, urls: string[]): Promise { const canonical = selectCanonicalLlmsTxt(discovered, ctx.baseUrl); const filesForAnalysis = canonical ? [canonical] : []; const urls = extractLinksFromLlmsTxtFiles(filesForAnalysis); - return walkAggregateLinks(ctx, urls); + const result = await walkAggregateLinks(ctx, urls); + return result.pageUrls; } /** diff --git a/src/scoring/resolutions.ts b/src/scoring/resolutions.ts index 254ce75..b42eeca 100644 --- a/src/scoring/resolutions.ts +++ b/src/scoring/resolutions.ts @@ -275,15 +275,22 @@ const RESOLUTION_TEMPLATES: Record = { 'llms-txt-coverage': { warn: (d) => { const missing = (d.missingCount as number) ?? 0; + const coverage = (d.coverageRate as number) ?? 0; + const warnThreshold = (d.coverageWarnThreshold as number) ?? 80; + const passThreshold = (d.coveragePassThreshold as number) ?? 95; return ( - `Your llms.txt covers 80-95% of your site's pages. ${missing} live ` + + `Your llms.txt covers ${coverage}% of your site's pages ` + + `(${warnThreshold}-${passThreshold}% is warn). ${missing} live ` + 'pages are not represented in the index.' ); }, fail: (d) => { const missing = (d.missingCount as number) ?? 0; + const coverage = (d.coverageRate as number) ?? 0; + const warnThreshold = (d.coverageWarnThreshold as number) ?? 80; return ( - `Your llms.txt covers less than 80% of your site's pages. ` + + `Your llms.txt covers ${coverage}% of your site's pages ` + + `(below ${warnThreshold}% threshold). ` + `${missing} live pages are missing from the index. Regenerate ` + 'llms.txt from your sitemap or build pipeline.' ); diff --git a/src/types.ts b/src/types.ts index 1d77fb5..792517f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -84,6 +84,12 @@ export interface CheckOptions { preferredVersion?: string; /** Canonical origin to rewrite in fetched content (for preview/staging testing). */ canonicalOrigin?: string; + /** Pass threshold for llms-txt-coverage (0–100). Default 95. */ + coveragePassThreshold?: number; + /** Warn threshold for llms-txt-coverage (0–100). Default 80. */ + coverageWarnThreshold?: number; + /** Glob patterns to exclude from the sitemap before calculating coverage. */ + coverageExclusions?: string[]; /** * Explicit URL to use as the canonical llms.txt for downstream sampling and * analysis. When set, the standard candidate-discovery heuristic is bypassed diff --git a/test/unit/checks/llms-txt-coverage.test.ts b/test/unit/checks/llms-txt-coverage.test.ts index e7082d6..72412fb 100644 --- a/test/unit/checks/llms-txt-coverage.test.ts +++ b/test/unit/checks/llms-txt-coverage.test.ts @@ -7,6 +7,8 @@ import type { DiscoveredFile } from '../../../src/types.js'; import { hasLocaleCodeAt, filterToUnprefixedLocale, + compileExclusionMatcher, + extractOmittedPrefixes, } from '../../../src/checks/observability/llms-txt-coverage.js'; const server = setupServer(); @@ -360,13 +362,12 @@ describe('llms-txt-coverage', () => { expect(result.details?.coverageRate).toBe(100); }); - test('excludes blog/changelog/pricing paths from sitemap comparison', async () => { + test('excludes blog/pricing/careers paths from sitemap comparison', async () => { const host = 'cov-exclude.local'; const docPages = [`http://${host}/guide`]; const sitemapPages = [ `http://${host}/guide`, `http://${host}/blog/post-1`, - `http://${host}/changelog/v2`, `http://${host}/pricing`, `http://${host}/careers/engineer`, ]; @@ -392,7 +393,7 @@ describe('llms-txt-coverage', () => { const result = await check.run(ctx); // Only /guide should be in the doc pages set (others excluded) expect(result.details?.sitemapDocPages).toBe(1); - expect(result.details?.excludedNonDocPages).toBe(4); + expect(result.details?.excludedNonDocPages).toBe(3); expect(result.status).toBe('pass'); }); @@ -724,12 +725,12 @@ describe('llms-txt-coverage', () => { const host = 'basepath-exclude.local'; const pages = [`http://${host}/docs/getting-started`, `http://${host}/docs/api-reference`]; - // Sitemap includes /docs/changelog pages that should be excluded + // Sitemap includes /docs/blog and /docs/pricing pages that should be excluded const sitemapPages = [ ...pages, - `http://${host}/docs/changelog/2024-01-01`, - `http://${host}/docs/changelog/2024-02-01`, `http://${host}/docs/blog/post-1`, + `http://${host}/docs/blog/post-2`, + `http://${host}/docs/pricing`, ]; const ctx = makeCtx(host, pages, '/docs'); @@ -750,7 +751,7 @@ describe('llms-txt-coverage', () => { const result = await check.run(ctx); expect(result.status).toBe('pass'); - // Only 2 doc pages remain after excluding /docs/changelog and /docs/blog + // Only 2 doc pages remain after excluding /docs/blog and /docs/pricing expect(result.details?.sitemapDocPages).toBe(2); expect(result.details?.excludedNonDocPages).toBe(3); }); @@ -795,3 +796,312 @@ describe('filterToUnprefixedLocale', () => { expect(filterToUnprefixedLocale(urls, 1)).toEqual(urls); }); }); + +describe('configurable thresholds', () => { + test('uses custom pass threshold', async () => { + const host = 'cov-custom-pass.local'; + // 9 of 10 pages = 90% coverage. Default would warn, but pass=80 makes it pass. + const allPages = Array.from({ length: 10 }, (_, i) => `http://${host}/docs/page-${i}`); + const llmsPages = allPages.slice(0, 9); + + const ctx = makeCtx(host, llmsPages, '/docs'); + ctx.options.coveragePassThreshold = 80; + ctx.options.coverageWarnThreshold = 50; + + server.use( + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(allPages), { + status: 200, + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + expect(result.details?.coverageRate).toBe(90); + expect(result.details?.coveragePassThreshold).toBe(80); + expect(result.details?.coverageWarnThreshold).toBe(50); + }); + + test('threshold of 0 makes check informational (always passes)', async () => { + const host = 'cov-informational.local'; + // Only 2 of 10 pages = 20% coverage. With thresholds at 0, this passes. + const allPages = Array.from({ length: 10 }, (_, i) => `http://${host}/docs/page-${i}`); + const llmsPages = allPages.slice(0, 2); + + const ctx = makeCtx(host, llmsPages, '/docs'); + ctx.options.coveragePassThreshold = 0; + ctx.options.coverageWarnThreshold = 0; + + server.use( + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(allPages), { + status: 200, + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + expect(result.details?.coverageRate).toBe(20); + }); +}); + +describe('coverage exclusions', () => { + test('user exclusion patterns remove matching sitemap URLs from denominator', async () => { + const host = 'cov-exclusions.local'; + const docPages = [`http://${host}/docs/guide`, `http://${host}/docs/api`]; + const sitemapPages = [ + ...docPages, + `http://${host}/docs/reference/v1/endpoint-a`, + `http://${host}/docs/reference/v1/endpoint-b`, + `http://${host}/docs/reference/v2/endpoint-a`, + ]; + + const ctx = makeCtx(host, docPages, '/docs'); + ctx.options.coverageExclusions = ['/docs/reference/**']; + + server.use( + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(sitemapPages), { + status: 200, + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + expect(result.details?.sitemapDocPages).toBe(2); + expect(result.details?.userExcludedPages).toBe(3); + expect(result.details?.coverageRate).toBe(100); + }); + + test('exclusion patterns work relative to base path', async () => { + const host = 'cov-exclusions-rel.local'; + const docPages = [`http://${host}/docs/guide`]; + const sitemapPages = [...docPages, `http://${host}/docs/archive/old-page`]; + + const ctx = makeCtx(host, docPages, '/docs'); + ctx.options.coverageExclusions = ['/archive/**']; + + server.use( + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(sitemapPages), { + status: 200, + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + expect(result.details?.sitemapDocPages).toBe(1); + expect(result.details?.userExcludedPages).toBe(1); + }); +}); + +describe('omitted subtrees', () => { + test('excludes sitemap pages under omitted subtree prefixes', async () => { + const host = 'cov-omitted.local'; + // Root llms.txt links to section indexes (depth 0) + const rootLlmsTxt = [ + '# Docs\n', + `- [Chains](http://${host}/docs/chains/llms.txt)`, + `- [Intro](http://${host}/docs/intro)`, + ].join('\n'); + + // chains/llms.txt links to sub-section indexes (depth 1, omitted) + pages + const chainsLlmsTxt = [ + '# Chains\n', + `- [Ethereum](http://${host}/docs/chains/ethereum/llms.txt)`, + `- [Solana](http://${host}/docs/chains/solana/llms.txt)`, + `- [Overview](http://${host}/docs/chains/overview)`, + ].join('\n'); + + // Sitemap has pages under the omitted subtrees + const sitemapPages = [ + `http://${host}/docs/intro`, + `http://${host}/docs/chains/overview`, + `http://${host}/docs/chains/ethereum/method-a`, + `http://${host}/docs/chains/ethereum/method-b`, + `http://${host}/docs/chains/solana/method-a`, + ]; + + const baseUrl = `http://${host}/docs`; + const ctx = createContext(baseUrl, { requestDelay: 0 }); + const discovered: DiscoveredFile[] = [ + { url: `http://${host}/llms.txt`, content: rootLlmsTxt, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + + server.use( + // Depth-0 aggregate fetch: chains/llms.txt + http.get( + `http://${host}/docs/chains/llms.txt`, + () => + new HttpResponse(chainsLlmsTxt, { + status: 200, + headers: { 'content-type': 'text/plain' }, + }), + ), + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(sitemapPages), { + status: 200, + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + // Pages directly verified: /docs/intro, /docs/chains/overview = 2 + // Omitted subtrees: /docs/chains/ethereum (2 pages), /docs/chains/solana (1 page) = 3 excluded + // Coverage: 2/2 = 100% + expect(result.status).toBe('pass'); + expect(result.details?.sitemapDocPages).toBe(2); + expect(result.details?.omittedSubtrees).toBe(2); + expect(result.details?.omittedSubtreePages).toBe(3); + expect(result.details?.coverageRate).toBe(100); + expect(result.message).toContain('nested indexes omitted'); + }); + + test('omitted subtrees without matching sitemap pages do not affect results', async () => { + const host = 'cov-omitted-empty.local'; + const rootLlmsTxt = [ + '# Docs\n', + `- [Section](http://${host}/docs/section/llms.txt)`, + `- [Guide](http://${host}/docs/guide)`, + ].join('\n'); + + const sectionLlmsTxt = [ + '# Section\n', + `- [SubSection](http://${host}/docs/section/sub/llms.txt)`, + `- [Page](http://${host}/docs/section/page)`, + ].join('\n'); + + const sitemapPages = [`http://${host}/docs/guide`, `http://${host}/docs/section/page`]; + + const baseUrl = `http://${host}/docs`; + const ctx = createContext(baseUrl, { requestDelay: 0 }); + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { + discoveredFiles: [ + { url: `http://${host}/llms.txt`, content: rootLlmsTxt, status: 200, redirected: false }, + ], + }, + }); + + server.use( + http.get( + `http://${host}/docs/section/llms.txt`, + () => + new HttpResponse(sectionLlmsTxt, { + status: 200, + headers: { 'content-type': 'text/plain' }, + }), + ), + http.get( + `http://${host}/robots.txt`, + () => new HttpResponse(`Sitemap: http://${host}/sitemap.xml`, { status: 200 }), + ), + http.get( + `http://${host}/sitemap.xml`, + () => + new HttpResponse(makeSitemap(sitemapPages), { + status: 200, + headers: { 'content-type': 'application/xml' }, + }), + ), + ); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + expect(result.details?.coverageRate).toBe(100); + // Omitted subtree /docs/section/sub has no matching sitemap pages + expect(result.details?.omittedSubtreePages ?? 0).toBe(0); + }); +}); + +describe('compileExclusionMatcher', () => { + test('matches ** across segments', () => { + const matcher = compileExclusionMatcher(['/docs/reference/**']); + expect(matcher('/docs/reference/v1/endpoint')).toBe(true); + expect(matcher('/docs/reference')).toBe(true); + expect(matcher('/docs/guide')).toBe(false); + }); + + test('matches * within a segment', () => { + const matcher = compileExclusionMatcher(['/docs/v*/api']); + expect(matcher('/docs/v1/api')).toBe(true); + expect(matcher('/docs/v2/api')).toBe(true); + expect(matcher('/docs/v1/guide')).toBe(false); + }); + + test('multiple patterns', () => { + const matcher = compileExclusionMatcher(['/docs/changelog/**', '/docs/blog/**']); + expect(matcher('/docs/changelog/v1')).toBe(true); + expect(matcher('/docs/blog/post-1')).toBe(true); + expect(matcher('/docs/guide')).toBe(false); + }); + + test('empty patterns never match', () => { + const matcher = compileExclusionMatcher([]); + expect(matcher('/docs/anything')).toBe(false); + }); +}); + +describe('extractOmittedPrefixes', () => { + test('extracts directory from .txt URLs', () => { + const prefixes = extractOmittedPrefixes([ + 'http://example.com/docs/chains/ethereum/llms.txt', + 'http://example.com/docs/chains/solana/llms.txt', + ]); + expect(prefixes).toEqual(['/docs/chains/ethereum', '/docs/chains/solana']); + }); + + test('returns empty for empty input', () => { + expect(extractOmittedPrefixes([])).toEqual([]); + }); +}); From 77b7e7215582a0c5789a69305d40d0788e3283e8 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sat, 25 Apr 2026 15:51:31 -0400 Subject: [PATCH 04/13] Split 'llms-txt-directive' into 'html' and 'md' variants --- README.md | 8 +- SCORING.md | 21 +- docs/about.md | 2 +- docs/agent-score-calculation.md | 56 ++-- docs/checks/content-discoverability.md | 64 ++-- docs/checks/index.md | 2 +- docs/ci-integration.md | 2 +- docs/improve-your-score.md | 10 +- docs/index.md | 2 +- docs/interaction-diagnostics.md | 14 +- docs/quick-start.md | 6 +- docs/reference/config-file.md | 2 +- docs/reference/scoring-api.md | 2 +- docs/what-is-agent-score.md | 6 +- scoring-reference.md | 78 +++-- .../llms-txt-directive-html.ts | 193 ++++++++++++ .../llms-txt-directive-md.ts | 202 ++++++++++++ .../llms-txt-directive.ts | 233 -------------- src/checks/index.ts | 3 +- src/cli/formatters/text.ts | 16 +- src/scoring/coefficients.ts | 5 +- src/scoring/diagnostics.ts | 60 +++- src/scoring/proportions.ts | 3 +- src/scoring/resolutions.ts | 28 +- src/scoring/tag-scores.ts | 9 +- src/scoring/weights.ts | 3 +- test/integration/cli.test.ts | 2 +- ...est.ts => llms-txt-directive-html.test.ts} | 122 ++++---- .../unit/checks/llms-txt-directive-md.test.ts | 287 ++++++++++++++++++ test/unit/cli/formatters.test.ts | 4 +- test/unit/scoring/coefficients.test.ts | 24 +- test/unit/scoring/diagnostics.test.ts | 95 +++++- test/unit/scoring/proportions.test.ts | 6 +- test/unit/scoring/resolutions.test.ts | 3 +- test/unit/scoring/score.test.ts | 14 +- test/unit/scoring/tag-scores.test.ts | 16 +- test/unit/scoring/weights.test.ts | 14 +- 37 files changed, 1152 insertions(+), 465 deletions(-) create mode 100644 src/checks/content-discoverability/llms-txt-directive-html.ts create mode 100644 src/checks/content-discoverability/llms-txt-directive-md.ts delete mode 100644 src/checks/content-discoverability/llms-txt-directive.ts rename test/unit/checks/{llms-txt-directive.test.ts => llms-txt-directive-html.test.ts} (81%) create mode 100644 test/unit/checks/llms-txt-directive-md.test.ts diff --git a/README.md b/README.md index 6fbe7ce..03f59f8 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ npm

    -Test your documentation site against the [Agent-Friendly Documentation Spec](https://agentdocsspec.com). AFDocs runs 22 checks across 7 categories to measure how well AI coding agents can discover, navigate, and consume your docs. +Test your documentation site against the [Agent-Friendly Documentation Spec](https://agentdocsspec.com). AFDocs runs 23 checks across 7 categories to measure how well AI coding agents can discover, navigate, and consume your docs. Powering [Agent Score](https://buildwithfern.com/agent-score) by Fern. @@ -45,8 +45,8 @@ Agent-Friendly Docs Scorecard PASS llms-txt-exists llms.txt found at /llms.txt WARN llms-txt-size llms.txt is 65,000 characters Fix: If it grows further, split into nested llms.txt files ... - FAIL llms-txt-directive No directive detected on any tested page - Fix: Add a blockquote near the top of each page ... + FAIL llms-txt-directive-html No directive detected in HTML of any tested page + Fix: Add a visually-hidden element near the top of each page ... ``` ## Install @@ -69,7 +69,7 @@ Full documentation is available at **[afdocs.dev](https://afdocs.dev)**: - [Understand Your Score](https://afdocs.dev/what-is-agent-score) — what the score means and how it's calculated - [Improve Your Score](https://afdocs.dev/improve-your-score) — prioritized fix guide -- [Checks Reference](https://afdocs.dev/checks/) — all 22 checks with fix suggestions +- [Checks Reference](https://afdocs.dev/checks/) — all 23 checks with fix suggestions - [CLI Reference](https://afdocs.dev/reference/cli) — flags, output formats, sampling strategies - [CI Integration](https://afdocs.dev/ci-integration) — vitest helpers for your pipeline - [Programmatic API](https://afdocs.dev/reference/programmatic-api) — TypeScript API for custom tooling diff --git a/SCORING.md b/SCORING.md index 9e57247..85879fa 100644 --- a/SCORING.md +++ b/SCORING.md @@ -4,7 +4,7 @@ Scoring Version: 0.1.0 · [Agent-Friendly Docs Spec v0.3.0](https://agentdocsspe ## What is this score? -The Agent-Friendly Docs Scorecard measures how effectively AI coding agents can discover, navigate, and consume a documentation site. It runs 22 automated checks against your site and produces a 0–100 score with a letter grade. +The Agent-Friendly Docs Scorecard measures how effectively AI coding agents can discover, navigate, and consume a documentation site. It runs 23 automated checks against your site and produces a 0–100 score with a letter grade. Each check corresponds to a section of the [Agent-Friendly Docs Spec](https://agentdocsspec.com), which documents what the check measures, why it matters for real agent workflows, and the observed behaviors that motivated it. This document covers how checks are **scored**, not what they **measure**. If you want to understand a specific check in depth, follow the spec links in the table below. @@ -23,7 +23,7 @@ The score reflects how well agents can _actually use_ your documentation, not ju ## What we check -The 22 checks are grouped into seven categories. Each check is assigned a **weight tier** based on its observed impact on agent workflows: +The 23 checks are grouped into seven categories. Each check is assigned a **weight tier** based on its observed impact on agent workflows: - **Critical (10 pts)**: Agents cannot function without this. Failure means zero content, zero navigation, or zero access. - **High (7 pts)**: Directly limits agent effectiveness. Failure means truncation, dead ends, or agents stuck on a worse path. @@ -41,7 +41,8 @@ How agents find and navigate your documentation. | [llms-txt-size](https://agentdocsspec.com/spec/#llms-txt-size) | High (7) | Whether your llms.txt fits within agent context windows. Truncated indexes defeat their purpose. | | [llms-txt-links-resolve](https://agentdocsspec.com/spec/#llms-txt-links-resolve) | High (7) | Whether links in your llms.txt actually work. Broken links send agents down dead ends with high confidence. | | [llms-txt-links-markdown](https://agentdocsspec.com/spec/#llms-txt-links-markdown) | High (7) | Whether llms.txt links point to markdown rather than HTML. Agents work significantly less effectively with HTML content. | -| [llms-txt-directive](https://agentdocsspec.com/spec/#llms-txt-directive) | High (7) | Whether your docs pages tell agents where to find llms.txt. Without this, agents won't know it exists. | +| [llms-txt-directive-html](https://agentdocsspec.com/spec/#llms-txt-directive-html) | High (7) | Whether your HTML pages tell agents where to find llms.txt. Without this, agents won't know it exists. | +| [llms-txt-directive-md](https://agentdocsspec.com/spec/#llms-txt-directive-md) | Medium (4) | Whether your markdown pages tell agents where to find llms.txt. | ### Markdown Availability @@ -129,7 +130,7 @@ Not all warnings represent the same degree of degradation. A warning on `llms-tx | Coefficient | Meaning | Checks | | ----------- | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | **0.75** | Content substantively intact | `llms-txt-valid`, `content-negotiation`, `llms-txt-links-resolve`, `llms-txt-coverage`, `markdown-content-parity` | -| **0.60** | Partial coverage or platform-dependent | `llms-txt-directive`, `redirect-behavior` | +| **0.60** | Partial coverage or platform-dependent | `llms-txt-directive-html`, `llms-txt-directive-md`, `redirect-behavior` | | **0.50** | Genuine functional degradation | `llms-txt-exists`, `llms-txt-size`, `rendering-strategy`, `markdown-url-support`, `page-size-markdown`, `page-size-html`, `content-start-position`, `tabbed-content-serialization`, `section-header-quality`, `cache-header-hygiene`, `auth-gate-detection`, `auth-alternative-access` | | **0.25** | Actively steering agents to a worse path | `llms-txt-links-markdown` (markdown exists but llms.txt links to HTML; agents don't discover .md variants on their own) | @@ -160,11 +161,19 @@ Some problems only become visible when you look at multiple checks together. The ### Markdown support is undiscoverable -**Triggers when** your site serves markdown at .md URLs, but none of the discovery mechanisms (content negotiation, llms.txt directive, .md links in llms.txt) are in place. +**Triggers when** your site serves markdown at .md URLs, but there is no agent-facing directive on HTML pages pointing to llms.txt and the server does not support content negotiation. **What it means**: You've done the work to support markdown, but agents have no way to find out. They'll default to the HTML path. In observed agent behavior, agents do not independently discover .md URL variants; they need to be told. -**What to do**: Add a directive on your docs pages pointing to llms.txt, or implement content negotiation for `Accept: text/markdown`. Either change makes your existing markdown support visible to agents. +**What to do**: Add a directive on your docs pages pointing to llms.txt, and implement content negotiation for `Accept: text/markdown`. The directive is the primary discovery mechanism because it reaches all agents; content negotiation provides a fast path for agents that request markdown by default. Both are recommended. + +### Markdown support is only partially discoverable + +**Triggers when** your site serves markdown at .md URLs and supports content negotiation, but there is no agent-facing directive on HTML pages pointing to llms.txt. + +**What it means**: Agents that send `Accept: text/markdown` (Claude Code, Cursor, OpenCode) get markdown automatically, but the majority of agents fetch HTML by default and have no signal that a markdown path exists. + +**What to do**: Add a directive near the top of each HTML page pointing to your llms.txt. If your site serves markdown, mention that in the directive too. The directive reaches all agents, not just the ones that request markdown by default. ### Truncated index diff --git a/docs/about.md b/docs/about.md index 97b1e68..a1d8ff8 100644 --- a/docs/about.md +++ b/docs/about.md @@ -3,7 +3,7 @@
    -AFDocs is an open-source tool that tests documentation sites against the [Agent-Friendly Documentation Spec](https://agentdocsspec.com). The spec defines what makes documentation accessible to AI coding agents, based on observed behavior across real agent platforms. AFDocs automates those observations into 22 checks that produce a score and actionable fix suggestions. +AFDocs is an open-source tool that tests documentation sites against the [Agent-Friendly Documentation Spec](https://agentdocsspec.com). The spec defines what makes documentation accessible to AI coding agents, based on observed behavior across real agent platforms. AFDocs automates those observations into 23 checks that produce a score and actionable fix suggestions.