From f4c6098c98e1ba638c076d06766cd33ab814184b Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 09:59:51 -0400 Subject: [PATCH 01/16] =?UTF-8?q?docs(adr):=20add=20ADR=200003=20=E2=80=94?= =?UTF-8?q?=20reporting=20pipeline=20layering?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Establishes the architectural principle that the data layer (analyzers and the Finding struct) holds raw bytes from network captures, while each reporter (terminal, JSON, future CSV/SQLite/HTML/SIEM) is responsible for all formatting that depends on its own output medium — escaping, styling, truncation, localization. The motivating consequence is terminal-safe escaping: an audit during PR #49 follow-up found 7 unprotected interpolations in the HTTP analyzer where attacker-controlled bytes could reach the operator's terminal and be interpreted as ANSI escape sequences (CWE-117). PR #49 patched TLS by escaping at the construction site, which solved the immediate vulnerability but destroyed forensic data (Cyrillic SNIs became hex blobs in JSON output) and required every new analyzer to remember the rule — the HTTP audit shows the rule never propagated. The decision: escape at display time in the reporter, using stdlib str::escape_default in the terminal reporter. Analyzers store raw bytes; JSON output stays safe via serde_json's automatic RFC 8259 escaping; future reporters apply their own format's rules. Validated against OWASP output-encoding guidance (CWE-117) and Rust stdlib behavior via Perplexity 2026-04-08. Other formatting concerns (truncation, cipher-name fallback, verdict localization) follow the same principle but are explicitly deferred per YAGNI. Implementation follows in subsequent commits. --- docs/adr/0003-reporting-pipeline-layering.md | 203 +++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 docs/adr/0003-reporting-pipeline-layering.md diff --git a/docs/adr/0003-reporting-pipeline-layering.md b/docs/adr/0003-reporting-pipeline-layering.md new file mode 100644 index 0000000..e8977dd --- /dev/null +++ b/docs/adr/0003-reporting-pipeline-layering.md @@ -0,0 +1,203 @@ +# ADR 0003: Reporting Pipeline Layering — Data Layer is Raw, Display Layer Formats + +**Status:** Accepted +**Date:** 2026-04-09 +**Context:** PR #49 (issue #28) discovered a terminal injection vulnerability when untrusted bytes from network captures were interpolated into `Finding` strings rendered to the operator's terminal. Investigating the right fix surfaced a deeper architectural question about where formatting belongs in the reporting pipeline. PR #49 placed sanitization at the analyzer construction site, which solved the immediate problem but destroyed forensic data and required every analyzer to remember the rule. An audit (2026-04-08) found 7 unprotected interpolations in the HTTP analyzer with the same vulnerability class, demonstrating that construction-site rules don't propagate. This ADR establishes the architectural principle behind the right fix. + +## Problem + +Wirerust's reporting pipeline currently has a blurred boundary between *what* data the analyzers extract and *how* that data is presented. Several distinct concerns are entangled at the analyzer construction site: + +- **Sanitization.** Untrusted bytes (TLS SNI hostnames, HTTP URIs, HTTP headers, etc.) flow through `String::from_utf8_lossy`, which preserves ASCII control bytes including ESC (`0x1b`). When the terminal reporter writes those bytes via `format!("{}", finding.summary)`, the analyst's terminal interprets the embedded ANSI escape sequences as commands. CWE-117 ("Improper Output Neutralization for Logs") covers this class. +- **Formatting.** Analyzers also pre-format human-readable text — `format!("Path traversal in URI: {uri}")` in HTTP, `cipher_name()` hex fallbacks in TLS, `truncate_uri()` length decisions, and so on. The same data is committed to one specific human-readable form before any reporter sees it. +- **Styling.** Color and bold/dim attributes are correctly applied by the terminal reporter, but the line text itself is built ad-hoc instead of being a clean transform of the raw `Finding`. + +These share a single underlying problem: **formatting decisions are made at the data construction site, not at the rendering site.** The visible symptoms are: + +1. **Terminal injection (the immediate vulnerability).** Control bytes from packet payloads reach the analyst's terminal because no display-layer escaping exists. Empirically demonstrated in PR #49 with an SNI of `b"\x1b[31m..."`. + +2. **Forensic data loss.** PR #49's construction-site fix used `{:?}` (Debug formatter), which permanently replaces raw bytes in `Finding.summary` with their escaped form. Downstream consumers (JSON output, future CSV/SQLite/SIEM reporters) then see the escaped form forever. A Cyrillic SNI like `пример.рф` becomes `\u{43f}\u{440}\u{438}\u{43c}\u{435}\u{440}.\u{440}\u{444}` in the JSON export — unreadable to a Russian-speaking analyst, and lossy for any tool that needs the original bytes. + +3. **Tribal-knowledge enforcement.** Every new analyzer must remember the escape rule. The HTTP analyzer's 7 unprotected findings (added before PR #49) prove the rule never propagated. Future analyzers (Modbus, DNP3, SSH, SMB — issues #7, #8) would have to relearn it. + +4. **Format coupling.** A future reporter that needs different formatting (HTML, SIEM JSON, CSV with its own escaping, localized alert text) would either have to undo and redo the construction-site formatting, or inherit the wrong context. + +A single representation cannot serve both raw forensic data and human-readable formatted output. The pipeline needs a clear boundary. + +## Decision + +**The reporting pipeline is layered. The data layer (analyzers and the `Finding` struct) holds raw bytes. The display layer (each reporter) is responsible for all formatting that depends on the output medium — escaping, styling, truncation, localization.** + +### The pipeline + +``` +Packet → Decoder → Dispatcher → Analyzer → Finding → Reporter → Output + ─┬─── ─┬─── + │ └─ Display layer + │ (per-medium formatting) + └─ Data layer + (raw bytes, post-from_utf8_lossy) +``` + +The data layer is raw and forensic; the display layer formats for its medium and knows nothing about other layers. + +### The layering rule + +| Layer | Responsibility | Bytes are… | +|---|---|---| +| Analyzer | Extract data, build findings with raw strings | Raw (post-`from_utf8_lossy`) | +| `Finding` struct | Hold immutable forensic data | Raw | +| JSON reporter (`serde_json`) | Serialize for machine consumption | Escaped per RFC 8259 (automatic via serde) | +| Terminal reporter | Render for human display | Escaped per terminal-safety rules + styled | +| Future CSV / SQLite / HTML / SIEM reporters | Render for their format | Escape and format per their format's rules | + +### Immediate scope: terminal-safe escaping + +The first concrete consequence of the layering rule — and the motivating problem — is terminal-safe escaping. The terminal reporter calls `str::escape_default` (Rust stdlib) on every `Finding.summary` and every entry in `Finding.evidence` immediately before writing them to the output buffer. + +`str::escape_default` escapes: +- C0 control bytes (`0x00`–`0x1f`, including ESC `0x1b`, BEL `0x07`, LF `0x0a`, CR `0x0d`) +- DEL (`0x7f`) +- backslash (`\\`) + +It preserves: +- All printable ASCII +- All valid non-ASCII Unicode (Cyrillic, CJK, emoji, etc.) + +The output is always valid UTF-8 and contains no bytes that a modern terminal will interpret as control sequences. + +C1 control bytes (`0x80`–`0x9f`) are not a coverage gap. `Finding.summary` and `Finding.evidence` are `String` (guaranteed valid UTF-8) populated via `String::from_utf8_lossy` (which replaces invalid bytes with `U+FFFD`). A standalone byte in the C1 range cannot appear in a valid UTF-8 string, so C0 + DEL coverage is complete. + +### Other formatting concerns (same principle, deferred scope) + +Other things that currently happen at the analyzer construction site also belong in the display layer under this principle, but are intentionally NOT moved by the PR that introduces this ADR: + +- **Truncation.** HTTP analyzer's `truncate_uri()` decides display length at construction. A future change could move length decisions to reporters, letting JSON consumers see full URIs while terminal consumers see truncated ones. +- **Cipher name hex fallback.** TLS analyzer's `cipher_name()` formats unknown cipher IDs as hex strings at construction. Could move to display. +- **Verdict / category / confidence text.** The `Display` impls on `Verdict`, `Confidence`, and `ThreatCategory` produce English strings directly. A future localization concern would move these to reporter-owned mappings. + +These are noted to record the principle, not to commit to fixing them now. Each can be revisited if a concrete need appears — e.g., when adding a CSV reporter that wants full URIs, or an HTML reporter that wants different styling. YAGNI applies. This ADR establishes the boundary; subsequent work can push more responsibilities across it as needed. + +### Where escaping does NOT happen + +- **At the analyzer / construction site.** Analyzers store raw bytes in `Finding.summary` and `Finding.evidence`. They do not call `escape_default`, do not use `{:?}`, do not pre-escape anything. +- **In `Finding`'s `Display` impl.** It produces raw text and is documented as such. The terminal reporter does not use it (it builds output directly from struct fields and applies the helper). +- **In the JSON reporter.** `serde_json` already escapes per RFC 8259, so JSON output is safe with no extra work. + +## Alternatives Considered + +### Construction-site escaping (the PR #49 approach) + +Each analyzer escapes untrusted bytes before they enter the `Finding` struct, e.g. via the Debug formatter `{hostname:?}`. + +- **Pro:** Visible at the danger point. +- **Con (forensics):** Permanently replaces raw bytes with escaped form. JSON consumers and future reporters lose the original data. Cyrillic SNIs become hex blobs across all output paths. +- **Con (correctness):** Easy to forget. Every new analyzer must remember the rule. The HTTP analyzer's 7 unprotected findings prove this — the rule was tribal knowledge from one PR and didn't propagate. +- **Con (encoding once for many sinks):** OWASP guidance is explicit — encode at display time, not at storage time, because a single piece of data may render to multiple sinks (terminal, JSON, CSV, log file) each needing different escaping. Encoding at construction either breaks one sink or fails to protect another. +- **Con (architectural):** Conflates the data layer with the display layer. Once a project does this, every new reporter inherits the wrong shape. +- **Rejected.** Violates the layering principle and destroys forensic data. + +### A wrapper type (`Untrusted`) + +Define a newtype like `pub struct Untrusted<'a>(&'a str)` with an `impl Display` that escapes, and require analyzers to wrap untrusted values: `format!("URI: {}", Untrusted(&parsed.uri))`. + +- **Pro:** Type-system enforcement; impossible to forget. +- **Con:** Still construction-site escaping in disguise. The wrapped value either gets stored escaped (forensics loss) or the wrapper threads through the entire reporting pipeline (invasive). +- **Con:** Adds API surface without fixing the layering problem. +- **Rejected.** The complexity isn't justified once the display-time approach is adopted. + +### `bstr::ByteSlice::escape_default` from the `bstr` crate + +A third-party crate offering byte-slice escaping. + +- **Pro:** Handles raw `&[u8]` directly. +- **Con:** Treats input as raw bytes and hex-escapes anything `> 0x7f`, including UTF-8 continuation bytes. A Cyrillic hostname becomes `\x{d0}\x{9f}\x{d1}\x{80}\x{d0}\x{b8}...` — unreadable. +- **Con:** Adds a dependency for a problem stdlib solves. +- **Rejected.** Same UX problem as the Debug formatter, plus a dependency. + +### `char::escape_debug` (per-char Debug) + +Iterate characters and apply `escape_debug` to each. + +- **Pro:** Preserves printable ASCII. +- **Con:** Escapes all non-ASCII Unicode as `\u{...}` — same UX problem as `{:?}`. +- **Rejected.** + +### Stripping vs. escaping + +Drop dangerous bytes entirely instead of escaping them. + +- **Pro:** Slightly shorter output. +- **Con:** Loses information. An attacker who embeds `\x1b[31mHACK\x1b[0m` in a hostname has done something noteworthy; the analyst should see *what* the attacker did, not just that "something" was stripped. Escaping preserves the evidence. +- **Rejected.** + +### A separate sanitized view on `Finding` + +Add a `Finding::display_summary()` method that returns the escaped form, and have the terminal reporter call it. Keep the raw `summary` field for JSON consumers. + +- **Pro:** API discoverable on the type. +- **Con:** Couples the type to one specific display medium (terminal). The CSV reporter would need its own method, the HTML reporter another, etc. Method count grows with reporter count. +- **Con:** Encourages new analyzer authors to look at `display_summary()` and infer that the raw `summary` field is "the one that needs care" — same tribal-knowledge problem as construction-site escaping. +- **Rejected.** Reporters owning their own escaping is cleaner. + +## Rationale + +- **Matches OWASP guidance.** Output encoding belongs in the rendering layer, not the storage layer (CWE-117, OWASP XSS / output encoding cheat sheet). A single piece of alert data may render to multiple sinks; each needs its own escaping rules. +- **Matches the layering of `serde`.** `Finding` already implements `#[derive(Serialize)]`. The JSON reporter delegates escaping to serde — that's display-layer escaping for the JSON medium. Doing the same thing for the terminal medium is symmetric and unsurprising. +- **Preserves forensic data.** The raw bytes are kept in the `Finding` struct, available for JSON export, future reporters, and downstream tooling. An analyst exporting to JSON sees the actual SNI bytes (with serde's standard JSON escaping); only the terminal reporter applies terminal-safe escaping. +- **Single point of enforcement per medium.** Future analyzers don't need to remember any rule. Adding a new analyzer requires zero terminal-safety awareness. A new reporter (CSV, HTML, etc.) gets one place to apply its own escaping. +- **Extensible.** When a future need appears — localization, HTML rendering, different truncation per medium — the pipeline already has the boundary in place. The work is in the display layer, not in every analyzer. +- **`str::escape_default` is the right primitive for the immediate fix.** Stdlib (no dependency), preserves valid Unicode, and escapes exactly the bytes that constitute the threat (C0 + DEL). Cross-checked against `char::escape_debug` (mangles Unicode), `bstr::escape_default` (mangles Unicode), and `{:?}` Debug formatter (mangles Unicode). +- **Validated.** OWASP encoding guidance, RFC 8259 + serde_json behavior, and `str::escape_default` semantics all confirmed via Perplexity 2026-04-08. + +## Consequences + +### File-level changes required by the introducing PR + +| File | Change | +|------|--------| +| `src/reporter/terminal.rs` | Add a private `escape_for_terminal(s: &str) -> String` helper at file scope that returns `s.escape_default().collect()`. Apply it to `f.summary` (line ~65, where `f.summary` is interpolated into the line `format!`) and to each `ev` in `f.evidence` (line ~81) before writing to the output buffer. The helper is private to the terminal reporter — other reporters that need it (e.g., a future CSV reporter) implement at their own boundary, since each output medium has different escaping rules. | +| `src/analyzer/tls.rs` | Replace `{hostname:?}` (line ~349) and `{lossy:?}` (line ~369) with `{hostname}` / `{lossy}`. Update the inline doc comments that explain *why* the Debug formatter was used; replace them with a pointer to this ADR. | +| `src/findings.rs` | Add a `///` doc comment on `impl Display for Finding` noting that it produces RAW text and is NOT safe for terminal display; consumers wanting safe display should go through the terminal reporter. | +| `src/analyzer/http.rs` | **No changes required.** Existing raw interpolations are now correct under the new policy. | +| `src/analyzer/dns.rs` | **No changes required.** DNS analyzer's `analyze()` returns `Vec::new()` — emits no findings. | +| `docs/adr/0002-modular-protocol-analyzers.md` | Add a cross-reference in the "Finding Generation Guidelines" section pointing to this ADR, so readers of the analyzer pattern doc also see the layering principle. | + +### Tests required + +- Unit test on the helper covering: ESC (`0x1b`), BEL (`0x07`), DEL (`0x7f`), backslash, Cyrillic preservation, emoji preservation, mixed content. +- End-to-end regression test: build a `Finding` whose `summary` contains a literal `\x1b[31mRED\x1b[0m` byte sequence. Assert that: + - the terminal reporter's output contains no raw `0x1b` byte and contains the escaped form, + - the JSON reporter's output contains `\u001b` (serde's escaping), + - the `Finding.summary` field on the struct still contains the literal `0x1b` byte (forensic preservation). + +### Behavioral changes visible to users + +- TLS findings for non-ASCII / non-UTF-8 SNI hostnames will display readably in the terminal: a Cyrillic SNI like `пример.рф` will appear as `пример.рф` (not `\u{43f}\u{440}...`). Embedded ESC bytes still get escaped to `\x1b`. +- TLS findings in JSON output will contain the raw hostname (with serde's standard JSON string escaping for control bytes only). Downstream tooling that previously saw `\u{...}` literals will now see the actual UTF-8 hostname. +- HTTP findings (path traversal, web shell, admin panel, unusual method, missing host, long URI, empty UA) gain terminal-safety. Previously, an attacker could embed terminal control sequences in a URI and have them rendered live in the analyst's report. They can no longer. + +### Non-changes (intentional) + +- `Finding` struct shape is unchanged. No new fields, no wrapper types, no per-medium accessors. +- The JSON reporter is unchanged. +- The DNS and HTTP analyzers are unchanged. +- Truncation, cipher-name formatting, and verdict-text English formatting stay at construction site for now (deferred per "Other formatting concerns" above). +- No new dependencies. + +### Binding rules for future contributors + +> **Rule 1 (analyzer authors):** Analyzers MUST store untrusted bytes raw in `Finding.summary` and `Finding.evidence`. They MUST NOT escape, debug-format, or otherwise pre-encode untrusted bytes before assigning to those fields. +> +> **Rule 2 (reporter authors):** Each reporter MUST apply medium-appropriate escaping at its render boundary. The terminal reporter escapes for terminal-safety; the JSON reporter relies on `serde_json`'s automatic RFC 8259 escaping; future reporters apply their own format's rules. +> +> **Rule 3 (display-layer formatting in general):** New formatting concerns that depend on the output medium (truncation, styling, localization, etc.) belong in the reporter, not in the analyzer. When in doubt, push it across the boundary. + +## Validation + +This decision was validated through targeted Perplexity queries on 2026-04-08: + +- **Output encoding placement:** OWASP guidance (XSS prevention cheat sheet, CWE-117 log injection) recommends encoding at display time, not at storage time. Encoding at construction creates premature commitment to one output context and breaks others. Confirmed. +- **`serde_json` control byte handling:** `serde_json` automatically escapes control bytes (including ESC `0x1b`) as `\u001b`, per RFC 8259 §7. JSON output is safe with no analyzer code. Confirmed. +- **`str::escape_default` semantics:** Confirmed against Rust stdlib behavior — uses `char::is_ascii_control` internally, escapes C0 + DEL + backslash, preserves all valid non-ASCII Unicode as raw UTF-8 bytes. Compared against `char::escape_debug`, `bstr::ByteSlice::escape_default`, and Debug formatter `{:?}`; `str::escape_default` is the only stdlib option that preserves multi-byte UTF-8. +- **C1 control byte risk:** Standalone C1 bytes (`0x80`–`0x9f`) cannot appear in a valid UTF-8 `String`. Since `Finding.summary` is `String` and analyzers populate it via `String::from_utf8_lossy`, C0 + DEL coverage is complete. From 45cf6494add8f1e090c68a38bc4c782a59cb710b Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 10:33:03 -0400 Subject: [PATCH 02/16] =?UTF-8?q?docs(adr):=20correct=20ADR=200003=20escap?= =?UTF-8?q?e=20primitive=20=E2=80=94=20custom=20helper,=20not=20str::escap?= =?UTF-8?q?e=5Fdefault?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An initial Perplexity answer (relied on during the first draft) claimed str::escape_default preserves multi-byte UTF-8 while escaping only C0 control bytes. Empirical verification during plan self-review — a small rustc-compiled program that fed ESC, BEL, DEL, Cyrillic, emoji, and mixed content through str::escape_default — falsified this: the method routes every character through char::escape_default, which escapes all non-ASCII as \u{...}. A Cyrillic SNI like "пример.рф" would become "\u{43f}\u{440}..." — the same UX problem as PR #49's Debug formatter. The Perplexity answer had conflated str::escape_default (method on str, operates on chars, escapes all non-ASCII) with std::ascii::escape_default (free function, operates on bytes, uses \x{hh}). These are not equivalent. Corrections in this commit: - Replaced "The mechanism: str::escape_default" with an 8-line custom helper that iterates chars() and gates char::escape_default() on is_ascii_control() || '\\'. Empirically verified against the same test inputs. - Replaced the "char::escape_debug (per-char Debug)" alternative entry with "Stdlib str::escape_default", since they're functionally equivalent for our input space and the new entry correctly documents the rejection. - Updated Rationale bullet to describe the custom helper. - Updated Validation section to record the empirical check and the corrected Perplexity claim. - Fixed an incorrect "\x1b" reference in the behavioral-changes section (char::escape_default uses \u{1b}, not \x1b). No change to the layering principle, the decision to escape at display time, the rollback of TLS PR #49, the HTTP analyzer impact, or any of the file-level consequences. Only the escape primitive description changed. --- docs/adr/0003-reporting-pipeline-layering.md | 48 ++++++++++++++------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/docs/adr/0003-reporting-pipeline-layering.md b/docs/adr/0003-reporting-pipeline-layering.md index e8977dd..d0c287c 100644 --- a/docs/adr/0003-reporting-pipeline-layering.md +++ b/docs/adr/0003-reporting-pipeline-layering.md @@ -53,16 +53,34 @@ The data layer is raw and forensic; the display layer formats for its medium and ### Immediate scope: terminal-safe escaping -The first concrete consequence of the layering rule — and the motivating problem — is terminal-safe escaping. The terminal reporter calls `str::escape_default` (Rust stdlib) on every `Finding.summary` and every entry in `Finding.evidence` immediately before writing them to the output buffer. +The first concrete consequence of the layering rule — and the motivating problem — is terminal-safe escaping. The terminal reporter defines a private `escape_for_terminal` helper that iterates the input `str`'s characters and applies `char::escape_default()` only to characters matching `char::is_ascii_control()` or the backslash. All other characters are passed through unchanged: + +```rust +fn escape_for_terminal(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if c.is_ascii_control() || c == '\\' { + for e in c.escape_default() { + out.push(e); + } + } else { + out.push(c); + } + } + out +} +``` -`str::escape_default` escapes: -- C0 control bytes (`0x00`–`0x1f`, including ESC `0x1b`, BEL `0x07`, LF `0x0a`, CR `0x0d`) -- DEL (`0x7f`) +The helper escapes: +- C0 control bytes (`0x00`–`0x1f`, including ESC `0x1b`, BEL `0x07`, LF `0x0a`, CR `0x0d`) — rendered as `\u{1b}`, `\n`, `\t`, etc. via `char::escape_default` +- DEL (`0x7f`) — rendered as `\u{7f}` - backslash (`\\`) It preserves: - All printable ASCII -- All valid non-ASCII Unicode (Cyrillic, CJK, emoji, etc.) +- All valid non-ASCII Unicode (Cyrillic, CJK, emoji, etc.) — passed through as raw UTF-8 bytes + +**Why not stdlib `str::escape_default`?** Empirical verification (2026-04-09) showed that `str::escape_default` internally escapes *every* non-ASCII character via `char::escape_default`, so a Cyrillic hostname like `пример.рф` would become `\u{43f}\u{440}\u{438}\u{43c}\u{435}\u{440}.\u{440}\u{444}` — the same UX problem as the Debug formatter in PR #49. This contradicted an earlier Perplexity answer (which conflated `str::escape_default` with the byte-oriented `std::ascii::escape_default`) and was caught during plan self-review. The custom helper is ~8 lines, stdlib-only, and does exactly what we need. The output is always valid UTF-8 and contains no bytes that a modern terminal will interpret as control sequences. @@ -115,13 +133,13 @@ A third-party crate offering byte-slice escaping. - **Con:** Adds a dependency for a problem stdlib solves. - **Rejected.** Same UX problem as the Debug formatter, plus a dependency. -### `char::escape_debug` (per-char Debug) +### Stdlib `str::escape_default` (or `char::escape_debug`, which is equivalent for this purpose) -Iterate characters and apply `escape_debug` to each. +Apply the stdlib method unconditionally to every character. -- **Pro:** Preserves printable ASCII. -- **Con:** Escapes all non-ASCII Unicode as `\u{...}` — same UX problem as `{:?}`. -- **Rejected.** +- **Pro:** Stdlib, zero dependency. +- **Con:** Escapes *all* non-ASCII characters, not just control bytes. A Cyrillic hostname like `пример.рф` becomes `\u{43f}\u{440}\u{438}\u{43c}\u{435}\u{440}.\u{440}\u{444}` — same UX problem as the Debug formatter. Verified empirically 2026-04-09. This was the original choice in an earlier draft of this ADR, reversed during plan self-review when the assumption was checked against actual stdlib behavior. +- **Rejected.** The custom helper adds ~8 lines of code to gate `escape_default` on `is_ascii_control()` and avoid mangling valid Unicode. ### Stripping vs. escaping @@ -147,8 +165,8 @@ Add a `Finding::display_summary()` method that returns the escaped form, and hav - **Preserves forensic data.** The raw bytes are kept in the `Finding` struct, available for JSON export, future reporters, and downstream tooling. An analyst exporting to JSON sees the actual SNI bytes (with serde's standard JSON escaping); only the terminal reporter applies terminal-safe escaping. - **Single point of enforcement per medium.** Future analyzers don't need to remember any rule. Adding a new analyzer requires zero terminal-safety awareness. A new reporter (CSV, HTML, etc.) gets one place to apply its own escaping. - **Extensible.** When a future need appears — localization, HTML rendering, different truncation per medium — the pipeline already has the boundary in place. The work is in the display layer, not in every analyzer. -- **`str::escape_default` is the right primitive for the immediate fix.** Stdlib (no dependency), preserves valid Unicode, and escapes exactly the bytes that constitute the threat (C0 + DEL). Cross-checked against `char::escape_debug` (mangles Unicode), `bstr::escape_default` (mangles Unicode), and `{:?}` Debug formatter (mangles Unicode). -- **Validated.** OWASP encoding guidance, RFC 8259 + serde_json behavior, and `str::escape_default` semantics all confirmed via Perplexity 2026-04-08. +- **A small custom helper is the right primitive.** Built on stdlib `char::escape_default` + `char::is_ascii_control`, ~8 lines, no dependency. Gates the escape on control-ness so valid Unicode (Cyrillic, CJK, emoji) passes through unchanged. Escapes exactly the bytes that constitute the threat (C0 + DEL + backslash). The stdlib `str::escape_default` method was considered and rejected (it mangles all non-ASCII). +- **Validated.** OWASP encoding guidance and RFC 8259 + serde_json behavior confirmed via Perplexity 2026-04-08. The escape primitive was re-verified empirically (`rustc`-compiled program, 2026-04-09) after an initial Perplexity answer about `str::escape_default` turned out to be wrong — see Validation. ## Consequences @@ -156,7 +174,7 @@ Add a `Finding::display_summary()` method that returns the escaped form, and hav | File | Change | |------|--------| -| `src/reporter/terminal.rs` | Add a private `escape_for_terminal(s: &str) -> String` helper at file scope that returns `s.escape_default().collect()`. Apply it to `f.summary` (line ~65, where `f.summary` is interpolated into the line `format!`) and to each `ev` in `f.evidence` (line ~81) before writing to the output buffer. The helper is private to the terminal reporter — other reporters that need it (e.g., a future CSV reporter) implement at their own boundary, since each output medium has different escaping rules. | +| `src/reporter/terminal.rs` | Add a private `escape_for_terminal(s: &str) -> String` helper at file scope that iterates `s.chars()`, applies `char::escape_default()` for chars matching `char::is_ascii_control()` or backslash, and passes all other chars through. Apply it to `f.summary` (line ~65, where `f.summary` is interpolated into the line `format!`) and to each `ev` in `f.evidence` (line ~81) before writing to the output buffer. The helper is private to the terminal reporter — other reporters that need it (e.g., a future CSV reporter) implement at their own boundary, since each output medium has different escaping rules. | | `src/analyzer/tls.rs` | Replace `{hostname:?}` (line ~349) and `{lossy:?}` (line ~369) with `{hostname}` / `{lossy}`. Update the inline doc comments that explain *why* the Debug formatter was used; replace them with a pointer to this ADR. | | `src/findings.rs` | Add a `///` doc comment on `impl Display for Finding` noting that it produces RAW text and is NOT safe for terminal display; consumers wanting safe display should go through the terminal reporter. | | `src/analyzer/http.rs` | **No changes required.** Existing raw interpolations are now correct under the new policy. | @@ -173,7 +191,7 @@ Add a `Finding::display_summary()` method that returns the escaped form, and hav ### Behavioral changes visible to users -- TLS findings for non-ASCII / non-UTF-8 SNI hostnames will display readably in the terminal: a Cyrillic SNI like `пример.рф` will appear as `пример.рф` (not `\u{43f}\u{440}...`). Embedded ESC bytes still get escaped to `\x1b`. +- TLS findings for non-ASCII / non-UTF-8 SNI hostnames will display readably in the terminal: a Cyrillic SNI like `пример.рф` will appear as `пример.рф` (not `\u{43f}\u{440}...`). Embedded control bytes still get escaped — an ESC `0x1b` renders as `\u{1b}` via `char::escape_default`. - TLS findings in JSON output will contain the raw hostname (with serde's standard JSON string escaping for control bytes only). Downstream tooling that previously saw `\u{...}` literals will now see the actual UTF-8 hostname. - HTTP findings (path traversal, web shell, admin panel, unusual method, missing host, long URI, empty UA) gain terminal-safety. Previously, an attacker could embed terminal control sequences in a URI and have them rendered live in the analyst's report. They can no longer. @@ -199,5 +217,5 @@ This decision was validated through targeted Perplexity queries on 2026-04-08: - **Output encoding placement:** OWASP guidance (XSS prevention cheat sheet, CWE-117 log injection) recommends encoding at display time, not at storage time. Encoding at construction creates premature commitment to one output context and breaks others. Confirmed. - **`serde_json` control byte handling:** `serde_json` automatically escapes control bytes (including ESC `0x1b`) as `\u001b`, per RFC 8259 §7. JSON output is safe with no analyzer code. Confirmed. -- **`str::escape_default` semantics:** Confirmed against Rust stdlib behavior — uses `char::is_ascii_control` internally, escapes C0 + DEL + backslash, preserves all valid non-ASCII Unicode as raw UTF-8 bytes. Compared against `char::escape_debug`, `bstr::ByteSlice::escape_default`, and Debug formatter `{:?}`; `str::escape_default` is the only stdlib option that preserves multi-byte UTF-8. +- **Escape primitive selection:** An initial Perplexity answer claimed `str::escape_default` preserves multi-byte UTF-8. A follow-up empirical check (`rustc`-compiled program, 2026-04-09) falsified this: `str::escape_default` internally routes every character through `char::escape_default`, which escapes *all* non-ASCII Unicode as `\u{...}`. A custom helper iterating `chars()` and gating `escape_default` on `is_ascii_control() || '\\'` was then verified empirically against the same test inputs (ESC, BEL, DEL, backslash, Cyrillic, emoji, mixed content, tabs/newlines/CR) and produced the correct output: control bytes escaped as `\u{}`, backslash doubled, Cyrillic and emoji preserved byte-for-byte. - **C1 control byte risk:** Standalone C1 bytes (`0x80`–`0x9f`) cannot appear in a valid UTF-8 `String`. Since `Finding.summary` is `String` and analyzers populate it via `String::from_utf8_lossy`, C0 + DEL coverage is complete. From 1143d0bfcfba7ac13d238c5226282e87efde9e8d Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 10:38:53 -0400 Subject: [PATCH 03/16] docs(plan): add implementation plan for ADR 0003 Seven bite-sized TDD tasks implementing the reporting pipeline layering decision from ADR 0003 (commits f4c6098 + 45cf649): Task 1: Cross-reference ADR 0003 from ADR 0002 Task 2: Finding::Display doc warning Task 3: escape_for_terminal helper + 9 unit tests Task 4: Apply helper in terminal reporter render loop Task 5: Roll back TLS construction-site {:?} escaping Task 6: End-to-end regression test (data layer + both reporters) Task 7: cargo test + clippy + fmt Each task has complete code blocks, exact commands, and expected outputs. Test assertions were pre-validated empirically with a small rustc-compiled program (see commit 45cf649 for the primitive correction). No placeholders, no speculative steps. Total impact: ~100 lines of production-code change, ~120 lines of test code, 6-7 commits beyond the ADR. --- .../2026-04-09-reporting-pipeline-layering.md | 803 ++++++++++++++++++ 1 file changed, 803 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md diff --git a/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md b/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md new file mode 100644 index 0000000..8e76d45 --- /dev/null +++ b/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md @@ -0,0 +1,803 @@ +# Reporting Pipeline Layering Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Move finding output sanitization from the analyzer construction site to the terminal reporter, per ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`), fixing terminal injection in HTTP findings and restoring forensic data preservation in the `Finding` struct. + +**Architecture:** The `Finding` struct stores raw post-`from_utf8_lossy` bytes; the terminal reporter escapes via a custom helper (~8 lines, built on stdlib `char::escape_default` gated by `char::is_ascii_control`) immediately before writing. The JSON reporter is already safe via `serde_json`'s automatic RFC 8259 control-byte escaping. No new dependencies, no new types. + +**Important mechanism note:** `str::escape_default` was the initial choice but was rejected during plan self-review after empirical verification showed it escapes *all* non-ASCII characters (Cyrillic, emoji) — same UX problem as the PR #49 Debug formatter. See ADR 0003's updated Mechanism section and commit `45cf649`. The custom helper iterates `s.chars()` and only escapes when `char::is_ascii_control() || c == '\\'`. + +**Tech Stack:** Rust 2024 edition, stdlib only for the escape primitive, `serde_json` for JSON output (already in tree), `owo_colors` for terminal styling (already in tree). + +--- + +## File Structure + +| File | Role | Change | +|------|------|--------| +| `docs/adr/0003-reporting-pipeline-layering.md` | The ADR | Already committed in `f4c6098`. No change in this plan. | +| `docs/adr/0002-modular-protocol-analyzers.md` | Analyzer pattern ADR | Add one cross-reference line pointing to ADR 0003 | +| `src/reporter/terminal.rs` | Terminal reporter | Add private `escape_for_terminal` helper + unit tests; apply to `f.summary` and each `ev` in the rendering loop | +| `src/findings.rs` | `Finding` struct | Add doc comment on `impl Display for Finding` warning that it produces raw text | +| `src/analyzer/tls.rs` | TLS analyzer | Replace `{hostname:?}` and `{lossy:?}` with `{hostname}` / `{lossy}`; update inline comments to reference ADR 0003 | +| `tests/tls_analyzer_tests.rs` | TLS tests | Update `test_non_utf8_sni_escapes_control_bytes_in_summary` to assert the RAW ESC byte is now preserved in `f.summary` (contract inversion) | +| `tests/reporter_tests.rs` | Reporter tests | Add end-to-end regression tests: terminal reporter escapes ESC bytes; JSON reporter preserves via serde's `\u001b`; `Finding.summary` keeps the raw byte | + +No files are created. All changes are edits or test additions inside existing files. + +--- + +## Task 1: Cross-reference ADR 0003 from ADR 0002 + +**Files:** +- Modify: `docs/adr/0002-modular-protocol-analyzers.md` + +- [ ] **Step 1: Locate the "Finding Generation Guidelines" section** + +Read `docs/adr/0002-modular-protocol-analyzers.md` and find the "Finding Generation Guidelines" section (around line 103-109 in the current file — it's the bulleted list starting with "Generate findings only for unambiguous security concerns"). + +- [ ] **Step 2: Add cross-reference bullet** + +Add one new bullet at the end of the guidelines list: + +```markdown +- **Output sanitization is a reporter responsibility, not an analyzer responsibility.** Store raw bytes (post-`from_utf8_lossy`) in `Finding.summary` and `Finding.evidence`. Do not escape, debug-format, or otherwise pre-encode untrusted bytes at the analyzer. See ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`) for the full layering principle. +``` + +- [ ] **Step 3: Commit** + +```bash +git add docs/adr/0002-modular-protocol-analyzers.md +git commit -m "docs(adr): cross-reference ADR 0003 from ADR 0002 + +Point analyzer authors at the layering principle so they know output +sanitization is the reporter's responsibility, not theirs." +``` + +--- + +## Task 2: Add Finding::Display doc warning + +**Files:** +- Modify: `src/findings.rs:72` + +- [ ] **Step 1: Locate `impl fmt::Display for Finding`** + +Open `src/findings.rs`. The impl block starts at line 72. + +- [ ] **Step 2: Add doc comment above the impl block** + +Insert these lines directly above `impl fmt::Display for Finding {`: + +```rust +/// Produces the raw text representation of a finding for logging, debugging, +/// or machine-readable output. **Not safe for direct terminal display** — the +/// `summary` field may contain attacker-controlled bytes from packet payloads +/// (including ASCII control codes like ESC `0x1b`) that a terminal would +/// interpret as control sequences. For safe terminal rendering, use the +/// terminal reporter (`src/reporter/terminal.rs`), which applies +/// `str::escape_default` to every `summary` and `evidence` entry before +/// writing to the output buffer. See ADR 0003. +``` + +- [ ] **Step 3: Run existing `test_finding_display`** + +```bash +cargo test --test findings_tests test_finding_display +``` + +Expected: PASS (no behavioral change; this is a doc-only edit). + +- [ ] **Step 4: Commit** + +```bash +git add src/findings.rs +git commit -m "docs(findings): warn that Finding::Display is not terminal-safe + +Doc comment on the impl Display block explaining that the raw text +representation is for logging/debugging only and may contain +attacker-controlled control bytes. Terminal rendering should go +through the terminal reporter, which escapes per ADR 0003." +``` + +--- + +## Task 3: Add `escape_for_terminal` helper with unit tests + +**Files:** +- Modify: `src/reporter/terminal.rs` (add helper and `#[cfg(test)] mod tests`) + +- [ ] **Step 1: Write failing unit tests** + +At the bottom of `src/reporter/terminal.rs`, add a `#[cfg(test)] mod tests` block. **Note on expected output format:** `char::escape_default` uses `\u{}` (minimal hex, no leading zeros) for control bytes without a short escape — so ESC 0x1b renders as `\u{1b}`, BEL 0x07 renders as `\u{7}`, DEL 0x7f renders as `\u{7f}`. It uses the short forms `\n`, `\r`, `\t` for newline/CR/tab, and `\\` for backslash. These expectations were verified empirically (see commit `45cf649`). + +```rust +#[cfg(test)] +mod tests { + use super::escape_for_terminal; + + #[test] + fn escapes_esc_byte() { + assert_eq!( + escape_for_terminal("\x1b[31mRED\x1b[0m"), + "\\u{1b}[31mRED\\u{1b}[0m" + ); + } + + #[test] + fn escapes_bel_and_del() { + assert_eq!( + escape_for_terminal("ring\x07bye\x7f"), + "ring\\u{7}bye\\u{7f}" + ); + } + + #[test] + fn escapes_tab_newline_cr_as_short_forms() { + // char::escape_default uses short escapes for these three. + assert_eq!( + escape_for_terminal("tab\there\nnewline\rreturn"), + "tab\\there\\nnewline\\rreturn" + ); + } + + #[test] + fn escapes_backslash() { + assert_eq!(escape_for_terminal("a\\b"), "a\\\\b"); + } + + #[test] + fn preserves_printable_ascii() { + assert_eq!( + escape_for_terminal("hello world 123 !@#"), + "hello world 123 !@#" + ); + } + + #[test] + fn preserves_cyrillic() { + assert_eq!(escape_for_terminal("пример.рф"), "пример.рф"); + } + + #[test] + fn preserves_emoji() { + assert_eq!(escape_for_terminal("crab 🦀 rust"), "crab 🦀 rust"); + } + + #[test] + fn mixed_content_escapes_only_dangerous_bytes() { + // Cyrillic + ESC injection + emoji — Cyrillic and emoji must survive, + // only the ESC sequence should be escaped. + assert_eq!( + escape_for_terminal("пример\x1b[31m🦀"), + "пример\\u{1b}[31m🦀" + ); + } + + #[test] + fn empty_string_is_empty() { + assert_eq!(escape_for_terminal(""), ""); + } +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +cargo test --lib reporter::terminal::tests 2>&1 | tail -20 +``` + +Expected: compile error — `escape_for_terminal` not in scope. This confirms the tests are wired up and the function doesn't exist yet. (Private `#[cfg(test)] mod tests` blocks inside `src/reporter/terminal.rs` are lib tests, not integration tests — use `--lib` alone without `--test`.) + +- [ ] **Step 3: Implement `escape_for_terminal`** + +At the top of `src/reporter/terminal.rs`, immediately after the `use` statements and before `pub struct TerminalReporter`, add: + +```rust +/// Escape control bytes (C0 + DEL + backslash) for safe terminal display. +/// +/// Iterates the input string's characters and applies `char::escape_default` +/// only when the character matches `char::is_ascii_control()` or is a +/// backslash. All other characters — printable ASCII and valid non-ASCII +/// Unicode (Cyrillic, CJK, emoji) — pass through unchanged. +/// +/// Why not `str::escape_default`? It routes *every* character through +/// `char::escape_default`, which escapes non-ASCII as `\u{...}` and +/// would mangle a Cyrillic hostname like `пример.рф` into +/// `\u{43f}\u{440}...`. See ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`) +/// for the layering rationale and the empirical verification. +fn escape_for_terminal(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if c.is_ascii_control() || c == '\\' { + for e in c.escape_default() { + out.push(e); + } + } else { + out.push(c); + } + } + out +} +``` + +- [ ] **Step 4: Run the helper unit tests** + +```bash +cargo test --lib reporter::terminal::tests 2>&1 | tail -20 +``` + +Expected: all 9 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/reporter/terminal.rs +git commit -m "feat(reporter): add escape_for_terminal helper + +Private helper in src/reporter/terminal.rs built on stdlib +char::escape_default, gated by char::is_ascii_control || backslash. +Escapes C0 control bytes, DEL, and backslash while preserving all +valid non-ASCII Unicode (Cyrillic, CJK, emoji). Unit tested for ESC, +BEL, DEL, tab/newline/CR short escapes, backslash, printable ASCII, +Cyrillic, emoji, mixed content, and empty string. + +Not yet applied to the rendering loop — that's the next commit, so +the helper can land with dedicated tests before it's integrated. + +See ADR 0003 for the layering rationale." +``` + +--- + +## Task 4: Apply helper in terminal reporter rendering loop + +**Files:** +- Modify: `src/reporter/terminal.rs:63-82` (the `FINDINGS` rendering block) +- Modify: `tests/reporter_tests.rs` (add a failing test first) + +- [ ] **Step 1: Write a failing test in `tests/reporter_tests.rs`** + +Add this test at the end of `tests/reporter_tests.rs`: + +```rust +#[test] +fn test_terminal_reporter_escapes_esc_bytes_in_summary() { + // Regression: a Finding whose summary contains an ESC byte must not + // propagate the raw byte to terminal output, where it would be + // interpreted as an ANSI escape sequence. Per ADR 0003, the terminal + // reporter is responsible for this escaping. + let reporter = TerminalReporter { use_color: false }; + let summary = Summary::new(); + let findings = vec![Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + summary: "attacker payload: \x1b[31mRED\x1b[0m".into(), + evidence: vec!["raw evidence: \x1b[32mGREEN".into()], + mitre_technique: None, + source_ip: None, + timestamp: None, + }]; + + let output = reporter.render(&summary, &findings, &[]); + + assert!( + !output.as_bytes().contains(&0x1b), + "terminal output must not contain raw ESC (0x1b) bytes, got: {output:?}" + ); + assert!( + output.contains("\\u{1b}[31mRED"), + "terminal output should contain escaped form of ESC sequence in summary, got: {output}" + ); + assert!( + output.contains("\\u{1b}[32mGREEN"), + "terminal output should contain escaped form in evidence line, got: {output}" + ); +} +``` + +- [ ] **Step 2: Run the test to verify it fails** + +```bash +cargo test --test reporter_tests test_terminal_reporter_escapes_esc_bytes_in_summary 2>&1 | tail -15 +``` + +Expected: FAIL — the assertion `!output.as_bytes().contains(&0x1b)` fails because the current renderer interpolates `f.summary` and `ev` raw. + +- [ ] **Step 3: Wrap `f.summary` and each `ev` in the render loop** + +In `src/reporter/terminal.rs`, find the findings rendering block (around lines 60-86). The current code reads: + +```rust + // Findings + if !findings.is_empty() { + out.push_str(&self.section("FINDINGS")); + for f in findings { + let line = format!( + "[{}] {} ({}) - {}", + f.category, f.verdict, f.confidence, f.summary + ); + let colored = if self.use_color { + match f.verdict { + Verdict::Likely => match f.confidence { + Confidence::High => line.red().bold().to_string(), + _ => line.yellow().to_string(), + }, + Verdict::Inconclusive => line.cyan().to_string(), + Verdict::Unlikely => line.dimmed().to_string(), + } + } else { + line + }; + out.push_str(&format!(" {colored}\n")); + for ev in &f.evidence { + out.push_str(&format!(" > {ev}\n")); + } + if let Some(ref t) = f.mitre_technique { + out.push_str(&format!(" MITRE: {t}\n")); + } + } + out.push('\n'); + } +``` + +Change it to wrap `f.summary` and each `ev` through `escape_for_terminal`: + +```rust + // Findings + if !findings.is_empty() { + out.push_str(&self.section("FINDINGS")); + for f in findings { + // Per ADR 0003: the Finding struct stores raw bytes; the + // terminal reporter is responsible for escaping untrusted + // content (summary + evidence) before writing to a TTY. + let safe_summary = escape_for_terminal(&f.summary); + let line = format!( + "[{}] {} ({}) - {}", + f.category, f.verdict, f.confidence, safe_summary + ); + let colored = if self.use_color { + match f.verdict { + Verdict::Likely => match f.confidence { + Confidence::High => line.red().bold().to_string(), + _ => line.yellow().to_string(), + }, + Verdict::Inconclusive => line.cyan().to_string(), + Verdict::Unlikely => line.dimmed().to_string(), + } + } else { + line + }; + out.push_str(&format!(" {colored}\n")); + for ev in &f.evidence { + let safe_ev = escape_for_terminal(ev); + out.push_str(&format!(" > {safe_ev}\n")); + } + if let Some(ref t) = f.mitre_technique { + out.push_str(&format!(" MITRE: {t}\n")); + } + } + out.push('\n'); + } +``` + +- [ ] **Step 4: Run the new test to verify it passes** + +```bash +cargo test --test reporter_tests test_terminal_reporter_escapes_esc_bytes_in_summary 2>&1 | tail -10 +``` + +Expected: PASS. Also run all reporter tests to make sure nothing else broke: + +```bash +cargo test --test reporter_tests 2>&1 | tail -15 +``` + +Expected: all tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add src/reporter/terminal.rs tests/reporter_tests.rs +git commit -m "feat(reporter): escape finding summary and evidence at terminal render + +Apply escape_for_terminal to every Finding.summary and every evidence +entry in the terminal reporter's rendering loop. This moves output +sanitization from the analyzer construction site (where PR #49 placed +it) to the display layer, per ADR 0003. + +JSON output remains unchanged and is already safe via serde_json's +automatic RFC 8259 escaping. HTTP findings (path traversal, web shell, +admin panel, unusual method, etc.) — previously vulnerable to terminal +injection via attacker-controlled URIs — are fixed by this change with +zero analyzer-side code modifications." +``` + +--- + +## Task 5: Roll back TLS Debug formatter escaping + +**Files:** +- Modify: `tests/tls_analyzer_tests.rs:397-438` (invert `test_non_utf8_sni_escapes_control_bytes_in_summary`) +- Modify: `src/analyzer/tls.rs:343-376` (remove `{:?}` Debug-format interpolation) + +- [ ] **Step 1: Invert the failing-contract test first** + +Replace the body of `test_non_utf8_sni_escapes_control_bytes_in_summary` in `tests/tls_analyzer_tests.rs` (lines 396-438) with its new contract. Rename it to reflect the new meaning: + +```rust +#[test] +fn test_non_utf8_sni_preserves_raw_bytes_in_summary() { + // Per ADR 0003: the Finding struct is the data layer — it stores the + // raw post-from_utf8_lossy bytes from the attacker's SNI, including + // any ASCII control codes. Terminal-safety is the reporter's job, not + // the analyzer's. This test enforces that contract: raw ESC must + // survive to the struct; downstream rendering tests (in reporter + // tests) verify the terminal reporter escapes it on display. + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + // 0xff makes from_utf8 fail; 0x1b [ 3 1 m is the ANSI "red" CSI sequence; + // "pwnd" is the visible payload an attacker would inject. + let raw_sni: &[u8] = &[0xff, 0x1b, b'[', b'3', b'1', b'm', b'p', b'w', b'n', b'd']; + let record = build_client_hello_raw_sni(raw_sni, &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let findings = analyzer.findings(); + let f = findings + .iter() + .find(|f| f.summary.contains("non-UTF-8 bytes")) + .expect("expected non-UTF-8 SNI finding"); + + // The summary MUST contain the raw ESC byte — the analyzer does not + // escape. Forensic preservation is a load-bearing property of the + // data layer (ADR 0003). + assert!( + f.summary.as_bytes().contains(&0x1b), + "summary must preserve raw ESC byte for forensics, got: {:?}", + f.summary + ); + // And it must NOT contain the Debug-formatted escape form (which + // would indicate a regression to construction-site escaping). + assert!( + !f.summary.contains("\\u{1b}"), + "summary must not contain Debug-formatted escape (regression to construction-site), got: {}", + f.summary + ); + + // Hex evidence is unchanged — that's the lossless record. + assert!( + f.evidence + .iter() + .any(|e| e.contains("ff1b5b33316d70776e64")), + "expected raw bytes in hex evidence, got: {:?}", + f.evidence + ); +} +``` + +- [ ] **Step 2: Run the test to verify it fails** + +```bash +cargo test --test tls_analyzer_tests test_non_utf8_sni_preserves_raw_bytes_in_summary 2>&1 | tail -15 +``` + +Expected: FAIL — the current TLS code still uses `{lossy:?}`, so `f.summary` contains `\u{1b}` (escaped) instead of the raw `0x1b` byte. + +- [ ] **Step 3: Remove the Debug formatter in `src/analyzer/tls.rs`** + +In `src/analyzer/tls.rs`, find the NonAsciiUtf8 match arm (around lines 338-356). The current code has: + +```rust + SniValue::NonAsciiUtf8 { hostname, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Use Debug formatter ({:?}) to escape any control codepoints + // that might survive UTF-8 decoding (e.g. U+0085 NEL); the + // hostname here is valid UTF-8 but printable-script content + // is not guaranteed. + summary: format!( + "TLS SNI contains non-ASCII characters (RFC 6066 requires \ + A-labels per RFC 5890): {hostname:?}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } +``` + +Replace with (remove the `:?` and update the comment): + +```rust + SniValue::NonAsciiUtf8 { hostname, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Raw hostname interpolation — the data layer stores raw + // bytes per ADR 0003. Terminal-safety (escaping control + // codes, etc.) is applied by the terminal reporter at + // render time, not here. + summary: format!( + "TLS SNI contains non-ASCII characters (RFC 6066 requires \ + A-labels per RFC 5890): {hostname}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } +``` + +Now find the NonUtf8 match arm (around lines 357-376) with the current code: + +```rust + SniValue::NonUtf8 { lossy, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Use Debug formatter ({:?}) to escape control bytes (e.g. + // ESC 0x1b) that String::from_utf8_lossy preserves but the + // analyst's terminal would interpret as ANSI control + // sequences. Without this an attacker could craft a + // malformed SNI like b"\x1b[31m..." that recolors or + // overwrites the rendered finding line. + summary: format!( + "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } +``` + +Replace with (remove the `:?` and update the comment): + +```rust + SniValue::NonUtf8 { lossy, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Raw lossy interpolation — the data layer stores raw + // bytes (including any embedded ASCII control codes) per + // ADR 0003. The terminal reporter is responsible for + // escaping these for safe display; JSON output is already + // safe via serde_json's automatic RFC 8259 escaping. + summary: format!( + "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } +``` + +- [ ] **Step 4: Run the inverted test to verify it passes** + +```bash +cargo test --test tls_analyzer_tests test_non_utf8_sni_preserves_raw_bytes_in_summary 2>&1 | tail -10 +``` + +Expected: PASS. The raw ESC byte now survives to `f.summary`. + +- [ ] **Step 5: Run the full TLS test suite to confirm no other tests broke** + +```bash +cargo test --test tls_analyzer_tests 2>&1 | tail -20 +``` + +Expected: all tests PASS. The non-ASCII tests (Cyrillic, emoji, café.example) don't assert on escaped form directly — they check for substrings like `"RFC 6066"` and `"non-ASCII characters"` which are still present in the summary. The change only affects how the user-controlled `{hostname}` / `{lossy}` portion is interpolated. + +- [ ] **Step 6: Commit** + +```bash +git add src/analyzer/tls.rs tests/tls_analyzer_tests.rs +git commit -m "fix(tls): roll back construction-site escaping of SNI summaries + +Remove {:?} Debug-format interpolation in both NonAsciiUtf8 and NonUtf8 +SNI finding arms. The data layer now stores raw post-from_utf8_lossy +bytes; the terminal reporter is responsible for escaping at render +time per ADR 0003. + +Inverts test_non_utf8_sni_escapes_control_bytes_in_summary — renamed +to test_non_utf8_sni_preserves_raw_bytes_in_summary — to enforce the +new contract: raw ESC bytes MUST survive to Finding.summary. A +separate regression test (next commit) verifies the end-to-end +pipeline: terminal reporter escapes, JSON reporter preserves via +serde, struct holds the raw byte. + +Behavioral change visible to JSON consumers: Cyrillic SNIs now appear +as readable Unicode (\"пример.рф\") instead of hex-escaped +placeholders (\"\\u{43f}\\u{440}...\")." +``` + +--- + +## Task 6: End-to-end regression test for the layering contract + +**Files:** +- Modify: `tests/reporter_tests.rs` (add one integration test) + +- [ ] **Step 1: Write the end-to-end test** + +Add this test at the end of `tests/reporter_tests.rs`: + +```rust +#[test] +fn test_output_sanitization_layering_contract() { + // End-to-end contract test for ADR 0003. A single Finding flows through + // the data layer and both reporters; all three assertions must hold: + // 1. The struct itself keeps the raw ESC byte (forensic layer). + // 2. The terminal reporter escapes the ESC byte (terminal display layer). + // 3. The JSON reporter escapes via serde's RFC 8259 \u001b form (JSON layer). + // + // Any future regression that breaks one of these — e.g., re-introducing + // construction-site escaping, removing the terminal reporter's helper, + // or swapping to a JSON crate that doesn't escape control chars — will + // fail this test. + let finding = Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + summary: "attacker payload: \x1b[31mRED\x1b[0m".into(), + evidence: vec!["ev: \x1b[32mGREEN".into()], + mitre_technique: None, + source_ip: None, + timestamp: None, + }; + + // Layer 1: the struct preserves the raw ESC byte (forensic ground truth). + assert!( + finding.summary.as_bytes().contains(&0x1b), + "Finding.summary must preserve raw ESC for forensics" + ); + assert!( + finding.evidence[0].as_bytes().contains(&0x1b), + "Finding.evidence must preserve raw ESC for forensics" + ); + + // Layer 2: terminal reporter escapes on display. + let terminal_output = TerminalReporter { use_color: false }.render( + &Summary::new(), + std::slice::from_ref(&finding), + &[], + ); + assert!( + !terminal_output.as_bytes().contains(&0x1b), + "terminal reporter must not emit raw ESC bytes, got: {terminal_output:?}" + ); + assert!( + terminal_output.contains("\\u{1b}[31mRED"), + "terminal reporter should emit the escaped summary form, got: {terminal_output}" + ); + assert!( + terminal_output.contains("\\u{1b}[32mGREEN"), + "terminal reporter should emit the escaped evidence form, got: {terminal_output}" + ); + + // Layer 3: JSON reporter escapes via serde's RFC 8259 \u001b form. + let json_output = JsonReporter.render( + &Summary::new(), + std::slice::from_ref(&finding), + &[], + ); + assert!( + !json_output.as_bytes().contains(&0x1b), + "JSON reporter must not emit raw ESC bytes, got: {json_output:?}" + ); + assert!( + json_output.contains("\\u001b"), + "JSON reporter should serialize ESC as \\u001b per RFC 8259, got: {json_output}" + ); + // Round-trip through serde_json::from_str: the deserialized summary + // must match the original raw ESC byte. This proves the JSON escape + // is reversible, which is what downstream tooling relies on. + let parsed: serde_json::Value = serde_json::from_str(&json_output).unwrap(); + let parsed_summary = parsed["findings"][0]["summary"].as_str().unwrap(); + assert_eq!(parsed_summary, finding.summary); +} +``` + +- [ ] **Step 2: Run the test** + +```bash +cargo test --test reporter_tests test_output_sanitization_layering_contract 2>&1 | tail -15 +``` + +Expected: PASS immediately — Tasks 3, 4, and 5 already established the behavior. This test codifies the contract so it can't silently regress. + +- [ ] **Step 3: Commit** + +```bash +git add tests/reporter_tests.rs +git commit -m "test(reporter): end-to-end contract test for ADR 0003 layering + +Single Finding flows through the data layer and both reporters. Asserts: + + 1. Finding struct preserves raw ESC byte (forensic ground truth). + 2. Terminal reporter escapes ESC to \\u{1b} on display. + 3. JSON reporter serializes ESC as \\u001b per RFC 8259. + 4. JSON round-trips back to the original raw byte (reversibility). + +Any regression that re-introduces construction-site escaping, removes +the terminal reporter's helper, or changes the JSON path will fail +this test." +``` + +--- + +## Task 7: Full test suite and clippy + +**Files:** none modified + +- [ ] **Step 1: Run cargo test** + +```bash +cargo test --all 2>&1 | tail -30 +``` + +Expected: all tests PASS. Count total — should be approximately the same as before plus the new tests added in this plan (9 helper unit tests in `src/reporter/terminal.rs`, 1 new reporter test, 1 new end-to-end test, plus 1 inverted TLS test — net +11 new, +1 rewritten). + +- [ ] **Step 2: Run cargo clippy** + +```bash +cargo clippy --all-targets -- -D warnings 2>&1 | tail -20 +``` + +Expected: zero warnings. The new code should be idiomatic Rust. + +- [ ] **Step 3: Run cargo fmt check (no changes)** + +```bash +cargo fmt -- --check 2>&1 | tail -10 +``` + +Expected: zero diff. If formatting drifted, run `cargo fmt` and include the changes in the next commit. + +- [ ] **Step 4: No commit unless fmt flagged changes** + +If Step 3 reported no changes, no commit is needed. If it did: + +```bash +cargo fmt +git add -u +git commit -m "chore: cargo fmt" +``` + +--- + +## Summary of commits + +After all tasks complete, the branch should have (on top of `develop`): + +1. `f4c6098` docs(adr): add ADR 0003 — reporting pipeline layering *(already committed before plan execution)* +2. docs(adr): cross-reference ADR 0003 from ADR 0002 *(Task 1)* +3. docs(findings): warn that Finding::Display is not terminal-safe *(Task 2)* +4. feat(reporter): add escape_for_terminal helper *(Task 3)* +5. feat(reporter): escape finding summary and evidence at terminal render *(Task 4)* +6. fix(tls): roll back construction-site escaping of SNI summaries *(Task 5)* +7. test(reporter): end-to-end contract test for ADR 0003 layering *(Task 6)* +8. (optional) chore: cargo fmt *(Task 7, only if needed)* + +Total: 6–7 commits beyond the ADR, ~100 lines of production-code change, ~120 lines of new/modified test code. + +## Post-implementation + +After all tasks complete and tests pass, proceed to: +- Multi-agent PR review (`/pr-review-toolkit:review-pr`) +- Apply review triage pattern +- Push branch + create PR + request Copilot review +- Address Copilot feedback From 902827f4e3ebe9851799e1a02ff117788713385b Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 10:56:38 -0400 Subject: [PATCH 04/16] docs(adr): cross-reference ADR 0003 from ADR 0002 Point analyzer authors at the layering principle so they know output sanitization is the reporter's responsibility, not theirs. --- docs/adr/0002-modular-protocol-analyzers.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/adr/0002-modular-protocol-analyzers.md b/docs/adr/0002-modular-protocol-analyzers.md index 656a8a3..1a1cbe4 100644 --- a/docs/adr/0002-modular-protocol-analyzers.md +++ b/docs/adr/0002-modular-protocol-analyzers.md @@ -107,6 +107,7 @@ The `detail` map contains protocol-specific fields as `serde_json::Value`. This - Include MITRE ATT&CK technique ID only when there's a clean mapping; `None` is better than a forced fit - Include actionable evidence (specific values, not just "something was wrong") - Cap findings with `MAX_FINDINGS` to prevent memory exhaustion on adversarial input +- **Output sanitization is a reporter responsibility, not an analyzer responsibility.** Store raw bytes (post-`from_utf8_lossy`) in `Finding.summary` and `Finding.evidence`. Do not escape, debug-format, or otherwise pre-encode untrusted bytes at the analyzer. See ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`) for the full layering principle. ## Alternatives Considered From 0775b6fc10dfc5d86e169e9b2931a0985dfadda6 Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:05:03 -0400 Subject: [PATCH 05/16] docs(findings): warn that Finding::Display is not terminal-safe Doc comment on the impl Display block explaining that the raw text representation is for logging/debugging only and may contain attacker-controlled control bytes. Terminal rendering should go through the terminal reporter, which escapes per ADR 0003. --- src/findings.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/findings.rs b/src/findings.rs index 4fc99ec..deafe53 100644 --- a/src/findings.rs +++ b/src/findings.rs @@ -69,6 +69,15 @@ pub struct Finding { pub timestamp: Option>, } +/// Produces the raw text representation of a finding for logging, debugging, +/// or machine-readable output. **Not safe for direct terminal display** — the +/// `summary` field may contain attacker-controlled bytes from packet payloads +/// (including ASCII control codes like ESC `0x1b`) that a terminal would +/// interpret as control sequences. For safe terminal rendering, use the +/// terminal reporter (`src/reporter/terminal.rs`), which escapes every +/// `summary` and `evidence` entry before writing to the output buffer. +/// See ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`) for the +/// full layering principle. impl fmt::Display for Finding { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( From 1f7e556ceeafbe3b0ff0f7367d72642cd36d0fae Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:11:09 -0400 Subject: [PATCH 06/16] feat(reporter): add escape_for_terminal helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Private helper in src/reporter/terminal.rs built on stdlib char::escape_default, gated by char::is_ascii_control || backslash. Escapes C0 control bytes, DEL, and backslash while preserving all valid non-ASCII Unicode (Cyrillic, CJK, emoji). Unit tested for ESC, BEL, DEL, tab/newline/CR short escapes, backslash, printable ASCII, Cyrillic, emoji, mixed content, and empty string. Not yet applied to the rendering loop — that's the next commit, so the helper can land with dedicated tests before it's integrated. See ADR 0003 for the layering rationale. --- src/reporter/terminal.rs | 94 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/src/reporter/terminal.rs b/src/reporter/terminal.rs index 3bda246..f46ea1a 100644 --- a/src/reporter/terminal.rs +++ b/src/reporter/terminal.rs @@ -5,6 +5,32 @@ use crate::findings::{Confidence, Finding, Verdict}; use crate::reporter::Reporter; use crate::summary::Summary; +/// Escape control bytes (C0 + DEL + backslash) for safe terminal display. +/// +/// Iterates the input string's characters and applies `char::escape_default` +/// only when the character matches `char::is_ascii_control()` or is a +/// backslash. All other characters — printable ASCII and valid non-ASCII +/// Unicode (Cyrillic, CJK, emoji) — pass through unchanged. +/// +/// Why not `str::escape_default`? It routes *every* character through +/// `char::escape_default`, which escapes non-ASCII as `\u{...}` and +/// would mangle a Cyrillic hostname like `пример.рф` into +/// `\u{43f}\u{440}...`. See ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`) +/// for the layering rationale and the empirical verification. +fn escape_for_terminal(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if c.is_ascii_control() || c == '\\' { + for e in c.escape_default() { + out.push(e); + } + } else { + out.push(c); + } + } + out +} + pub struct TerminalReporter { pub use_color: bool, } @@ -113,3 +139,71 @@ impl TerminalReporter { } } } + +#[cfg(test)] +mod tests { + use super::escape_for_terminal; + + #[test] + fn escapes_esc_byte() { + assert_eq!( + escape_for_terminal("\x1b[31mRED\x1b[0m"), + "\\u{1b}[31mRED\\u{1b}[0m" + ); + } + + #[test] + fn escapes_bel_and_del() { + assert_eq!( + escape_for_terminal("ring\x07bye\x7f"), + "ring\\u{7}bye\\u{7f}" + ); + } + + #[test] + fn escapes_tab_newline_cr_as_short_forms() { + // char::escape_default uses short escapes for these three. + assert_eq!( + escape_for_terminal("tab\there\nnewline\rreturn"), + "tab\\there\\nnewline\\rreturn" + ); + } + + #[test] + fn escapes_backslash() { + assert_eq!(escape_for_terminal("a\\b"), "a\\\\b"); + } + + #[test] + fn preserves_printable_ascii() { + assert_eq!( + escape_for_terminal("hello world 123 !@#"), + "hello world 123 !@#" + ); + } + + #[test] + fn preserves_cyrillic() { + assert_eq!(escape_for_terminal("пример.рф"), "пример.рф"); + } + + #[test] + fn preserves_emoji() { + assert_eq!(escape_for_terminal("crab 🦀 rust"), "crab 🦀 rust"); + } + + #[test] + fn mixed_content_escapes_only_dangerous_bytes() { + // Cyrillic + ESC injection + emoji — Cyrillic and emoji must survive, + // only the ESC sequence should be escaped. + assert_eq!( + escape_for_terminal("пример\x1b[31m🦀"), + "пример\\u{1b}[31m🦀" + ); + } + + #[test] + fn empty_string_is_empty() { + assert_eq!(escape_for_terminal(""), ""); + } +} From 755837b25eca01b32f5d7481dc2deb9597ae43b3 Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:19:29 -0400 Subject: [PATCH 07/16] feat(reporter): escape finding summary and evidence at terminal render MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply escape_for_terminal to every Finding.summary and every evidence entry in the terminal reporter's rendering loop. This moves output sanitization from the analyzer construction site (where PR #49 placed it) to the display layer, per ADR 0003. JSON output remains unchanged and is already safe via serde_json's automatic RFC 8259 escaping. HTTP findings (path traversal, web shell, admin panel, unusual method, etc.) — previously vulnerable to terminal injection via attacker-controlled URIs — are fixed by this change with zero analyzer-side code modifications. --- src/reporter/terminal.rs | 9 +++++++-- tests/reporter_tests.rs | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/reporter/terminal.rs b/src/reporter/terminal.rs index f46ea1a..c32d2f6 100644 --- a/src/reporter/terminal.rs +++ b/src/reporter/terminal.rs @@ -86,9 +86,13 @@ impl Reporter for TerminalReporter { if !findings.is_empty() { out.push_str(&self.section("FINDINGS")); for f in findings { + // Per ADR 0003: the Finding struct stores raw bytes; the + // terminal reporter is responsible for escaping untrusted + // content (summary + evidence) before writing to a TTY. + let safe_summary = escape_for_terminal(&f.summary); let line = format!( "[{}] {} ({}) - {}", - f.category, f.verdict, f.confidence, f.summary + f.category, f.verdict, f.confidence, safe_summary ); let colored = if self.use_color { match f.verdict { @@ -104,7 +108,8 @@ impl Reporter for TerminalReporter { }; out.push_str(&format!(" {colored}\n")); for ev in &f.evidence { - out.push_str(&format!(" > {ev}\n")); + let safe_ev = escape_for_terminal(ev); + out.push_str(&format!(" > {safe_ev}\n")); } if let Some(ref t) = f.mitre_technique { out.push_str(&format!(" MITRE: {t}\n")); diff --git a/tests/reporter_tests.rs b/tests/reporter_tests.rs index 137b60c..c636944 100644 --- a/tests/reporter_tests.rs +++ b/tests/reporter_tests.rs @@ -77,3 +77,38 @@ fn test_terminal_reporter_hides_skipped_when_zero() { "Terminal output should NOT show 'Skipped' when zero, got: {output}" ); } + +#[test] +fn test_terminal_reporter_escapes_esc_bytes_in_summary() { + // Regression: a Finding whose summary contains an ESC byte must not + // propagate the raw byte to terminal output, where it would be + // interpreted as an ANSI escape sequence. Per ADR 0003, the terminal + // reporter is responsible for this escaping. + let reporter = TerminalReporter { use_color: false }; + let summary = Summary::new(); + let findings = vec![Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + summary: "attacker payload: \x1b[31mRED\x1b[0m".into(), + evidence: vec!["raw evidence: \x1b[32mGREEN".into()], + mitre_technique: None, + source_ip: None, + timestamp: None, + }]; + + let output = reporter.render(&summary, &findings, &[]); + + assert!( + !output.as_bytes().contains(&0x1b), + "terminal output must not contain raw ESC (0x1b) bytes, got: {output:?}" + ); + assert!( + output.contains("\\u{1b}[31mRED"), + "terminal output should contain escaped form of ESC sequence in summary, got: {output}" + ); + assert!( + output.contains("\\u{1b}[32mGREEN"), + "terminal output should contain escaped form in evidence line, got: {output}" + ); +} From 106310fb25a31e7ab5bd46e1494fe0b99e583ed9 Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:23:47 -0400 Subject: [PATCH 08/16] style(reporter): rename safe_summary/safe_ev to escaped_summary/escaped_ev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor clarity improvement from Task 4 code quality review. "safe" is ambiguous — the escaped string still contains the escape sequences in textual form, which is what the terminal wants. "escaped" names the action (control-byte escaping via char::escape_default) rather than a misleading safety property. Zero behavioral change. --- src/reporter/terminal.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/reporter/terminal.rs b/src/reporter/terminal.rs index c32d2f6..3d21c8e 100644 --- a/src/reporter/terminal.rs +++ b/src/reporter/terminal.rs @@ -89,10 +89,10 @@ impl Reporter for TerminalReporter { // Per ADR 0003: the Finding struct stores raw bytes; the // terminal reporter is responsible for escaping untrusted // content (summary + evidence) before writing to a TTY. - let safe_summary = escape_for_terminal(&f.summary); + let escaped_summary = escape_for_terminal(&f.summary); let line = format!( "[{}] {} ({}) - {}", - f.category, f.verdict, f.confidence, safe_summary + f.category, f.verdict, f.confidence, escaped_summary ); let colored = if self.use_color { match f.verdict { @@ -108,8 +108,8 @@ impl Reporter for TerminalReporter { }; out.push_str(&format!(" {colored}\n")); for ev in &f.evidence { - let safe_ev = escape_for_terminal(ev); - out.push_str(&format!(" > {safe_ev}\n")); + let escaped_ev = escape_for_terminal(ev); + out.push_str(&format!(" > {escaped_ev}\n")); } if let Some(ref t) = f.mitre_technique { out.push_str(&format!(" MITRE: {t}\n")); From 0e42aad26c2cdca2d6a6b65519d0d8355aa12e32 Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:26:52 -0400 Subject: [PATCH 09/16] fix(tls): roll back construction-site escaping of SNI summaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove {:?} Debug-format interpolation in both NonAsciiUtf8 and NonUtf8 SNI finding arms. The data layer now stores raw post-from_utf8_lossy bytes; the terminal reporter is responsible for escaping at render time per ADR 0003. Inverts test_non_utf8_sni_escapes_control_bytes_in_summary — renamed to test_non_utf8_sni_preserves_raw_bytes_in_summary — to enforce the new contract: raw ESC bytes MUST survive to Finding.summary. A separate regression test (next commit) verifies the end-to-end pipeline: terminal reporter escapes, JSON reporter preserves via serde, struct holds the raw byte. Behavioral change visible to JSON consumers: Cyrillic SNIs now appear as readable Unicode ("пример.рф") instead of hex-escaped placeholders ("\u{43f}\u{440}..."). --- src/analyzer/tls.rs | 23 +++++++++++------------ tests/tls_analyzer_tests.rs | 27 ++++++++++++++++----------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs index a1193f4..3eecad4 100644 --- a/src/analyzer/tls.rs +++ b/src/analyzer/tls.rs @@ -340,13 +340,13 @@ impl TlsAnalyzer { category: ThreatCategory::Anomaly, verdict: Verdict::Inconclusive, confidence: Confidence::Low, - // Use Debug formatter ({:?}) to escape any control codepoints - // that might survive UTF-8 decoding (e.g. U+0085 NEL); the - // hostname here is valid UTF-8 but printable-script content - // is not guaranteed. + // Raw hostname interpolation — the data layer stores raw + // bytes per ADR 0003. Terminal-safety (escaping control + // codes, etc.) is applied by the terminal reporter at + // render time, not here. summary: format!( "TLS SNI contains non-ASCII characters (RFC 6066 requires \ - A-labels per RFC 5890): {hostname:?}" + A-labels per RFC 5890): {hostname}" ), evidence: vec![format!("hex: {hex}")], mitre_technique: None, @@ -359,14 +359,13 @@ impl TlsAnalyzer { category: ThreatCategory::Anomaly, verdict: Verdict::Inconclusive, confidence: Confidence::Low, - // Use Debug formatter ({:?}) to escape control bytes (e.g. - // ESC 0x1b) that String::from_utf8_lossy preserves but the - // analyst's terminal would interpret as ANSI control - // sequences. Without this an attacker could craft a - // malformed SNI like b"\x1b[31m..." that recolors or - // overwrites the rendered finding line. + // Raw lossy interpolation — the data layer stores raw + // bytes (including any embedded ASCII control codes) per + // ADR 0003. The terminal reporter is responsible for + // escaping these for safe display; JSON output is already + // safe via serde_json's automatic RFC 8259 escaping. summary: format!( - "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}" + "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy}" ), evidence: vec![format!("hex: {hex}")], mitre_technique: None, diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs index c4408bd..897cc05 100644 --- a/tests/tls_analyzer_tests.rs +++ b/tests/tls_analyzer_tests.rs @@ -394,11 +394,13 @@ fn test_ascii_sni_does_not_emit_non_utf8_finding() { } #[test] -fn test_non_utf8_sni_escapes_control_bytes_in_summary() { - // Security regression: a malformed SNI containing raw ESC (0x1b) plus an - // ANSI CSI sequence must NOT propagate the literal control byte into the - // finding summary, where it would be interpreted by an analyst's terminal - // and could recolor or overwrite the rendered report line. +fn test_non_utf8_sni_preserves_raw_bytes_in_summary() { + // Per ADR 0003: the Finding struct is the data layer — it stores the + // raw post-from_utf8_lossy bytes from the attacker's SNI, including + // any ASCII control codes. Terminal-safety is the reporter's job, not + // the analyzer's. This test enforces that contract: raw ESC must + // survive to the struct; downstream rendering tests (in reporter + // tests) verify the terminal reporter escapes it on display. let mut analyzer = TlsAnalyzer::new(); let fk = test_flow_key(); @@ -414,16 +416,19 @@ fn test_non_utf8_sni_escapes_control_bytes_in_summary() { .find(|f| f.summary.contains("non-UTF-8 bytes")) .expect("expected non-UTF-8 SNI finding"); - // The summary must not contain the raw ESC byte. Debug formatting ({:?}) - // turns 0x1b into the literal escape sequence "\u{1b}" instead. + // The summary MUST contain the raw ESC byte — the analyzer does not + // escape. Forensic preservation is a load-bearing property of the + // data layer (ADR 0003). assert!( - !f.summary.as_bytes().contains(&0x1b), - "summary contains raw ESC byte (terminal injection vector): {:?}", + f.summary.as_bytes().contains(&0x1b), + "summary must preserve raw ESC byte for forensics, got: {:?}", f.summary ); + // And it must NOT contain the Debug-formatted escape form (which + // would indicate a regression to construction-site escaping). assert!( - f.summary.contains("\\u{1b}"), - "summary should contain escaped ESC sequence \\u{{1b}}, got: {}", + !f.summary.contains("\\u{1b}"), + "summary must not contain Debug-formatted escape (regression to construction-site), got: {}", f.summary ); From 7e4f1df0f632097089e912276de073edd47bf1c7 Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:33:19 -0400 Subject: [PATCH 10/16] test(reporter): end-to-end contract test for ADR 0003 layering Single Finding flows through the data layer and both reporters. Asserts: 1. Finding struct preserves raw ESC byte (forensic ground truth). 2. Terminal reporter escapes ESC to \u{1b} on display. 3. JSON reporter serializes ESC as \u001b per RFC 8259. 4. JSON round-trips back to the original raw byte (reversibility). Any regression that re-introduces construction-site escaping, removes the terminal reporter's helper, or changes the JSON path will fail this test. --- tests/reporter_tests.rs | 74 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tests/reporter_tests.rs b/tests/reporter_tests.rs index c636944..9b84eb7 100644 --- a/tests/reporter_tests.rs +++ b/tests/reporter_tests.rs @@ -112,3 +112,77 @@ fn test_terminal_reporter_escapes_esc_bytes_in_summary() { "terminal output should contain escaped form in evidence line, got: {output}" ); } + +#[test] +fn test_output_sanitization_layering_contract() { + // End-to-end contract test for ADR 0003. A single Finding flows through + // the data layer and both reporters; all three assertions must hold: + // 1. The struct itself keeps the raw ESC byte (forensic layer). + // 2. The terminal reporter escapes the ESC byte (terminal display layer). + // 3. The JSON reporter escapes via serde's RFC 8259 \u001b form (JSON layer). + // + // Any future regression that breaks one of these — e.g., re-introducing + // construction-site escaping, removing the terminal reporter's helper, + // or swapping to a JSON crate that doesn't escape control chars — will + // fail this test. + let finding = Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + summary: "attacker payload: \x1b[31mRED\x1b[0m".into(), + evidence: vec!["ev: \x1b[32mGREEN".into()], + mitre_technique: None, + source_ip: None, + timestamp: None, + }; + + // Layer 1: the struct preserves the raw ESC byte (forensic ground truth). + assert!( + finding.summary.as_bytes().contains(&0x1b), + "Finding.summary must preserve raw ESC for forensics" + ); + assert!( + finding.evidence[0].as_bytes().contains(&0x1b), + "Finding.evidence must preserve raw ESC for forensics" + ); + + // Layer 2: terminal reporter escapes on display. + let terminal_output = TerminalReporter { use_color: false }.render( + &Summary::new(), + std::slice::from_ref(&finding), + &[], + ); + assert!( + !terminal_output.as_bytes().contains(&0x1b), + "terminal reporter must not emit raw ESC bytes, got: {terminal_output:?}" + ); + assert!( + terminal_output.contains("\\u{1b}[31mRED"), + "terminal reporter should emit the escaped summary form, got: {terminal_output}" + ); + assert!( + terminal_output.contains("\\u{1b}[32mGREEN"), + "terminal reporter should emit the escaped evidence form, got: {terminal_output}" + ); + + // Layer 3: JSON reporter escapes via serde's RFC 8259 \u001b form. + let json_output = JsonReporter.render( + &Summary::new(), + std::slice::from_ref(&finding), + &[], + ); + assert!( + !json_output.as_bytes().contains(&0x1b), + "JSON reporter must not emit raw ESC bytes, got: {json_output:?}" + ); + assert!( + json_output.contains("\\u001b"), + "JSON reporter should serialize ESC as \\u001b per RFC 8259, got: {json_output}" + ); + // Round-trip through serde_json::from_str: the deserialized summary + // must match the original raw ESC byte. This proves the JSON escape + // is reversible, which is what downstream tooling relies on. + let parsed: serde_json::Value = serde_json::from_str(&json_output).unwrap(); + let parsed_summary = parsed["findings"][0]["summary"].as_str().unwrap(); + assert_eq!(parsed_summary, finding.summary); +} From 96ed07dc812afb0f3e340039cefe01fdd5c1465a Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:34:58 -0400 Subject: [PATCH 11/16] style: cargo fmt --- tests/reporter_tests.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/reporter_tests.rs b/tests/reporter_tests.rs index 9b84eb7..ba1ece2 100644 --- a/tests/reporter_tests.rs +++ b/tests/reporter_tests.rs @@ -166,11 +166,7 @@ fn test_output_sanitization_layering_contract() { ); // Layer 3: JSON reporter escapes via serde's RFC 8259 \u001b form. - let json_output = JsonReporter.render( - &Summary::new(), - std::slice::from_ref(&finding), - &[], - ); + let json_output = JsonReporter.render(&Summary::new(), std::slice::from_ref(&finding), &[]); assert!( !json_output.as_bytes().contains(&0x1b), "JSON reporter must not emit raw ESC bytes, got: {json_output:?}" From 7d2cd3c9282a6d7ae6a4ffd71062d63882de6745 Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:55:05 -0400 Subject: [PATCH 12/16] fix(reporter): escape C1 control codepoints (U+0080-U+009F) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The escape_for_terminal helper gated only on char::is_ascii_control(), which matches C0 + DEL but NOT the C1 range (U+0080-U+009F). C1 codepoints like U+009B (CSI) and U+0085 (NEL) can appear in a valid UTF-8 String — they're encoded as two-byte sequences (e.g., 0xC2 0x9B for U+009B) that survive from_utf8_lossy. A terminal in DEC S8C1T mode interprets U+009B as the 8-bit equivalent of ESC[, so an attacker-supplied SNI or HTTP URI containing 0xC2 0x9B could bypass the escape and inject ANSI control sequences. Modern terminals in UTF-8 mode default to NOT interpreting C1 this way (verified via Perplexity for xterm, iTerm2, gnome-terminal, Windows Terminal, VS Code, tmux), but the DEC mode exists and some legacy terminals honor it. Escape C1 as cheap insurance. The ADR's Coverage of the threat model section contained a subtly wrong claim: 'a standalone byte in the 0x80-0x9f range cannot appear in a valid UTF-8 string'. That's correct for a single raw byte but irrelevant — the codepoint U+009B is a valid multi-byte UTF-8 sequence and can absolutely appear in a String. ADR updated to reflect the actual helper behavior and the corrected reasoning. Adds two new unit tests (escapes_c1_nel_and_csi, escapes_c1_range_boundaries) covering U+0080, U+0085, U+009B, U+009F, plus the U+00A0 (NBSP) boundary — which is just past C1 and must pass through unchanged. --- docs/adr/0003-reporting-pipeline-layering.md | 6 +- src/reporter/terminal.rs | 59 ++++++++++++++++---- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/docs/adr/0003-reporting-pipeline-layering.md b/docs/adr/0003-reporting-pipeline-layering.md index d0c287c..4d8c16d 100644 --- a/docs/adr/0003-reporting-pipeline-layering.md +++ b/docs/adr/0003-reporting-pipeline-layering.md @@ -84,7 +84,9 @@ It preserves: The output is always valid UTF-8 and contains no bytes that a modern terminal will interpret as control sequences. -C1 control bytes (`0x80`–`0x9f`) are not a coverage gap. `Finding.summary` and `Finding.evidence` are `String` (guaranteed valid UTF-8) populated via `String::from_utf8_lossy` (which replaces invalid bytes with `U+FFFD`). A standalone byte in the C1 range cannot appear in a valid UTF-8 string, so C0 + DEL coverage is complete. +C1 control codepoints (U+0080–U+009F) are also dangerous: U+009B (CSI) is the 8-bit equivalent of ESC[ and is interpreted as a Control Sequence Introducer by terminals in DEC S8C1T mode, and U+0085 (NEL) has similar semantics. **These codepoints CAN appear in a valid UTF-8 `String`**, encoded as two-byte sequences (e.g., `0xC2 0x9B` for U+009B). `String::from_utf8_lossy` passes such sequences through unchanged — they are valid UTF-8. An earlier draft of this ADR claimed that "a standalone byte in the 0x80–0x9f range cannot appear in a valid UTF-8 string"; that's correct for a single raw byte but misleading for the codepoint, which is the relevant question for a `String` produced by `from_utf8_lossy`. + +The helper therefore escapes both C0 + DEL (via `char::is_ascii_control()`) AND the C1 range `U+0080..=U+009F` (via an explicit range check), plus backslash. Modern terminals in UTF-8 mode default to NOT interpreting C1 codepoints as controls (xterm, iTerm2, gnome-terminal, Windows Terminal, VS Code, tmux all decode UTF-8 first), so the practical threat is narrow; but S8C1T exists, can be enabled, and some legacy terminals honor it. Escaping C1 is cheap insurance and closes the gap the earlier reasoning overlooked. ### Other formatting concerns (same principle, deferred scope) @@ -218,4 +220,4 @@ This decision was validated through targeted Perplexity queries on 2026-04-08: - **Output encoding placement:** OWASP guidance (XSS prevention cheat sheet, CWE-117 log injection) recommends encoding at display time, not at storage time. Encoding at construction creates premature commitment to one output context and breaks others. Confirmed. - **`serde_json` control byte handling:** `serde_json` automatically escapes control bytes (including ESC `0x1b`) as `\u001b`, per RFC 8259 §7. JSON output is safe with no analyzer code. Confirmed. - **Escape primitive selection:** An initial Perplexity answer claimed `str::escape_default` preserves multi-byte UTF-8. A follow-up empirical check (`rustc`-compiled program, 2026-04-09) falsified this: `str::escape_default` internally routes every character through `char::escape_default`, which escapes *all* non-ASCII Unicode as `\u{...}`. A custom helper iterating `chars()` and gating `escape_default` on `is_ascii_control() || '\\'` was then verified empirically against the same test inputs (ESC, BEL, DEL, backslash, Cyrillic, emoji, mixed content, tabs/newlines/CR) and produced the correct output: control bytes escaped as `\u{}`, backslash doubled, Cyrillic and emoji preserved byte-for-byte. -- **C1 control byte risk:** Standalone C1 bytes (`0x80`–`0x9f`) cannot appear in a valid UTF-8 `String`. Since `Finding.summary` is `String` and analyzers populate it via `String::from_utf8_lossy`, C0 + DEL coverage is complete. +- **C1 control codepoint risk:** C1 codepoints (U+0080–U+009F) CAN appear as valid UTF-8 in a `String` — encoded as two-byte sequences (e.g., 0xC2 0x9B for U+009B, the 8-bit CSI). The earlier claim that "a standalone byte in the 0x80–0x9f range cannot appear" was correct for a single raw byte but conflates the distinction between byte and codepoint. The helper now explicitly escapes the C1 range via a range check, in addition to C0 + DEL via `char::is_ascii_control()`. Empirical verification (2026-04-09) confirmed that a Cyrillic SNI containing 0xC2 0x9B bytes (U+009B) produces a String whose char iteration yields U+009B unescaped by the old predicate; the fix adds a range check to catch it. diff --git a/src/reporter/terminal.rs b/src/reporter/terminal.rs index 3d21c8e..0cb045f 100644 --- a/src/reporter/terminal.rs +++ b/src/reporter/terminal.rs @@ -5,22 +5,35 @@ use crate::findings::{Confidence, Finding, Verdict}; use crate::reporter::Reporter; use crate::summary::Summary; -/// Escape control bytes (C0 + DEL + backslash) for safe terminal display. +/// Escape control bytes (C0 + DEL + C1 + backslash) for safe terminal display. /// /// Iterates the input string's characters and applies `char::escape_default` -/// only when the character matches `char::is_ascii_control()` or is a -/// backslash. All other characters — printable ASCII and valid non-ASCII -/// Unicode (Cyrillic, CJK, emoji) — pass through unchanged. +/// when the character matches `char::is_ascii_control()` (C0 + DEL), falls in +/// the C1 range `U+0080..=U+009F`, or is a backslash. All other characters — +/// printable ASCII and valid non-ASCII Unicode (Cyrillic, CJK, emoji) — pass +/// through unchanged. /// -/// Why not `str::escape_default`? It routes *every* character through -/// `char::escape_default`, which escapes non-ASCII as `\u{...}` and -/// would mangle a Cyrillic hostname like `пример.рф` into -/// `\u{43f}\u{440}...`. See ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`) -/// for the layering rationale and the empirical verification. +/// **Why C1?** Codepoints U+0080–U+009F (C1 controls like NEL U+0085 and CSI +/// U+009B) can be encoded in valid UTF-8 as two-byte sequences (e.g., U+009B +/// as `0xC2 0x9B`) and survive `String::from_utf8_lossy`. Most modern +/// terminals in UTF-8 mode do NOT interpret these as controls by default, but +/// the DEC S8C1T mode and some legacy terminals can treat U+009B as an 8-bit +/// equivalent of ESC[. Escaping them closes a narrow but real vector. +/// +/// **Why not `str::escape_default`?** It routes *every* character through +/// `char::escape_default`, which escapes non-ASCII as `\u{...}` and would +/// mangle a Cyrillic hostname like `пример.рф` into `\u{43f}\u{440}...`. +/// See ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`) for the +/// layering rationale and the empirical verification. fn escape_for_terminal(s: &str) -> String { let mut out = String::with_capacity(s.len()); for c in s.chars() { - if c.is_ascii_control() || c == '\\' { + // Escape C0 (0x00-0x1f), DEL (0x7f), C1 (U+0080-U+009F), and backslash. + // C1 codepoints like U+0085 (NEL) and U+009B (CSI — 8-bit equivalent of + // ESC[) are valid multi-byte UTF-8 in a `String` but can be interpreted + // as controls by terminals in 8-bit C1 mode (DEC S8C1T). Escaping them + // is cheap insurance. + if c.is_ascii_control() || ('\u{80}'..='\u{9f}').contains(&c) || c == '\\' { for e in c.escape_default() { out.push(e); } @@ -211,4 +224,30 @@ mod tests { fn empty_string_is_empty() { assert_eq!(escape_for_terminal(""), ""); } + + #[test] + fn escapes_c1_nel_and_csi() { + // U+0085 (NEL, Next Line) and U+009B (CSI, 8-bit Control Sequence + // Introducer). Both are valid multi-byte UTF-8 and survive + // String::from_utf8_lossy — must be escaped to avoid 8-bit terminal + // control interpretation. + assert_eq!( + escape_for_terminal("line1\u{85}line2"), + "line1\\u{85}line2" + ); + assert_eq!( + escape_for_terminal("before\u{9b}31mafter"), + "before\\u{9b}31mafter" + ); + } + + #[test] + fn escapes_c1_range_boundaries() { + // U+0080 (start of C1) and U+009F (end of C1) must both escape. + // U+00A0 (NBSP, just past C1) must pass through unchanged — it's a + // legitimate printable whitespace character, not a control code. + assert_eq!(escape_for_terminal("\u{80}"), "\\u{80}"); + assert_eq!(escape_for_terminal("\u{9f}"), "\\u{9f}"); + assert_eq!(escape_for_terminal("\u{a0}"), "\u{a0}"); + } } From b9cc82051e17e248c944c13f27ac984f1cf6adf3 Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:56:19 -0400 Subject: [PATCH 13/16] test: augment coverage per pr-test-analyzer review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three additions from the multi-agent PR review triage: 1. test_json_reporter_preserves_cyrillic_as_readable_unicode — pins the ADR 0003 user-visible claim that JSON consumers now see raw Cyrillic hostnames instead of the PR #49 hex-mangled form. Includes a round-trip serde_json::from_str check proving the Cyrillic survives the full trip. 2. test_terminal_reporter_escapes_esc_bytes_in_summary augmented with Cyrillic content in the summary payload. Previously only covered ASCII + ESC — now proves the render loop preserves Cyrillic while still escaping control bytes (catches accidental double-escape bugs in the for-findings loop). 3. test_cyrillic_sni_emits_non_ascii_finding augmented with raw- Cyrillic-substring assertions on Finding.summary. This directly guards the {hostname:?} → {hostname} rollback on the NonAsciiUtf8 match arm in src/analyzer/tls.rs, which was otherwise only structurally similar (and untested) compared to the non-UTF-8 branch that has the inverted dedicated test. --- tests/reporter_tests.rs | 51 ++++++++++++++++++++++++++++++++++++- tests/tls_analyzer_tests.rs | 21 +++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/tests/reporter_tests.rs b/tests/reporter_tests.rs index ba1ece2..b7dcc04 100644 --- a/tests/reporter_tests.rs +++ b/tests/reporter_tests.rs @@ -90,7 +90,7 @@ fn test_terminal_reporter_escapes_esc_bytes_in_summary() { category: ThreatCategory::Anomaly, verdict: Verdict::Inconclusive, confidence: Confidence::Low, - summary: "attacker payload: \x1b[31mRED\x1b[0m".into(), + summary: "attacker payload: пример\x1b[31mRED\x1b[0m".into(), evidence: vec!["raw evidence: \x1b[32mGREEN".into()], mitre_technique: None, source_ip: None, @@ -111,6 +111,10 @@ fn test_terminal_reporter_escapes_esc_bytes_in_summary() { output.contains("\\u{1b}[32mGREEN"), "terminal output should contain escaped form in evidence line, got: {output}" ); + assert!( + output.contains("пример"), + "terminal output should preserve Cyrillic in the summary, got: {output}" + ); } #[test] @@ -182,3 +186,48 @@ fn test_output_sanitization_layering_contract() { let parsed_summary = parsed["findings"][0]["summary"].as_str().unwrap(); assert_eq!(parsed_summary, finding.summary); } + +#[test] +fn test_json_reporter_preserves_cyrillic_as_readable_unicode() { + // ADR 0003's primary user-visible behavioral claim: JSON consumers see + // raw Cyrillic (and other non-ASCII Unicode) hostnames instead of the + // hex-mangled forms PR #49 produced (`\u{43f}\u{440}...`). This test + // pins that contract: any future regression that re-introduces + // construction-site Debug-formatting or swaps to a non-serde JSON + // writer that pre-escapes non-ASCII will fail here. + let finding = Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + summary: "TLS SNI non-ASCII: пример.рф".into(), + evidence: vec!["hex: d0bfd180d0b8d0bcd0b5d1802ed180d184".into()], + mitre_technique: None, + source_ip: None, + timestamp: None, + }; + + let json_output = JsonReporter.render( + &Summary::new(), + std::slice::from_ref(&finding), + &[], + ); + + // Readable Cyrillic must appear in the JSON output — serde_json preserves + // non-ASCII Unicode by default and only escapes control characters. + assert!( + json_output.contains("пример.рф"), + "JSON output must contain readable Cyrillic, got: {json_output}" + ); + // And the old hex-escaped form must NOT appear — that would indicate a + // regression to construction-site escaping. + assert!( + !json_output.contains("\\u{43f}"), + "JSON output must not contain Debug-formatted Cyrillic escape (regression to construction-site), got: {json_output}" + ); + + // Round-trip: deserialize the JSON and confirm the summary matches + // byte-for-byte — proves the Cyrillic survives the full trip. + let parsed: serde_json::Value = serde_json::from_str(&json_output).unwrap(); + let parsed_summary = parsed["findings"][0]["summary"].as_str().unwrap(); + assert_eq!(parsed_summary, finding.summary); +} diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs index 897cc05..7947dea 100644 --- a/tests/tls_analyzer_tests.rs +++ b/tests/tls_analyzer_tests.rs @@ -614,6 +614,27 @@ fn test_cyrillic_sni_emits_non_ascii_finding() { *analyzer.sni_counts().get("пример.example").unwrap_or(&0), 1 ); + + // Per ADR 0003: the data layer stores raw bytes (not Debug-escaped). + // Find the non-ASCII finding and assert it contains the raw Cyrillic + // hostname — this directly guards the {hostname:?} → {hostname} rollback + // on the NonAsciiUtf8 match arm (src/analyzer/tls.rs), which was + // otherwise only covered structurally by the NonUtf8 branch test. + let f = analyzer + .findings() + .into_iter() + .find(|f| f.summary.contains("non-ASCII characters")) + .expect("expected non-ASCII finding"); + assert!( + f.summary.contains("пример.example"), + "summary must contain raw Cyrillic hostname for forensic preservation, got: {}", + f.summary + ); + assert!( + !f.summary.contains("\\u{43f}"), + "summary must not contain Debug-formatted Cyrillic escape (regression to construction-site), got: {}", + f.summary + ); } #[test] From 5bdecc73f201ada116c685dc077ec1c5944a1d6d Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 11:58:27 -0400 Subject: [PATCH 14/16] style: cargo fmt --- src/reporter/terminal.rs | 5 +---- tests/reporter_tests.rs | 6 +----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/reporter/terminal.rs b/src/reporter/terminal.rs index 0cb045f..1801d58 100644 --- a/src/reporter/terminal.rs +++ b/src/reporter/terminal.rs @@ -231,10 +231,7 @@ mod tests { // Introducer). Both are valid multi-byte UTF-8 and survive // String::from_utf8_lossy — must be escaped to avoid 8-bit terminal // control interpretation. - assert_eq!( - escape_for_terminal("line1\u{85}line2"), - "line1\\u{85}line2" - ); + assert_eq!(escape_for_terminal("line1\u{85}line2"), "line1\\u{85}line2"); assert_eq!( escape_for_terminal("before\u{9b}31mafter"), "before\\u{9b}31mafter" diff --git a/tests/reporter_tests.rs b/tests/reporter_tests.rs index b7dcc04..1307a99 100644 --- a/tests/reporter_tests.rs +++ b/tests/reporter_tests.rs @@ -206,11 +206,7 @@ fn test_json_reporter_preserves_cyrillic_as_readable_unicode() { timestamp: None, }; - let json_output = JsonReporter.render( - &Summary::new(), - std::slice::from_ref(&finding), - &[], - ); + let json_output = JsonReporter.render(&Summary::new(), std::slice::from_ref(&finding), &[]); // Readable Cyrillic must appear in the JSON output — serde_json preserves // non-ASCII Unicode by default and only escapes control characters. From ab9df8a6eeeeea179c7ce6929396444fd3514f8d Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 12:11:37 -0400 Subject: [PATCH 15/16] docs(adr,plan): propagate C1 predicate update through ADR and plan The C1 control codepoint fix in commit 7d2cd3c updated the helper's predicate to include the U+0080..=U+009F range, but only patched one section of the ADR ('Coverage of the threat model') and left four other places describing the pre-C1 predicate. Copilot's review of #57 caught all five mismatches (four in ADR 0003, one in the plan). Fixes: - ADR 0003 'Immediate scope' section: code sample updated to include the C1 range check, and the 'helper escapes' bullet list gains an explicit C1 bullet pointing to the later threat-model paragraph. - ADR 0003 'Rationale' bullet: 'Escapes exactly the bytes that constitute the threat (C0 + DEL + backslash)' -> 'C0 + C1 + DEL + backslash'. Also bumps the line count estimate (~8 -> ~15) to reflect the helper's actual size after the C1 addition. - ADR 0003 'Consequences' table row for src/reporter/terminal.rs: description now mentions C1 controls as part of the required predicate. - Plan Architecture paragraph: clarifies that the helper gates on ASCII control + C1 control + backslash. Adds a forward reference to commit 7d2cd3c so future plan-followers know the C1 range was added post-plan and Task 3's code samples reflect the pre-C1 predicate for historical accuracy. - Plan 'File Structure' clarification: 'No files are created' was technically inaccurate since this plan doc and ADR 0003 are both new files; reworded to 'No new code or runtime files are created' with an explicit note that the two docs are prerequisites committed before the implementation tasks begin. Per Copilot on #57. --- docs/adr/0003-reporting-pipeline-layering.md | 9 +++++---- .../plans/2026-04-09-reporting-pipeline-layering.md | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/adr/0003-reporting-pipeline-layering.md b/docs/adr/0003-reporting-pipeline-layering.md index 4d8c16d..554da00 100644 --- a/docs/adr/0003-reporting-pipeline-layering.md +++ b/docs/adr/0003-reporting-pipeline-layering.md @@ -53,13 +53,13 @@ The data layer is raw and forensic; the display layer formats for its medium and ### Immediate scope: terminal-safe escaping -The first concrete consequence of the layering rule — and the motivating problem — is terminal-safe escaping. The terminal reporter defines a private `escape_for_terminal` helper that iterates the input `str`'s characters and applies `char::escape_default()` only to characters matching `char::is_ascii_control()` or the backslash. All other characters are passed through unchanged: +The first concrete consequence of the layering rule — and the motivating problem — is terminal-safe escaping. The terminal reporter defines a private `escape_for_terminal` helper that iterates the input `str`'s characters and applies `char::escape_default()` only to characters matching `char::is_ascii_control()`, the C1 control range `U+0080..=U+009F`, or the backslash. All other characters are passed through unchanged: ```rust fn escape_for_terminal(s: &str) -> String { let mut out = String::with_capacity(s.len()); for c in s.chars() { - if c.is_ascii_control() || c == '\\' { + if c.is_ascii_control() || ('\u{80}'..='\u{9f}').contains(&c) || c == '\\' { for e in c.escape_default() { out.push(e); } @@ -74,6 +74,7 @@ fn escape_for_terminal(s: &str) -> String { The helper escapes: - C0 control bytes (`0x00`–`0x1f`, including ESC `0x1b`, BEL `0x07`, LF `0x0a`, CR `0x0d`) — rendered as `\u{1b}`, `\n`, `\t`, etc. via `char::escape_default` - DEL (`0x7f`) — rendered as `\u{7f}` +- C1 control codepoints (`U+0080`–`U+009F`, including NEL `U+0085` and CSI `U+009B`) — rendered as `\u{85}`, `\u{9b}`, etc. via `char::escape_default`. See "C1 control codepoints" below for the rationale. - backslash (`\\`) It preserves: @@ -167,7 +168,7 @@ Add a `Finding::display_summary()` method that returns the escaped form, and hav - **Preserves forensic data.** The raw bytes are kept in the `Finding` struct, available for JSON export, future reporters, and downstream tooling. An analyst exporting to JSON sees the actual SNI bytes (with serde's standard JSON escaping); only the terminal reporter applies terminal-safe escaping. - **Single point of enforcement per medium.** Future analyzers don't need to remember any rule. Adding a new analyzer requires zero terminal-safety awareness. A new reporter (CSV, HTML, etc.) gets one place to apply its own escaping. - **Extensible.** When a future need appears — localization, HTML rendering, different truncation per medium — the pipeline already has the boundary in place. The work is in the display layer, not in every analyzer. -- **A small custom helper is the right primitive.** Built on stdlib `char::escape_default` + `char::is_ascii_control`, ~8 lines, no dependency. Gates the escape on control-ness so valid Unicode (Cyrillic, CJK, emoji) passes through unchanged. Escapes exactly the bytes that constitute the threat (C0 + DEL + backslash). The stdlib `str::escape_default` method was considered and rejected (it mangles all non-ASCII). +- **A small custom helper is the right primitive.** Built on stdlib `char::escape_default` + `char::is_ascii_control` plus an explicit C1 range check, ~15 lines, no dependency. Gates the escape on control-ness so valid Unicode (Cyrillic, CJK, emoji) passes through unchanged. Escapes exactly the characters that constitute the threat (C0 + C1 + DEL + backslash). The stdlib `str::escape_default` method was considered and rejected (it mangles all non-ASCII). - **Validated.** OWASP encoding guidance and RFC 8259 + serde_json behavior confirmed via Perplexity 2026-04-08. The escape primitive was re-verified empirically (`rustc`-compiled program, 2026-04-09) after an initial Perplexity answer about `str::escape_default` turned out to be wrong — see Validation. ## Consequences @@ -176,7 +177,7 @@ Add a `Finding::display_summary()` method that returns the escaped form, and hav | File | Change | |------|--------| -| `src/reporter/terminal.rs` | Add a private `escape_for_terminal(s: &str) -> String` helper at file scope that iterates `s.chars()`, applies `char::escape_default()` for chars matching `char::is_ascii_control()` or backslash, and passes all other chars through. Apply it to `f.summary` (line ~65, where `f.summary` is interpolated into the line `format!`) and to each `ev` in `f.evidence` (line ~81) before writing to the output buffer. The helper is private to the terminal reporter — other reporters that need it (e.g., a future CSV reporter) implement at their own boundary, since each output medium has different escaping rules. | +| `src/reporter/terminal.rs` | Add a private `escape_for_terminal(s: &str) -> String` helper at file scope that iterates `s.chars()`, applies `char::escape_default()` for chars that are ASCII controls (C0 + DEL), C1 controls (`U+0080..=U+009F`), or backslash, and passes all other chars through. Apply it to `f.summary` (line ~65, where `f.summary` is interpolated into the line `format!`) and to each `ev` in `f.evidence` (line ~81) before writing to the output buffer. The helper is private to the terminal reporter — other reporters that need it (e.g., a future CSV reporter) implement at their own boundary, since each output medium has different escaping rules. | | `src/analyzer/tls.rs` | Replace `{hostname:?}` (line ~349) and `{lossy:?}` (line ~369) with `{hostname}` / `{lossy}`. Update the inline doc comments that explain *why* the Debug formatter was used; replace them with a pointer to this ADR. | | `src/findings.rs` | Add a `///` doc comment on `impl Display for Finding` noting that it produces RAW text and is NOT safe for terminal display; consumers wanting safe display should go through the terminal reporter. | | `src/analyzer/http.rs` | **No changes required.** Existing raw interpolations are now correct under the new policy. | diff --git a/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md b/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md index 8e76d45..17e29d9 100644 --- a/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md +++ b/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md @@ -4,9 +4,9 @@ **Goal:** Move finding output sanitization from the analyzer construction site to the terminal reporter, per ADR 0003 (`docs/adr/0003-reporting-pipeline-layering.md`), fixing terminal injection in HTTP findings and restoring forensic data preservation in the `Finding` struct. -**Architecture:** The `Finding` struct stores raw post-`from_utf8_lossy` bytes; the terminal reporter escapes via a custom helper (~8 lines, built on stdlib `char::escape_default` gated by `char::is_ascii_control`) immediately before writing. The JSON reporter is already safe via `serde_json`'s automatic RFC 8259 control-byte escaping. No new dependencies, no new types. +**Architecture:** The `Finding` struct stores raw post-`from_utf8_lossy` bytes; the terminal reporter escapes via a custom helper (~15 lines, built on stdlib `char::escape_default` gated by ASCII control / C1 control / backslash detection) immediately before writing. The JSON reporter is already safe via `serde_json`'s automatic RFC 8259 control-byte escaping. No new dependencies, no new types. -**Important mechanism note:** `str::escape_default` was the initial choice but was rejected during plan self-review after empirical verification showed it escapes *all* non-ASCII characters (Cyrillic, emoji) — same UX problem as the PR #49 Debug formatter. See ADR 0003's updated Mechanism section and commit `45cf649`. The custom helper iterates `s.chars()` and only escapes when `char::is_ascii_control() || c == '\\'`. +**Important mechanism note:** `str::escape_default` was the initial choice but was rejected during plan self-review after empirical verification showed it escapes *all* non-ASCII characters (Cyrillic, emoji) — same UX problem as the PR #49 Debug formatter. See ADR 0003's updated Mechanism section and commit `45cf649`. The custom helper iterates `s.chars()` and only escapes when the character is an ASCII control (C0 + DEL), a C1 control (`U+0080..=U+009F`), or `'\\'`. The C1 range was added in commit `7d2cd3c` after the initial round of plan execution; the flow described in Task 3 below matches the pre-C1 predicate for historical accuracy — see commit `7d2cd3c` for the final predicate. **Tech Stack:** Rust 2024 edition, stdlib only for the escape primitive, `serde_json` for JSON output (already in tree), `owo_colors` for terminal styling (already in tree). @@ -24,7 +24,7 @@ | `tests/tls_analyzer_tests.rs` | TLS tests | Update `test_non_utf8_sni_escapes_control_bytes_in_summary` to assert the RAW ESC byte is now preserved in `f.summary` (contract inversion) | | `tests/reporter_tests.rs` | Reporter tests | Add end-to-end regression tests: terminal reporter escapes ESC bytes; JSON reporter preserves via serde's `\u001b`; `Finding.summary` keeps the raw byte | -No files are created. All changes are edits or test additions inside existing files. +No new code or runtime files are created as part of this implementation work. All implementation changes are edits or test additions inside existing files. (This plan document and `docs/adr/0003-reporting-pipeline-layering.md` are themselves new files committed as prerequisites before the implementation tasks begin.) --- From bee34596773f1453cd728131159fb4e65ee4b57f Mon Sep 17 00:00:00 2001 From: Zious Date: Thu, 9 Apr 2026 12:23:55 -0400 Subject: [PATCH 16/16] fix(reporter): escape analyzer summary detail values at terminal render MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TerminalReporter::render's ANALYZER SUMMARIES block was printing detail values (serde_json::Value) via {val} direct interpolation, relying on serde_json's Display impl to handle escaping. That works for C0 + DEL (serde_json escapes them per RFC 8259 §7) but NOT for C1 codepoints (U+0080-U+009F) — serde_json preserves those as raw UTF-8 two-byte sequences per RFC 8259, which is valid but leaves a C1 terminal injection vector open. Empirically verified: a serde_json::Value::Array containing "before\u{9b}31mafter" renders via Display to a JSON string that contains the raw 0xC2 0x9B bytes (U+009B CSI). The helper fix in commit 7d2cd3c closed this gap in the findings block but didn't reach the analyzer summaries block. Detail values include attacker-controlled strings from HTTP top_hosts / top_user_agents / recent_uris and TLS top_snis — these all come through String::from_utf8_lossy in the analyzers, so embedded C0 + C1 bytes can appear in the serde_json::Value leaves exposed to the terminal. Fix: apply escape_for_terminal to val.to_string() before writing to the output buffer, matching the findings-block pattern. The JSON rendering path now has a defense-in-depth double layer: serde_json's RFC 8259 escape for C0 (which produces \u001b that escape_for_terminal passes through as printable ASCII), plus escape_for_terminal's C1 gate catching any raw U+0080-U+009F bytes in the JSON string. Adds test_terminal_reporter_escapes_control_bytes_in_analyzer_summaries covering the C0 and C1 cases through the full ANALYZER SUMMARIES render path, plus a Cyrillic preservation assertion. Also fixes the plan's Task 2 doc-comment snippet which still mentioned str::escape_default — replaces with the correct escape_for_terminal reference (Copilot round 2 comment). Per Copilot round 2 review on #57. --- .../2026-04-09-reporting-pipeline-layering.md | 8 +-- src/reporter/terminal.rs | 10 +++- tests/reporter_tests.rs | 49 +++++++++++++++++++ 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md b/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md index 17e29d9..aeb293a 100644 --- a/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md +++ b/docs/superpowers/plans/2026-04-09-reporting-pipeline-layering.md @@ -76,9 +76,11 @@ Insert these lines directly above `impl fmt::Display for Finding {`: /// `summary` field may contain attacker-controlled bytes from packet payloads /// (including ASCII control codes like ESC `0x1b`) that a terminal would /// interpret as control sequences. For safe terminal rendering, use the -/// terminal reporter (`src/reporter/terminal.rs`), which applies -/// `str::escape_default` to every `summary` and `evidence` entry before -/// writing to the output buffer. See ADR 0003. +/// terminal reporter (`src/reporter/terminal.rs`), which applies its +/// `escape_for_terminal` helper to every `summary` and `evidence` entry +/// before writing to the output buffer. See ADR 0003 +/// (`docs/adr/0003-reporting-pipeline-layering.md`) for the full layering +/// principle. ``` - [ ] **Step 3: Run existing `test_finding_display`** diff --git a/src/reporter/terminal.rs b/src/reporter/terminal.rs index 1801d58..06664e6 100644 --- a/src/reporter/terminal.rs +++ b/src/reporter/terminal.rs @@ -139,7 +139,15 @@ impl Reporter for TerminalReporter { asummary.packets_analyzed )); for (key, val) in &asummary.detail { - out.push_str(&format!(" {key}: {val}\n")); + // Per ADR 0003: analyzer summary detail values can contain + // attacker-controlled bytes (e.g., top_hosts, top_snis, + // recent_uris from HTTP/TLS). serde_json's Display impl + // escapes C0 + DEL per RFC 8259 but passes C1 codepoints + // (U+0080-U+009F) through as raw UTF-8 — so we still need + // to run the JSON rendering through escape_for_terminal to + // close the C1 gap that U+009B (CSI) exploits. + let escaped_val = escape_for_terminal(&val.to_string()); + out.push_str(&format!(" {key}: {escaped_val}\n")); } out.push('\n'); } diff --git a/tests/reporter_tests.rs b/tests/reporter_tests.rs index 1307a99..db2710f 100644 --- a/tests/reporter_tests.rs +++ b/tests/reporter_tests.rs @@ -227,3 +227,52 @@ fn test_json_reporter_preserves_cyrillic_as_readable_unicode() { let parsed_summary = parsed["findings"][0]["summary"].as_str().unwrap(); assert_eq!(parsed_summary, finding.summary); } + +#[test] +fn test_terminal_reporter_escapes_control_bytes_in_analyzer_summaries() { + // Regression: analyzer_summaries detail values can contain + // attacker-controlled strings (HTTP top_hosts, TLS top_snis, etc.). + // serde_json::Value's Display impl escapes C0 + DEL per RFC 8259 but + // passes C1 codepoints (U+0080-U+009F) through as raw UTF-8 — which + // is a terminal injection vector on the analyzer summary rendering + // path. Per ADR 0003, the terminal reporter must escape at the + // display boundary regardless of what the underlying serializer does. + use wirerust::analyzer::AnalysisSummary; + + let mut detail = std::collections::HashMap::new(); + detail.insert( + "top_snis".to_string(), + serde_json::json!([ + "\u{1b}[31mREDC0\u{1b}[0m", // C0 ESC injection + "before\u{9b}31mC1after", // C1 CSI injection (valid UTF-8, bypasses serde's RFC 8259 escape) + "пример.рф", // legitimate Cyrillic — must survive readably + ]), + ); + let analyzer_summary = AnalysisSummary { + analyzer_name: "TLS".to_string(), + packets_analyzed: 3, + detail, + }; + + let output = TerminalReporter { use_color: false }.render( + &Summary::new(), + &[], + std::slice::from_ref(&analyzer_summary), + ); + + // No raw C0 ESC bytes in the output. + assert!( + !output.as_bytes().contains(&0x1b), + "terminal output must not contain raw ESC (0x1b) bytes in analyzer summary section, got: {output:?}" + ); + // No raw C1 bytes (0xC2 0x9B encodes U+009B in UTF-8). + assert!( + !output.as_bytes().windows(2).any(|w| w == [0xc2, 0x9b]), + "terminal output must not contain raw C1 UTF-8 bytes (0xC2 0x9B) in analyzer summary section, got: {output:?}" + ); + // Cyrillic must still be readable — the escape must not mangle valid Unicode. + assert!( + output.contains("пример.рф"), + "analyzer summary section must preserve legitimate Cyrillic, got: {output}" + ); +}