From 7b3d31ec53eccf926fe81b4fb43aaf935ad0634c Mon Sep 17 00:00:00 2001 From: Zious Date: Wed, 8 Apr 2026 18:05:58 -0500 Subject: [PATCH 1/2] feat: flag valid-UTF-8 but non-ASCII SNI as RFC 6066 violation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #51. PR #49 surfaced non-UTF-8 SNI bytes as a finding but deliberately left a related RFC 6066 §3 violation unflagged: SNI hostnames that are valid UTF-8 but contain non-ASCII codepoints, e.g. raw U-labels like "café.example" or "пример.example". The spec requires HostName to be ASCII (with internationalized names sent as A-labels per RFC 5890 Punycode `xn--…` form), so a non-ASCII byte on the wire is a real protocol violation. Real-world false-positive risk ------------------------------ Validated via Perplexity that all major TLS clients auto-Punycode internationalized hostnames before sending the SNI: - rustls — rejects raw UTF-8 at the ServerName API level, requires Punycode upfront - Chrome / BoringSSL — always Punycode (per RFC 3492 + 6066) - Firefox / NSS — always Punycode (URL bar may display Unicode for user-friendliness, but the wire is always Punycode) - curl / libcurl — auto-converts via libidn2 before calling OpenSSL - OpenSSL raw — passes verbatim, but expects ASCII from the caller; applications using OpenSSL directly without IDNA prep are the only legitimate path to a raw U-label SNI on the wire So the false-positive surface is small: a buggy custom client, an attacker tool, or an OpenSSL-direct app that forgot to Punycode. Severity is `Anomaly` / `Inconclusive` / `Low` — same rationale as the non-UTF-8 finding. SniValue enum ------------- Renamed `Utf8(String)` → `Ascii(String)` (the old name was misleading since ASCII is also valid UTF-8) and added a new variant: enum SniValue { Ascii(String), // RFC compliant NonAsciiUtf8 { hostname: String, hex: String }, // valid UTF-8, non-ASCII NonUtf8 { lossy: String, hex: String }, // invalid UTF-8 (existing) } `extract_sni` now uses `s.is_ascii()` to split the UTF-8 case into `Ascii` and `NonAsciiUtf8`. The hostname for `NonAsciiUtf8` is the decoded String (always valid UTF-8 by definition); `hex` is the lossless representation for forensic evidence. `handle_client_hello` keys `sni_counts` on the hostname directly for both Ascii and NonAsciiUtf8 (no collision risk for valid UTF-8) and keeps the existing `` tagged form for NonUtf8. The finding summary uses `{hostname:?}` Debug formatter to escape any control codepoints that might survive UTF-8 decoding (e.g. U+0085 NEL). Tests ----- - Flipped `test_valid_utf8_non_ascii_sni_currently_not_flagged` to `test_valid_utf8_non_ascii_sni_emits_finding` — asserts the new finding fires for "café.example" with the right severity, RFC reference in summary, and hex evidence (`636166c3a92e6578616d706c65`). - Added `test_cyrillic_sni_emits_non_ascii_finding` for "пример.example". - Added `test_emoji_sni_emits_non_ascii_finding` for "🦀.example" (4-byte UTF-8 codepoint, all bytes ≥ 0x80). - Added `test_punycode_a_label_does_not_emit_non_ascii_finding` for "xn--caf-dma.example" — pins that the RFC-compliant Punycode form is NOT flagged. --- src/analyzer/tls.rs | 96 +++++++++++++++++------- tests/tls_analyzer_tests.rs | 142 ++++++++++++++++++++++++++++++++---- 2 files changed, 200 insertions(+), 38 deletions(-) diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs index b32fcb2..a1193f4 100644 --- a/src/analyzer/tls.rs +++ b/src/analyzer/tls.rs @@ -148,12 +148,28 @@ fn compute_ja3s(version: u16, cipher: TlsCipherSuiteID, extensions: &[TlsExtensi /// Result of decoding a TLS SNI hostname. /// /// RFC 6066 §3 specifies that the `HostName` field is "represented as a byte -/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890), -/// which are also ASCII. A `NonUtf8` SNI is therefore a protocol violation — -/// usually a buggy TLS client, but worth surfacing for forensic review. +/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890, +/// `xn--…` Punycode form), which are also ASCII. Two RFC violations are tracked +/// separately: +/// +/// - `NonAsciiUtf8` — bytes decode as valid UTF-8 but contain non-ASCII codepoints +/// (e.g. a raw U-label "café.example"). RFC 6066 says these MUST be Punycode-encoded +/// before transmission. Major TLS clients (rustls, Chrome/BoringSSL, Firefox/NSS, +/// curl/libcurl) all do this conversion automatically — so a non-ASCII SNI on the +/// wire indicates either a buggy custom client (often raw OpenSSL without IDNA prep) +/// or an attacker tool. +/// - `NonUtf8` — bytes can't decode as UTF-8 at all. Strictly malformed. +/// +/// Both are surfaced as Anomaly findings; usually a client bug, but worth forensic +/// review. enum SniValue { - /// Hostname decoded cleanly as UTF-8 (which includes the RFC-compliant ASCII case). - Utf8(String), + /// Hostname is pure ASCII — RFC 6066 compliant. May be a literal hostname or + /// an A-label (Punycode) form like `xn--caf-dma.example`. + Ascii(String), + /// Hostname decodes as valid UTF-8 but contains at least one byte ≥ 0x80. + /// `hostname` is the decoded String (always valid UTF-8); `hex` is the lossless + /// lowercase hex of the raw bytes for forensic evidence. + NonAsciiUtf8 { hostname: String, hex: String }, /// Hostname bytes failed UTF-8 decoding. `lossy` is the U+FFFD-replaced form /// for human display; `hex` is the lossless lowercase hex of the raw bytes. NonUtf8 { lossy: String, hex: String }, @@ -171,7 +187,11 @@ fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option { && let Some((_, hostname)) = list.first() { return Some(match std::str::from_utf8(hostname) { - Ok(s) => SniValue::Utf8(s.to_string()), + Ok(s) if s.is_ascii() => SniValue::Ascii(s.to_string()), + Ok(s) => SniValue::NonAsciiUtf8 { + hostname: s.to_string(), + hex: bytes_to_hex(hostname), + }, Err(_) => SniValue::NonUtf8 { lossy: String::from_utf8_lossy(hostname).into_owned(), hex: bytes_to_hex(hostname), @@ -304,30 +324,56 @@ impl TlsAnalyzer { // Choose a map key that preserves uniqueness for non-UTF-8 cases. // Using `lossy` as a key would collapse distinct byte sequences whose // U+FFFD replacements happen to align — bad for forensic counting. + // For Ascii and NonAsciiUtf8 the hostname is valid UTF-8 with no + // collision risk, so use it directly. let key = match &sni { - SniValue::Utf8(s) => s.clone(), + SniValue::Ascii(s) => s.clone(), + SniValue::NonAsciiUtf8 { hostname, .. } => hostname.clone(), SniValue::NonUtf8 { hex, .. } => format!(""), }; Self::increment(&mut self.sni_counts, key, MAX_MAP_ENTRIES); - if let SniValue::NonUtf8 { lossy, hex } = sni { - self.all_findings.push(Finding { - category: ThreatCategory::Anomaly, - verdict: Verdict::Inconclusive, - confidence: Confidence::Low, - // Use Debug formatter ({:?}) to escape control bytes (e.g. ESC 0x1b) - // that String::from_utf8_lossy preserves but the analyst's terminal - // would interpret as ANSI control sequences. Without this an attacker - // could craft a malformed SNI like b"\x1b[31m..." that recolors or - // overwrites the rendered finding line. - summary: format!( - "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}" - ), - evidence: vec![format!("hex: {hex}")], - mitre_technique: None, - source_ip: None, - timestamp: None, - }); + match sni { + SniValue::Ascii(_) => {} // RFC-compliant, no finding + SniValue::NonAsciiUtf8 { hostname, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Use Debug formatter ({:?}) to escape any control codepoints + // that might survive UTF-8 decoding (e.g. U+0085 NEL); the + // hostname here is valid UTF-8 but printable-script content + // is not guaranteed. + summary: format!( + "TLS SNI contains non-ASCII characters (RFC 6066 requires \ + A-labels per RFC 5890): {hostname:?}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } + SniValue::NonUtf8 { lossy, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Use Debug formatter ({:?}) to escape control bytes (e.g. + // ESC 0x1b) that String::from_utf8_lossy preserves but the + // analyst's terminal would interpret as ANSI control + // sequences. Without this an attacker could craft a + // malformed SNI like b"\x1b[31m..." that recolors or + // overwrites the rendered finding line. + summary: format!( + "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } } } diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs index dfd338e..57c532c 100644 --- a/tests/tls_analyzer_tests.rs +++ b/tests/tls_analyzer_tests.rs @@ -512,38 +512,154 @@ fn test_sni_with_empty_hostname_bytes() { } #[test] -fn test_valid_utf8_non_ascii_sni_currently_not_flagged() { - // Pin: a SNI value that is valid UTF-8 but contains non-ASCII characters - // (e.g. a raw U-label "café.example") is *also* a RFC 6066 §3 violation — - // the spec requires A-labels (RFC 5890 Punycode `xn--` form). The current - // analyzer does not flag this case (tracked as a follow-up at - // https://github.com/Zious11/wirerust/issues/51). +fn test_valid_utf8_non_ascii_sni_emits_finding() { + // A SNI value that is valid UTF-8 but contains non-ASCII characters (e.g. a + // raw U-label "café.example") is a RFC 6066 §3 violation: the spec requires + // ASCII encoding, with internationalized names sent as A-labels (RFC 5890 + // Punycode `xn--` form). Major TLS clients (rustls, Chrome/BoringSSL, + // Firefox/NSS, curl/libcurl) all auto-Punycode internationalized hostnames + // before sending, so a non-ASCII SNI on the wire is a strong indicator of + // either a buggy custom client (often raw OpenSSL without IDNA prep) or an + // attacker tool. // - // This test pins the current "no finding emitted" behavior. When issue #51 - // lands, this assertion should be flipped to expect a finding, and this - // comment removed. + // This test was previously a pin-test for "no finding emitted" pending + // issue #51; that issue is now implemented and the assertion has been + // flipped to expect the finding. let mut analyzer = TlsAnalyzer::new(); let fk = test_flow_key(); let record = build_client_hello("café.example", &[0x1301]); analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + // Hostname is still counted under its valid-UTF-8 form (no collision risk). assert_eq!( *analyzer.sni_counts().get("café.example").unwrap_or(&0), 1, "expected café.example to be counted under its UTF-8 form, got {:?}", analyzer.sni_counts() ); + + let non_ascii_findings: Vec<_> = analyzer + .findings() + .into_iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .collect(); + assert_eq!( + non_ascii_findings.len(), + 1, + "expected exactly one non-ASCII SNI finding, got: {:?}", + analyzer.findings() + ); + let f = &non_ascii_findings[0]; + assert_eq!(f.category, wirerust::findings::ThreatCategory::Anomaly); + assert_eq!(f.verdict, wirerust::findings::Verdict::Inconclusive); + assert_eq!(f.confidence, wirerust::findings::Confidence::Low); + assert!( + f.summary.contains("RFC 6066"), + "summary should reference RFC 6066, got: {}", + f.summary + ); + assert!( + f.summary.contains("A-labels"), + "summary should mention A-labels, got: {}", + f.summary + ); + // Hex evidence is the lossless representation of the raw bytes. + // "café.example" UTF-8 = 63 61 66 c3 a9 2e 65 78 61 6d 70 6c 65 + assert!( + f.evidence + .iter() + .any(|e| e.contains("636166c3a92e6578616d706c65")), + "expected raw UTF-8 bytes in hex evidence, got: {:?}", + f.evidence + ); + + // Critical: this must NOT trip the non-UTF-8 finding (those are different + // cases — café.example is valid UTF-8, just non-ASCII). let non_utf8_findings = analyzer .findings() .iter() .filter(|f| f.summary.contains("non-UTF-8 bytes")) .count(); + assert_eq!(non_utf8_findings, 0); +} + +#[test] +fn test_cyrillic_sni_emits_non_ascii_finding() { + // Regression: a U-label using Cyrillic script should trip the non-ASCII + // finding identically to the Latin-accented case. + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + let record = build_client_hello("пример.example", &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let non_ascii_count = analyzer + .findings() + .iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .count(); assert_eq!( - non_utf8_findings, 0, - "valid UTF-8 must not trip the non-UTF-8 finding (see \ - https://github.com/Zious11/wirerust/issues/51 for the planned follow-up; \ - flip this assertion when that issue lands)" + non_ascii_count, + 1, + "Cyrillic U-label must emit one non-ASCII finding, got {:?}", + analyzer.findings() + ); + assert_eq!( + *analyzer.sni_counts().get("пример.example").unwrap_or(&0), + 1 + ); +} + +#[test] +fn test_emoji_sni_emits_non_ascii_finding() { + // Regression: a 4-byte UTF-8 emoji codepoint also trips the finding. + // 🦀 = U+1F980 = 0xF0 0x9F 0xA6 0x80 (4 bytes, all ≥ 0x80). + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + let record = build_client_hello("🦀.example", &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let non_ascii_count = analyzer + .findings() + .iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .count(); + assert_eq!( + non_ascii_count, + 1, + "emoji SNI must emit one non-ASCII finding, got {:?}", + analyzer.findings() + ); +} + +#[test] +fn test_punycode_a_label_does_not_emit_non_ascii_finding() { + // Regression: a Punycode A-label "xn--caf-dma.example" is the + // RFC-compliant way to encode "café.example" and is pure ASCII. + // The analyzer must NOT flag it. + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + let record = build_client_hello("xn--caf-dma.example", &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let non_ascii_count = analyzer + .findings() + .iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .count(); + assert_eq!( + non_ascii_count, 0, + "Punycode A-label is RFC-compliant ASCII and must not be flagged" + ); + assert_eq!( + *analyzer + .sni_counts() + .get("xn--caf-dma.example") + .unwrap_or(&0), + 1 ); } From 82b31a985394818b36bc419e2e175306a00f3c65 Mon Sep 17 00:00:00 2001 From: Zious Date: Wed, 8 Apr 2026 18:46:11 -0500 Subject: [PATCH 2/2] test: fix grammar in non-ASCII SNI test comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Copilot review on PR #55: "a RFC 6066" → "an RFC 6066". RFC is pronounced "arr-eff-see", which starts with a vowel sound, so the indefinite article is "an" — matches IETF style throughout the RFC corpus itself. --- tests/tls_analyzer_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs index 57c532c..c4408bd 100644 --- a/tests/tls_analyzer_tests.rs +++ b/tests/tls_analyzer_tests.rs @@ -514,7 +514,7 @@ fn test_sni_with_empty_hostname_bytes() { #[test] fn test_valid_utf8_non_ascii_sni_emits_finding() { // A SNI value that is valid UTF-8 but contains non-ASCII characters (e.g. a - // raw U-label "café.example") is a RFC 6066 §3 violation: the spec requires + // raw U-label "café.example") is an RFC 6066 §3 violation: the spec requires // ASCII encoding, with internationalized names sent as A-labels (RFC 5890 // Punycode `xn--` form). Major TLS clients (rustls, Chrome/BoringSSL, // Firefox/NSS, curl/libcurl) all auto-Punycode internationalized hostnames