diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs index b32fcb2..a1193f4 100644 --- a/src/analyzer/tls.rs +++ b/src/analyzer/tls.rs @@ -148,12 +148,28 @@ fn compute_ja3s(version: u16, cipher: TlsCipherSuiteID, extensions: &[TlsExtensi /// Result of decoding a TLS SNI hostname. /// /// RFC 6066 §3 specifies that the `HostName` field is "represented as a byte -/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890), -/// which are also ASCII. A `NonUtf8` SNI is therefore a protocol violation — -/// usually a buggy TLS client, but worth surfacing for forensic review. +/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890, +/// `xn--…` Punycode form), which are also ASCII. Two RFC violations are tracked +/// separately: +/// +/// - `NonAsciiUtf8` — bytes decode as valid UTF-8 but contain non-ASCII codepoints +/// (e.g. a raw U-label "café.example"). RFC 6066 says these MUST be Punycode-encoded +/// before transmission. Major TLS clients (rustls, Chrome/BoringSSL, Firefox/NSS, +/// curl/libcurl) all do this conversion automatically — so a non-ASCII SNI on the +/// wire indicates either a buggy custom client (often raw OpenSSL without IDNA prep) +/// or an attacker tool. +/// - `NonUtf8` — bytes can't decode as UTF-8 at all. Strictly malformed. +/// +/// Both are surfaced as Anomaly findings; usually a client bug, but worth forensic +/// review. enum SniValue { - /// Hostname decoded cleanly as UTF-8 (which includes the RFC-compliant ASCII case). - Utf8(String), + /// Hostname is pure ASCII — RFC 6066 compliant. May be a literal hostname or + /// an A-label (Punycode) form like `xn--caf-dma.example`. + Ascii(String), + /// Hostname decodes as valid UTF-8 but contains at least one byte ≥ 0x80. + /// `hostname` is the decoded String (always valid UTF-8); `hex` is the lossless + /// lowercase hex of the raw bytes for forensic evidence. + NonAsciiUtf8 { hostname: String, hex: String }, /// Hostname bytes failed UTF-8 decoding. `lossy` is the U+FFFD-replaced form /// for human display; `hex` is the lossless lowercase hex of the raw bytes. NonUtf8 { lossy: String, hex: String }, @@ -171,7 +187,11 @@ fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option { && let Some((_, hostname)) = list.first() { return Some(match std::str::from_utf8(hostname) { - Ok(s) => SniValue::Utf8(s.to_string()), + Ok(s) if s.is_ascii() => SniValue::Ascii(s.to_string()), + Ok(s) => SniValue::NonAsciiUtf8 { + hostname: s.to_string(), + hex: bytes_to_hex(hostname), + }, Err(_) => SniValue::NonUtf8 { lossy: String::from_utf8_lossy(hostname).into_owned(), hex: bytes_to_hex(hostname), @@ -304,30 +324,56 @@ impl TlsAnalyzer { // Choose a map key that preserves uniqueness for non-UTF-8 cases. // Using `lossy` as a key would collapse distinct byte sequences whose // U+FFFD replacements happen to align — bad for forensic counting. + // For Ascii and NonAsciiUtf8 the hostname is valid UTF-8 with no + // collision risk, so use it directly. let key = match &sni { - SniValue::Utf8(s) => s.clone(), + SniValue::Ascii(s) => s.clone(), + SniValue::NonAsciiUtf8 { hostname, .. } => hostname.clone(), SniValue::NonUtf8 { hex, .. } => format!(""), }; Self::increment(&mut self.sni_counts, key, MAX_MAP_ENTRIES); - if let SniValue::NonUtf8 { lossy, hex } = sni { - self.all_findings.push(Finding { - category: ThreatCategory::Anomaly, - verdict: Verdict::Inconclusive, - confidence: Confidence::Low, - // Use Debug formatter ({:?}) to escape control bytes (e.g. ESC 0x1b) - // that String::from_utf8_lossy preserves but the analyst's terminal - // would interpret as ANSI control sequences. Without this an attacker - // could craft a malformed SNI like b"\x1b[31m..." that recolors or - // overwrites the rendered finding line. - summary: format!( - "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}" - ), - evidence: vec![format!("hex: {hex}")], - mitre_technique: None, - source_ip: None, - timestamp: None, - }); + match sni { + SniValue::Ascii(_) => {} // RFC-compliant, no finding + SniValue::NonAsciiUtf8 { hostname, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Use Debug formatter ({:?}) to escape any control codepoints + // that might survive UTF-8 decoding (e.g. U+0085 NEL); the + // hostname here is valid UTF-8 but printable-script content + // is not guaranteed. + summary: format!( + "TLS SNI contains non-ASCII characters (RFC 6066 requires \ + A-labels per RFC 5890): {hostname:?}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } + SniValue::NonUtf8 { lossy, hex } => { + self.all_findings.push(Finding { + category: ThreatCategory::Anomaly, + verdict: Verdict::Inconclusive, + confidence: Confidence::Low, + // Use Debug formatter ({:?}) to escape control bytes (e.g. + // ESC 0x1b) that String::from_utf8_lossy preserves but the + // analyst's terminal would interpret as ANSI control + // sequences. Without this an attacker could craft a + // malformed SNI like b"\x1b[31m..." that recolors or + // overwrites the rendered finding line. + summary: format!( + "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}" + ), + evidence: vec![format!("hex: {hex}")], + mitre_technique: None, + source_ip: None, + timestamp: None, + }); + } } } diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs index dfd338e..c4408bd 100644 --- a/tests/tls_analyzer_tests.rs +++ b/tests/tls_analyzer_tests.rs @@ -512,38 +512,154 @@ fn test_sni_with_empty_hostname_bytes() { } #[test] -fn test_valid_utf8_non_ascii_sni_currently_not_flagged() { - // Pin: a SNI value that is valid UTF-8 but contains non-ASCII characters - // (e.g. a raw U-label "café.example") is *also* a RFC 6066 §3 violation — - // the spec requires A-labels (RFC 5890 Punycode `xn--` form). The current - // analyzer does not flag this case (tracked as a follow-up at - // https://github.com/Zious11/wirerust/issues/51). +fn test_valid_utf8_non_ascii_sni_emits_finding() { + // A SNI value that is valid UTF-8 but contains non-ASCII characters (e.g. a + // raw U-label "café.example") is an RFC 6066 §3 violation: the spec requires + // ASCII encoding, with internationalized names sent as A-labels (RFC 5890 + // Punycode `xn--` form). Major TLS clients (rustls, Chrome/BoringSSL, + // Firefox/NSS, curl/libcurl) all auto-Punycode internationalized hostnames + // before sending, so a non-ASCII SNI on the wire is a strong indicator of + // either a buggy custom client (often raw OpenSSL without IDNA prep) or an + // attacker tool. // - // This test pins the current "no finding emitted" behavior. When issue #51 - // lands, this assertion should be flipped to expect a finding, and this - // comment removed. + // This test was previously a pin-test for "no finding emitted" pending + // issue #51; that issue is now implemented and the assertion has been + // flipped to expect the finding. let mut analyzer = TlsAnalyzer::new(); let fk = test_flow_key(); let record = build_client_hello("café.example", &[0x1301]); analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + // Hostname is still counted under its valid-UTF-8 form (no collision risk). assert_eq!( *analyzer.sni_counts().get("café.example").unwrap_or(&0), 1, "expected café.example to be counted under its UTF-8 form, got {:?}", analyzer.sni_counts() ); + + let non_ascii_findings: Vec<_> = analyzer + .findings() + .into_iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .collect(); + assert_eq!( + non_ascii_findings.len(), + 1, + "expected exactly one non-ASCII SNI finding, got: {:?}", + analyzer.findings() + ); + let f = &non_ascii_findings[0]; + assert_eq!(f.category, wirerust::findings::ThreatCategory::Anomaly); + assert_eq!(f.verdict, wirerust::findings::Verdict::Inconclusive); + assert_eq!(f.confidence, wirerust::findings::Confidence::Low); + assert!( + f.summary.contains("RFC 6066"), + "summary should reference RFC 6066, got: {}", + f.summary + ); + assert!( + f.summary.contains("A-labels"), + "summary should mention A-labels, got: {}", + f.summary + ); + // Hex evidence is the lossless representation of the raw bytes. + // "café.example" UTF-8 = 63 61 66 c3 a9 2e 65 78 61 6d 70 6c 65 + assert!( + f.evidence + .iter() + .any(|e| e.contains("636166c3a92e6578616d706c65")), + "expected raw UTF-8 bytes in hex evidence, got: {:?}", + f.evidence + ); + + // Critical: this must NOT trip the non-UTF-8 finding (those are different + // cases — café.example is valid UTF-8, just non-ASCII). let non_utf8_findings = analyzer .findings() .iter() .filter(|f| f.summary.contains("non-UTF-8 bytes")) .count(); + assert_eq!(non_utf8_findings, 0); +} + +#[test] +fn test_cyrillic_sni_emits_non_ascii_finding() { + // Regression: a U-label using Cyrillic script should trip the non-ASCII + // finding identically to the Latin-accented case. + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + let record = build_client_hello("пример.example", &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let non_ascii_count = analyzer + .findings() + .iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .count(); assert_eq!( - non_utf8_findings, 0, - "valid UTF-8 must not trip the non-UTF-8 finding (see \ - https://github.com/Zious11/wirerust/issues/51 for the planned follow-up; \ - flip this assertion when that issue lands)" + non_ascii_count, + 1, + "Cyrillic U-label must emit one non-ASCII finding, got {:?}", + analyzer.findings() + ); + assert_eq!( + *analyzer.sni_counts().get("пример.example").unwrap_or(&0), + 1 + ); +} + +#[test] +fn test_emoji_sni_emits_non_ascii_finding() { + // Regression: a 4-byte UTF-8 emoji codepoint also trips the finding. + // 🦀 = U+1F980 = 0xF0 0x9F 0xA6 0x80 (4 bytes, all ≥ 0x80). + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + let record = build_client_hello("🦀.example", &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let non_ascii_count = analyzer + .findings() + .iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .count(); + assert_eq!( + non_ascii_count, + 1, + "emoji SNI must emit one non-ASCII finding, got {:?}", + analyzer.findings() + ); +} + +#[test] +fn test_punycode_a_label_does_not_emit_non_ascii_finding() { + // Regression: a Punycode A-label "xn--caf-dma.example" is the + // RFC-compliant way to encode "café.example" and is pure ASCII. + // The analyzer must NOT flag it. + let mut analyzer = TlsAnalyzer::new(); + let fk = test_flow_key(); + + let record = build_client_hello("xn--caf-dma.example", &[0x1301]); + analyzer.on_data(&fk, Direction::ClientToServer, &record, 0); + + let non_ascii_count = analyzer + .findings() + .iter() + .filter(|f| f.summary.contains("non-ASCII characters")) + .count(); + assert_eq!( + non_ascii_count, 0, + "Punycode A-label is RFC-compliant ASCII and must not be flagged" + ); + assert_eq!( + *analyzer + .sni_counts() + .get("xn--caf-dma.example") + .unwrap_or(&0), + 1 ); }