From 7b3d31ec53eccf926fe81b4fb43aaf935ad0634c Mon Sep 17 00:00:00 2001
From: Zious <jaredbrichards@gmail.com>
Date: Wed, 8 Apr 2026 18:05:58 -0500
Subject: [PATCH 1/2] feat: flag valid-UTF-8 but non-ASCII SNI as RFC 6066
 violation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #51. PR #49 surfaced non-UTF-8 SNI bytes as a finding but
deliberately left a related RFC 6066 §3 violation unflagged: SNI
hostnames that are valid UTF-8 but contain non-ASCII codepoints,
e.g. raw U-labels like "café.example" or "пример.example". The spec
requires HostName to be ASCII (with internationalized names sent as
A-labels per RFC 5890 Punycode `xn--…` form), so a non-ASCII byte on
the wire is a real protocol violation.

Real-world false-positive risk
------------------------------
Validated via Perplexity that all major TLS clients auto-Punycode
internationalized hostnames before sending the SNI:

- rustls — rejects raw UTF-8 at the ServerName API level, requires
  Punycode upfront
- Chrome / BoringSSL — always Punycode (per RFC 3492 + 6066)
- Firefox / NSS — always Punycode (URL bar may display Unicode for
  user-friendliness, but the wire is always Punycode)
- curl / libcurl — auto-converts via libidn2 before calling OpenSSL
- OpenSSL raw — passes verbatim, but expects ASCII from the caller;
  applications using OpenSSL directly without IDNA prep are the only
  legitimate path to a raw U-label SNI on the wire

So the false-positive surface is small: a buggy custom client, an
attacker tool, or an OpenSSL-direct app that forgot to Punycode.
Severity is `Anomaly` / `Inconclusive` / `Low` — same rationale as
the non-UTF-8 finding.

SniValue enum
-------------
Renamed `Utf8(String)` → `Ascii(String)` (the old name was misleading
since ASCII is also valid UTF-8) and added a new variant:

    enum SniValue {
        Ascii(String),                              // RFC compliant
        NonAsciiUtf8 { hostname: String, hex: String }, // valid UTF-8, non-ASCII
        NonUtf8 { lossy: String, hex: String },     // invalid UTF-8 (existing)
    }

`extract_sni` now uses `s.is_ascii()` to split the UTF-8 case into
`Ascii` and `NonAsciiUtf8`. The hostname for `NonAsciiUtf8` is the
decoded String (always valid UTF-8 by definition); `hex` is the
lossless representation for forensic evidence.

`handle_client_hello` keys `sni_counts` on the hostname directly for
both Ascii and NonAsciiUtf8 (no collision risk for valid UTF-8) and
keeps the existing `<non-utf8:HEX>` tagged form for NonUtf8. The
finding summary uses `{hostname:?}` Debug formatter to escape any
control codepoints that might survive UTF-8 decoding (e.g. U+0085 NEL).

Tests
-----
- Flipped `test_valid_utf8_non_ascii_sni_currently_not_flagged` to
  `test_valid_utf8_non_ascii_sni_emits_finding` — asserts the new
  finding fires for "café.example" with the right severity, RFC
  reference in summary, and hex evidence (`636166c3a92e6578616d706c65`).
- Added `test_cyrillic_sni_emits_non_ascii_finding` for "пример.example".
- Added `test_emoji_sni_emits_non_ascii_finding` for "🦀.example"
  (4-byte UTF-8 codepoint, all bytes ≥ 0x80).
- Added `test_punycode_a_label_does_not_emit_non_ascii_finding` for
  "xn--caf-dma.example" — pins that the RFC-compliant Punycode form
  is NOT flagged.
---
 src/analyzer/tls.rs         |  96 +++++++++++++++++-------
 tests/tls_analyzer_tests.rs | 142 ++++++++++++++++++++++++++++++++----
 2 files changed, 200 insertions(+), 38 deletions(-)

diff --git a/src/analyzer/tls.rs b/src/analyzer/tls.rs
index b32fcb2..a1193f4 100644
--- a/src/analyzer/tls.rs
+++ b/src/analyzer/tls.rs
@@ -148,12 +148,28 @@ fn compute_ja3s(version: u16, cipher: TlsCipherSuiteID, extensions: &[TlsExtensi
 /// Result of decoding a TLS SNI hostname.
 ///
 /// RFC 6066 §3 specifies that the `HostName` field is "represented as a byte
-/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890),
-/// which are also ASCII. A `NonUtf8` SNI is therefore a protocol violation —
-/// usually a buggy TLS client, but worth surfacing for forensic review.
+/// string using ASCII encoding". Internationalized names use A-labels (RFC 5890,
+/// `xn--…` Punycode form), which are also ASCII. Two RFC violations are tracked
+/// separately:
+///
+/// - `NonAsciiUtf8` — bytes decode as valid UTF-8 but contain non-ASCII codepoints
+///   (e.g. a raw U-label "café.example"). RFC 6066 says these MUST be Punycode-encoded
+///   before transmission. Major TLS clients (rustls, Chrome/BoringSSL, Firefox/NSS,
+///   curl/libcurl) all do this conversion automatically — so a non-ASCII SNI on the
+///   wire indicates either a buggy custom client (often raw OpenSSL without IDNA prep)
+///   or an attacker tool.
+/// - `NonUtf8` — bytes can't decode as UTF-8 at all. Strictly malformed.
+///
+/// Both are surfaced as Anomaly findings; usually a client bug, but worth forensic
+/// review.
 enum SniValue {
-    /// Hostname decoded cleanly as UTF-8 (which includes the RFC-compliant ASCII case).
-    Utf8(String),
+    /// Hostname is pure ASCII — RFC 6066 compliant. May be a literal hostname or
+    /// an A-label (Punycode) form like `xn--caf-dma.example`.
+    Ascii(String),
+    /// Hostname decodes as valid UTF-8 but contains at least one byte ≥ 0x80.
+    /// `hostname` is the decoded String (always valid UTF-8); `hex` is the lossless
+    /// lowercase hex of the raw bytes for forensic evidence.
+    NonAsciiUtf8 { hostname: String, hex: String },
     /// Hostname bytes failed UTF-8 decoding. `lossy` is the U+FFFD-replaced form
     /// for human display; `hex` is the lossless lowercase hex of the raw bytes.
     NonUtf8 { lossy: String, hex: String },
@@ -171,7 +187,11 @@ fn extract_sni(extensions: &[TlsExtension<'_>]) -> Option<SniValue> {
             && let Some((_, hostname)) = list.first()
         {
             return Some(match std::str::from_utf8(hostname) {
-                Ok(s) => SniValue::Utf8(s.to_string()),
+                Ok(s) if s.is_ascii() => SniValue::Ascii(s.to_string()),
+                Ok(s) => SniValue::NonAsciiUtf8 {
+                    hostname: s.to_string(),
+                    hex: bytes_to_hex(hostname),
+                },
                 Err(_) => SniValue::NonUtf8 {
                     lossy: String::from_utf8_lossy(hostname).into_owned(),
                     hex: bytes_to_hex(hostname),
@@ -304,30 +324,56 @@ impl TlsAnalyzer {
             // Choose a map key that preserves uniqueness for non-UTF-8 cases.
             // Using `lossy` as a key would collapse distinct byte sequences whose
             // U+FFFD replacements happen to align — bad for forensic counting.
+            // For Ascii and NonAsciiUtf8 the hostname is valid UTF-8 with no
+            // collision risk, so use it directly.
             let key = match &sni {
-                SniValue::Utf8(s) => s.clone(),
+                SniValue::Ascii(s) => s.clone(),
+                SniValue::NonAsciiUtf8 { hostname, .. } => hostname.clone(),
                 SniValue::NonUtf8 { hex, .. } => format!("<non-utf8:{hex}>"),
             };
             Self::increment(&mut self.sni_counts, key, MAX_MAP_ENTRIES);
 
-            if let SniValue::NonUtf8 { lossy, hex } = sni {
-                self.all_findings.push(Finding {
-                    category: ThreatCategory::Anomaly,
-                    verdict: Verdict::Inconclusive,
-                    confidence: Confidence::Low,
-                    // Use Debug formatter ({:?}) to escape control bytes (e.g. ESC 0x1b)
-                    // that String::from_utf8_lossy preserves but the analyst's terminal
-                    // would interpret as ANSI control sequences. Without this an attacker
-                    // could craft a malformed SNI like b"\x1b[31m..." that recolors or
-                    // overwrites the rendered finding line.
-                    summary: format!(
-                        "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}"
-                    ),
-                    evidence: vec![format!("hex: {hex}")],
-                    mitre_technique: None,
-                    source_ip: None,
-                    timestamp: None,
-                });
+            match sni {
+                SniValue::Ascii(_) => {} // RFC-compliant, no finding
+                SniValue::NonAsciiUtf8 { hostname, hex } => {
+                    self.all_findings.push(Finding {
+                        category: ThreatCategory::Anomaly,
+                        verdict: Verdict::Inconclusive,
+                        confidence: Confidence::Low,
+                        // Use Debug formatter ({:?}) to escape any control codepoints
+                        // that might survive UTF-8 decoding (e.g. U+0085 NEL); the
+                        // hostname here is valid UTF-8 but printable-script content
+                        // is not guaranteed.
+                        summary: format!(
+                            "TLS SNI contains non-ASCII characters (RFC 6066 requires \
+                             A-labels per RFC 5890): {hostname:?}"
+                        ),
+                        evidence: vec![format!("hex: {hex}")],
+                        mitre_technique: None,
+                        source_ip: None,
+                        timestamp: None,
+                    });
+                }
+                SniValue::NonUtf8 { lossy, hex } => {
+                    self.all_findings.push(Finding {
+                        category: ThreatCategory::Anomaly,
+                        verdict: Verdict::Inconclusive,
+                        confidence: Confidence::Low,
+                        // Use Debug formatter ({:?}) to escape control bytes (e.g.
+                        // ESC 0x1b) that String::from_utf8_lossy preserves but the
+                        // analyst's terminal would interpret as ANSI control
+                        // sequences. Without this an attacker could craft a
+                        // malformed SNI like b"\x1b[31m..." that recolors or
+                        // overwrites the rendered finding line.
+                        summary: format!(
+                            "TLS SNI contains non-UTF-8 bytes (RFC 6066 violation): {lossy:?}"
+                        ),
+                        evidence: vec![format!("hex: {hex}")],
+                        mitre_technique: None,
+                        source_ip: None,
+                        timestamp: None,
+                    });
+                }
             }
         }
 
diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs
index dfd338e..57c532c 100644
--- a/tests/tls_analyzer_tests.rs
+++ b/tests/tls_analyzer_tests.rs
@@ -512,38 +512,154 @@ fn test_sni_with_empty_hostname_bytes() {
 }
 
 #[test]
-fn test_valid_utf8_non_ascii_sni_currently_not_flagged() {
-    // Pin: a SNI value that is valid UTF-8 but contains non-ASCII characters
-    // (e.g. a raw U-label "café.example") is *also* a RFC 6066 §3 violation —
-    // the spec requires A-labels (RFC 5890 Punycode `xn--` form). The current
-    // analyzer does not flag this case (tracked as a follow-up at
-    // https://github.com/Zious11/wirerust/issues/51).
+fn test_valid_utf8_non_ascii_sni_emits_finding() {
+    // A SNI value that is valid UTF-8 but contains non-ASCII characters (e.g. a
+    // raw U-label "café.example") is a RFC 6066 §3 violation: the spec requires
+    // ASCII encoding, with internationalized names sent as A-labels (RFC 5890
+    // Punycode `xn--` form). Major TLS clients (rustls, Chrome/BoringSSL,
+    // Firefox/NSS, curl/libcurl) all auto-Punycode internationalized hostnames
+    // before sending, so a non-ASCII SNI on the wire is a strong indicator of
+    // either a buggy custom client (often raw OpenSSL without IDNA prep) or an
+    // attacker tool.
     //
-    // This test pins the current "no finding emitted" behavior. When issue #51
-    // lands, this assertion should be flipped to expect a finding, and this
-    // comment removed.
+    // This test was previously a pin-test for "no finding emitted" pending
+    // issue #51; that issue is now implemented and the assertion has been
+    // flipped to expect the finding.
     let mut analyzer = TlsAnalyzer::new();
     let fk = test_flow_key();
 
     let record = build_client_hello("café.example", &[0x1301]);
     analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
 
+    // Hostname is still counted under its valid-UTF-8 form (no collision risk).
     assert_eq!(
         *analyzer.sni_counts().get("café.example").unwrap_or(&0),
         1,
         "expected café.example to be counted under its UTF-8 form, got {:?}",
         analyzer.sni_counts()
     );
+
+    let non_ascii_findings: Vec<_> = analyzer
+        .findings()
+        .into_iter()
+        .filter(|f| f.summary.contains("non-ASCII characters"))
+        .collect();
+    assert_eq!(
+        non_ascii_findings.len(),
+        1,
+        "expected exactly one non-ASCII SNI finding, got: {:?}",
+        analyzer.findings()
+    );
+    let f = &non_ascii_findings[0];
+    assert_eq!(f.category, wirerust::findings::ThreatCategory::Anomaly);
+    assert_eq!(f.verdict, wirerust::findings::Verdict::Inconclusive);
+    assert_eq!(f.confidence, wirerust::findings::Confidence::Low);
+    assert!(
+        f.summary.contains("RFC 6066"),
+        "summary should reference RFC 6066, got: {}",
+        f.summary
+    );
+    assert!(
+        f.summary.contains("A-labels"),
+        "summary should mention A-labels, got: {}",
+        f.summary
+    );
+    // Hex evidence is the lossless representation of the raw bytes.
+    // "café.example" UTF-8 = 63 61 66 c3 a9 2e 65 78 61 6d 70 6c 65
+    assert!(
+        f.evidence
+            .iter()
+            .any(|e| e.contains("636166c3a92e6578616d706c65")),
+        "expected raw UTF-8 bytes in hex evidence, got: {:?}",
+        f.evidence
+    );
+
+    // Critical: this must NOT trip the non-UTF-8 finding (those are different
+    // cases — café.example is valid UTF-8, just non-ASCII).
     let non_utf8_findings = analyzer
         .findings()
         .iter()
         .filter(|f| f.summary.contains("non-UTF-8 bytes"))
         .count();
+    assert_eq!(non_utf8_findings, 0);
+}
+
+#[test]
+fn test_cyrillic_sni_emits_non_ascii_finding() {
+    // Regression: a U-label using Cyrillic script should trip the non-ASCII
+    // finding identically to the Latin-accented case.
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    let record = build_client_hello("пример.example", &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    let non_ascii_count = analyzer
+        .findings()
+        .iter()
+        .filter(|f| f.summary.contains("non-ASCII characters"))
+        .count();
     assert_eq!(
-        non_utf8_findings, 0,
-        "valid UTF-8 must not trip the non-UTF-8 finding (see \
-         https://github.com/Zious11/wirerust/issues/51 for the planned follow-up; \
-         flip this assertion when that issue lands)"
+        non_ascii_count,
+        1,
+        "Cyrillic U-label must emit one non-ASCII finding, got {:?}",
+        analyzer.findings()
+    );
+    assert_eq!(
+        *analyzer.sni_counts().get("пример.example").unwrap_or(&0),
+        1
+    );
+}
+
+#[test]
+fn test_emoji_sni_emits_non_ascii_finding() {
+    // Regression: a 4-byte UTF-8 emoji codepoint also trips the finding.
+    // 🦀 = U+1F980 = 0xF0 0x9F 0xA6 0x80 (4 bytes, all ≥ 0x80).
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    let record = build_client_hello("🦀.example", &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    let non_ascii_count = analyzer
+        .findings()
+        .iter()
+        .filter(|f| f.summary.contains("non-ASCII characters"))
+        .count();
+    assert_eq!(
+        non_ascii_count,
+        1,
+        "emoji SNI must emit one non-ASCII finding, got {:?}",
+        analyzer.findings()
+    );
+}
+
+#[test]
+fn test_punycode_a_label_does_not_emit_non_ascii_finding() {
+    // Regression: a Punycode A-label "xn--caf-dma.example" is the
+    // RFC-compliant way to encode "café.example" and is pure ASCII.
+    // The analyzer must NOT flag it.
+    let mut analyzer = TlsAnalyzer::new();
+    let fk = test_flow_key();
+
+    let record = build_client_hello("xn--caf-dma.example", &[0x1301]);
+    analyzer.on_data(&fk, Direction::ClientToServer, &record, 0);
+
+    let non_ascii_count = analyzer
+        .findings()
+        .iter()
+        .filter(|f| f.summary.contains("non-ASCII characters"))
+        .count();
+    assert_eq!(
+        non_ascii_count, 0,
+        "Punycode A-label is RFC-compliant ASCII and must not be flagged"
+    );
+    assert_eq!(
+        *analyzer
+            .sni_counts()
+            .get("xn--caf-dma.example")
+            .unwrap_or(&0),
+        1
     );
 }
 

From 82b31a985394818b36bc419e2e175306a00f3c65 Mon Sep 17 00:00:00 2001
From: Zious <jaredbrichards@gmail.com>
Date: Wed, 8 Apr 2026 18:46:11 -0500
Subject: [PATCH 2/2] test: fix grammar in non-ASCII SNI test comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses Copilot review on PR #55: "a RFC 6066" → "an RFC 6066".
RFC is pronounced "arr-eff-see", which starts with a vowel sound, so
the indefinite article is "an" — matches IETF style throughout the
RFC corpus itself.
---
 tests/tls_analyzer_tests.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tls_analyzer_tests.rs b/tests/tls_analyzer_tests.rs
index 57c532c..c4408bd 100644
--- a/tests/tls_analyzer_tests.rs
+++ b/tests/tls_analyzer_tests.rs
@@ -514,7 +514,7 @@ fn test_sni_with_empty_hostname_bytes() {
 #[test]
 fn test_valid_utf8_non_ascii_sni_emits_finding() {
     // A SNI value that is valid UTF-8 but contains non-ASCII characters (e.g. a
-    // raw U-label "café.example") is a RFC 6066 §3 violation: the spec requires
+    // raw U-label "café.example") is an RFC 6066 §3 violation: the spec requires
     // ASCII encoding, with internationalized names sent as A-labels (RFC 5890
     // Punycode `xn--` form). Major TLS clients (rustls, Chrome/BoringSSL,
     // Firefox/NSS, curl/libcurl) all auto-Punycode internationalized hostnames