From 1e6b64831705ae7f852b85558c106c232b919e5c Mon Sep 17 00:00:00 2001 From: edgul Date: Wed, 30 Jul 2025 15:50:53 -0400 Subject: [PATCH] fix hostname tokenizing of newline, carriage return and tab --- src/testdata/urlpatterntestdata.json | 24 +++++++++++++++++++++--- src/tokenizer.rs | 8 ++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/testdata/urlpatterntestdata.json b/src/testdata/urlpatterntestdata.json index 5fcda0e..7cc5d71 100644 --- a/src/testdata/urlpatterntestdata.json +++ b/src/testdata/urlpatterntestdata.json @@ -2494,15 +2494,33 @@ }, { "pattern": [{ "hostname": "bad\nhostname" }], - "expected_obj": "error" + "inputs": [{ "hostname": "badhostname" }], + "expected_obj": { + "hostname": "badhostname" + }, + "expected_match": { + "hostname": { "input": "badhostname", "groups": {} } + } }, { "pattern": [{ "hostname": "bad\rhostname" }], - "expected_obj": "error" + "inputs": [{ "hostname": "badhostname" }], + "expected_obj": { + "hostname": "badhostname" + }, + "expected_match": { + "hostname": { "input": "badhostname", "groups": {} } + } }, { "pattern": [{ "hostname": "bad\thostname" }], - "expected_obj": "error" + "inputs": [{ "hostname": "badhostname" }], + "expected_obj": { + "hostname": "badhostname" + }, + "expected_match": { + "hostname": { "input": "badhostname", "groups": {} } + } }, { "pattern": [{}], diff --git a/src/tokenizer.rs b/src/tokenizer.rs index dd25cfb..b250d9c 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -160,6 +160,14 @@ pub fn tokenize( ); continue; } + if tokenizer.code_point == Some('\n') + || tokenizer.code_point == Some('\r') + || tokenizer.code_point == Some('\t') + { + // ignore newline, carriage return and tab + tokenizer.index = tokenizer.next_index; + continue; + } if tokenizer.code_point == Some('{') { tokenizer.add_token_with_default_pos_and_len(TokenType::Open); continue;