From 8ff61bde612bb36581f80ef2419f8fd6bd57e40c Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Tue, 17 Dec 2024 06:06:25 +1300 Subject: [PATCH 1/2] Remove regex dependency Signed-off-by: Nico Burns --- Cargo.toml | 3 --- src/build.rs | 76 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 58 insertions(+), 21 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c8b70ba..77e10c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,8 +8,5 @@ license = "MIT" build = "src/build.rs" -[build-dependencies] -regex = "1.0" - [dependencies] unicode-normalization = "0.1" diff --git a/src/build.rs b/src/build.rs index 80f619a..fc70743 100644 --- a/src/build.rs +++ b/src/build.rs @@ -1,11 +1,8 @@ -extern crate regex; - use std::char; use std::env; use std::fs::File; use std::io::Write; use std::path::Path; -use regex::Regex; // Case folding a single code point can give up to this many code points. const MAX_FOLDED_CODE_POINTS: usize = 3; @@ -13,17 +10,12 @@ const MAX_FOLDED_CODE_POINTS: usize = 3; fn main() { let mut lines = include_str!("../CaseFolding.txt").lines(); let first_line = lines.next().unwrap(); - let version_regex = Regex::new(r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$").unwrap(); - let unicode_version = &version_regex.captures(first_line).unwrap(); - let (major, minor, patch): (u64, u64, u64) = ( - unicode_version[1].parse().unwrap(), - unicode_version[2].parse().unwrap(), - unicode_version[3].parse().unwrap(), - ); + let (major, minor, patch) = parse_version(first_line); let dst = Path::new(&env::var("OUT_DIR").unwrap()).join("case_folding_data.rs"); let f = &mut File::create(&dst).unwrap(); + // Shorthand for `write!(f, ...).unwrap()` macro_rules! w { ($($args: tt)+) => { (write!(f, $($args)+)).unwrap(); } }; @@ -31,18 +23,16 @@ fn main() { w!("pub const UNICODE_VERSION: (u64, u64, u64) = ({}, {}, {});\n", major, minor, patch); w!("const CASE_FOLDING_TABLE: &'static [(char, [char; 3])] = &[\n"); - // Entry with C (common case folding) or F (full case folding) status - let c_or_f_entry = Regex::new(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);").unwrap(); - for line in lines { - if let Some(captures) = c_or_f_entry.captures(line) { - let from = &captures[1]; - let to = captures[2].split(' ').map(hex_to_escaped).collect::>(); + // Parse line. Skip if line is empty (or only comment). Skip if status is not F or C + if let Some((from, _, to)) = parse_line(line).filter(status_is_f_or_c) { assert!(to.len() <= MAX_FOLDED_CODE_POINTS); let blanks = MAX_FOLDED_CODE_POINTS - to.len(); + + // Write line let mut to = to.into_iter(); let first_to = to.next().unwrap(); - w!(" ('{}', ['{}'", hex_to_escaped(from), first_to); + w!(" ('{}', ['{}'", from, first_to); for c in to { w!(", '{}'", c); } @@ -55,9 +45,59 @@ fn main() { w!("];\n"); } - fn hex_to_escaped(hex: &str) -> String { let c = u32::from_str_radix(hex, 16).unwrap(); assert!(c != 0); char::from_u32(c).unwrap().escape_default().collect() } + +fn parse_version(first_line: &str) -> (u64, u64, u64) { + let (prefix, rest) = first_line.split_at(14); + assert_eq!(prefix, "# CaseFolding-"); + + let (rest, suffix) = rest.split_at(rest.len() - 4); + assert_eq!(suffix, ".txt"); + + let unicode_version: Vec<&str> = rest.split('.').collect(); + assert_eq!(unicode_version.len(), 3); + assert!(unicode_version + .iter() + .all(|part| part.chars().all(|c| c.is_ascii_digit()))); + + let (major, minor, patch): (u64, u64, u64) = ( + unicode_version[0].parse().unwrap(), + unicode_version[1].parse().unwrap(), + unicode_version[2].parse().unwrap(), + ); + + (major, minor, patch) +} + +fn parse_line(line: &str) -> Option<(String, char, Vec)> { + // Handle comments: find content before the first # char (or whole line if there is no 3 char) + let pre_comment = if line.contains('#') { + line.split_once('#').unwrap().0 + } else { + line + }; + + // Skip line if non-comment content is empty + if pre_comment.is_empty() { + return None; + } + + let parts: Vec<&str> = pre_comment.split("; ").collect(); + assert!(parts.len() == 4); + assert!(["C", "F", "S", "T"].contains(&parts[1])); + assert!(parts[3] == ""); + + let from = hex_to_escaped(parts[0]); + let status = parts[1].chars().next().unwrap(); + let to = parts[2].split(' ').map(hex_to_escaped).collect::>(); + + return Some((from, status, to)); +} + +fn status_is_f_or_c((_to, status, _from): &(String, char, Vec)) -> bool { + *status == 'F' || *status == 'C' +} From 6e65c74805eb5498d9518720ce5a9e532d7d1f8d Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Tue, 17 Dec 2024 06:27:43 +1300 Subject: [PATCH 2/2] Bump version to 0.2.2 Signed-off-by: Nico Burns --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 77e10c5..f41ca57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "caseless" -version = "0.2.1" +version = "0.2.2" authors = ["Simon Sapin "] description = "Unicode caseless matching" repository = "https://github.com/unicode-rs/rust-caseless"