Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
[package]
name = "caseless"
version = "0.2.1"
version = "0.2.2"
authors = ["Simon Sapin <simon.sapin@exyr.org>"]
description = "Unicode caseless matching"
repository = "https://github.com/unicode-rs/rust-caseless"
license = "MIT"

build = "src/build.rs"

[build-dependencies]
regex = "1.0"

[dependencies]
unicode-normalization = "0.1"
76 changes: 58 additions & 18 deletions src/build.rs
Original file line number Diff line number Diff line change
@@ -1,48 +1,38 @@
extern crate regex;

use std::char;
use std::env;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use regex::Regex;

// Case folding a single code point can give up to this many code points.
const MAX_FOLDED_CODE_POINTS: usize = 3;

fn main() {
let mut lines = include_str!("../CaseFolding.txt").lines();
let first_line = lines.next().unwrap();
let version_regex = Regex::new(r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$").unwrap();
let unicode_version = &version_regex.captures(first_line).unwrap();
let (major, minor, patch): (u64, u64, u64) = (
unicode_version[1].parse().unwrap(),
unicode_version[2].parse().unwrap(),
unicode_version[3].parse().unwrap(),
);
let (major, minor, patch) = parse_version(first_line);

let dst = Path::new(&env::var("OUT_DIR").unwrap()).join("case_folding_data.rs");
let f = &mut File::create(&dst).unwrap();

// Shorthand for `write!(f, ...).unwrap()`
macro_rules! w {
($($args: tt)+) => { (write!(f, $($args)+)).unwrap(); }
};

w!("pub const UNICODE_VERSION: (u64, u64, u64) = ({}, {}, {});\n", major, minor, patch);
w!("const CASE_FOLDING_TABLE: &'static [(char, [char; 3])] = &[\n");

// Entry with C (common case folding) or F (full case folding) status
let c_or_f_entry = Regex::new(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);").unwrap();

for line in lines {
if let Some(captures) = c_or_f_entry.captures(line) {
let from = &captures[1];
let to = captures[2].split(' ').map(hex_to_escaped).collect::<Vec<_>>();
// Parse line. Skip if line is empty (or only comment). Skip if status is not F or C
if let Some((from, _, to)) = parse_line(line).filter(status_is_f_or_c) {
assert!(to.len() <= MAX_FOLDED_CODE_POINTS);
let blanks = MAX_FOLDED_CODE_POINTS - to.len();

// Write line
let mut to = to.into_iter();
let first_to = to.next().unwrap();
w!(" ('{}', ['{}'", hex_to_escaped(from), first_to);
w!(" ('{}', ['{}'", from, first_to);
for c in to {
w!(", '{}'", c);
}
Expand All @@ -55,9 +45,59 @@ fn main() {
w!("];\n");
}


fn hex_to_escaped(hex: &str) -> String {
let c = u32::from_str_radix(hex, 16).unwrap();
assert!(c != 0);
char::from_u32(c).unwrap().escape_default().collect()
}

fn parse_version(first_line: &str) -> (u64, u64, u64) {
let (prefix, rest) = first_line.split_at(14);
assert_eq!(prefix, "# CaseFolding-");

let (rest, suffix) = rest.split_at(rest.len() - 4);
assert_eq!(suffix, ".txt");

let unicode_version: Vec<&str> = rest.split('.').collect();
assert_eq!(unicode_version.len(), 3);
assert!(unicode_version
.iter()
.all(|part| part.chars().all(|c| c.is_ascii_digit())));

let (major, minor, patch): (u64, u64, u64) = (
unicode_version[0].parse().unwrap(),
unicode_version[1].parse().unwrap(),
unicode_version[2].parse().unwrap(),
);

(major, minor, patch)
}

fn parse_line(line: &str) -> Option<(String, char, Vec<String>)> {
// Handle comments: find content before the first # char (or whole line if there is no 3 char)
let pre_comment = if line.contains('#') {
line.split_once('#').unwrap().0
} else {
line
};

// Skip line if non-comment content is empty
if pre_comment.is_empty() {
return None;
}

let parts: Vec<&str> = pre_comment.split("; ").collect();
assert!(parts.len() == 4);
assert!(["C", "F", "S", "T"].contains(&parts[1]));
assert!(parts[3] == "");

let from = hex_to_escaped(parts[0]);
let status = parts[1].chars().next().unwrap();
let to = parts[2].split(' ').map(hex_to_escaped).collect::<Vec<_>>();

return Some((from, status, to));
}

fn status_is_f_or_c((_to, status, _from): &(String, char, Vec<String>)) -> bool {
*status == 'F' || *status == 'C'
}