Skip to content

Commit 6d54954

Browse files
committed
Align flag parsing with Hunspell's HashMgr
Hunspell does not reject Unicode when the flag type is not `UTF-8`. For non `UTF-8` flag types, Hunspell operates in terms of bytes instead of characters and doesn't check whether the values fit the definition of the flag type. For example the short flag type (default) supports a two-byte (in UTF-8 representation) character 'À' by using its first byte and ignoring the rest. This change translates Hunspell's parsing code from its `HashMgr` type directly to ensure we follow the same rules.
1 parent 49603a2 commit 6d54954

File tree

2 files changed

+66
-76
lines changed

2 files changed

+66
-76
lines changed

src/aff/parser.rs

Lines changed: 66 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,112 +1107,99 @@ fn try_flag_from_char(ch: char) -> core::result::Result<Flag, ParseFlagError> {
11071107
try_flag_from_u32(ch as u32)
11081108
}
11091109

1110+
// See Hunspell's `HashMgr::decode_flag`.
11101111
fn parse_flag_from_str(
11111112
flag_type: FlagType,
11121113
input: &str,
11131114
) -> core::result::Result<Flag, ParseFlagError> {
11141115
use ParseFlagError::*;
11151116
assert!(!input.is_empty());
11161117

1117-
match flag_type {
1118+
let u16 = match flag_type {
11181119
FlagType::Short => {
1119-
let mut chars = input.chars();
1120-
let ch = chars.next().expect("asserted to be non-empty above");
1121-
if ch.is_ascii() {
1122-
// The flag is ASCII: it's a valid `u8` so it can fit into a `u16`.
1123-
try_flag_from_u16(ch as u16)
1124-
} else {
1125-
Err(NonAscii(ch))
1126-
}
1120+
// NOTE: Hunspell and Nuspell do not validate that the flag is ASCII and will accept
1121+
// (and work correctly, for the most part) non-ASCII flags.
1122+
// Also see <https://github.com/helix-editor/spellbook/issues/8>.
1123+
input.as_bytes()[0] as u16
11271124
}
11281125
FlagType::Long => {
1129-
let mut chars = input.chars();
1130-
let c1 = chars.next().expect("asserted to be non-empty above");
1131-
if !c1.is_ascii() {
1132-
return Err(NonAscii(c1));
1133-
}
1134-
let c2 = match chars.next() {
1135-
Some(ch) => ch,
1136-
None => return Err(MissingSecondChar(c1)),
1137-
};
1138-
if !c2.is_ascii() {
1139-
return Err(NonAscii(c2));
1126+
// Same here: the first two bytes are taken and everything else is ignored.
1127+
let bytes = input.as_bytes();
1128+
if bytes.len() < 2 {
1129+
return Err(MissingSecondChar);
11401130
}
1141-
1142-
try_flag_from_u16(u16::from_ne_bytes([c1 as u8, c2 as u8]))
1143-
}
1144-
FlagType::Numeric => {
1145-
let number = input.parse::<u16>().map_err(ParseIntError)?;
1146-
try_flag_from_u16(number)
1131+
u16::from_ne_bytes([bytes[0], bytes[1]])
11471132
}
1133+
FlagType::Numeric => input.parse::<u16>().map_err(ParseIntError)?,
11481134
FlagType::Utf8 => {
11491135
// A u16 is not large enough to fit any Unicode scalar. Nuspell rejects scalars with
11501136
// codepoint values above `u16::MAX` but Hunspell accepts them. Hunspell converts the
11511137
// input string into UTF-16 and then takes the first u16.
1152-
let u16 = input
1138+
input
11531139
.encode_utf16()
11541140
.next()
1155-
.expect("asserted to be non-empty above");
1156-
try_flag_from_u16(u16)
1141+
.expect("asserted to be non-empty above")
11571142
}
1158-
}
1143+
};
1144+
try_flag_from_u16(u16)
11591145
}
11601146

1147+
// See Hunspell's `HashMgr::decode_flags`.
11611148
fn parse_flags_from_str(
11621149
flag_type: FlagType,
11631150
input: &str,
11641151
) -> core::result::Result<FlagSet, ParseFlagError> {
11651152
use ParseFlagError::*;
1153+
// See above notes about Unicode handling in `parse_flag_from_str`. Handling bytes here may
1154+
// cause the flags to handle Unicode poorly leading to "collisions" between two distinct
1155+
// Unicode characters.
1156+
1157+
if input.is_empty() {
1158+
return Ok(FlagSet::default());
1159+
}
11661160

11671161
match flag_type {
11681162
FlagType::Short => {
11691163
let flagset = input
1170-
.chars()
1171-
.map(|ch| {
1172-
if ch.is_ascii() {
1173-
// The flag is ASCII: it's a valid `u8` so it can fit into a `u16`.
1174-
try_flag_from_u16(ch as u16)
1175-
} else {
1176-
Err(ParseFlagError::NonAscii(ch))
1177-
}
1178-
})
1164+
.bytes()
1165+
.map(|b| try_flag_from_u16(b as u16))
11791166
.collect::<core::result::Result<Vec<Flag>, _>>()?;
11801167
Ok(flagset.into())
11811168
}
11821169
FlagType::Long => {
1183-
let mut chars = input.chars();
1184-
let mut flags = Vec::new();
1185-
while let Some(c1) = chars.next() {
1186-
let c2 = match chars.next() {
1187-
Some(ch) => ch,
1188-
None => return Err(MissingSecondChar(c1)),
1189-
};
1190-
let flag = try_flag_from_u16(u16::from_ne_bytes([c1 as u8, c2 as u8]))?;
1191-
flags.push(flag);
1170+
let bytes = input.as_bytes();
1171+
let mut len = bytes.len();
1172+
if (len & 1) == 1 {
1173+
return Err(MissingSecondChar);
1174+
}
1175+
len >>= 1;
1176+
let mut flags = Vec::with_capacity(len);
1177+
for i in 0..len {
1178+
let u16 = u16::from_ne_bytes([bytes[i << 1], bytes[(i << 1) | 1]]);
1179+
flags.push(try_flag_from_u16(u16)?);
11921180
}
11931181
Ok(flags.into())
11941182
}
11951183
FlagType::Numeric => {
11961184
let mut flags = Vec::new();
1197-
let mut number = String::new();
1198-
let mut separated = false;
1199-
for ch in input.chars() {
1200-
if ch == ',' {
1201-
if separated {
1202-
return Err(DuplicateComma);
1203-
}
1204-
let n = number.parse::<u16>().map_err(ParseIntError)?;
1205-
flags.push(try_flag_from_u16(n)?);
1206-
number.clear();
1207-
} else {
1208-
number.push(ch);
1185+
let mut start = 0;
1186+
while let Some(idx) = input[start..].find(',').map(|i| i + start) {
1187+
if input.as_bytes().get(idx + 1).copied() == Some(b',') {
1188+
return Err(DuplicateComma);
12091189
}
1210-
separated = ch == ',';
1211-
}
1212-
if !number.is_empty() {
1213-
let n = number.parse::<u16>().map_err(ParseIntError)?;
1214-
flags.push(try_flag_from_u16(n)?);
1190+
let flag = input[start..idx]
1191+
.parse::<u16>()
1192+
.map_err(ParseIntError)
1193+
.and_then(try_flag_from_u16)?;
1194+
flags.push(flag);
1195+
start = idx + 1;
12151196
}
1197+
let flag = input[start..]
1198+
.parse::<u16>()
1199+
.map_err(ParseIntError)
1200+
.and_then(try_flag_from_u16)?;
1201+
flags.push(flag);
1202+
12161203
Ok(flags.into())
12171204
}
12181205
FlagType::Utf8 => {
@@ -1693,7 +1680,7 @@ impl fmt::Display for UnknownFlagTypeError {
16931680
#[derive(Debug, PartialEq, Eq, Clone)]
16941681
pub enum ParseFlagError {
16951682
NonAscii(char),
1696-
MissingSecondChar(char),
1683+
MissingSecondChar,
16971684
ParseIntError(core::num::ParseIntError),
16981685
DuplicateComma,
16991686
ZeroFlag,
@@ -1704,9 +1691,7 @@ impl fmt::Display for ParseFlagError {
17041691
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
17051692
match self {
17061693
Self::NonAscii(ch) => write!(f, "expected ascii char, found {}", ch),
1707-
Self::MissingSecondChar(ch) => {
1708-
write!(f, "expected two chars, {} is missing its second", ch)
1709-
}
1694+
Self::MissingSecondChar => f.write_str("expected two chars, found one"),
17101695
Self::ParseIntError(err) => err.fmt(f),
17111696
Self::DuplicateComma => f.write_str("unexpected extra comma"),
17121697
Self::ZeroFlag => f.write_str("flag cannot be zero"),
@@ -1847,6 +1832,14 @@ mod test {
18471832
Ok(flag!('a' as u16)),
18481833
parse_flag_from_str(FlagType::Short, "a")
18491834
);
1835+
// Non-ASCII is accepted for short flags too. Only the first byte is taken.
1836+
assert_eq!(Ok(flag!(195)), parse_flag_from_str(FlagType::Short, "À"));
1837+
// As below, this can cause "collisions."
1838+
// Here both characters share the same first byte (`0xC3`).
1839+
assert_eq!(
1840+
parse_flag_from_str(FlagType::Short, "À"),
1841+
parse_flag_from_str(FlagType::Short, "Ã")
1842+
);
18501843
assert_eq!(Ok(flag!(1)), parse_flag_from_str(FlagType::Numeric, "1"));
18511844

18521845
// U+1F52D '🔭' is four bytes in UTF8 and two code units in UTF-16. Nuspell rejects flags
@@ -1872,6 +1865,10 @@ mod test {
18721865
Ok(flagset![214, 216, 54321]),
18731866
parse_flags_from_str(FlagType::Numeric, "214,216,54321")
18741867
);
1868+
assert_eq!(
1869+
Err(ParseFlagError::DuplicateComma),
1870+
parse_flags_from_str(FlagType::Numeric, "123,,789")
1871+
);
18751872
}
18761873

18771874
#[test]

src/lib.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,6 @@ impl<S: BuildHasher> Dictionary<S> {
234234
/// dict.add("foobarbaz/G").unwrap();
235235
/// assert!(dict.check("foobarbaz"));
236236
/// assert!(dict.check("foobarbazing"));
237-
///
238-
/// // Adding to a dictionary might fail if the line cannot be parsed. For example, a flag
239-
/// // using a UTF-8 character that takes more than two bytes is not allowed.
240-
/// assert_eq!("😓".len(), 4); // 😓 is 4 bytes in UTF-8: 0xf0 0x9f 0x98 0x93
241-
/// assert!(dict.add("notallowed/😓").is_err());
242237
/// ```
243238
pub fn add(&mut self, input: &str) -> Result<(), ParseFlagError> {
244239
// This impl might be expanded in the future.
@@ -573,8 +568,6 @@ mod test {
573568
dict.add("foobarbaz/G").unwrap();
574569
assert!(dict.check("foobarbaz"));
575570
assert!(dict.check("foobarbazing"));
576-
577-
assert!(dict.add("notallowed/😓").is_err());
578571
}
579572

580573
#[test]

0 commit comments

Comments
 (0)