@@ -1107,112 +1107,99 @@ fn try_flag_from_char(ch: char) -> core::result::Result<Flag, ParseFlagError> {
1107
1107
try_flag_from_u32 ( ch as u32 )
1108
1108
}
1109
1109
1110
+ // See Hunspell's `HashMgr::decode_flag`.
1110
1111
fn parse_flag_from_str (
1111
1112
flag_type : FlagType ,
1112
1113
input : & str ,
1113
1114
) -> core:: result:: Result < Flag , ParseFlagError > {
1114
1115
use ParseFlagError :: * ;
1115
1116
assert ! ( !input. is_empty( ) ) ;
1116
1117
1117
- match flag_type {
1118
+ let u16 = match flag_type {
1118
1119
FlagType :: Short => {
1119
- let mut chars = input. chars ( ) ;
1120
- let ch = chars. next ( ) . expect ( "asserted to be non-empty above" ) ;
1121
- if ch. is_ascii ( ) {
1122
- // The flag is ASCII: it's a valid `u8` so it can fit into a `u16`.
1123
- try_flag_from_u16 ( ch as u16 )
1124
- } else {
1125
- Err ( NonAscii ( ch) )
1126
- }
1120
+ // NOTE: Hunspell and Nuspell do not validate that the flag is ASCII and will accept
1121
+ // (and work correctly, for the most part) non-ASCII flags.
1122
+ // Also see <https://github.com/helix-editor/spellbook/issues/8>.
1123
+ input. as_bytes ( ) [ 0 ] as u16
1127
1124
}
1128
1125
FlagType :: Long => {
1129
- let mut chars = input. chars ( ) ;
1130
- let c1 = chars. next ( ) . expect ( "asserted to be non-empty above" ) ;
1131
- if !c1. is_ascii ( ) {
1132
- return Err ( NonAscii ( c1) ) ;
1133
- }
1134
- let c2 = match chars. next ( ) {
1135
- Some ( ch) => ch,
1136
- None => return Err ( MissingSecondChar ( c1) ) ,
1137
- } ;
1138
- if !c2. is_ascii ( ) {
1139
- return Err ( NonAscii ( c2) ) ;
1126
+ // Same here: the first two bytes are taken and everything else is ignored.
1127
+ let bytes = input. as_bytes ( ) ;
1128
+ if bytes. len ( ) < 2 {
1129
+ return Err ( MissingSecondChar ) ;
1140
1130
}
1141
-
1142
- try_flag_from_u16 ( u16:: from_ne_bytes ( [ c1 as u8 , c2 as u8 ] ) )
1143
- }
1144
- FlagType :: Numeric => {
1145
- let number = input. parse :: < u16 > ( ) . map_err ( ParseIntError ) ?;
1146
- try_flag_from_u16 ( number)
1131
+ u16:: from_ne_bytes ( [ bytes[ 0 ] , bytes[ 1 ] ] )
1147
1132
}
1133
+ FlagType :: Numeric => input. parse :: < u16 > ( ) . map_err ( ParseIntError ) ?,
1148
1134
FlagType :: Utf8 => {
1149
1135
// A u16 is not large enough to fit any Unicode scalar. Nuspell rejects scalars with
1150
1136
// codepoint values above `u16::MAX` but Hunspell accepts them. Hunspell converts the
1151
1137
// input string into UTF-16 and then takes the first u16.
1152
- let u16 = input
1138
+ input
1153
1139
. encode_utf16 ( )
1154
1140
. next ( )
1155
- . expect ( "asserted to be non-empty above" ) ;
1156
- try_flag_from_u16 ( u16)
1141
+ . expect ( "asserted to be non-empty above" )
1157
1142
}
1158
- }
1143
+ } ;
1144
+ try_flag_from_u16 ( u16)
1159
1145
}
1160
1146
1147
+ // See Hunspell's `HashMgr::decode_flags`.
1161
1148
fn parse_flags_from_str (
1162
1149
flag_type : FlagType ,
1163
1150
input : & str ,
1164
1151
) -> core:: result:: Result < FlagSet , ParseFlagError > {
1165
1152
use ParseFlagError :: * ;
1153
+ // See above notes about Unicode handling in `parse_flag_from_str`. Handling bytes here may
1154
+ // cause the flags to handle Unicode poorly leading to "collisions" between two distinct
1155
+ // Unicode characters.
1156
+
1157
+ if input. is_empty ( ) {
1158
+ return Ok ( FlagSet :: default ( ) ) ;
1159
+ }
1166
1160
1167
1161
match flag_type {
1168
1162
FlagType :: Short => {
1169
1163
let flagset = input
1170
- . chars ( )
1171
- . map ( |ch| {
1172
- if ch. is_ascii ( ) {
1173
- // The flag is ASCII: it's a valid `u8` so it can fit into a `u16`.
1174
- try_flag_from_u16 ( ch as u16 )
1175
- } else {
1176
- Err ( ParseFlagError :: NonAscii ( ch) )
1177
- }
1178
- } )
1164
+ . bytes ( )
1165
+ . map ( |b| try_flag_from_u16 ( b as u16 ) )
1179
1166
. collect :: < core:: result:: Result < Vec < Flag > , _ > > ( ) ?;
1180
1167
Ok ( flagset. into ( ) )
1181
1168
}
1182
1169
FlagType :: Long => {
1183
- let mut chars = input. chars ( ) ;
1184
- let mut flags = Vec :: new ( ) ;
1185
- while let Some ( c1) = chars. next ( ) {
1186
- let c2 = match chars. next ( ) {
1187
- Some ( ch) => ch,
1188
- None => return Err ( MissingSecondChar ( c1) ) ,
1189
- } ;
1190
- let flag = try_flag_from_u16 ( u16:: from_ne_bytes ( [ c1 as u8 , c2 as u8 ] ) ) ?;
1191
- flags. push ( flag) ;
1170
+ let bytes = input. as_bytes ( ) ;
1171
+ let mut len = bytes. len ( ) ;
1172
+ if ( len & 1 ) == 1 {
1173
+ return Err ( MissingSecondChar ) ;
1174
+ }
1175
+ len >>= 1 ;
1176
+ let mut flags = Vec :: with_capacity ( len) ;
1177
+ for i in 0 ..len {
1178
+ let u16 = u16:: from_ne_bytes ( [ bytes[ i << 1 ] , bytes[ ( i << 1 ) | 1 ] ] ) ;
1179
+ flags. push ( try_flag_from_u16 ( u16) ?) ;
1192
1180
}
1193
1181
Ok ( flags. into ( ) )
1194
1182
}
1195
1183
FlagType :: Numeric => {
1196
1184
let mut flags = Vec :: new ( ) ;
1197
- let mut number = String :: new ( ) ;
1198
- let mut separated = false ;
1199
- for ch in input. chars ( ) {
1200
- if ch == ',' {
1201
- if separated {
1202
- return Err ( DuplicateComma ) ;
1203
- }
1204
- let n = number. parse :: < u16 > ( ) . map_err ( ParseIntError ) ?;
1205
- flags. push ( try_flag_from_u16 ( n) ?) ;
1206
- number. clear ( ) ;
1207
- } else {
1208
- number. push ( ch) ;
1185
+ let mut start = 0 ;
1186
+ while let Some ( idx) = input[ start..] . find ( ',' ) . map ( |i| i + start) {
1187
+ if input. as_bytes ( ) . get ( idx + 1 ) . copied ( ) == Some ( b',' ) {
1188
+ return Err ( DuplicateComma ) ;
1209
1189
}
1210
- separated = ch == ',' ;
1211
- }
1212
- if !number. is_empty ( ) {
1213
- let n = number. parse :: < u16 > ( ) . map_err ( ParseIntError ) ?;
1214
- flags. push ( try_flag_from_u16 ( n) ?) ;
1190
+ let flag = input[ start..idx]
1191
+ . parse :: < u16 > ( )
1192
+ . map_err ( ParseIntError )
1193
+ . and_then ( try_flag_from_u16) ?;
1194
+ flags. push ( flag) ;
1195
+ start = idx + 1 ;
1215
1196
}
1197
+ let flag = input[ start..]
1198
+ . parse :: < u16 > ( )
1199
+ . map_err ( ParseIntError )
1200
+ . and_then ( try_flag_from_u16) ?;
1201
+ flags. push ( flag) ;
1202
+
1216
1203
Ok ( flags. into ( ) )
1217
1204
}
1218
1205
FlagType :: Utf8 => {
@@ -1693,7 +1680,7 @@ impl fmt::Display for UnknownFlagTypeError {
1693
1680
#[ derive( Debug , PartialEq , Eq , Clone ) ]
1694
1681
pub enum ParseFlagError {
1695
1682
NonAscii ( char ) ,
1696
- MissingSecondChar ( char ) ,
1683
+ MissingSecondChar ,
1697
1684
ParseIntError ( core:: num:: ParseIntError ) ,
1698
1685
DuplicateComma ,
1699
1686
ZeroFlag ,
@@ -1704,9 +1691,7 @@ impl fmt::Display for ParseFlagError {
1704
1691
fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
1705
1692
match self {
1706
1693
Self :: NonAscii ( ch) => write ! ( f, "expected ascii char, found {}" , ch) ,
1707
- Self :: MissingSecondChar ( ch) => {
1708
- write ! ( f, "expected two chars, {} is missing its second" , ch)
1709
- }
1694
+ Self :: MissingSecondChar => f. write_str ( "expected two chars, found one" ) ,
1710
1695
Self :: ParseIntError ( err) => err. fmt ( f) ,
1711
1696
Self :: DuplicateComma => f. write_str ( "unexpected extra comma" ) ,
1712
1697
Self :: ZeroFlag => f. write_str ( "flag cannot be zero" ) ,
@@ -1847,6 +1832,14 @@ mod test {
1847
1832
Ok ( flag!( 'a' as u16 ) ) ,
1848
1833
parse_flag_from_str( FlagType :: Short , "a" )
1849
1834
) ;
1835
+ // Non-ASCII is accepted for short flags too. Only the first byte is taken.
1836
+ assert_eq ! ( Ok ( flag!( 195 ) ) , parse_flag_from_str( FlagType :: Short , "À" ) ) ;
1837
+ // As below, this can cause "collisions."
1838
+ // Here both characters share the same first byte (`0xC3`).
1839
+ assert_eq ! (
1840
+ parse_flag_from_str( FlagType :: Short , "À" ) ,
1841
+ parse_flag_from_str( FlagType :: Short , "Ã" )
1842
+ ) ;
1850
1843
assert_eq ! ( Ok ( flag!( 1 ) ) , parse_flag_from_str( FlagType :: Numeric , "1" ) ) ;
1851
1844
1852
1845
// U+1F52D '🔭' is four bytes in UTF8 and two code units in UTF-16. Nuspell rejects flags
@@ -1872,6 +1865,10 @@ mod test {
1872
1865
Ok ( flagset![ 214 , 216 , 54321 ] ) ,
1873
1866
parse_flags_from_str( FlagType :: Numeric , "214,216,54321" )
1874
1867
) ;
1868
+ assert_eq ! (
1869
+ Err ( ParseFlagError :: DuplicateComma ) ,
1870
+ parse_flags_from_str( FlagType :: Numeric , "123,,789" )
1871
+ ) ;
1875
1872
}
1876
1873
1877
1874
#[ test]
0 commit comments