@@ -156,12 +156,6 @@ extension Unicode.Scalar {
156156 // including and above the DEL character U+7F.
157157 return self . value >= 0x20 && self . value < 0x7F
158158 }
159-
160- var isStartOfUTF8Character : Bool {
161- // RFC 2279: The octet values FE and FF never appear.
162- // RFC 3629: The octet values C0, C1, F5 to FF never appear.
163- return self . value <= 0x80 || ( self . value >= 0xC2 && self . value < 0xF5 )
164- }
165159}
166160
167161extension Unicode . Scalar {
@@ -179,20 +173,25 @@ extension Unicode.Scalar {
179173 return Unicode . Scalar ( curByte)
180174 }
181175
182- // Read the number of high bits set, which indicates the number of bytes in
183- // the character.
184- let encodedBytes = ( ~ ( UInt32 ( curByte) << 24 ) ) . leadingZeroBitCount
185-
186- // If this is 0b10XXXXXX, then it is a continuation character.
187- if encodedBytes == 1 || !Unicode. Scalar ( curByte) . isStartOfUTF8Character {
176+ // If this is not the start of a UTF8 character,
177+ // then it is either a continuation byte or an invalid UTF8 code point.
178+ if !curByte. isStartOfUTF8Character {
188179 // Skip until we get the start of another character. This is guaranteed to
189180 // at least stop at the nul at the end of the buffer.
190- while let peeked = peek ( ) , !Unicode . Scalar ( peeked) . isStartOfUTF8Character {
181+ while let peeked = peek ( ) , !peeked. isStartOfUTF8Character {
191182 _ = advance ( )
192183 }
193184 return nil
194185 }
195186
187+ // Read the number of high bits set, which indicates the number of bytes in
188+ // the character.
189+ let encodedBytes = ( ~ curByte) . leadingZeroBitCount
190+ // We have a multi-byte UTF-8 scalar.
191+ // Single-byte UTF-8 scalars are handled at the start of the function by checking `curByte < 0x80`.
192+ // `isStartOfUTF8Character` guaranteed that the `curByte` has 2 to 4 leading ones.
193+ precondition ( encodedBytes >= 2 && encodedBytes <= 4 )
194+
196195 // Drop the high bits indicating the # bytes of the result.
197196 var charValue = UInt32 ( curByte << encodedBytes) >> encodedBytes
198197
@@ -252,3 +251,11 @@ extension Unicode.Scalar {
252251 return self . lexing ( advance: advance, peek: peek)
253252 }
254253}
254+
255+ extension UInt8 {
256+ var isStartOfUTF8Character : Bool {
257+ // RFC 2279: The octet values FE and FF never appear.
258+ // RFC 3629: The octet values C0, C1, F5 to FF never appear.
259+ return self < 0x80 || ( self >= 0xC2 && self < 0xF5 )
260+ }
261+ }
0 commit comments