@@ -95,65 +95,32 @@ cfg_match! {
9595 if multibyte_mask == 0 {
9696 assert!( intra_chunk_offset == 0 ) ;
9797
98- // Check if there are any control characters in the chunk. All
99- // control characters that we can encounter at this point have a
100- // byte value less than 32 or ...
101- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
102- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
103-
104- // ... it's the ASCII 'DEL' character with a value of 127.
105- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
106- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
107-
108- let control_char_mask = control_char_mask0 | control_char_mask1;
109-
110- if control_char_mask != 0 {
111- // Check for newlines in the chunk
112- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
113- let newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
114-
115- if control_char_mask == newlines_mask {
116- // All control characters are newlines, record them
117- let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
118- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
119-
120- loop {
121- let index = newlines_mask. trailing_zeros( ) ;
122-
123- if index >= CHUNK_SIZE as u32 {
124- // We have arrived at the end of the chunk.
125- break ;
126- }
127-
128- lines. push( RelativeBytePos ( index) + output_offset) ;
129-
130- // Clear the bit, so we can find the next one.
131- newlines_mask &= ( !1 ) << index;
132- }
133-
134- // We are done for this chunk. All control characters were
135- // newlines and we took care of those.
136- continue ;
137- } else {
138- // Some of the control characters are not newlines,
139- // fall through to the slow path below.
140- }
141- } else {
142- // No control characters, nothing to record for this chunk
143- continue ;
98+ // Check for newlines in the chunk
99+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
100+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
101+
102+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
103+
104+ while newlines_mask != 0 {
105+ let index = newlines_mask. trailing_zeros( ) ;
106+
107+ lines. push( RelativeBytePos ( index) + output_offset) ;
108+
109+ // Clear the bit, so we can find the next one.
110+ newlines_mask &= newlines_mask - 1 ;
144111 }
112+ } else {
113+ // The slow path.
114+ // There are multibyte chars in here, fallback to generic decoding.
115+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
116+ intra_chunk_offset = analyze_source_file_generic(
117+ & src[ scan_start..] ,
118+ CHUNK_SIZE - intra_chunk_offset,
119+ RelativeBytePos :: from_usize( scan_start) ,
120+ lines,
121+ multi_byte_chars,
122+ ) ;
145123 }
146-
147- // The slow path.
148- // There are control chars in here, fallback to generic decoding.
149- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
150- intra_chunk_offset = analyze_source_file_generic(
151- & src[ scan_start..] ,
152- CHUNK_SIZE - intra_chunk_offset,
153- RelativeBytePos :: from_usize( scan_start) ,
154- lines,
155- multi_byte_chars,
156- ) ;
157124 }
158125
159126 // There might still be a tail left to analyze
@@ -253,65 +220,32 @@ cfg_match! {
253220 if multibyte_mask == 0 {
254221 assert!( intra_chunk_offset == 0 ) ;
255222
256- // Check if there are any control characters in the chunk. All
257- // control characters that we can encounter at this point have a
258- // byte value less than 32 or ...
259- let control_char_test0 = unsafe { _mm_cmplt_epi8( chunk, _mm_set1_epi8( 32 ) ) } ;
260- let control_char_mask0 = unsafe { _mm_movemask_epi8( control_char_test0) } ;
261-
262- // ... it's the ASCII 'DEL' character with a value of 127.
263- let control_char_test1 = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( 127 ) ) } ;
264- let control_char_mask1 = unsafe { _mm_movemask_epi8( control_char_test1) } ;
265-
266- let control_char_mask = control_char_mask0 | control_char_mask1;
267-
268- if control_char_mask != 0 {
269- // Check for newlines in the chunk
270- let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
271- let newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
272-
273- if control_char_mask == newlines_mask {
274- // All control characters are newlines, record them
275- let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32 ;
276- let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
277-
278- loop {
279- let index = newlines_mask. trailing_zeros( ) ;
280-
281- if index >= CHUNK_SIZE as u32 {
282- // We have arrived at the end of the chunk.
283- break ;
284- }
285-
286- lines. push( RelativeBytePos ( index) + output_offset) ;
287-
288- // Clear the bit, so we can find the next one.
289- newlines_mask &= ( !1 ) << index;
290- }
291-
292- // We are done for this chunk. All control characters were
293- // newlines and we took care of those.
294- continue ;
295- } else {
296- // Some of the control characters are not newlines,
297- // fall through to the slow path below.
298- }
299- } else {
300- // No control characters, nothing to record for this chunk
301- continue ;
223+ // Check for newlines in the chunk
224+ let newlines_test = unsafe { _mm_cmpeq_epi8( chunk, _mm_set1_epi8( b'\n' as i8 ) ) } ;
225+ let mut newlines_mask = unsafe { _mm_movemask_epi8( newlines_test) } ;
226+
227+ let output_offset = RelativeBytePos :: from_usize( chunk_index * CHUNK_SIZE + 1 ) ;
228+
229+ while newlines_mask != 0 {
230+ let index = newlines_mask. trailing_zeros( ) ;
231+
232+ lines. push( RelativeBytePos ( index) + output_offset) ;
233+
234+ // Clear the bit, so we can find the next one.
235+ newlines_mask &= newlines_mask - 1 ;
302236 }
237+ } else {
238+ // The slow path.
239+ // There are multibyte chars in here, fallback to generic decoding.
240+ let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
241+ intra_chunk_offset = analyze_source_file_generic(
242+ & src[ scan_start..] ,
243+ CHUNK_SIZE - intra_chunk_offset,
244+ RelativeBytePos :: from_usize( scan_start) ,
245+ lines,
246+ multi_byte_chars,
247+ ) ;
303248 }
304-
305- // The slow path.
306- // There are control chars in here, fallback to generic decoding.
307- let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
308- intra_chunk_offset = analyze_source_file_generic(
309- & src[ scan_start..] ,
310- CHUNK_SIZE - intra_chunk_offset,
311- RelativeBytePos :: from_usize( scan_start) ,
312- lines,
313- multi_byte_chars,
314- ) ;
315249 }
316250
317251 // There might still be a tail left to analyze
@@ -369,29 +303,18 @@ fn analyze_source_file_generic(
369303 // string.
370304 let mut char_len = 1 ;
371305
372- if byte < 32 {
373- // This is an ASCII control character, it could be one of the cases
374- // that are interesting to us.
375-
306+ if byte == b'\n' {
376307 let pos = RelativeBytePos :: from_usize ( i) + output_offset;
377-
378- if let b'\n' = byte {
379- lines. push ( pos + RelativeBytePos ( 1 ) ) ;
380- }
381- } else if byte >= 127 {
382- // The slow path:
383- // This is either ASCII control character "DEL" or the beginning of
384- // a multibyte char. Just decode to `char`.
308+ lines. push ( pos + RelativeBytePos ( 1 ) ) ;
309+ } else if byte >= 128 {
310+ // This is the beginning of a multibyte char. Just decode to `char`.
385311 let c = src[ i..] . chars ( ) . next ( ) . unwrap ( ) ;
386312 char_len = c. len_utf8 ( ) ;
387313
388314 let pos = RelativeBytePos :: from_usize ( i) + output_offset;
389-
390- if char_len > 1 {
391- assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
392- let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
393- multi_byte_chars. push ( mbc) ;
394- }
315+ assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ;
316+ let mbc = MultiByteChar { pos, bytes : char_len as u8 } ;
317+ multi_byte_chars. push ( mbc) ;
395318 }
396319
397320 i += char_len;
0 commit comments