@@ -95,65 +95,32 @@ cfg_match! {
9595                if  multibyte_mask == 0  { 
9696                    assert!( intra_chunk_offset == 0 ) ; 
9797
98-                     // Check if there are any control characters in the chunk. All 
99-                     // control characters that we can encounter at this point have a 
100-                     // byte value less than 32 or ... 
101-                     let  control_char_test0 = unsafe  {  _mm_cmplt_epi8( chunk,  _mm_set1_epi8( 32 ) )  } ; 
102-                     let  control_char_mask0 = unsafe  {  _mm_movemask_epi8( control_char_test0)  } ; 
103- 
104-                     // ... it's the ASCII 'DEL' character with a value of 127. 
105-                     let  control_char_test1 = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( 127 ) )  } ; 
106-                     let  control_char_mask1 = unsafe  {  _mm_movemask_epi8( control_char_test1)  } ; 
107- 
108-                     let  control_char_mask = control_char_mask0 | control_char_mask1; 
109- 
110-                     if  control_char_mask != 0  { 
111-                         // Check for newlines in the chunk 
112-                         let  newlines_test = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( b'\n'  as  i8 ) )  } ; 
113-                         let  newlines_mask = unsafe  {  _mm_movemask_epi8( newlines_test)  } ; 
114- 
115-                         if  control_char_mask == newlines_mask { 
116-                             // All control characters are newlines, record them 
117-                             let  mut  newlines_mask = 0xFFFF0000  | newlines_mask as  u32 ; 
118-                             let  output_offset = RelativeBytePos :: from_usize( chunk_index *  CHUNK_SIZE  + 1 ) ; 
119- 
120-                             loop  { 
121-                                 let  index = newlines_mask. trailing_zeros( ) ; 
122- 
123-                                 if  index >= CHUNK_SIZE  as  u32  { 
124-                                     // We have arrived at the end of the chunk. 
125-                                     break ; 
126-                                 } 
127- 
128-                                 lines. push( RelativeBytePos ( index)  + output_offset) ; 
129- 
130-                                 // Clear the bit, so we can find the next one. 
131-                                 newlines_mask &= ( !1 )  << index; 
132-                             } 
133- 
134-                             // We are done for this chunk. All control characters were 
135-                             // newlines and we took care of those. 
136-                             continue ; 
137-                         }  else { 
138-                             // Some of the control characters are not newlines, 
139-                             // fall through to the slow path below. 
140-                         } 
141-                     }  else { 
142-                         // No control characters, nothing to record for this chunk 
143-                         continue ; 
98+                     // Check for newlines in the chunk 
99+                     let  newlines_test = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( b'\n'  as  i8 ) )  } ; 
100+                     let  mut  newlines_mask = unsafe  {  _mm_movemask_epi8( newlines_test)  } ; 
101+ 
102+                     let  output_offset = RelativeBytePos :: from_usize( chunk_index *  CHUNK_SIZE  + 1 ) ; 
103+ 
104+                     while  newlines_mask != 0  { 
105+                         let  index = newlines_mask. trailing_zeros( ) ; 
106+ 
107+                         lines. push( RelativeBytePos ( index)  + output_offset) ; 
108+ 
109+                         // Clear the bit, so we can find the next one. 
110+                         newlines_mask &= newlines_mask - 1 ; 
144111                    } 
112+                 }  else { 
113+                     // The slow path. 
114+                     // There are multibyte chars in here, fallback to generic decoding. 
115+                     let  scan_start = chunk_index *  CHUNK_SIZE  + intra_chunk_offset; 
116+                     intra_chunk_offset = analyze_source_file_generic( 
117+                         & src[ scan_start..] , 
118+                         CHUNK_SIZE  - intra_chunk_offset, 
119+                         RelativeBytePos :: from_usize( scan_start) , 
120+                         lines, 
121+                         multi_byte_chars, 
122+                     ) ; 
145123                } 
146- 
147-                 // The slow path. 
148-                 // There are control chars in here, fallback to generic decoding. 
149-                 let  scan_start = chunk_index *  CHUNK_SIZE  + intra_chunk_offset; 
150-                 intra_chunk_offset = analyze_source_file_generic( 
151-                     & src[ scan_start..] , 
152-                     CHUNK_SIZE  - intra_chunk_offset, 
153-                     RelativeBytePos :: from_usize( scan_start) , 
154-                     lines, 
155-                     multi_byte_chars, 
156-                 ) ; 
157124            } 
158125
159126            // There might still be a tail left to analyze 
@@ -253,65 +220,32 @@ cfg_match! {
253220                if  multibyte_mask == 0  { 
254221                    assert!( intra_chunk_offset == 0 ) ; 
255222
256-                     // Check if there are any control characters in the chunk. All 
257-                     // control characters that we can encounter at this point have a 
258-                     // byte value less than 32 or ... 
259-                     let  control_char_test0 = unsafe  {  _mm_cmplt_epi8( chunk,  _mm_set1_epi8( 32 ) )  } ; 
260-                     let  control_char_mask0 = unsafe  {  _mm_movemask_epi8( control_char_test0)  } ; 
261- 
262-                     // ... it's the ASCII 'DEL' character with a value of 127. 
263-                     let  control_char_test1 = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( 127 ) )  } ; 
264-                     let  control_char_mask1 = unsafe  {  _mm_movemask_epi8( control_char_test1)  } ; 
265- 
266-                     let  control_char_mask = control_char_mask0 | control_char_mask1; 
267- 
268-                     if  control_char_mask != 0  { 
269-                         // Check for newlines in the chunk 
270-                         let  newlines_test = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( b'\n'  as  i8 ) )  } ; 
271-                         let  newlines_mask = unsafe  {  _mm_movemask_epi8( newlines_test)  } ; 
272- 
273-                         if  control_char_mask == newlines_mask { 
274-                             // All control characters are newlines, record them 
275-                             let  mut  newlines_mask = 0xFFFF0000  | newlines_mask as  u32 ; 
276-                             let  output_offset = RelativeBytePos :: from_usize( chunk_index *  CHUNK_SIZE  + 1 ) ; 
277- 
278-                             loop  { 
279-                                 let  index = newlines_mask. trailing_zeros( ) ; 
280- 
281-                                 if  index >= CHUNK_SIZE  as  u32  { 
282-                                     // We have arrived at the end of the chunk. 
283-                                     break ; 
284-                                 } 
285- 
286-                                 lines. push( RelativeBytePos ( index)  + output_offset) ; 
287- 
288-                                 // Clear the bit, so we can find the next one. 
289-                                 newlines_mask &= ( !1 )  << index; 
290-                             } 
291- 
292-                             // We are done for this chunk. All control characters were 
293-                             // newlines and we took care of those. 
294-                             continue ; 
295-                         }  else { 
296-                             // Some of the control characters are not newlines, 
297-                             // fall through to the slow path below. 
298-                         } 
299-                     }  else { 
300-                         // No control characters, nothing to record for this chunk 
301-                         continue ; 
223+                     // Check for newlines in the chunk 
224+                     let  newlines_test = unsafe  {  _mm_cmpeq_epi8( chunk,  _mm_set1_epi8( b'\n'  as  i8 ) )  } ; 
225+                     let  mut  newlines_mask = unsafe  {  _mm_movemask_epi8( newlines_test)  } ; 
226+ 
227+                     let  output_offset = RelativeBytePos :: from_usize( chunk_index *  CHUNK_SIZE  + 1 ) ; 
228+ 
229+                     while  newlines_mask != 0  { 
230+                         let  index = newlines_mask. trailing_zeros( ) ; 
231+ 
232+                         lines. push( RelativeBytePos ( index)  + output_offset) ; 
233+ 
234+                         // Clear the bit, so we can find the next one. 
235+                         newlines_mask &= newlines_mask - 1 ; 
302236                    } 
237+                 }  else { 
238+                     // The slow path. 
239+                     // There are multibyte chars in here, fallback to generic decoding. 
240+                     let  scan_start = chunk_index *  CHUNK_SIZE  + intra_chunk_offset; 
241+                     intra_chunk_offset = analyze_source_file_generic( 
242+                         & src[ scan_start..] , 
243+                         CHUNK_SIZE  - intra_chunk_offset, 
244+                         RelativeBytePos :: from_usize( scan_start) , 
245+                         lines, 
246+                         multi_byte_chars, 
247+                     ) ; 
303248                } 
304- 
305-                 // The slow path. 
306-                 // There are control chars in here, fallback to generic decoding. 
307-                 let  scan_start = chunk_index *  CHUNK_SIZE  + intra_chunk_offset; 
308-                 intra_chunk_offset = analyze_source_file_generic( 
309-                     & src[ scan_start..] , 
310-                     CHUNK_SIZE  - intra_chunk_offset, 
311-                     RelativeBytePos :: from_usize( scan_start) , 
312-                     lines, 
313-                     multi_byte_chars, 
314-                 ) ; 
315249            } 
316250
317251            // There might still be a tail left to analyze 
@@ -369,29 +303,18 @@ fn analyze_source_file_generic(
369303        // string. 
370304        let  mut  char_len = 1 ; 
371305
372-         if  byte < 32  { 
373-             // This is an ASCII control character, it could be one of the cases 
374-             // that are interesting to us. 
375- 
306+         if  byte == b'\n'  { 
376307            let  pos = RelativeBytePos :: from_usize ( i)  + output_offset; 
377- 
378-             if  let  b'\n'  = byte { 
379-                 lines. push ( pos + RelativeBytePos ( 1 ) ) ; 
380-             } 
381-         }  else  if  byte >= 127  { 
382-             // The slow path: 
383-             // This is either ASCII control character "DEL" or the beginning of 
384-             // a multibyte char. Just decode to `char`. 
308+             lines. push ( pos + RelativeBytePos ( 1 ) ) ; 
309+         }  else  if  byte >= 128  { 
310+             // This is the beginning of a multibyte char. Just decode to `char`. 
385311            let  c = src[ i..] . chars ( ) . next ( ) . unwrap ( ) ; 
386312            char_len = c. len_utf8 ( ) ; 
387313
388314            let  pos = RelativeBytePos :: from_usize ( i)  + output_offset; 
389- 
390-             if  char_len > 1  { 
391-                 assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ; 
392-                 let  mbc = MultiByteChar  {  pos,  bytes :  char_len as  u8  } ; 
393-                 multi_byte_chars. push ( mbc) ; 
394-             } 
315+             assert ! ( ( 2 ..=4 ) . contains( & char_len) ) ; 
316+             let  mbc = MultiByteChar  {  pos,  bytes :  char_len as  u8  } ; 
317+             multi_byte_chars. push ( mbc) ; 
395318        } 
396319
397320        i += char_len; 
0 commit comments