@@ -18,7 +18,65 @@ extern size_t Stringrchr(char *str,char ch, size_t stride,size_t len);
1818extern size_t Stringrchr2 (unsigned short * str , unsigned short ch , size_t stride ,size_t len );
1919extern size_t Stringrchr4 (unsigned int * str , unsigned int ch , size_t stride ,size_t len );
2020
21- #if defined(__SSE2__ ) || EMU_AVX
21+ #if C_AVX2 || EMU_AVX2
22+
23+ static size_t srchr (char * str , char ch , size_t len ){
24+ size_t i = len ;
25+ // align to 32 bytes
26+ while ((i > 0 ) && ((((intptr_t )str + i ) & 31 ) != 0 )){if (ch != str [i - 1 ]) return i ; else -- i ;}
27+ if (!i ) return 0 ;
28+ /* don't test i>=0 which is always true because size_t is unsigned */
29+ const __m256i xmm0 = _mm256_set1_epi8 ( ch );
30+ const __m256i xmm2 = _mm256_set1_epi8 ( 0xff );
31+ while (i > 32 ) {
32+ // search for ch
33+ int mask = 0 ;
34+ __m256i xmm1 = _mm256_load_si256 ((__m256i * )(str + i - 32 ));
35+ xmm1 = _mm256_andnot_si256 (_mm256_cmpeq_epi8 (xmm1 , xmm0 ),xmm2 );
36+ if ((mask = _mm256_movemask_epi8 (xmm1 )) != 0 ) { // some character is not ch
37+ // got 0 somewhere within 32 bytes in xmm1, or within 32 bits in mask
38+ // find index of last set bit
39+ #if (MMSC_VER ) // make sure <intrin.h> is included
40+ unsigned long pos ;
41+ _BitScanBackward (& pos , mask );
42+ i -= (size_t )pos ;
43+ #elif defined(__clang__ ) || ((__GNUC__ >= 4 ) || ((__GNUC__ == 3 ) && (__GNUC_MINOR__ >= 4 ))) // modern GCC has built-in __builtin_ctz
44+ i -= __builtin_clz (mask );
45+ #else // none of choices exist, use local BSR implementation
46+ #error __builtin_clz
47+ #endif
48+ return i ;
49+ }
50+ i -= 32 ;
51+ }
52+ while (i > 16 ) {
53+ const __m128i xmm0 = _mm_set1_epi8 ( ch );
54+ const __m128i xmm2 = _mm_set1_epi8 ( 0xff );
55+ // search for ch
56+ int mask = 0 ;
57+ __m128i xmm1 = _mm_load_si128 ((__m128i * )(str + i - 16 ));
58+ xmm1 = _mm_andnot_si128 (_mm_cmpeq_epi8 (xmm1 , xmm0 ),xmm2 );
59+ if ((mask = _mm_movemask_epi8 (xmm1 )) != 0 ) { // some character is not ch
60+ // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
61+ // find index of last set bit
62+ #if (MMSC_VER ) // make sure <intrin.h> is included
63+ unsigned long pos ;
64+ _BitScanBackward (& pos , mask );
65+ i -= (size_t )pos - 16 ;
66+ #elif defined(__clang__ ) || ((__GNUC__ >= 4 ) || ((__GNUC__ == 3 ) && (__GNUC_MINOR__ >= 4 ))) // modern GCC has built-in __builtin_ctz
67+ i -= __builtin_clz (mask )- 16 ; // mask is 32-bits but only lower 16-bits are significant
68+ #else // none of choices exist, use local BSR implementation
69+ #error __builtin_clz
70+ #endif
71+ return i ;
72+ }
73+ i -= 16 ;
74+ }
75+
76+ while (i > 0 ){if (ch != str [i - 1 ]) return i ; else -- i ;}
77+ return 0 ;
78+ }
79+ #elif defined(__SSE2__ ) || EMU_AVX
2280
2381static size_t srchr (char * str , char ch , size_t len ){
2482 size_t i = len ;
0 commit comments