diff --git a/htscodecs/rANS_static32x16pr.c b/htscodecs/rANS_static32x16pr.c index 51ea554..2b3acc4 100644 --- a/htscodecs/rANS_static32x16pr.c +++ b/htscodecs/rANS_static32x16pr.c @@ -165,7 +165,6 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in, } else { // Branchless version optimises poorly with gcc unless we have // AVX2 capability, so have a custom rewrite of it. - uint16_t* ptr16 = (uint16_t *)ptr; for (i=(in_size &~(NX-1)); likely(i>0); i-=NX) { // Unrolled copy of below, because gcc doesn't optimise this // well in the original form. @@ -197,15 +196,15 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in, int c1 = rp[3-1] > sy[1]->x_max; #ifdef HTSCODECS_LITTLE_ENDIAN - ptr16[-1] = rp[3-0]; ptr16 -= c0; - ptr16[-1] = rp[3-1]; ptr16 -= c1; + memcpy(&ptr[-2], &rp[3-0], 2); ptr -= c0 * 2; + memcpy(&ptr[-2], &rp[3-1], 2); ptr -= c1 * 2; #else - ((uint8_t *)&ptr16[-1])[0] = rp[3-0]; - ((uint8_t *)&ptr16[-1])[1] = rp[3-0]>>8; - ptr16 -= c0; - ((uint8_t *)&ptr16[-1])[0] = rp[3-1]; - ((uint8_t *)&ptr16[-1])[1] = rp[3-1]>>8; - ptr16 -= c1; + ptr[-2] = rp[3-0]; + ptr[-1] = rp[3-0]>>8; + ptr -= c0 * 2; + ptr[-2] = rp[3-1]; + ptr[-1] = rp[3-1]>>8; + ptr -= c1 * 2; #endif rp[3-0] = c0 ? rp[3-0]>>16 : rp[3-0]; @@ -217,15 +216,15 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in, int c2 = rp[3-2] > sy[2]->x_max; int c3 = rp[3-3] > sy[3]->x_max; #ifdef HTSCODECS_LITTLE_ENDIAN - ptr16[-1] = rp[3-2]; ptr16 -= c2; - ptr16[-1] = rp[3-3]; ptr16 -= c3; + memcpy(&ptr[-2], &rp[3-2], 2); ptr -= c2 * 2; + memcpy(&ptr[-2], &rp[3-3], 2); ptr -= c3 * 2; #else - ((uint8_t *)&ptr16[-1])[0] = rp[3-2]; - ((uint8_t *)&ptr16[-1])[1] = rp[3-2]>>8; - ptr16 -= c2; - ((uint8_t *)&ptr16[-1])[0] = rp[3-3]; - ((uint8_t *)&ptr16[-1])[1] = rp[3-3]>>8; - ptr16 -= c3; + ptr[-2] = rp[3-2]; + ptr[-1] = rp[3-2]>>8; + ptr -= c2 * 2; + ptr[-2] = rp[3-3]; + ptr[-1] = rp[3-3]>>8; + ptr -= c3 * 2; #endif rp[3-2] = c2 ? rp[3-2]>>16 : rp[3-2]; rp[3-3] = c3 ? rp[3-3]>>16 : rp[3-3]; @@ -239,7 +238,6 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in, } if (z < -1) abort(); } - ptr = (uint8_t *)ptr16; } for (z = NX-1; z >= 0; z--) RansEncFlush(&ransN[z], &ptr); @@ -476,7 +474,6 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in, i32[i] = &in[iN[i]]; for (; likely(i32[0] >= in); ) { - uint16_t *ptr16 = (uint16_t *)ptr; for (z = NX-1; z >= 0; z-=4) { RansEncSymbol *sy[4]; int k; @@ -490,12 +487,12 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in, for (k = 0; k < 4; k++) { int c = ransN[z-k] > sy[k]->x_max; #ifdef HTSCODECS_LITTLE_ENDIAN - ptr16[-1] = ransN[z-k]; + memcpy(&ptr[-2], &ransN[z-k], 2); #else - ((uint8_t *)&ptr16[-1])[0] = ransN[z-k]; - ((uint8_t *)&ptr16[-1])[1] = ransN[z-k]>>8; + ptr16[-2] = ransN[z-k]; + ptr16[-1] = ransN[z-k]>>8; #endif - ptr16 -= c; + ptr -= c * 2; //ransN[z-k] >>= c<<4; ransN[z-k] = c ? ransN[z-k]>>16 : ransN[z-k]; } @@ -506,7 +503,6 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in, ransN[z-k] += sy[k]->bias + q*sy[k]->cmpl_freq; } } - ptr = (uint8_t *)ptr16; } for (z = NX-1; z>=0; z--) diff --git a/htscodecs/rANS_static32x16pr_avx2.c b/htscodecs/rANS_static32x16pr_avx2.c index cf04578..702d513 100644 --- a/htscodecs/rANS_static32x16pr_avx2.c +++ b/htscodecs/rANS_static32x16pr_avx2.c @@ -220,8 +220,6 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in, while (z-- > 0) RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]); - uint16_t *ptr16 = (uint16_t *)ptr; - LOAD(Rv, ransN); for (i=(in_size &~(NX-1)); i>0; i-=NX) { @@ -306,7 +304,7 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in, _mm256_and_si256(sh[x+3], D))) // Renorm: - // if (x > x_max) {*--ptr16 = x & 0xffff; x >>= 16;} + // if (x > x_max) { ptr -=2; *((uint16_t *)ptr) = x & 0xffff; x >>= 16;} __m256i xmax1 = SYM_LOAD( 0, xA, xB, xC, xD); __m256i xmax2 = SYM_LOAD( 4, xA, xB, xC, xD); __m256i xmax3 = SYM_LOAD( 8, xA, xB, xC, xD); @@ -317,7 +315,7 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in, __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmax3); __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmax4); - // Store bottom 16-bits at ptr16 + // Store bottom 16-bits at ptr unsigned int imask1 = _mm256_movemask_ps((__m256)cv1); unsigned int imask2 = _mm256_movemask_ps((__m256)cv2); unsigned int imask3 = _mm256_movemask_ps((__m256)cv3); @@ -349,20 +347,20 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in, // Now we have bottom N 16-bit values in each V12/V34 to flush __m128i f = _mm256_extractf128_si256(V34, 1); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask4); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask4) * 2; f = _mm256_extractf128_si256(V34, 0); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask3); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask3) * 2; f = _mm256_extractf128_si256(V12, 1); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask2); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask2) * 2; f = _mm256_extractf128_si256(V12, 0); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask1); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask1) * 2; __m256i Rs; Rs = _mm256_srli_epi32(Rv1,16); Rv1 = _mm256_blendv_epi8(Rv1, Rs, cv1); @@ -437,7 +435,6 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in, STORE(Rv, ransN); - ptr = (uint8_t *)ptr16; for (z = NX-1; z >= 0; z--) RansEncFlush(&ransN[z], &ptr); @@ -506,15 +503,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in, goto err; } - uint16_t *sp = (uint16_t *)cp; + uint8_t *sp = cp; uint8_t overflow[64+64] = {0}; cp_end -= 64; // Protect against running off the end of in buffer. // We copy it to a worst-case local buffer when near the end. - if ((uint8_t *)sp > cp_end) { - memmove(overflow, sp, cp_end+64 - (uint8_t *)sp); - sp = (uint16_t *)overflow; + if (sp > cp_end) { + memmove(overflow, sp, cp_end+64 - sp); + sp = overflow; cp_end = overflow + sizeof(overflow) - 64; } @@ -569,9 +566,9 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in, // Protect against running off the end of in buffer. // We copy it to a worst-case local buffer when near the end. - if ((uint8_t *)sp > cp_end) { - memmove(overflow, sp, cp_end+64 - (uint8_t *)sp); - sp = (uint16_t *)overflow; + if (sp > cp_end) { + memmove(overflow, sp, cp_end+64 - sp); + sp = overflow; cp_end = overflow + sizeof(overflow) - 64; } @@ -600,11 +597,11 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in, // Shuffle the renorm values to correct lanes and incr sp pointer unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2); - sp += _mm_popcnt_u32(imask1); + sp += _mm_popcnt_u32(imask1) * 2; __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]); __m256i Vv2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); - sp += _mm_popcnt_u32(imask2); + sp += _mm_popcnt_u32(imask2) * 2; Yv1 = _mm256_or_si256(Yv1, Vv1); Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2); @@ -662,7 +659,7 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in, Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3); __m256i Yv4 = _mm256_slli_epi32(Rv4, 16); unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4); - sp += _mm_popcnt_u32(imask3); + sp += _mm_popcnt_u32(imask3) * 2; __m256i idx4 = _mm256_load_si256((const __m256i*)permute[imask4]); __m256i Vv4 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); @@ -671,7 +668,7 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in, Yv3 = _mm256_or_si256(Yv3, Vv3); Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4); Yv4 = _mm256_or_si256(Yv4, Vv4); - sp += _mm_popcnt_u32(imask4); + sp += _mm_popcnt_u32(imask4) * 2; // R[z] = c ? Y[z] : R[z]; Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3); @@ -757,8 +754,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si lN[z] = c; } - uint16_t *ptr16 = (uint16_t *)ptr; - // clang16 clang10 gcc7 gcc13 // 587 435 381 588 438 403 504 386 415 527 381 394 // simT 611 432 402 475 401 367 472 422 386 486 353 324 @@ -865,7 +860,8 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si // ------------------------------------------------------------ // for (z = NX-1; z >= 0; z--) { // if (ransN[z] >= x_max[z]) { - // *--ptr16 = ransN[z] & 0xffff; + // ptr -= 2; + // *((uint16_t *)ptr) = ransN[z] & 0xffff; // ransN[z] >>= 16; // } // } @@ -874,10 +870,13 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmaxv[2]); __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmaxv[3]); - // Store bottom 16-bits at ptr16 + // Store bottom 16-bits at ptr // // for (z = NX-1; z >= 0; z--) { - // if (cond[z]) *--ptr16 = (uint16_t )(ransN[z] & 0xffff); + // if (cond[z]) { + // ptr -=2; + // *((uint16_t *) ptr) = (ransN[z] & 0xffff); + // } // } unsigned int imask1 = _mm256_movemask_ps((__m256)cv1); unsigned int imask2 = _mm256_movemask_ps((__m256)cv2); @@ -910,20 +909,20 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si // Now we have bottom N 16-bit values in each V12/V34 to flush __m128i f = _mm256_extractf128_si256(V34, 1); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask4); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask4) * 2; f = _mm256_extractf128_si256(V34, 0); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask3); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask3) * 2; f = _mm256_extractf128_si256(V12, 1); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask2); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask2) * 2; f = _mm256_extractf128_si256(V12, 0); - _mm_storeu_si128((__m128i *)(ptr16-8), f); - ptr16 -= _mm_popcnt_u32(imask1); + _mm_storeu_si128((__m128i *)(ptr-16), f); + ptr -= _mm_popcnt_u32(imask1) * 2; __m256i Rs1, Rs2, Rs3, Rs4; Rs1 = _mm256_srli_epi32(Rv1,16); @@ -995,8 +994,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si STORE(Rv, ransN); - ptr = (uint8_t *)ptr16; - for (z = NX-1; z>=0; z--) RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]); @@ -1106,7 +1103,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, for (z = 0; z < NX; z++) iN[z] = z*isz4; - uint16_t *sp = (uint16_t *)ptr; + uint8_t *sp = ptr; const uint32_t mask = (1u << shift)-1; __m256i maskv = _mm256_set1_epi32(mask); @@ -1127,7 +1124,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, if (shift == TF_SHIFT_O1) { isz4 -= 64; - for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { + for (; iN[0] < isz4 && ptr_end - sp > 64; ) { // m[z] = R[z] & mask; __m256i masked1 = _mm256_and_si256(Rv1, maskv); __m256i masked2 = _mm256_and_si256(Rv2, maskv); @@ -1240,12 +1237,12 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2); Vv1 = _mm256_permutevar8x32_epi32(Vv1, idx1); - sp += _mm_popcnt_u32(imask1); + sp += _mm_popcnt_u32(imask1) * 2; __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]); __m256i Vv2 = _mm256_cvtepu16_epi32( _mm_loadu_si128((__m128i *)sp)); - sp += _mm_popcnt_u32(imask2); + sp += _mm_popcnt_u32(imask2) * 2; Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2); Yv1 = _mm256_or_si256(Yv1, Vv1); @@ -1300,7 +1297,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3); unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4); __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]); - sp += _mm_popcnt_u32(imask3); + sp += _mm_popcnt_u32(imask3) * 2; Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3); sv3 = _mm256_packus_epi32(sv3, sv4); @@ -1328,7 +1325,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4); Yv4 = _mm256_or_si256(Yv4, Vv4); - sp += _mm_popcnt_u32(imask4); + sp += _mm_popcnt_u32(imask4) * 2; Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3); Rv4 = _mm256_blendv_epi8(Rv4, Yv4, renorm_mask4); @@ -1338,7 +1335,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, STORE(Rv, R); STORE(Lv, lN); - ptr = (uint8_t *)sp; + ptr = sp; if (1) { iN[0]-=tidx; @@ -1383,7 +1380,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, // SIMD version ends decoding early as it reads at most 64 bytes // from input via 4 vectorised loads. isz4 -= 64; - for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { + for (; iN[0] < isz4 && ptr_end - sp > 64; ) { // m[z] = R[z] & mask; __m256i masked1 = _mm256_and_si256(Rv1, maskv); __m256i masked2 = _mm256_and_si256(Rv2, maskv); @@ -1475,12 +1472,12 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2); Vv1 = _mm256_permutevar8x32_epi32(Vv1, idx1); - sp += _mm_popcnt_u32(imask1); + sp += _mm_popcnt_u32(imask1) * 2; __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]); __m256i Vv2 = _mm256_cvtepu16_epi32( _mm_loadu_si128((__m128i *)sp)); - sp += _mm_popcnt_u32(imask2); + sp += _mm_popcnt_u32(imask2) * 2; Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2); Yv1 = _mm256_or_si256(Yv1, Vv1); @@ -1525,7 +1522,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3); unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4); __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]); - sp += _mm_popcnt_u32(imask3); + sp += _mm_popcnt_u32(imask3) * 2; Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3); // sv3 sv4 are 32-bit ints with lowest bit being char @@ -1562,7 +1559,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4); Yv4 = _mm256_or_si256(Yv4, Vv4); - sp += _mm_popcnt_u32(imask4); + sp += _mm_popcnt_u32(imask4) * 2; Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3); Rv4 = _mm256_blendv_epi8(Rv4, Yv4, renorm_mask4); @@ -1571,7 +1568,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in, STORE(Rv, R); STORE(Lv, lN); - ptr = (uint8_t *)sp; + ptr = sp; if (1) { iN[0]-=tidx; diff --git a/htscodecs/rANS_static32x16pr_avx512.c b/htscodecs/rANS_static32x16pr_avx512.c index 3563f83..c9717f0 100644 --- a/htscodecs/rANS_static32x16pr_avx512.c +++ b/htscodecs/rANS_static32x16pr_avx512.c @@ -204,7 +204,6 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in, LOAD512(Rv, ransN); - uint16_t *ptr16 = (uint16_t *)ptr; for (i=(in_size &~(32-1)); i>0; i-=32) { uint8_t *c = &in[i-32]; @@ -223,20 +222,20 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in, SET512(xmax, SB); uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); - int pc1 = _mm_popcnt_u32(gt_mask1); + int pc1 = _mm_popcnt_u32(gt_mask1) * 2; __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); SET512(SDv, SD); - int pc2 = _mm_popcnt_u32(gt_mask2); + int pc2 = _mm_popcnt_u32(gt_mask2) * 2; Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); - _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<= 0; z--) @@ -359,7 +357,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in, goto err; } - uint16_t *sp = (uint16_t *)cp; + uint8_t *sp = cp; int out_end = (out_sz&~(32-1)); const uint32_t mask = (1u << TF_SHIFT)-1; @@ -381,9 +379,9 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in, // Protect against running off the end of in buffer. // We copy it to a worst-case local buffer when near the end. - if ((uint8_t *)sp+64 > cp_end) { - memmove(overflow, sp, cp_end - (uint8_t *)sp); - sp = (uint16_t *)overflow; + if (cp_end - sp < 64) { + memmove(overflow, sp, cp_end - sp); + sp = overflow; cp_end = overflow + sizeof(overflow); } @@ -410,7 +408,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in, // renorm. this is the interesting part: renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L)); // advance by however many words we actually read - sp += _mm_popcnt_u32(renorm_mask1); + sp += _mm_popcnt_u32(renorm_mask1) * 2; __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256( (const __m256i *)sp)); @@ -440,7 +438,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in, renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16); // advance by however many words we actually read - sp += _mm_popcnt_u32(renorm_mask2); + sp += _mm_popcnt_u32(renorm_mask2) * 2; R1 = _mm512_add_epi32(R1, renorm_vals1); R2 = _mm512_add_epi32(R2, renorm_vals2); @@ -575,7 +573,6 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in, LOAD512(Rv, ransN); - uint16_t *ptr16 = (uint16_t *)ptr; LOAD512(iN, iN); LOAD512(last, lN); @@ -621,7 +618,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in, // ------------------------------------------------------------ // for (z = NX-1; z >= 0; z--) { // if (ransN[z] >= x_max[z]) { - // *--ptr16 = ransN[z] & 0xffff; + // ptr += 2; + // *((uint16_t *) ptr) = ransN[z] & 0xffff; // ransN[z] >>= 16; // } // } @@ -690,20 +688,20 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in, SET512x(xmax, x_max); // high latency uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); - int pc1 = _mm_popcnt_u32(gt_mask1); + int pc1 = _mm_popcnt_u32(gt_mask1) * 2; __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); SET512x(SDv, cmpl_freq); // good - int pc2 = _mm_popcnt_u32(gt_mask2); + int pc2 = _mm_popcnt_u32(gt_mask2) * 2; Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); - _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<=0; z--) RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]); @@ -845,7 +841,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, for (z = 0; z < NX; z++) iN[z] = z*isz4; - uint16_t *sp = (uint16_t *)ptr; + uint8_t *sp = ptr; const uint32_t mask = (1u << shift)-1; __m512i _maskv = _mm512_set1_epi32(mask); @@ -865,7 +861,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, if (shift == TF_SHIFT_O1) { isz4 -= 64; - for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { + for (; iN[0] < isz4 && ptr_end - sp > 64; ) { // m[z] = R[z] & mask; __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv); __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv); @@ -939,11 +935,11 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, __m512i renorm_words1 = _mm512_cvtepu16_epi32 (_mm256_loadu_si256((const __m256i *)sp)); - sp += _mm_popcnt_u32(_imask1); + sp += _mm_popcnt_u32(_imask1) * 2; __m512i renorm_words2 = _mm512_cvtepu16_epi32 (_mm256_loadu_si256((const __m256i *)sp)); - sp += _mm_popcnt_u32(_imask2); + sp += _mm_popcnt_u32(_imask2) * 2; __m512i _renorm_vals1 = _mm512_maskz_expand_epi32(_imask1, renorm_words1); @@ -985,7 +981,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, STORE512(_Rv, R); STORE512(_Lv, lN); - ptr = (uint8_t *)sp; + ptr = sp; if (1) { iN[0]-=tidx; @@ -1034,7 +1030,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, // SIMD version ends decoding early as it reads at most 64 bytes // from input via 4 vectorised loads. isz4 -= 64; - for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { + for (; iN[0] < isz4 && ptr_end - sp > 64; ) { // m[z] = R[z] & mask; __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv); __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv); @@ -1133,7 +1129,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, STORE512(_Rv, R); STORE512(_Lv, lN); - ptr = (uint8_t *)sp; + ptr = sp; if (1) { iN[0]-=tidx; diff --git a/htscodecs/rANS_static32x16pr_neon.c b/htscodecs/rANS_static32x16pr_neon.c index 52e7d2f..817bb30 100644 --- a/htscodecs/rANS_static32x16pr_neon.c +++ b/htscodecs/rANS_static32x16pr_neon.c @@ -665,7 +665,7 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in, // 500MB/s. Clang does a lot of reordering of this code, removing some // of the manual tuning benefits. Short of dropping to assembly, for now // I would recommend using gcc to compile this file. - uint16_t *sp = (uint16_t *)cp; + uint8_t *sp = cp; uint8_t overflow[64+64] = {0}; for (i=0; i < out_end; i+=NX) { // Decode freq, bias and symbol from s3 lookups @@ -770,16 +770,20 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in, // Protect against running off the end of in buffer. // We copy it to a worst-case local buffer when near the end. - if ((uint8_t *)sp+64 > cp_end) { - memmove(overflow, sp, cp_end - (uint8_t *)sp); - sp = (uint16_t *)overflow; + if (cp_end - sp < 64) { + memmove(overflow, sp, cp_end - sp); + sp = overflow; cp_end = overflow + sizeof(overflow); } - uint16x8_t norm12 = vld1q_u16(sp); - sp += nbits[imask1] + nbits[imask2]; - uint16x8_t norm34 = vld1q_u16(sp); - sp += nbits[imask3] + nbits[imask4]; + // load 8 16-bit lanes of renorm data (loaded as 8-bit for + // alignment purposes, but will be shuffled and cast to + // 16 bit below). + + uint8x16_t norm12 = vld1q_u8(sp); + sp += (nbits[imask1] + nbits[imask2]) * 2; + uint8x16_t norm34 = vld1q_u8(sp); + sp += (nbits[imask3] + nbits[imask4]) * 2; Bv5 = vandq_u32(Bv5, maskv); Bv6 = vandq_u32(Bv6, maskv); @@ -799,14 +803,14 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in, uint32_t imask12 = (imask1<<4)|imask2; uint32_t imask34 = (imask3<<4)|imask4; + // Shuffle norm to the corresponding R lanes, via imask // #define for brevity and formatting #define cast_u16_u8 vreinterpret_u16_u8 -#define cast_u8_u16 vreinterpretq_u8_u16 uint16x4_t norm1, norm2, norm3, norm4, norm5, norm6, norm7, norm8; - norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx [imask1])); - norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx2[imask12])); - norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx [imask3])); - norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx2[imask34])); + norm1 = cast_u16_u8(vqtbl1_u8(norm12,idx [imask1])); + norm2 = cast_u16_u8(vqtbl1_u8(norm12,idx2[imask12])); + norm3 = cast_u16_u8(vqtbl1_u8(norm34,idx [imask3])); + norm4 = cast_u16_u8(vqtbl1_u8(norm34,idx2[imask34])); uint32x4_t Rlt5 = vcltq_u32(Rv5, vdupq_n_u32(RANS_BYTE_L)); uint32x4_t Rlt6 = vcltq_u32(Rv6, vdupq_n_u32(RANS_BYTE_L)); @@ -819,15 +823,15 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in, uint32x4_t Rsl3 = vshlq_n_u32(Rv3, 16); uint32x4_t Rsl4 = vshlq_n_u32(Rv4, 16); - uint16x8_t norm56 = vld1q_u16(sp); + uint8x16_t norm56 = vld1q_u8(sp); uint32_t imask5 = vaddvq_u32(vandq_u32(Rlt5, bit)); uint32_t imask6 = vaddvq_u32(vandq_u32(Rlt6, bit)); uint32_t imask7 = vaddvq_u32(vandq_u32(Rlt7, bit)); uint32_t imask8 = vaddvq_u32(vandq_u32(Rlt8, bit)); - sp += nbits[imask5] + nbits[imask6]; - uint16x8_t norm78 = vld1q_u16(sp); - sp += nbits[imask7] + nbits[imask8]; + sp += (nbits[imask5] + nbits[imask6]) * 2; + uint8x16_t norm78 = vld1q_u8(sp); + sp += (nbits[imask7] + nbits[imask8]) * 2; Rsl1 = vaddw_u16(Rsl1, norm1); // Rsl += norm Rsl2 = vaddw_u16(Rsl2, norm2); @@ -842,10 +846,10 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in, Rv3 = vbslq_u32(Rlt3, Rsl3, Rv3); Rv4 = vbslq_u32(Rlt4, Rsl4, Rv4); - norm5 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm56),idx [imask5])); - norm6 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm56),idx2[imask56])); - norm7 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm78),idx [imask7])); - norm8 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm78),idx2[imask78])); + norm5 = cast_u16_u8(vqtbl1_u8(norm56,idx [imask5])); + norm6 = cast_u16_u8(vqtbl1_u8(norm56,idx2[imask56])); + norm7 = cast_u16_u8(vqtbl1_u8(norm78,idx [imask7])); + norm8 = cast_u16_u8(vqtbl1_u8(norm78,idx2[imask78])); uint32x4_t Rsl5 = vshlq_n_u32(Rv5, 16); uint32x4_t Rsl6 = vshlq_n_u32(Rv6, 16); @@ -1551,7 +1555,7 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in, // Follow with 2nd copy doing scalar code instead? unsigned char tbuf[32][32] = {0}; int tidx = 0; - for (; i4[0] < isz4 && ptr+64 < ptr_end;) { + for (; i4[0] < isz4 && ptr_end - ptr > 64;) { for (z = 0; z < NX; z+=16) { uint32x4_t Rv1 = vld1q_u32(&R[z+0]); uint32x4_t Rv2 = vld1q_u32(&R[z+4]); @@ -1643,12 +1647,14 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in, uint32x4_t Rlt3 = vcltq_u32(Rv3, vdupq_n_u32(RANS_BYTE_L)); uint32x4_t Rlt4 = vcltq_u32(Rv4, vdupq_n_u32(RANS_BYTE_L)); uint32x4_t all2 = {2,2,2,2}; - // load 8 lanes of renorm data - uint16x8_t norm12 = vld1q_u16((uint16_t *)ptr); + // load 8 16-bit lanes of renorm data (loaded as 8-bit for + // alignment purposes, but will be shuffled and cast to + // 16 bit below). + uint8x16_t norm12 = vld1q_u8(ptr); // move ptr by no. renorm lanes used ptr += vaddvq_u32(vandq_u32(Rlt1, all2)) + vaddvq_u32(vandq_u32(Rlt2, all2)); - uint16x8_t norm34 = vld1q_u16((uint16_t *)ptr); + uint8x16_t norm34 = vld1q_u8(ptr); ptr += vaddvq_u32(vandq_u32(Rlt3, all2)) + vaddvq_u32(vandq_u32(Rlt4, all2)); @@ -1663,16 +1669,12 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in, uint32_t imask34 = (imask3<<4)|imask4; // Shuffle norm to the corresponding R lanes, via imask - // #define for brevity and formatting + // cast_u16_u8 #define for brevity and formatting uint16x4_t norm1, norm2, norm3, norm4; - norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12), - idx [imask1])); - norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12), - idx2[imask12])); - norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34), - idx [imask3])); - norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34), - idx2[imask34])); + norm1 = cast_u16_u8(vqtbl1_u8(norm12, idx [imask1])); + norm2 = cast_u16_u8(vqtbl1_u8(norm12, idx2[imask12])); + norm3 = cast_u16_u8(vqtbl1_u8(norm34, idx [imask3])); + norm4 = cast_u16_u8(vqtbl1_u8(norm34, idx2[imask34])); // Add norm to R<<16 and blend back in with R uint32x4_t Rsl1 = vshlq_n_u32(Rv1, 16); // Rsl = R << 16 @@ -1776,7 +1778,7 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in, uint32_t *S3 = (uint32_t *)s3; - for (; i4[0] < isz4 && ptr+64 < ptr_end;) { + for (; i4[0] < isz4 && ptr_end - ptr > 64;) { int Z = 0; for (z = 0; z < NX; z+=16, Z+=4) { // streamline these. Could swap between two banks and pre-load @@ -1852,23 +1854,25 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in, uint32_t imask3 = vaddvq_u32(vandq_u32(Rlt3, bit)); uint32_t imask4 = vaddvq_u32(vandq_u32(Rlt4, bit)); - // load 8 lanes of renorm data - uint16x8_t norm12 = vld1q_u16((uint16_t *)ptr); + // load 8 16-bit lanes of renorm data (loaded as 8-bit for + // alignment purposes, but will be shuffled and cast to + // 16 bit below). + uint8x16_t norm12 = vld1q_u8(ptr); // move ptr by no. renorm lanes used ptr += nbits[imask1] + nbits[imask2]; - uint16x8_t norm34 = vld1q_u16((uint16_t *)ptr); + uint8x16_t norm34 = vld1q_u8(ptr); ptr += nbits[imask3] + nbits[imask4]; uint32_t imask12 = (imask1<<4)|imask2; uint32_t imask34 = (imask3<<4)|imask4; // Shuffle norm to the corresponding R lanes, via imask - // #define for brevity and formatting + // cast_u16_u8 #define for brevity and formatting uint16x4_t norm1, norm2, norm3, norm4; - norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx [imask1])); - norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx2[imask12])); - norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx [imask3])); - norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx2[imask34])); + norm1 = cast_u16_u8(vqtbl1_u8(norm12,idx [imask1])); + norm2 = cast_u16_u8(vqtbl1_u8(norm12,idx2[imask12])); + norm3 = cast_u16_u8(vqtbl1_u8(norm34,idx [imask3])); + norm4 = cast_u16_u8(vqtbl1_u8(norm34,idx2[imask34])); // Add norm to R<<16 and blend back in with R uint32x4_t Rsl1 = vshlq_n_u32(RV[Z+0], 16); // Rsl = R << 16 diff --git a/htscodecs/rANS_static32x16pr_sse4.c b/htscodecs/rANS_static32x16pr_sse4.c index a9b9c4b..7de193b 100644 --- a/htscodecs/rANS_static32x16pr_sse4.c +++ b/htscodecs/rANS_static32x16pr_sse4.c @@ -275,7 +275,6 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in, uint32_t SB[256], SA[256], SD[256], SC[256]; // Build lookup tables for SIMD encoding - uint16_t *ptr16 = (uint16_t *)ptr; for (i = 0; i < 256; i++) { SB[i] = syms[i].x_max; SA[i] = syms[i].rcp_freq; @@ -315,7 +314,7 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in, __m128i cv6 = _mm_cmpgt_epi32(Rv[H+1], xmax6); __m128i cv5 = _mm_cmpgt_epi32(Rv[H+0], xmax5); - // Store bottom 16-bits at ptr16 + // Store bottom 16-bits at ptr unsigned int imask8 = _mm_movemask_ps((__m128)cv8); unsigned int imask7 = _mm_movemask_ps((__m128)cv7); unsigned int imask6 = _mm_movemask_ps((__m128)cv6); @@ -367,10 +366,10 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in, V6 = _mm_shuffle_epi8(V6, shuf); V5 = _mm_shuffle_epi8(V5, shuf); - _mm_storeu_si64(ptr16-4, V8); ptr16 -= _mm_popcnt_u32(imask8); - _mm_storeu_si64(ptr16-4, V7); ptr16 -= _mm_popcnt_u32(imask7); - _mm_storeu_si64(ptr16-4, V6); ptr16 -= _mm_popcnt_u32(imask6); - _mm_storeu_si64(ptr16-4, V5); ptr16 -= _mm_popcnt_u32(imask5); + _mm_storeu_si64(ptr-8, V8); ptr -= _mm_popcnt_u32(imask8) * 2; + _mm_storeu_si64(ptr-8, V7); ptr -= _mm_popcnt_u32(imask7) * 2; + _mm_storeu_si64(ptr-8, V6); ptr -= _mm_popcnt_u32(imask6) * 2; + _mm_storeu_si64(ptr-8, V5); ptr -= _mm_popcnt_u32(imask5) * 2; Rv[H+3] = _mm_blendv_epi8(Rv[H+3], _mm_srli_epi32(Rv[H+3], 16), cv8); Rv[H+2] = _mm_blendv_epi8(Rv[H+2], _mm_srli_epi32(Rv[H+2], 16), cv7); @@ -441,8 +440,6 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in, STORE128v(Rv, ransN); - ptr = (uint8_t *)ptr16; - for (z = NX-1; z >= 0; z--) RansEncFlush(&ransN[z], &ptr); @@ -512,7 +509,7 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in, goto err; } - uint16_t *sp = (uint16_t *)cp; + uint8_t *sp = cp; int out_end = (out_sz&~(NX-1)); const uint32_t mask = (1u << TF_SHIFT)-1; @@ -628,31 +625,31 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in, // 72 = 7*8(imask1..7) + 16; worse case for 8th _mm_loadu_si128 call. // An extra 64 bytes is to avoid triggering this multiple times // after we swap sp/cp_end over. - if ((uint8_t *)sp+72 > cp_end) { - memmove(overflow, sp, cp_end - (uint8_t *)sp); - sp = (uint16_t *)overflow; - cp_end = (uint8_t *)overflow + sizeof(overflow); + if (cp_end - sp < 72) { + memmove(overflow, sp, cp_end - sp); + sp = overflow; + cp_end = overflow + sizeof(overflow); } // Shuffle the renorm values to correct lanes and incr sp pointer __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1); Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1])); - sp += _mm_popcnt_u32(imask1); + sp += _mm_popcnt_u32(imask1) * 2; __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2); - sp += _mm_popcnt_u32(imask2); + sp += _mm_popcnt_u32(imask2) * 2; Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2])); __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3); Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3])); - sp += _mm_popcnt_u32(imask3); + sp += _mm_popcnt_u32(imask3) * 2; __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4); - sp += _mm_popcnt_u32(imask4); + sp += _mm_popcnt_u32(imask4) * 2; Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4])); __m128i Yv1 = _mm_slli_epi32(Rv1, 16); @@ -738,21 +735,21 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in, __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5); Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5])); - sp += _mm_popcnt_u32(imask5); + sp += _mm_popcnt_u32(imask5) * 2; __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6); - sp += _mm_popcnt_u32(imask6); + sp += _mm_popcnt_u32(imask6) * 2; Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6])); __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7); Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7])); - sp += _mm_popcnt_u32(imask7); + sp += _mm_popcnt_u32(imask7) * 2; __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8); - sp += _mm_popcnt_u32(imask8); + sp += _mm_popcnt_u32(imask8) * 2; Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8])); __m128i Yv5 = _mm_slli_epi32(Rv5, 16); @@ -1108,7 +1105,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, // loop with shift as a variable. if (shift == TF_SHIFT_O1) { // TF_SHIFT_O1 = 12 - uint16_t *sp = (uint16_t *)ptr; + uint8_t *sp = ptr; const uint32_t mask = ((1u << TF_SHIFT_O1)-1); __m128i maskv = _mm_set1_epi32(mask); // set mask in all lanes uint8_t tbuf[32][32]; @@ -1117,7 +1114,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, LOAD128(Lv, l); isz4 -= 64; - for (; i4[0] < isz4 && (uint8_t *)sp+72 < ptr_end; ) { + for (; i4[0] < isz4 && ptr_end - sp > 72; ) { //for (z = 0; z < NX; z++) // m[z] = R[z] & mask; __m128i masked1 = _mm_and_si128(Rv1, maskv); @@ -1257,21 +1254,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1); Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1])); - sp += _mm_popcnt_u32(imask1); + sp += _mm_popcnt_u32(imask1) * 2; __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2); - sp += _mm_popcnt_u32(imask2); + sp += _mm_popcnt_u32(imask2) * 2; Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2])); __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3); Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3])); - sp += _mm_popcnt_u32(imask3); + sp += _mm_popcnt_u32(imask3) * 2; __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4); - sp += _mm_popcnt_u32(imask4); + sp += _mm_popcnt_u32(imask4) * 2; Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4])); __m128i Yv1 = _mm_slli_epi32(Rv1, 16); @@ -1382,21 +1379,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5); Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5])); - sp += _mm_popcnt_u32(imask5); + sp += _mm_popcnt_u32(imask5) * 2; __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6); - sp += _mm_popcnt_u32(imask6); + sp += _mm_popcnt_u32(imask6) * 2; Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6])); __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7); Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7])); - sp += _mm_popcnt_u32(imask7); + sp += _mm_popcnt_u32(imask7) * 2; __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8); - sp += _mm_popcnt_u32(imask8); + sp += _mm_popcnt_u32(imask8) * 2; Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8])); __m128i Yv5 = _mm_slli_epi32(Rv5, 16); @@ -1438,7 +1435,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, STORE128(Rv, R); STORE128(Lv, l); - ptr = (uint8_t *)sp; + ptr = sp; i4[0]-=tidx; int T; @@ -1478,7 +1475,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, } } else { // TF_SHIFT_O1 = 10 - uint16_t *sp = (uint16_t *)ptr; + uint8_t *sp = ptr; const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1); __m128i maskv = _mm_set1_epi32(mask); // set mask in all lanes uint8_t tbuf[32][32] __attribute__((aligned(32))); @@ -1487,7 +1484,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, LOAD128(Lv, l); isz4 -= 64; - for (; i4[0] < isz4 && (uint8_t *)sp+72 < ptr_end; ) { + for (; i4[0] < isz4 && ptr_end - sp > 72; ) { //for (z = 0; z < NX; z++) // m[z] = R[z] & mask; __m128i masked1 = _mm_and_si128(Rv1, maskv); @@ -1607,21 +1604,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1); Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1])); - sp += _mm_popcnt_u32(imask1); + sp += _mm_popcnt_u32(imask1) * 2; __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2); - sp += _mm_popcnt_u32(imask2); + sp += _mm_popcnt_u32(imask2) * 2; Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2])); __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3); Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3])); - sp += _mm_popcnt_u32(imask3); + sp += _mm_popcnt_u32(imask3) * 2; __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4); - sp += _mm_popcnt_u32(imask4); + sp += _mm_popcnt_u32(imask4) * 2; Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4])); __m128i Yv1 = _mm_slli_epi32(Rv1, 16); @@ -1722,21 +1719,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5); Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5])); - sp += _mm_popcnt_u32(imask5); + sp += _mm_popcnt_u32(imask5) * 2; __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6); - sp += _mm_popcnt_u32(imask6); + sp += _mm_popcnt_u32(imask6) * 2; Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6])); __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7); Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7])); - sp += _mm_popcnt_u32(imask7); + sp += _mm_popcnt_u32(imask7) * 2; __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp)); unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8); - sp += _mm_popcnt_u32(imask8); + sp += _mm_popcnt_u32(imask8) * 2; Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8])); __m128i Yv5 = _mm_slli_epi32(Rv5, 16); @@ -1777,7 +1774,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in, STORE128(Rv, R); STORE128(Lv, l); - ptr = (uint8_t *)sp; + ptr = sp; i4[0]-=tidx; int T; diff --git a/htscodecs/rANS_static4x16pr.c b/htscodecs/rANS_static4x16pr.c index d7fd00c..7b4d9bb 100644 --- a/htscodecs/rANS_static4x16pr.c +++ b/htscodecs/rANS_static4x16pr.c @@ -190,31 +190,10 @@ unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size, RansEncSymbol *s1 = &syms[in[i-3]]; RansEncSymbol *s0 = &syms[in[i-4]]; -#if 1 RansEncPutSymbol(&rans3, &ptr, s3); RansEncPutSymbol(&rans2, &ptr, s2); RansEncPutSymbol(&rans1, &ptr, s1); RansEncPutSymbol(&rans0, &ptr, s0); -#else - // Slightly beter on gcc, much better on clang - uint16_t *ptr16 = (uint16_t *)ptr; - - if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16; - if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16; - uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift); - uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift); - rans3 += s3->bias + q3 * s3->cmpl_freq; - rans2 += s2->bias + q2 * s2->cmpl_freq; - - if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16; - if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16; - uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift); - uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift); - rans1 += s1->bias + q1 * s1->cmpl_freq; - rans0 += s0->bias + q0 * s0->cmpl_freq; - - ptr = (uint8_t *)ptr16; -#endif } RansEncFlush(&rans3, &ptr); diff --git a/htscodecs/rANS_word.h b/htscodecs/rANS_word.h index db60b04..a537cb0 100644 --- a/htscodecs/rANS_word.h +++ b/htscodecs/rANS_word.h @@ -77,8 +77,10 @@ static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq { uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq-1; // this turns into a shift. if (x > x_max) { - uint16_t* ptr = (uint16_t *)*pptr; - *--ptr = (uint16_t) (x & 0xffff); + uint8_t* ptr = *pptr; + ptr -= 2; + ptr[0] = x & 0xff; + ptr[1] = (x >> 8) & 0xff; x >>= 16; *pptr = (uint8_t *)ptr; }