diff --git a/htscodecs/rANS_static32x16pr.c b/htscodecs/rANS_static32x16pr.c
index 51ea554..2b3acc4 100644
--- a/htscodecs/rANS_static32x16pr.c
+++ b/htscodecs/rANS_static32x16pr.c
@@ -165,7 +165,6 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
     } else {
         // Branchless version optimises poorly with gcc unless we have
         // AVX2 capability, so have a custom rewrite of it.
-        uint16_t* ptr16 = (uint16_t *)ptr;
         for (i=(in_size &~(NX-1)); likely(i>0); i-=NX) {
             // Unrolled copy of below, because gcc doesn't optimise this
             // well in the original form.
@@ -197,15 +196,15 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
                 int c1  = rp[3-1] > sy[1]->x_max;
 
 #ifdef HTSCODECS_LITTLE_ENDIAN
-                ptr16[-1] = rp[3-0]; ptr16 -= c0;
-                ptr16[-1] = rp[3-1]; ptr16 -= c1;
+                memcpy(&ptr[-2], &rp[3-0], 2); ptr -= c0 * 2;
+                memcpy(&ptr[-2], &rp[3-1], 2); ptr -= c1 * 2;
 #else
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-0];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-0]>>8;
-                ptr16 -= c0;
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-1];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-1]>>8;
-                ptr16 -= c1;
+                ptr[-2] = rp[3-0];
+                ptr[-1] = rp[3-0]>>8;
+                ptr -= c0 * 2;
+                ptr[-2] = rp[3-1];
+                ptr[-1] = rp[3-1]>>8;
+                ptr -= c1 * 2;
 #endif
 
                 rp[3-0] = c0 ? rp[3-0]>>16 : rp[3-0];
@@ -217,15 +216,15 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
                 int c2  = rp[3-2] > sy[2]->x_max;
                 int c3  = rp[3-3] > sy[3]->x_max;
 #ifdef HTSCODECS_LITTLE_ENDIAN
-                ptr16[-1] = rp[3-2]; ptr16 -= c2;
-                ptr16[-1] = rp[3-3]; ptr16 -= c3;
+                memcpy(&ptr[-2], &rp[3-2], 2); ptr -= c2 * 2;
+                memcpy(&ptr[-2], &rp[3-3], 2); ptr -= c3 * 2;
 #else
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-2];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-2]>>8;
-                ptr16 -= c2;
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-3];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-3]>>8;
-                ptr16 -= c3;
+                ptr[-2] = rp[3-2];
+                ptr[-1] = rp[3-2]>>8;
+                ptr -= c2 * 2;
+                ptr[-2] = rp[3-3];
+                ptr[-1] = rp[3-3]>>8;
+                ptr -= c3 * 2;
 #endif
                 rp[3-2] = c2 ? rp[3-2]>>16 : rp[3-2];
                 rp[3-3] = c3 ? rp[3-3]>>16 : rp[3-3];
@@ -239,7 +238,6 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
             }
             if (z < -1) abort();
         }
-        ptr = (uint8_t *)ptr16;
     }
     for (z = NX-1; z >= 0; z--)
         RansEncFlush(&ransN[z], &ptr);
@@ -476,7 +474,6 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in,
         i32[i] = &in[iN[i]];
 
     for (; likely(i32[0] >= in); ) {
-        uint16_t *ptr16 = (uint16_t *)ptr;
         for (z = NX-1; z >= 0; z-=4) {
             RansEncSymbol *sy[4];
             int k;
@@ -490,12 +487,12 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in,
             for (k = 0; k < 4; k++) {
                 int c = ransN[z-k] > sy[k]->x_max;
 #ifdef HTSCODECS_LITTLE_ENDIAN
-                ptr16[-1] = ransN[z-k];
+                memcpy(&ptr[-2], &ransN[z-k], 2);
 #else
-                ((uint8_t *)&ptr16[-1])[0] = ransN[z-k];
-                ((uint8_t *)&ptr16[-1])[1] = ransN[z-k]>>8;
+                ptr16[-2] = ransN[z-k];
+                ptr16[-1] = ransN[z-k]>>8;
 #endif
-                ptr16 -= c;
+                ptr -= c * 2;
                 //ransN[z-k] >>= c<<4;
                 ransN[z-k] = c ? ransN[z-k]>>16 : ransN[z-k];
             }
@@ -506,7 +503,6 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in,
                 ransN[z-k] += sy[k]->bias + q*sy[k]->cmpl_freq;
             }
         }
-        ptr = (uint8_t *)ptr16;
     }
 
     for (z = NX-1; z>=0; z--)
diff --git a/htscodecs/rANS_static32x16pr_avx2.c b/htscodecs/rANS_static32x16pr_avx2.c
index cf04578..702d513 100644
--- a/htscodecs/rANS_static32x16pr_avx2.c
+++ b/htscodecs/rANS_static32x16pr_avx2.c
@@ -220,8 +220,6 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
     while (z-- > 0)
       RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]);
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
-
     LOAD(Rv, ransN);
 
     for (i=(in_size &~(NX-1)); i>0; i-=NX) {
@@ -306,7 +304,7 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
                                         _mm256_and_si256(sh[x+3], D)))
 
         // Renorm:
-        // if (x > x_max) {*--ptr16 = x & 0xffff; x >>= 16;}
+        // if (x > x_max) { ptr -=2; *((uint16_t *)ptr) = x & 0xffff; x >>= 16;}
         __m256i xmax1 = SYM_LOAD( 0, xA, xB, xC, xD);
         __m256i xmax2 = SYM_LOAD( 4, xA, xB, xC, xD);
         __m256i xmax3 = SYM_LOAD( 8, xA, xB, xC, xD);
@@ -317,7 +315,7 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
         __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmax3);
         __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmax4);
 
-        // Store bottom 16-bits at ptr16
+        // Store bottom 16-bits at ptr
         unsigned int imask1 = _mm256_movemask_ps((__m256)cv1);
         unsigned int imask2 = _mm256_movemask_ps((__m256)cv2);
         unsigned int imask3 = _mm256_movemask_ps((__m256)cv3);
@@ -349,20 +347,20 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
 
         // Now we have bottom N 16-bit values in each V12/V34 to flush
         __m128i f =  _mm256_extractf128_si256(V34, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask4);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask4) * 2;
 
         f =  _mm256_extractf128_si256(V34, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask3);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask3) * 2;
 
         f =  _mm256_extractf128_si256(V12, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask2);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask2) * 2;
 
         f =  _mm256_extractf128_si256(V12, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask1);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask1) * 2;
 
         __m256i Rs;
         Rs = _mm256_srli_epi32(Rv1,16); Rv1 = _mm256_blendv_epi8(Rv1, Rs, cv1);
@@ -437,7 +435,6 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
 
     STORE(Rv, ransN);
 
-    ptr = (uint8_t *)ptr16;
     for (z = NX-1; z >= 0; z--)
       RansEncFlush(&ransN[z], &ptr);
 
@@ -506,15 +503,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
             goto err;
     }
 
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
     uint8_t overflow[64+64] = {0};
     cp_end -= 64;
 
     // Protect against running off the end of in buffer.
     // We copy it to a worst-case local buffer when near the end.
-    if ((uint8_t *)sp > cp_end) {
-        memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-        sp = (uint16_t *)overflow;
+    if (sp > cp_end) {
+        memmove(overflow, sp, cp_end+64 - sp);
+        sp = overflow;
         cp_end = overflow + sizeof(overflow) - 64;
     }
 
@@ -569,9 +566,9 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
 
         // Protect against running off the end of in buffer.
         // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp > cp_end) {
-            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
+        if (sp > cp_end) {
+            memmove(overflow, sp, cp_end+64 - sp);
+            sp = overflow;
             cp_end = overflow + sizeof(overflow) - 64;
         }
 
@@ -600,11 +597,11 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
 
         // Shuffle the renorm values to correct lanes and incr sp pointer
         unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2);
-        sp += _mm_popcnt_u32(imask1);
+        sp += _mm_popcnt_u32(imask1) * 2;
 
         __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]);
         __m256i Vv2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
-        sp += _mm_popcnt_u32(imask2);
+        sp += _mm_popcnt_u32(imask2) * 2;
 
         Yv1 = _mm256_or_si256(Yv1, Vv1);
         Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2);
@@ -662,7 +659,7 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3);
         __m256i Yv4 = _mm256_slli_epi32(Rv4, 16);
         unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4);
-        sp += _mm_popcnt_u32(imask3);
+        sp += _mm_popcnt_u32(imask3) * 2;
 
         __m256i idx4 = _mm256_load_si256((const __m256i*)permute[imask4]);
         __m256i Vv4 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
@@ -671,7 +668,7 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Yv3 = _mm256_or_si256(Yv3, Vv3);
         Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4);
         Yv4 = _mm256_or_si256(Yv4, Vv4);
-        sp += _mm_popcnt_u32(imask4);
+        sp += _mm_popcnt_u32(imask4) * 2;
 
         // R[z] = c ? Y[z] : R[z];
         Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3);
@@ -757,8 +754,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         lN[z] = c;
     }
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
-
 //       clang16      clang10      gcc7         gcc13
 //       587 435 381  588 438 403  504 386 415  527 381 394
 // simT  611 432 402  475 401 367  472 422 386  486 353 324
@@ -865,7 +860,8 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         // ------------------------------------------------------------
         //      for (z = NX-1; z >= 0; z--) {
         //          if (ransN[z] >= x_max[z]) {
-        //              *--ptr16 = ransN[z] & 0xffff;
+        //              ptr -= 2;
+        //              *((uint16_t *)ptr) = ransN[z] & 0xffff;
         //              ransN[z] >>= 16;
         //          }
         //      }
@@ -874,10 +870,13 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmaxv[2]);
         __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmaxv[3]);
 
-        // Store bottom 16-bits at ptr16
+        // Store bottom 16-bits at ptr
         //
         // for (z = NX-1; z >= 0; z--) {
-        //     if (cond[z]) *--ptr16 = (uint16_t )(ransN[z] & 0xffff);
+        //     if (cond[z]) {
+        //         ptr -=2;
+        //         *((uint16_t *) ptr) = (ransN[z] & 0xffff);
+        //     }
         // }
         unsigned int imask1 = _mm256_movemask_ps((__m256)cv1);
         unsigned int imask2 = _mm256_movemask_ps((__m256)cv2);
@@ -910,20 +909,20 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
 
         // Now we have bottom N 16-bit values in each V12/V34 to flush
         __m128i f =  _mm256_extractf128_si256(V34, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask4);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask4) * 2;
 
         f =  _mm256_extractf128_si256(V34, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask3);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask3) * 2;
 
         f =  _mm256_extractf128_si256(V12, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask2);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask2) * 2;
 
         f =  _mm256_extractf128_si256(V12, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask1);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask1) * 2;
 
         __m256i Rs1, Rs2, Rs3, Rs4;
         Rs1 = _mm256_srli_epi32(Rv1,16);
@@ -995,8 +994,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
 
     STORE(Rv, ransN);
 
-    ptr = (uint8_t *)ptr16;
-
     for (z = NX-1; z>=0; z--)
         RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]);
 
@@ -1106,7 +1103,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
     for (z = 0; z < NX; z++)
         iN[z] = z*isz4;
 
-    uint16_t *sp = (uint16_t *)ptr;
+    uint8_t *sp = ptr;
     const uint32_t mask = (1u << shift)-1;
 
     __m256i maskv  = _mm256_set1_epi32(mask);
@@ -1127,7 +1124,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
     if (shift == TF_SHIFT_O1) {
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m256i masked1 = _mm256_and_si256(Rv1, maskv);
             __m256i masked2 = _mm256_and_si256(Rv2, maskv);
@@ -1240,12 +1237,12 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
             unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2);
             Vv1 = _mm256_permutevar8x32_epi32(Vv1, idx1);
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]);
             __m256i Vv2 = _mm256_cvtepu16_epi32(
                               _mm_loadu_si128((__m128i *)sp));
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2);
 
             Yv1 = _mm256_or_si256(Yv1, Vv1);
@@ -1300,7 +1297,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3);
             unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4);
             __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]);
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
             Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3);
 
             sv3 = _mm256_packus_epi32(sv3, sv4);
@@ -1328,7 +1325,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4);
             Yv4 = _mm256_or_si256(Yv4, Vv4);
 
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
 
             Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3);
             Rv4 = _mm256_blendv_epi8(Rv4, Yv4, renorm_mask4);
@@ -1338,7 +1335,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
         STORE(Rv, R);
         STORE(Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
@@ -1383,7 +1380,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
         // SIMD version ends decoding early as it reads at most 64 bytes
         // from input via 4 vectorised loads.
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m256i masked1 = _mm256_and_si256(Rv1, maskv);
             __m256i masked2 = _mm256_and_si256(Rv2, maskv);
@@ -1475,12 +1472,12 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
             unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2);
             Vv1 = _mm256_permutevar8x32_epi32(Vv1, idx1);
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]);
             __m256i Vv2 = _mm256_cvtepu16_epi32(
                               _mm_loadu_si128((__m128i *)sp));
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2);
 
             Yv1 = _mm256_or_si256(Yv1, Vv1);
@@ -1525,7 +1522,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3);
             unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4);
             __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]);
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
             Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3);
 
             // sv3 sv4 are 32-bit ints with lowest bit being char
@@ -1562,7 +1559,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4);
             Yv4 = _mm256_or_si256(Yv4, Vv4);
 
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
 
             Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3);
             Rv4 = _mm256_blendv_epi8(Rv4, Yv4, renorm_mask4);
@@ -1571,7 +1568,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
         STORE(Rv, R);
         STORE(Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
diff --git a/htscodecs/rANS_static32x16pr_avx512.c b/htscodecs/rANS_static32x16pr_avx512.c
index 3563f83..c9717f0 100644
--- a/htscodecs/rANS_static32x16pr_avx512.c
+++ b/htscodecs/rANS_static32x16pr_avx512.c
@@ -204,7 +204,6 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
 
     LOAD512(Rv, ransN);
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
     for (i=(in_size &~(32-1)); i>0; i-=32) {
         uint8_t *c = &in[i-32];
 
@@ -223,20 +222,20 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         SET512(xmax, SB);
 
         uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
-        int pc1 = _mm_popcnt_u32(gt_mask1);
+        int pc1 = _mm_popcnt_u32(gt_mask1) * 2;
         __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
         __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
         uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
         SET512(SDv,  SD);
-        int pc2 = _mm_popcnt_u32(gt_mask2);
+        int pc2 = _mm_popcnt_u32(gt_mask2) * 2;
 
         Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
 
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
-        ptr16 -= pc2;
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
-        ptr16 -= pc1;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc2, pc2-1, Rp2);
+        ptr -= pc2;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc1, pc1-1, Rp1);
+        ptr -= pc1;
 
         SET512(rfv,  SA);
         Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
@@ -299,7 +298,6 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         qv2 = _mm512_add_epi32(qv2, biasv2);
         Rv2 = _mm512_add_epi32(Rv2, qv2);
     }
-    ptr = (uint8_t *)ptr16;
     STORE512(Rv, ransN);
 
     for (z = 32-1; z >= 0; z--)
@@ -359,7 +357,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
             goto err;
     }
 
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
 
     int out_end = (out_sz&~(32-1));
     const uint32_t mask = (1u << TF_SHIFT)-1;
@@ -381,9 +379,9 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
 
       // Protect against running off the end of in buffer.
       // We copy it to a worst-case local buffer when near the end.
-      if ((uint8_t *)sp+64 > cp_end) {
-        memmove(overflow, sp, cp_end - (uint8_t *)sp);
-        sp = (uint16_t *)overflow;
+      if (cp_end - sp < 64) {
+        memmove(overflow, sp, cp_end - sp);
+        sp = overflow;
         cp_end = overflow + sizeof(overflow);
       }
 
@@ -410,7 +408,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
       // renorm. this is the interesting part:
       renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L));
       // advance by however many words we actually read
-      sp += _mm_popcnt_u32(renorm_mask1);
+      sp += _mm_popcnt_u32(renorm_mask1) * 2;
       __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256(
                                       (const __m256i *)sp));
 
@@ -440,7 +438,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
       renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16);
 
       // advance by however many words we actually read
-      sp += _mm_popcnt_u32(renorm_mask2);
+      sp += _mm_popcnt_u32(renorm_mask2) * 2;
 
       R1 = _mm512_add_epi32(R1, renorm_vals1);
       R2 = _mm512_add_epi32(R2, renorm_vals2);
@@ -575,7 +573,6 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
 
     LOAD512(Rv, ransN);
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
     LOAD512(iN, iN);
     LOAD512(last, lN);
 
@@ -621,7 +618,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         // ------------------------------------------------------------
         //      for (z = NX-1; z >= 0; z--) {
         //          if (ransN[z] >= x_max[z]) {
-        //              *--ptr16 = ransN[z] & 0xffff;
+        //              ptr += 2;
+        //              *((uint16_t *) ptr) = ransN[z] & 0xffff;
         //              ransN[z] >>= 16;
         //          }
         //      }
@@ -690,20 +688,20 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         SET512x(xmax, x_max); // high latency
 
         uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
-        int pc1 = _mm_popcnt_u32(gt_mask1);
+        int pc1 = _mm_popcnt_u32(gt_mask1) * 2;
         __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
         __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
         uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
         SET512x(SDv, cmpl_freq); // good
-        int pc2 = _mm_popcnt_u32(gt_mask2);
+        int pc2 = _mm_popcnt_u32(gt_mask2) * 2;
 
         Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
 
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
-        ptr16 -= pc2;
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
-        ptr16 -= pc1;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc2, pc2-1, Rp2);
+        ptr -= pc2;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc1, pc1-1, Rp1);
+        ptr -= pc1;
 
         Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
         Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
@@ -751,8 +749,6 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
     STORE512(Rv, ransN);
     STORE512(last, lN);
 
-    ptr = (uint8_t *)ptr16;
-
     for (z = 32-1; z>=0; z--)
         RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]);
 
@@ -845,7 +841,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
     for (z = 0; z < NX; z++)
         iN[z] = z*isz4;
 
-    uint16_t *sp = (uint16_t *)ptr;
+    uint8_t *sp = ptr;
     const uint32_t mask = (1u << shift)-1;
 
     __m512i _maskv  = _mm512_set1_epi32(mask);
@@ -865,7 +861,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
     if (shift == TF_SHIFT_O1) {
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
             __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
@@ -939,11 +935,11 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
             __m512i renorm_words1 = _mm512_cvtepu16_epi32
                 (_mm256_loadu_si256((const __m256i *)sp));
-            sp += _mm_popcnt_u32(_imask1);
+            sp += _mm_popcnt_u32(_imask1) * 2;
 
             __m512i renorm_words2 = _mm512_cvtepu16_epi32
                 (_mm256_loadu_si256((const __m256i *)sp));
-            sp += _mm_popcnt_u32(_imask2);
+            sp += _mm_popcnt_u32(_imask2) * 2;
 
             __m512i _renorm_vals1 =
                 _mm512_maskz_expand_epi32(_imask1, renorm_words1);
@@ -985,7 +981,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
         STORE512(_Rv, R);
         STORE512(_Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
@@ -1034,7 +1030,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
         // SIMD version ends decoding early as it reads at most 64 bytes
         // from input via 4 vectorised loads.
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
             __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
@@ -1133,7 +1129,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
         STORE512(_Rv, R);
         STORE512(_Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
diff --git a/htscodecs/rANS_static32x16pr_neon.c b/htscodecs/rANS_static32x16pr_neon.c
index 52e7d2f..817bb30 100644
--- a/htscodecs/rANS_static32x16pr_neon.c
+++ b/htscodecs/rANS_static32x16pr_neon.c
@@ -665,7 +665,7 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
     // 500MB/s.  Clang does a lot of reordering of this code, removing some
     // of the manual tuning benefits.  Short of dropping to assembly, for now
     // I would recommend using gcc to compile this file.
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
     uint8_t overflow[64+64] = {0};
     for (i=0; i < out_end; i+=NX) {
         // Decode freq, bias and symbol from s3 lookups
@@ -770,16 +770,20 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
 
         // Protect against running off the end of in buffer.
         // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp+64 > cp_end) {
-            memmove(overflow, sp, cp_end - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
+        if (cp_end - sp < 64) {
+            memmove(overflow, sp, cp_end - sp);
+            sp = overflow;
             cp_end = overflow + sizeof(overflow);
         }
 
-        uint16x8_t norm12 =  vld1q_u16(sp);
-        sp += nbits[imask1] + nbits[imask2];
-        uint16x8_t norm34 =  vld1q_u16(sp);
-        sp += nbits[imask3] + nbits[imask4];
+        // load 8 16-bit lanes of renorm data (loaded as 8-bit for
+        // alignment purposes, but will be shuffled and cast to
+        // 16 bit below).
+
+        uint8x16_t norm12 =  vld1q_u8(sp);
+        sp += (nbits[imask1] + nbits[imask2]) * 2;
+        uint8x16_t norm34 =  vld1q_u8(sp);
+        sp += (nbits[imask3] + nbits[imask4]) * 2;
 
         Bv5 = vandq_u32(Bv5, maskv);
         Bv6 = vandq_u32(Bv6, maskv);
@@ -799,14 +803,14 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
         uint32_t imask12 = (imask1<<4)|imask2;
         uint32_t imask34 = (imask3<<4)|imask4;
 
+        // Shuffle norm to the corresponding R lanes, via imask
         // #define for brevity and formatting
 #define cast_u16_u8 vreinterpret_u16_u8
-#define cast_u8_u16 vreinterpretq_u8_u16
         uint16x4_t norm1, norm2, norm3, norm4, norm5, norm6, norm7, norm8;
-        norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx [imask1]));
-        norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx2[imask12]));
-        norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx [imask3]));
-        norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx2[imask34]));
+        norm1 = cast_u16_u8(vqtbl1_u8(norm12,idx [imask1]));
+        norm2 = cast_u16_u8(vqtbl1_u8(norm12,idx2[imask12]));
+        norm3 = cast_u16_u8(vqtbl1_u8(norm34,idx [imask3]));
+        norm4 = cast_u16_u8(vqtbl1_u8(norm34,idx2[imask34]));
 
         uint32x4_t Rlt5 = vcltq_u32(Rv5, vdupq_n_u32(RANS_BYTE_L));
         uint32x4_t Rlt6 = vcltq_u32(Rv6, vdupq_n_u32(RANS_BYTE_L));
@@ -819,15 +823,15 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
         uint32x4_t Rsl3 = vshlq_n_u32(Rv3, 16);
         uint32x4_t Rsl4 = vshlq_n_u32(Rv4, 16);
 
-        uint16x8_t norm56 =  vld1q_u16(sp);
+        uint8x16_t norm56 =  vld1q_u8(sp);
         uint32_t imask5 = vaddvq_u32(vandq_u32(Rlt5, bit));
         uint32_t imask6 = vaddvq_u32(vandq_u32(Rlt6, bit));
         uint32_t imask7 = vaddvq_u32(vandq_u32(Rlt7, bit));
         uint32_t imask8 = vaddvq_u32(vandq_u32(Rlt8, bit));
 
-        sp += nbits[imask5] + nbits[imask6];
-        uint16x8_t norm78 =  vld1q_u16(sp);
-        sp += nbits[imask7] + nbits[imask8];
+        sp += (nbits[imask5] + nbits[imask6]) * 2;
+        uint8x16_t norm78 =  vld1q_u8(sp);
+        sp += (nbits[imask7] + nbits[imask8]) * 2;
 
         Rsl1 = vaddw_u16(Rsl1, norm1);          // Rsl += norm
         Rsl2 = vaddw_u16(Rsl2, norm2);
@@ -842,10 +846,10 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
         Rv3 = vbslq_u32(Rlt3, Rsl3, Rv3);
         Rv4 = vbslq_u32(Rlt4, Rsl4, Rv4);
 
-        norm5 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm56),idx [imask5]));
-        norm6 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm56),idx2[imask56]));
-        norm7 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm78),idx [imask7]));
-        norm8 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm78),idx2[imask78]));
+        norm5 = cast_u16_u8(vqtbl1_u8(norm56,idx [imask5]));
+        norm6 = cast_u16_u8(vqtbl1_u8(norm56,idx2[imask56]));
+        norm7 = cast_u16_u8(vqtbl1_u8(norm78,idx [imask7]));
+        norm8 = cast_u16_u8(vqtbl1_u8(norm78,idx2[imask78]));
 
         uint32x4_t Rsl5 = vshlq_n_u32(Rv5, 16);
         uint32x4_t Rsl6 = vshlq_n_u32(Rv6, 16);
@@ -1551,7 +1555,7 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
         // Follow with 2nd copy doing scalar code instead?
         unsigned char tbuf[32][32] = {0};
         int tidx = 0;
-        for (; i4[0] < isz4 && ptr+64 < ptr_end;) {
+        for (; i4[0] < isz4 && ptr_end - ptr > 64;) {
             for (z = 0; z < NX; z+=16) {
                 uint32x4_t Rv1 = vld1q_u32(&R[z+0]);
                 uint32x4_t Rv2 = vld1q_u32(&R[z+4]);
@@ -1643,12 +1647,14 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
                 uint32x4_t Rlt3 = vcltq_u32(Rv3, vdupq_n_u32(RANS_BYTE_L));
                 uint32x4_t Rlt4 = vcltq_u32(Rv4, vdupq_n_u32(RANS_BYTE_L));
                 uint32x4_t all2 = {2,2,2,2};
-                // load 8 lanes of renorm data
-                uint16x8_t norm12 =  vld1q_u16((uint16_t *)ptr);
+                // load 8 16-bit lanes of renorm data (loaded as 8-bit for
+                // alignment purposes, but will be shuffled and cast to
+                // 16 bit below).
+                uint8x16_t norm12 =  vld1q_u8(ptr);
                 // move ptr by no. renorm lanes used
                 ptr += vaddvq_u32(vandq_u32(Rlt1, all2))
                     +  vaddvq_u32(vandq_u32(Rlt2, all2));
-                uint16x8_t norm34 =  vld1q_u16((uint16_t *)ptr);
+                uint8x16_t norm34 =  vld1q_u8(ptr);
                 ptr += vaddvq_u32(vandq_u32(Rlt3, all2))
                     +  vaddvq_u32(vandq_u32(Rlt4, all2));
 
@@ -1663,16 +1669,12 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
                 uint32_t imask34 = (imask3<<4)|imask4;
 
                 // Shuffle norm to the corresponding R lanes, via imask
-                // #define for brevity and formatting
+                // cast_u16_u8 #define for brevity and formatting
                 uint16x4_t norm1, norm2, norm3, norm4;
-                norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),
-                                              idx [imask1]));
-                norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),
-                                              idx2[imask12]));
-                norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),
-                                              idx [imask3]));
-                norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),
-                                              idx2[imask34]));
+                norm1 = cast_u16_u8(vqtbl1_u8(norm12, idx [imask1]));
+                norm2 = cast_u16_u8(vqtbl1_u8(norm12, idx2[imask12]));
+                norm3 = cast_u16_u8(vqtbl1_u8(norm34, idx [imask3]));
+                norm4 = cast_u16_u8(vqtbl1_u8(norm34, idx2[imask34]));
 
                 // Add norm to R<<16 and blend back in with R
                 uint32x4_t Rsl1 = vshlq_n_u32(Rv1, 16); // Rsl = R << 16
@@ -1776,7 +1778,7 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
 
         uint32_t *S3 = (uint32_t *)s3;
         
-        for (; i4[0] < isz4 && ptr+64 < ptr_end;) {
+        for (; i4[0] < isz4 && ptr_end - ptr > 64;) {
             int Z = 0;
             for (z = 0; z < NX; z+=16, Z+=4) {
                 // streamline these.  Could swap between two banks and pre-load
@@ -1852,23 +1854,25 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
                 uint32_t imask3 = vaddvq_u32(vandq_u32(Rlt3, bit));
                 uint32_t imask4 = vaddvq_u32(vandq_u32(Rlt4, bit));
 
-                // load 8 lanes of renorm data
-                uint16x8_t norm12 =  vld1q_u16((uint16_t *)ptr);
+                // load 8 16-bit lanes of renorm data (loaded as 8-bit for
+                // alignment purposes, but will be shuffled and cast to
+                // 16 bit below).
+                uint8x16_t norm12 =  vld1q_u8(ptr);
                 // move ptr by no. renorm lanes used
                 ptr += nbits[imask1] + nbits[imask2];
-                uint16x8_t norm34 =  vld1q_u16((uint16_t *)ptr);
+                uint8x16_t norm34 =  vld1q_u8(ptr);
                 ptr += nbits[imask3] + nbits[imask4];
 
                 uint32_t imask12 = (imask1<<4)|imask2;
                 uint32_t imask34 = (imask3<<4)|imask4;
 
                 // Shuffle norm to the corresponding R lanes, via imask
-                // #define for brevity and formatting
+                // cast_u16_u8 #define for brevity and formatting
                 uint16x4_t norm1, norm2, norm3, norm4;
-                norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx [imask1]));
-                norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx2[imask12]));
-                norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx [imask3]));
-                norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx2[imask34]));
+                norm1 = cast_u16_u8(vqtbl1_u8(norm12,idx [imask1]));
+                norm2 = cast_u16_u8(vqtbl1_u8(norm12,idx2[imask12]));
+                norm3 = cast_u16_u8(vqtbl1_u8(norm34,idx [imask3]));
+                norm4 = cast_u16_u8(vqtbl1_u8(norm34,idx2[imask34]));
 
                 // Add norm to R<<16 and blend back in with R
                 uint32x4_t Rsl1 = vshlq_n_u32(RV[Z+0], 16); // Rsl = R << 16
diff --git a/htscodecs/rANS_static32x16pr_sse4.c b/htscodecs/rANS_static32x16pr_sse4.c
index a9b9c4b..7de193b 100644
--- a/htscodecs/rANS_static32x16pr_sse4.c
+++ b/htscodecs/rANS_static32x16pr_sse4.c
@@ -275,7 +275,6 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
     uint32_t SB[256], SA[256], SD[256], SC[256];
 
     // Build lookup tables for SIMD encoding
-    uint16_t *ptr16 = (uint16_t *)ptr;
     for (i = 0; i < 256; i++) {
         SB[i] = syms[i].x_max;
         SA[i] = syms[i].rcp_freq;
@@ -315,7 +314,7 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
         __m128i cv6 = _mm_cmpgt_epi32(Rv[H+1], xmax6);
         __m128i cv5 = _mm_cmpgt_epi32(Rv[H+0], xmax5);
 
-        // Store bottom 16-bits at ptr16
+        // Store bottom 16-bits at ptr
         unsigned int imask8 = _mm_movemask_ps((__m128)cv8);
         unsigned int imask7 = _mm_movemask_ps((__m128)cv7);
         unsigned int imask6 = _mm_movemask_ps((__m128)cv6);
@@ -367,10 +366,10 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
         V6 = _mm_shuffle_epi8(V6, shuf);
         V5 = _mm_shuffle_epi8(V5, shuf);
 
-        _mm_storeu_si64(ptr16-4, V8); ptr16 -= _mm_popcnt_u32(imask8);
-        _mm_storeu_si64(ptr16-4, V7); ptr16 -= _mm_popcnt_u32(imask7);
-        _mm_storeu_si64(ptr16-4, V6); ptr16 -= _mm_popcnt_u32(imask6);
-        _mm_storeu_si64(ptr16-4, V5); ptr16 -= _mm_popcnt_u32(imask5);
+        _mm_storeu_si64(ptr-8, V8); ptr -= _mm_popcnt_u32(imask8) * 2;
+        _mm_storeu_si64(ptr-8, V7); ptr -= _mm_popcnt_u32(imask7) * 2;
+        _mm_storeu_si64(ptr-8, V6); ptr -= _mm_popcnt_u32(imask6) * 2;
+        _mm_storeu_si64(ptr-8, V5); ptr -= _mm_popcnt_u32(imask5) * 2;
 
         Rv[H+3] = _mm_blendv_epi8(Rv[H+3], _mm_srli_epi32(Rv[H+3], 16), cv8);
         Rv[H+2] = _mm_blendv_epi8(Rv[H+2], _mm_srli_epi32(Rv[H+2], 16), cv7);
@@ -441,8 +440,6 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
 
     STORE128v(Rv, ransN);
 
-    ptr = (uint8_t *)ptr16;
-
     for (z = NX-1; z >= 0; z--)
       RansEncFlush(&ransN[z], &ptr);
 
@@ -512,7 +509,7 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
             goto err;
     }
 
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
 
     int out_end = (out_sz&~(NX-1));
     const uint32_t mask = (1u << TF_SHIFT)-1;
@@ -628,31 +625,31 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
         // 72 = 7*8(imask1..7) + 16;  worse case for 8th _mm_loadu_si128 call.
         // An extra 64 bytes is to avoid triggering this multiple times
         // after we swap sp/cp_end over.
-        if ((uint8_t *)sp+72 > cp_end) {
-            memmove(overflow, sp, cp_end - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
-            cp_end = (uint8_t *)overflow + sizeof(overflow);
+        if (cp_end - sp < 72) {
+            memmove(overflow, sp, cp_end - sp);
+            sp = overflow;
+            cp_end = overflow + sizeof(overflow);
         }
 
         // Shuffle the renorm values to correct lanes and incr sp pointer
         __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1);
         Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1]));
-        sp += _mm_popcnt_u32(imask1);
+        sp += _mm_popcnt_u32(imask1) * 2;
 
         __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2);
-        sp += _mm_popcnt_u32(imask2);
+        sp += _mm_popcnt_u32(imask2) * 2;
         Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2]));
 
         __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3);
         Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3]));
-        sp += _mm_popcnt_u32(imask3);
+        sp += _mm_popcnt_u32(imask3) * 2;
 
         __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4);
-        sp += _mm_popcnt_u32(imask4);
+        sp += _mm_popcnt_u32(imask4) * 2;
         Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4]));
 
         __m128i Yv1 = _mm_slli_epi32(Rv1, 16);
@@ -738,21 +735,21 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
         __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5);
         Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5]));
-        sp += _mm_popcnt_u32(imask5);
+        sp += _mm_popcnt_u32(imask5) * 2;
 
         __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6);
-        sp += _mm_popcnt_u32(imask6);
+        sp += _mm_popcnt_u32(imask6) * 2;
         Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6]));
 
         __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7);
         Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7]));
-        sp += _mm_popcnt_u32(imask7);
+        sp += _mm_popcnt_u32(imask7) * 2;
 
         __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8);
-        sp += _mm_popcnt_u32(imask8);
+        sp += _mm_popcnt_u32(imask8) * 2;
         Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8]));
 
         __m128i Yv5 = _mm_slli_epi32(Rv5, 16);
@@ -1108,7 +1105,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
     // loop with shift as a variable.
     if (shift == TF_SHIFT_O1) {
         // TF_SHIFT_O1 = 12
-        uint16_t *sp = (uint16_t *)ptr;
+        uint8_t *sp = ptr;
         const uint32_t mask = ((1u << TF_SHIFT_O1)-1);
         __m128i maskv  = _mm_set1_epi32(mask); // set mask in all lanes
         uint8_t tbuf[32][32];
@@ -1117,7 +1114,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
         LOAD128(Lv, l);
 
         isz4 -= 64;
-        for (; i4[0] < isz4 && (uint8_t *)sp+72 < ptr_end; ) {
+        for (; i4[0] < isz4 && ptr_end - sp > 72; ) {
             //for (z = 0; z < NX; z++)
             //  m[z] = R[z] & mask;
             __m128i masked1 = _mm_and_si128(Rv1, maskv);
@@ -1257,21 +1254,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1);
             Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1]));
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2);
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2]));
 
             __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3);
             Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3]));
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
 
             __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4);
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
             Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4]));
 
             __m128i Yv1 = _mm_slli_epi32(Rv1, 16);
@@ -1382,21 +1379,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5);
             Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5]));
-            sp += _mm_popcnt_u32(imask5);
+            sp += _mm_popcnt_u32(imask5) * 2;
 
             __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6);
-            sp += _mm_popcnt_u32(imask6);
+            sp += _mm_popcnt_u32(imask6) * 2;
             Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6]));
 
             __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7);
             Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7]));
-            sp += _mm_popcnt_u32(imask7);
+            sp += _mm_popcnt_u32(imask7) * 2;
 
             __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8);
-            sp += _mm_popcnt_u32(imask8);
+            sp += _mm_popcnt_u32(imask8) * 2;
             Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8]));
 
             __m128i Yv5 = _mm_slli_epi32(Rv5, 16);
@@ -1438,7 +1435,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
 
         STORE128(Rv, R);
         STORE128(Lv, l);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         i4[0]-=tidx;
         int T;
@@ -1478,7 +1475,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
         }
     } else {
         // TF_SHIFT_O1 = 10
-        uint16_t *sp = (uint16_t *)ptr;
+        uint8_t *sp = ptr;
         const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
         __m128i maskv  = _mm_set1_epi32(mask); // set mask in all lanes
         uint8_t tbuf[32][32] __attribute__((aligned(32)));
@@ -1487,7 +1484,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
         LOAD128(Lv, l);
 
         isz4 -= 64;
-        for (; i4[0] < isz4 && (uint8_t *)sp+72 < ptr_end; ) {
+        for (; i4[0] < isz4 && ptr_end - sp > 72; ) {
             //for (z = 0; z < NX; z++)
             //  m[z] = R[z] & mask;
             __m128i masked1 = _mm_and_si128(Rv1, maskv);
@@ -1607,21 +1604,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1);
             Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1]));
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2);
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2]));
 
             __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3);
             Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3]));
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
 
             __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4);
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
             Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4]));
 
             __m128i Yv1 = _mm_slli_epi32(Rv1, 16);
@@ -1722,21 +1719,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5);
             Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5]));
-            sp += _mm_popcnt_u32(imask5);
+            sp += _mm_popcnt_u32(imask5) * 2;
 
             __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6);
-            sp += _mm_popcnt_u32(imask6);
+            sp += _mm_popcnt_u32(imask6) * 2;
             Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6]));
 
             __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7);
             Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7]));
-            sp += _mm_popcnt_u32(imask7);
+            sp += _mm_popcnt_u32(imask7) * 2;
 
             __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8);
-            sp += _mm_popcnt_u32(imask8);
+            sp += _mm_popcnt_u32(imask8) * 2;
             Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8]));
 
             __m128i Yv5 = _mm_slli_epi32(Rv5, 16);
@@ -1777,7 +1774,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
 
         STORE128(Rv, R);
         STORE128(Lv, l);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         i4[0]-=tidx;
         int T;
diff --git a/htscodecs/rANS_static4x16pr.c b/htscodecs/rANS_static4x16pr.c
index d7fd00c..7b4d9bb 100644
--- a/htscodecs/rANS_static4x16pr.c
+++ b/htscodecs/rANS_static4x16pr.c
@@ -190,31 +190,10 @@ unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
         RansEncSymbol *s1 = &syms[in[i-3]];
         RansEncSymbol *s0 = &syms[in[i-4]];
 
-#if 1
         RansEncPutSymbol(&rans3, &ptr, s3);
         RansEncPutSymbol(&rans2, &ptr, s2);
         RansEncPutSymbol(&rans1, &ptr, s1);
         RansEncPutSymbol(&rans0, &ptr, s0);
-#else
-        // Slightly beter on gcc, much better on clang
-        uint16_t *ptr16 = (uint16_t *)ptr;
-
-        if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16;
-        if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16;
-        uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift);
-        uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift);
-        rans3 += s3->bias + q3 * s3->cmpl_freq;
-        rans2 += s2->bias + q2 * s2->cmpl_freq;
-
-        if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16;
-        if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16;
-        uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift);
-        uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift);
-        rans1 += s1->bias + q1 * s1->cmpl_freq;
-        rans0 += s0->bias + q0 * s0->cmpl_freq;
-
-        ptr = (uint8_t *)ptr16;
-#endif
     }
 
     RansEncFlush(&rans3, &ptr);
diff --git a/htscodecs/rANS_word.h b/htscodecs/rANS_word.h
index db60b04..a537cb0 100644
--- a/htscodecs/rANS_word.h
+++ b/htscodecs/rANS_word.h
@@ -77,8 +77,10 @@ static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq
 {
     uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq-1; // this turns into a shift.
     if (x > x_max) {
-        uint16_t* ptr = (uint16_t *)*pptr;
-        *--ptr = (uint16_t) (x & 0xffff);
+        uint8_t* ptr = *pptr;
+        ptr -= 2;
+        ptr[0] = x & 0xff;
+        ptr[1] = (x >> 8) & 0xff;
         x >>= 16;
         *pptr = (uint8_t *)ptr;
     }