From e801ab4466aa16673bbf2f39a17e90fd48d8aed5 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 13 Jan 2026 15:07:43 +0000
Subject: [PATCH] Prevent use of unaligned uint16_t pointers

UBSan on clang complains about dereferencing unaligned uint16_t
pointers even when the dereference is being done by memmove().
Strictly it's correct as even setting the pointer to such a
value is undefined behaviour according to the standard.

Silence the noise by changing the pointers to uint8_t and
adjusting the arithmetic on them as necessary.

Some bounds checks to ensure fast code won't read beyond the
end of its input are also adjusted to prevent any possibility
of generating an address beyond the limits of the input memory.
---
 htscodecs/rANS_static32x16pr.c        |  44 +++++------
 htscodecs/rANS_static32x16pr_avx2.c   | 101 +++++++++++++-------------
 htscodecs/rANS_static32x16pr_avx512.c |  58 +++++++--------
 htscodecs/rANS_static32x16pr_neon.c   |  90 ++++++++++++-----------
 htscodecs/rANS_static32x16pr_sse4.c   |  83 ++++++++++-----------
 htscodecs/rANS_static4x16pr.c         |  21 ------
 htscodecs/rANS_word.h                 |   6 +-
 7 files changed, 187 insertions(+), 216 deletions(-)

diff --git a/htscodecs/rANS_static32x16pr.c b/htscodecs/rANS_static32x16pr.c
index 51ea554..2b3acc4 100644
--- a/htscodecs/rANS_static32x16pr.c
+++ b/htscodecs/rANS_static32x16pr.c
@@ -165,7 +165,6 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
     } else {
         // Branchless version optimises poorly with gcc unless we have
         // AVX2 capability, so have a custom rewrite of it.
-        uint16_t* ptr16 = (uint16_t *)ptr;
         for (i=(in_size &~(NX-1)); likely(i>0); i-=NX) {
             // Unrolled copy of below, because gcc doesn't optimise this
             // well in the original form.
@@ -197,15 +196,15 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
                 int c1  = rp[3-1] > sy[1]->x_max;
 
 #ifdef HTSCODECS_LITTLE_ENDIAN
-                ptr16[-1] = rp[3-0]; ptr16 -= c0;
-                ptr16[-1] = rp[3-1]; ptr16 -= c1;
+                memcpy(&ptr[-2], &rp[3-0], 2); ptr -= c0 * 2;
+                memcpy(&ptr[-2], &rp[3-1], 2); ptr -= c1 * 2;
 #else
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-0];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-0]>>8;
-                ptr16 -= c0;
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-1];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-1]>>8;
-                ptr16 -= c1;
+                ptr[-2] = rp[3-0];
+                ptr[-1] = rp[3-0]>>8;
+                ptr -= c0 * 2;
+                ptr[-2] = rp[3-1];
+                ptr[-1] = rp[3-1]>>8;
+                ptr -= c1 * 2;
 #endif
 
                 rp[3-0] = c0 ? rp[3-0]>>16 : rp[3-0];
@@ -217,15 +216,15 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
                 int c2  = rp[3-2] > sy[2]->x_max;
                 int c3  = rp[3-3] > sy[3]->x_max;
 #ifdef HTSCODECS_LITTLE_ENDIAN
-                ptr16[-1] = rp[3-2]; ptr16 -= c2;
-                ptr16[-1] = rp[3-3]; ptr16 -= c3;
+                memcpy(&ptr[-2], &rp[3-2], 2); ptr -= c2 * 2;
+                memcpy(&ptr[-2], &rp[3-3], 2); ptr -= c3 * 2;
 #else
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-2];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-2]>>8;
-                ptr16 -= c2;
-                ((uint8_t *)&ptr16[-1])[0] = rp[3-3];
-                ((uint8_t *)&ptr16[-1])[1] = rp[3-3]>>8;
-                ptr16 -= c3;
+                ptr[-2] = rp[3-2];
+                ptr[-1] = rp[3-2]>>8;
+                ptr -= c2 * 2;
+                ptr[-2] = rp[3-3];
+                ptr[-1] = rp[3-3]>>8;
+                ptr -= c3 * 2;
 #endif
                 rp[3-2] = c2 ? rp[3-2]>>16 : rp[3-2];
                 rp[3-3] = c3 ? rp[3-3]>>16 : rp[3-3];
@@ -239,7 +238,6 @@ unsigned char *rans_compress_O0_32x16(unsigned char *in,
             }
             if (z < -1) abort();
         }
-        ptr = (uint8_t *)ptr16;
     }
     for (z = NX-1; z >= 0; z--)
         RansEncFlush(&ransN[z], &ptr);
@@ -476,7 +474,6 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in,
         i32[i] = &in[iN[i]];
 
     for (; likely(i32[0] >= in); ) {
-        uint16_t *ptr16 = (uint16_t *)ptr;
         for (z = NX-1; z >= 0; z-=4) {
             RansEncSymbol *sy[4];
             int k;
@@ -490,12 +487,12 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in,
             for (k = 0; k < 4; k++) {
                 int c = ransN[z-k] > sy[k]->x_max;
 #ifdef HTSCODECS_LITTLE_ENDIAN
-                ptr16[-1] = ransN[z-k];
+                memcpy(&ptr[-2], &ransN[z-k], 2);
 #else
-                ((uint8_t *)&ptr16[-1])[0] = ransN[z-k];
-                ((uint8_t *)&ptr16[-1])[1] = ransN[z-k]>>8;
+                ptr16[-2] = ransN[z-k];
+                ptr16[-1] = ransN[z-k]>>8;
 #endif
-                ptr16 -= c;
+                ptr -= c * 2;
                 //ransN[z-k] >>= c<<4;
                 ransN[z-k] = c ? ransN[z-k]>>16 : ransN[z-k];
             }
@@ -506,7 +503,6 @@ unsigned char *rans_compress_O1_32x16(unsigned char *in,
                 ransN[z-k] += sy[k]->bias + q*sy[k]->cmpl_freq;
             }
         }
-        ptr = (uint8_t *)ptr16;
     }
 
     for (z = NX-1; z>=0; z--)
diff --git a/htscodecs/rANS_static32x16pr_avx2.c b/htscodecs/rANS_static32x16pr_avx2.c
index cf04578..702d513 100644
--- a/htscodecs/rANS_static32x16pr_avx2.c
+++ b/htscodecs/rANS_static32x16pr_avx2.c
@@ -220,8 +220,6 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
     while (z-- > 0)
       RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]);
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
-
     LOAD(Rv, ransN);
 
     for (i=(in_size &~(NX-1)); i>0; i-=NX) {
@@ -306,7 +304,7 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
                                         _mm256_and_si256(sh[x+3], D)))
 
         // Renorm:
-        // if (x > x_max) {*--ptr16 = x & 0xffff; x >>= 16;}
+        // if (x > x_max) { ptr -=2; *((uint16_t *)ptr) = x & 0xffff; x >>= 16;}
         __m256i xmax1 = SYM_LOAD( 0, xA, xB, xC, xD);
         __m256i xmax2 = SYM_LOAD( 4, xA, xB, xC, xD);
         __m256i xmax3 = SYM_LOAD( 8, xA, xB, xC, xD);
@@ -317,7 +315,7 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
         __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmax3);
         __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmax4);
 
-        // Store bottom 16-bits at ptr16
+        // Store bottom 16-bits at ptr
         unsigned int imask1 = _mm256_movemask_ps((__m256)cv1);
         unsigned int imask2 = _mm256_movemask_ps((__m256)cv2);
         unsigned int imask3 = _mm256_movemask_ps((__m256)cv3);
@@ -349,20 +347,20 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
 
         // Now we have bottom N 16-bit values in each V12/V34 to flush
         __m128i f =  _mm256_extractf128_si256(V34, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask4);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask4) * 2;
 
         f =  _mm256_extractf128_si256(V34, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask3);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask3) * 2;
 
         f =  _mm256_extractf128_si256(V12, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask2);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask2) * 2;
 
         f =  _mm256_extractf128_si256(V12, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask1);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask1) * 2;
 
         __m256i Rs;
         Rs = _mm256_srli_epi32(Rv1,16); Rv1 = _mm256_blendv_epi8(Rv1, Rs, cv1);
@@ -437,7 +435,6 @@ unsigned char *rans_compress_O0_32x16_avx2(unsigned char *in,
 
     STORE(Rv, ransN);
 
-    ptr = (uint8_t *)ptr16;
     for (z = NX-1; z >= 0; z--)
       RansEncFlush(&ransN[z], &ptr);
 
@@ -506,15 +503,15 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
             goto err;
     }
 
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
     uint8_t overflow[64+64] = {0};
     cp_end -= 64;
 
     // Protect against running off the end of in buffer.
     // We copy it to a worst-case local buffer when near the end.
-    if ((uint8_t *)sp > cp_end) {
-        memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-        sp = (uint16_t *)overflow;
+    if (sp > cp_end) {
+        memmove(overflow, sp, cp_end+64 - sp);
+        sp = overflow;
         cp_end = overflow + sizeof(overflow) - 64;
     }
 
@@ -569,9 +566,9 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
 
         // Protect against running off the end of in buffer.
         // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp > cp_end) {
-            memmove(overflow, sp, cp_end+64 - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
+        if (sp > cp_end) {
+            memmove(overflow, sp, cp_end+64 - sp);
+            sp = overflow;
             cp_end = overflow + sizeof(overflow) - 64;
         }
 
@@ -600,11 +597,11 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
 
         // Shuffle the renorm values to correct lanes and incr sp pointer
         unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2);
-        sp += _mm_popcnt_u32(imask1);
+        sp += _mm_popcnt_u32(imask1) * 2;
 
         __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]);
         __m256i Vv2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
-        sp += _mm_popcnt_u32(imask2);
+        sp += _mm_popcnt_u32(imask2) * 2;
 
         Yv1 = _mm256_or_si256(Yv1, Vv1);
         Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2);
@@ -662,7 +659,7 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3);
         __m256i Yv4 = _mm256_slli_epi32(Rv4, 16);
         unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4);
-        sp += _mm_popcnt_u32(imask3);
+        sp += _mm_popcnt_u32(imask3) * 2;
 
         __m256i idx4 = _mm256_load_si256((const __m256i*)permute[imask4]);
         __m256i Vv4 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
@@ -671,7 +668,7 @@ unsigned char *rans_uncompress_O0_32x16_avx2(unsigned char *in,
         Yv3 = _mm256_or_si256(Yv3, Vv3);
         Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4);
         Yv4 = _mm256_or_si256(Yv4, Vv4);
-        sp += _mm_popcnt_u32(imask4);
+        sp += _mm_popcnt_u32(imask4) * 2;
 
         // R[z] = c ? Y[z] : R[z];
         Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3);
@@ -757,8 +754,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         lN[z] = c;
     }
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
-
 //       clang16      clang10      gcc7         gcc13
 //       587 435 381  588 438 403  504 386 415  527 381 394
 // simT  611 432 402  475 401 367  472 422 386  486 353 324
@@ -865,7 +860,8 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         // ------------------------------------------------------------
         //      for (z = NX-1; z >= 0; z--) {
         //          if (ransN[z] >= x_max[z]) {
-        //              *--ptr16 = ransN[z] & 0xffff;
+        //              ptr -= 2;
+        //              *((uint16_t *)ptr) = ransN[z] & 0xffff;
         //              ransN[z] >>= 16;
         //          }
         //      }
@@ -874,10 +870,13 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
         __m256i cv3 = _mm256_cmpgt_epi32(Rv3, xmaxv[2]);
         __m256i cv4 = _mm256_cmpgt_epi32(Rv4, xmaxv[3]);
 
-        // Store bottom 16-bits at ptr16
+        // Store bottom 16-bits at ptr
         //
         // for (z = NX-1; z >= 0; z--) {
-        //     if (cond[z]) *--ptr16 = (uint16_t )(ransN[z] & 0xffff);
+        //     if (cond[z]) {
+        //         ptr -=2;
+        //         *((uint16_t *) ptr) = (ransN[z] & 0xffff);
+        //     }
         // }
         unsigned int imask1 = _mm256_movemask_ps((__m256)cv1);
         unsigned int imask2 = _mm256_movemask_ps((__m256)cv2);
@@ -910,20 +909,20 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
 
         // Now we have bottom N 16-bit values in each V12/V34 to flush
         __m128i f =  _mm256_extractf128_si256(V34, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask4);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask4) * 2;
 
         f =  _mm256_extractf128_si256(V34, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask3);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask3) * 2;
 
         f =  _mm256_extractf128_si256(V12, 1);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask2);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask2) * 2;
 
         f =  _mm256_extractf128_si256(V12, 0);
-        _mm_storeu_si128((__m128i *)(ptr16-8), f);
-        ptr16 -= _mm_popcnt_u32(imask1);
+        _mm_storeu_si128((__m128i *)(ptr-16), f);
+        ptr -= _mm_popcnt_u32(imask1) * 2;
 
         __m256i Rs1, Rs2, Rs3, Rs4;
         Rs1 = _mm256_srli_epi32(Rv1,16);
@@ -995,8 +994,6 @@ unsigned char *rans_compress_O1_32x16_avx2(unsigned char *in, unsigned int in_si
 
     STORE(Rv, ransN);
 
-    ptr = (uint8_t *)ptr16;
-
     for (z = NX-1; z>=0; z--)
         RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]);
 
@@ -1106,7 +1103,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
     for (z = 0; z < NX; z++)
         iN[z] = z*isz4;
 
-    uint16_t *sp = (uint16_t *)ptr;
+    uint8_t *sp = ptr;
     const uint32_t mask = (1u << shift)-1;
 
     __m256i maskv  = _mm256_set1_epi32(mask);
@@ -1127,7 +1124,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
     if (shift == TF_SHIFT_O1) {
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m256i masked1 = _mm256_and_si256(Rv1, maskv);
             __m256i masked2 = _mm256_and_si256(Rv2, maskv);
@@ -1240,12 +1237,12 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
             unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2);
             Vv1 = _mm256_permutevar8x32_epi32(Vv1, idx1);
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]);
             __m256i Vv2 = _mm256_cvtepu16_epi32(
                               _mm_loadu_si128((__m128i *)sp));
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2);
 
             Yv1 = _mm256_or_si256(Yv1, Vv1);
@@ -1300,7 +1297,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3);
             unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4);
             __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]);
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
             Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3);
 
             sv3 = _mm256_packus_epi32(sv3, sv4);
@@ -1328,7 +1325,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4);
             Yv4 = _mm256_or_si256(Yv4, Vv4);
 
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
 
             Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3);
             Rv4 = _mm256_blendv_epi8(Rv4, Yv4, renorm_mask4);
@@ -1338,7 +1335,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
         STORE(Rv, R);
         STORE(Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
@@ -1383,7 +1380,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
         // SIMD version ends decoding early as it reads at most 64 bytes
         // from input via 4 vectorised loads.
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m256i masked1 = _mm256_and_si256(Rv1, maskv);
             __m256i masked2 = _mm256_and_si256(Rv2, maskv);
@@ -1475,12 +1472,12 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
             unsigned int imask2 = _mm256_movemask_ps((__m256)renorm_mask2);
             Vv1 = _mm256_permutevar8x32_epi32(Vv1, idx1);
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m256i idx2 = _mm256_load_si256((const __m256i*)permute[imask2]);
             __m256i Vv2 = _mm256_cvtepu16_epi32(
                               _mm_loadu_si128((__m128i *)sp));
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm256_permutevar8x32_epi32(Vv2, idx2);
 
             Yv1 = _mm256_or_si256(Yv1, Vv1);
@@ -1525,7 +1522,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             unsigned int imask3 = _mm256_movemask_ps((__m256)renorm_mask3);
             unsigned int imask4 = _mm256_movemask_ps((__m256)renorm_mask4);
             __m256i idx3 = _mm256_load_si256((const __m256i*)permute[imask3]);
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
             Vv3 = _mm256_permutevar8x32_epi32(Vv3, idx3);
 
             // sv3 sv4 are 32-bit ints with lowest bit being char
@@ -1562,7 +1559,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
             Vv4 = _mm256_permutevar8x32_epi32(Vv4, idx4);
             Yv4 = _mm256_or_si256(Yv4, Vv4);
 
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
 
             Rv3 = _mm256_blendv_epi8(Rv3, Yv3, renorm_mask3);
             Rv4 = _mm256_blendv_epi8(Rv4, Yv4, renorm_mask4);
@@ -1571,7 +1568,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
 
         STORE(Rv, R);
         STORE(Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
diff --git a/htscodecs/rANS_static32x16pr_avx512.c b/htscodecs/rANS_static32x16pr_avx512.c
index 3563f83..c9717f0 100644
--- a/htscodecs/rANS_static32x16pr_avx512.c
+++ b/htscodecs/rANS_static32x16pr_avx512.c
@@ -204,7 +204,6 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
 
     LOAD512(Rv, ransN);
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
     for (i=(in_size &~(32-1)); i>0; i-=32) {
         uint8_t *c = &in[i-32];
 
@@ -223,20 +222,20 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         SET512(xmax, SB);
 
         uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
-        int pc1 = _mm_popcnt_u32(gt_mask1);
+        int pc1 = _mm_popcnt_u32(gt_mask1) * 2;
         __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
         __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
         uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
         SET512(SDv,  SD);
-        int pc2 = _mm_popcnt_u32(gt_mask2);
+        int pc2 = _mm_popcnt_u32(gt_mask2) * 2;
 
         Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
 
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
-        ptr16 -= pc2;
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
-        ptr16 -= pc1;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc2, pc2-1, Rp2);
+        ptr -= pc2;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc1, pc1-1, Rp1);
+        ptr -= pc1;
 
         SET512(rfv,  SA);
         Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
@@ -299,7 +298,6 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
         qv2 = _mm512_add_epi32(qv2, biasv2);
         Rv2 = _mm512_add_epi32(Rv2, qv2);
     }
-    ptr = (uint8_t *)ptr16;
     STORE512(Rv, ransN);
 
     for (z = 32-1; z >= 0; z--)
@@ -359,7 +357,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
             goto err;
     }
 
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
 
     int out_end = (out_sz&~(32-1));
     const uint32_t mask = (1u << TF_SHIFT)-1;
@@ -381,9 +379,9 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
 
       // Protect against running off the end of in buffer.
       // We copy it to a worst-case local buffer when near the end.
-      if ((uint8_t *)sp+64 > cp_end) {
-        memmove(overflow, sp, cp_end - (uint8_t *)sp);
-        sp = (uint16_t *)overflow;
+      if (cp_end - sp < 64) {
+        memmove(overflow, sp, cp_end - sp);
+        sp = overflow;
         cp_end = overflow + sizeof(overflow);
       }
 
@@ -410,7 +408,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
       // renorm. this is the interesting part:
       renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L));
       // advance by however many words we actually read
-      sp += _mm_popcnt_u32(renorm_mask1);
+      sp += _mm_popcnt_u32(renorm_mask1) * 2;
       __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256(
                                       (const __m256i *)sp));
 
@@ -440,7 +438,7 @@ unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
       renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16);
 
       // advance by however many words we actually read
-      sp += _mm_popcnt_u32(renorm_mask2);
+      sp += _mm_popcnt_u32(renorm_mask2) * 2;
 
       R1 = _mm512_add_epi32(R1, renorm_vals1);
       R2 = _mm512_add_epi32(R2, renorm_vals2);
@@ -575,7 +573,6 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
 
     LOAD512(Rv, ransN);
 
-    uint16_t *ptr16 = (uint16_t *)ptr;
     LOAD512(iN, iN);
     LOAD512(last, lN);
 
@@ -621,7 +618,8 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         // ------------------------------------------------------------
         //      for (z = NX-1; z >= 0; z--) {
         //          if (ransN[z] >= x_max[z]) {
-        //              *--ptr16 = ransN[z] & 0xffff;
+        //              ptr += 2;
+        //              *((uint16_t *) ptr) = ransN[z] & 0xffff;
         //              ransN[z] >>= 16;
         //          }
         //      }
@@ -690,20 +688,20 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
         SET512x(xmax, x_max); // high latency
 
         uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
-        int pc1 = _mm_popcnt_u32(gt_mask1);
+        int pc1 = _mm_popcnt_u32(gt_mask1) * 2;
         __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
         __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
         uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
         SET512x(SDv, cmpl_freq); // good
-        int pc2 = _mm_popcnt_u32(gt_mask2);
+        int pc2 = _mm_popcnt_u32(gt_mask2) * 2;
 
         Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
         Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
 
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
-        ptr16 -= pc2;
-        _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
-        ptr16 -= pc1;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc2, pc2-1, Rp2);
+        ptr -= pc2;
+        _mm512_mask_cvtepi32_storeu_epi16(ptr-pc1, pc1-1, Rp1);
+        ptr -= pc1;
 
         Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
         Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
@@ -751,8 +749,6 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
     STORE512(Rv, ransN);
     STORE512(last, lN);
 
-    ptr = (uint8_t *)ptr16;
-
     for (z = 32-1; z>=0; z--)
         RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]);
 
@@ -845,7 +841,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
     for (z = 0; z < NX; z++)
         iN[z] = z*isz4;
 
-    uint16_t *sp = (uint16_t *)ptr;
+    uint8_t *sp = ptr;
     const uint32_t mask = (1u << shift)-1;
 
     __m512i _maskv  = _mm512_set1_epi32(mask);
@@ -865,7 +861,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
     if (shift == TF_SHIFT_O1) {
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
             __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
@@ -939,11 +935,11 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
             __m512i renorm_words1 = _mm512_cvtepu16_epi32
                 (_mm256_loadu_si256((const __m256i *)sp));
-            sp += _mm_popcnt_u32(_imask1);
+            sp += _mm_popcnt_u32(_imask1) * 2;
 
             __m512i renorm_words2 = _mm512_cvtepu16_epi32
                 (_mm256_loadu_si256((const __m256i *)sp));
-            sp += _mm_popcnt_u32(_imask2);
+            sp += _mm_popcnt_u32(_imask2) * 2;
 
             __m512i _renorm_vals1 =
                 _mm512_maskz_expand_epi32(_imask1, renorm_words1);
@@ -985,7 +981,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
         STORE512(_Rv, R);
         STORE512(_Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
@@ -1034,7 +1030,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
         // SIMD version ends decoding early as it reads at most 64 bytes
         // from input via 4 vectorised loads.
         isz4 -= 64;
-        for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
+        for (; iN[0] < isz4 && ptr_end - sp > 64; ) {
             // m[z] = R[z] & mask;
             __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
             __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
@@ -1133,7 +1129,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
 
         STORE512(_Rv, R);
         STORE512(_Lv, lN);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         if (1) {
             iN[0]-=tidx;
diff --git a/htscodecs/rANS_static32x16pr_neon.c b/htscodecs/rANS_static32x16pr_neon.c
index 52e7d2f..817bb30 100644
--- a/htscodecs/rANS_static32x16pr_neon.c
+++ b/htscodecs/rANS_static32x16pr_neon.c
@@ -665,7 +665,7 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
     // 500MB/s.  Clang does a lot of reordering of this code, removing some
     // of the manual tuning benefits.  Short of dropping to assembly, for now
     // I would recommend using gcc to compile this file.
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
     uint8_t overflow[64+64] = {0};
     for (i=0; i < out_end; i+=NX) {
         // Decode freq, bias and symbol from s3 lookups
@@ -770,16 +770,20 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
 
         // Protect against running off the end of in buffer.
         // We copy it to a worst-case local buffer when near the end.
-        if ((uint8_t *)sp+64 > cp_end) {
-            memmove(overflow, sp, cp_end - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
+        if (cp_end - sp < 64) {
+            memmove(overflow, sp, cp_end - sp);
+            sp = overflow;
             cp_end = overflow + sizeof(overflow);
         }
 
-        uint16x8_t norm12 =  vld1q_u16(sp);
-        sp += nbits[imask1] + nbits[imask2];
-        uint16x8_t norm34 =  vld1q_u16(sp);
-        sp += nbits[imask3] + nbits[imask4];
+        // load 8 16-bit lanes of renorm data (loaded as 8-bit for
+        // alignment purposes, but will be shuffled and cast to
+        // 16 bit below).
+
+        uint8x16_t norm12 =  vld1q_u8(sp);
+        sp += (nbits[imask1] + nbits[imask2]) * 2;
+        uint8x16_t norm34 =  vld1q_u8(sp);
+        sp += (nbits[imask3] + nbits[imask4]) * 2;
 
         Bv5 = vandq_u32(Bv5, maskv);
         Bv6 = vandq_u32(Bv6, maskv);
@@ -799,14 +803,14 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
         uint32_t imask12 = (imask1<<4)|imask2;
         uint32_t imask34 = (imask3<<4)|imask4;
 
+        // Shuffle norm to the corresponding R lanes, via imask
         // #define for brevity and formatting
 #define cast_u16_u8 vreinterpret_u16_u8
-#define cast_u8_u16 vreinterpretq_u8_u16
         uint16x4_t norm1, norm2, norm3, norm4, norm5, norm6, norm7, norm8;
-        norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx [imask1]));
-        norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx2[imask12]));
-        norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx [imask3]));
-        norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx2[imask34]));
+        norm1 = cast_u16_u8(vqtbl1_u8(norm12,idx [imask1]));
+        norm2 = cast_u16_u8(vqtbl1_u8(norm12,idx2[imask12]));
+        norm3 = cast_u16_u8(vqtbl1_u8(norm34,idx [imask3]));
+        norm4 = cast_u16_u8(vqtbl1_u8(norm34,idx2[imask34]));
 
         uint32x4_t Rlt5 = vcltq_u32(Rv5, vdupq_n_u32(RANS_BYTE_L));
         uint32x4_t Rlt6 = vcltq_u32(Rv6, vdupq_n_u32(RANS_BYTE_L));
@@ -819,15 +823,15 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
         uint32x4_t Rsl3 = vshlq_n_u32(Rv3, 16);
         uint32x4_t Rsl4 = vshlq_n_u32(Rv4, 16);
 
-        uint16x8_t norm56 =  vld1q_u16(sp);
+        uint8x16_t norm56 =  vld1q_u8(sp);
         uint32_t imask5 = vaddvq_u32(vandq_u32(Rlt5, bit));
         uint32_t imask6 = vaddvq_u32(vandq_u32(Rlt6, bit));
         uint32_t imask7 = vaddvq_u32(vandq_u32(Rlt7, bit));
         uint32_t imask8 = vaddvq_u32(vandq_u32(Rlt8, bit));
 
-        sp += nbits[imask5] + nbits[imask6];
-        uint16x8_t norm78 =  vld1q_u16(sp);
-        sp += nbits[imask7] + nbits[imask8];
+        sp += (nbits[imask5] + nbits[imask6]) * 2;
+        uint8x16_t norm78 =  vld1q_u8(sp);
+        sp += (nbits[imask7] + nbits[imask8]) * 2;
 
         Rsl1 = vaddw_u16(Rsl1, norm1);          // Rsl += norm
         Rsl2 = vaddw_u16(Rsl2, norm2);
@@ -842,10 +846,10 @@ unsigned char *rans_uncompress_O0_32x16_neon(unsigned char *in,
         Rv3 = vbslq_u32(Rlt3, Rsl3, Rv3);
         Rv4 = vbslq_u32(Rlt4, Rsl4, Rv4);
 
-        norm5 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm56),idx [imask5]));
-        norm6 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm56),idx2[imask56]));
-        norm7 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm78),idx [imask7]));
-        norm8 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm78),idx2[imask78]));
+        norm5 = cast_u16_u8(vqtbl1_u8(norm56,idx [imask5]));
+        norm6 = cast_u16_u8(vqtbl1_u8(norm56,idx2[imask56]));
+        norm7 = cast_u16_u8(vqtbl1_u8(norm78,idx [imask7]));
+        norm8 = cast_u16_u8(vqtbl1_u8(norm78,idx2[imask78]));
 
         uint32x4_t Rsl5 = vshlq_n_u32(Rv5, 16);
         uint32x4_t Rsl6 = vshlq_n_u32(Rv6, 16);
@@ -1551,7 +1555,7 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
         // Follow with 2nd copy doing scalar code instead?
         unsigned char tbuf[32][32] = {0};
         int tidx = 0;
-        for (; i4[0] < isz4 && ptr+64 < ptr_end;) {
+        for (; i4[0] < isz4 && ptr_end - ptr > 64;) {
             for (z = 0; z < NX; z+=16) {
                 uint32x4_t Rv1 = vld1q_u32(&R[z+0]);
                 uint32x4_t Rv2 = vld1q_u32(&R[z+4]);
@@ -1643,12 +1647,14 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
                 uint32x4_t Rlt3 = vcltq_u32(Rv3, vdupq_n_u32(RANS_BYTE_L));
                 uint32x4_t Rlt4 = vcltq_u32(Rv4, vdupq_n_u32(RANS_BYTE_L));
                 uint32x4_t all2 = {2,2,2,2};
-                // load 8 lanes of renorm data
-                uint16x8_t norm12 =  vld1q_u16((uint16_t *)ptr);
+                // load 8 16-bit lanes of renorm data (loaded as 8-bit for
+                // alignment purposes, but will be shuffled and cast to
+                // 16 bit below).
+                uint8x16_t norm12 =  vld1q_u8(ptr);
                 // move ptr by no. renorm lanes used
                 ptr += vaddvq_u32(vandq_u32(Rlt1, all2))
                     +  vaddvq_u32(vandq_u32(Rlt2, all2));
-                uint16x8_t norm34 =  vld1q_u16((uint16_t *)ptr);
+                uint8x16_t norm34 =  vld1q_u8(ptr);
                 ptr += vaddvq_u32(vandq_u32(Rlt3, all2))
                     +  vaddvq_u32(vandq_u32(Rlt4, all2));
 
@@ -1663,16 +1669,12 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
                 uint32_t imask34 = (imask3<<4)|imask4;
 
                 // Shuffle norm to the corresponding R lanes, via imask
-                // #define for brevity and formatting
+                // cast_u16_u8 #define for brevity and formatting
                 uint16x4_t norm1, norm2, norm3, norm4;
-                norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),
-                                              idx [imask1]));
-                norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),
-                                              idx2[imask12]));
-                norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),
-                                              idx [imask3]));
-                norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),
-                                              idx2[imask34]));
+                norm1 = cast_u16_u8(vqtbl1_u8(norm12, idx [imask1]));
+                norm2 = cast_u16_u8(vqtbl1_u8(norm12, idx2[imask12]));
+                norm3 = cast_u16_u8(vqtbl1_u8(norm34, idx [imask3]));
+                norm4 = cast_u16_u8(vqtbl1_u8(norm34, idx2[imask34]));
 
                 // Add norm to R<<16 and blend back in with R
                 uint32x4_t Rsl1 = vshlq_n_u32(Rv1, 16); // Rsl = R << 16
@@ -1776,7 +1778,7 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
 
         uint32_t *S3 = (uint32_t *)s3;
         
-        for (; i4[0] < isz4 && ptr+64 < ptr_end;) {
+        for (; i4[0] < isz4 && ptr_end - ptr > 64;) {
             int Z = 0;
             for (z = 0; z < NX; z+=16, Z+=4) {
                 // streamline these.  Could swap between two banks and pre-load
@@ -1852,23 +1854,25 @@ unsigned char *rans_uncompress_O1_32x16_neon(unsigned char *in,
                 uint32_t imask3 = vaddvq_u32(vandq_u32(Rlt3, bit));
                 uint32_t imask4 = vaddvq_u32(vandq_u32(Rlt4, bit));
 
-                // load 8 lanes of renorm data
-                uint16x8_t norm12 =  vld1q_u16((uint16_t *)ptr);
+                // load 8 16-bit lanes of renorm data (loaded as 8-bit for
+                // alignment purposes, but will be shuffled and cast to
+                // 16 bit below).
+                uint8x16_t norm12 =  vld1q_u8(ptr);
                 // move ptr by no. renorm lanes used
                 ptr += nbits[imask1] + nbits[imask2];
-                uint16x8_t norm34 =  vld1q_u16((uint16_t *)ptr);
+                uint8x16_t norm34 =  vld1q_u8(ptr);
                 ptr += nbits[imask3] + nbits[imask4];
 
                 uint32_t imask12 = (imask1<<4)|imask2;
                 uint32_t imask34 = (imask3<<4)|imask4;
 
                 // Shuffle norm to the corresponding R lanes, via imask
-                // #define for brevity and formatting
+                // cast_u16_u8 #define for brevity and formatting
                 uint16x4_t norm1, norm2, norm3, norm4;
-                norm1 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx [imask1]));
-                norm2 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm12),idx2[imask12]));
-                norm3 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx [imask3]));
-                norm4 = cast_u16_u8(vqtbl1_u8(cast_u8_u16(norm34),idx2[imask34]));
+                norm1 = cast_u16_u8(vqtbl1_u8(norm12,idx [imask1]));
+                norm2 = cast_u16_u8(vqtbl1_u8(norm12,idx2[imask12]));
+                norm3 = cast_u16_u8(vqtbl1_u8(norm34,idx [imask3]));
+                norm4 = cast_u16_u8(vqtbl1_u8(norm34,idx2[imask34]));
 
                 // Add norm to R<<16 and blend back in with R
                 uint32x4_t Rsl1 = vshlq_n_u32(RV[Z+0], 16); // Rsl = R << 16
diff --git a/htscodecs/rANS_static32x16pr_sse4.c b/htscodecs/rANS_static32x16pr_sse4.c
index a9b9c4b..7de193b 100644
--- a/htscodecs/rANS_static32x16pr_sse4.c
+++ b/htscodecs/rANS_static32x16pr_sse4.c
@@ -275,7 +275,6 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
     uint32_t SB[256], SA[256], SD[256], SC[256];
 
     // Build lookup tables for SIMD encoding
-    uint16_t *ptr16 = (uint16_t *)ptr;
     for (i = 0; i < 256; i++) {
         SB[i] = syms[i].x_max;
         SA[i] = syms[i].rcp_freq;
@@ -315,7 +314,7 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
         __m128i cv6 = _mm_cmpgt_epi32(Rv[H+1], xmax6);
         __m128i cv5 = _mm_cmpgt_epi32(Rv[H+0], xmax5);
 
-        // Store bottom 16-bits at ptr16
+        // Store bottom 16-bits at ptr
         unsigned int imask8 = _mm_movemask_ps((__m128)cv8);
         unsigned int imask7 = _mm_movemask_ps((__m128)cv7);
         unsigned int imask6 = _mm_movemask_ps((__m128)cv6);
@@ -367,10 +366,10 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
         V6 = _mm_shuffle_epi8(V6, shuf);
         V5 = _mm_shuffle_epi8(V5, shuf);
 
-        _mm_storeu_si64(ptr16-4, V8); ptr16 -= _mm_popcnt_u32(imask8);
-        _mm_storeu_si64(ptr16-4, V7); ptr16 -= _mm_popcnt_u32(imask7);
-        _mm_storeu_si64(ptr16-4, V6); ptr16 -= _mm_popcnt_u32(imask6);
-        _mm_storeu_si64(ptr16-4, V5); ptr16 -= _mm_popcnt_u32(imask5);
+        _mm_storeu_si64(ptr-8, V8); ptr -= _mm_popcnt_u32(imask8) * 2;
+        _mm_storeu_si64(ptr-8, V7); ptr -= _mm_popcnt_u32(imask7) * 2;
+        _mm_storeu_si64(ptr-8, V6); ptr -= _mm_popcnt_u32(imask6) * 2;
+        _mm_storeu_si64(ptr-8, V5); ptr -= _mm_popcnt_u32(imask5) * 2;
 
         Rv[H+3] = _mm_blendv_epi8(Rv[H+3], _mm_srli_epi32(Rv[H+3], 16), cv8);
         Rv[H+2] = _mm_blendv_epi8(Rv[H+2], _mm_srli_epi32(Rv[H+2], 16), cv7);
@@ -441,8 +440,6 @@ unsigned char *rans_compress_O0_32x16_sse4(unsigned char *in,
 
     STORE128v(Rv, ransN);
 
-    ptr = (uint8_t *)ptr16;
-
     for (z = NX-1; z >= 0; z--)
       RansEncFlush(&ransN[z], &ptr);
 
@@ -512,7 +509,7 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
             goto err;
     }
 
-    uint16_t *sp = (uint16_t *)cp;
+    uint8_t *sp = cp;
 
     int out_end = (out_sz&~(NX-1));
     const uint32_t mask = (1u << TF_SHIFT)-1;
@@ -628,31 +625,31 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
         // 72 = 7*8(imask1..7) + 16;  worse case for 8th _mm_loadu_si128 call.
         // An extra 64 bytes is to avoid triggering this multiple times
         // after we swap sp/cp_end over.
-        if ((uint8_t *)sp+72 > cp_end) {
-            memmove(overflow, sp, cp_end - (uint8_t *)sp);
-            sp = (uint16_t *)overflow;
-            cp_end = (uint8_t *)overflow + sizeof(overflow);
+        if (cp_end - sp < 72) {
+            memmove(overflow, sp, cp_end - sp);
+            sp = overflow;
+            cp_end = overflow + sizeof(overflow);
         }
 
         // Shuffle the renorm values to correct lanes and incr sp pointer
         __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1);
         Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1]));
-        sp += _mm_popcnt_u32(imask1);
+        sp += _mm_popcnt_u32(imask1) * 2;
 
         __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2);
-        sp += _mm_popcnt_u32(imask2);
+        sp += _mm_popcnt_u32(imask2) * 2;
         Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2]));
 
         __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3);
         Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3]));
-        sp += _mm_popcnt_u32(imask3);
+        sp += _mm_popcnt_u32(imask3) * 2;
 
         __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4);
-        sp += _mm_popcnt_u32(imask4);
+        sp += _mm_popcnt_u32(imask4) * 2;
         Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4]));
 
         __m128i Yv1 = _mm_slli_epi32(Rv1, 16);
@@ -738,21 +735,21 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
         __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5);
         Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5]));
-        sp += _mm_popcnt_u32(imask5);
+        sp += _mm_popcnt_u32(imask5) * 2;
 
         __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6);
-        sp += _mm_popcnt_u32(imask6);
+        sp += _mm_popcnt_u32(imask6) * 2;
         Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6]));
 
         __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7);
         Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7]));
-        sp += _mm_popcnt_u32(imask7);
+        sp += _mm_popcnt_u32(imask7) * 2;
 
         __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
         unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8);
-        sp += _mm_popcnt_u32(imask8);
+        sp += _mm_popcnt_u32(imask8) * 2;
         Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8]));
 
         __m128i Yv5 = _mm_slli_epi32(Rv5, 16);
@@ -1108,7 +1105,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
     // loop with shift as a variable.
     if (shift == TF_SHIFT_O1) {
         // TF_SHIFT_O1 = 12
-        uint16_t *sp = (uint16_t *)ptr;
+        uint8_t *sp = ptr;
         const uint32_t mask = ((1u << TF_SHIFT_O1)-1);
         __m128i maskv  = _mm_set1_epi32(mask); // set mask in all lanes
         uint8_t tbuf[32][32];
@@ -1117,7 +1114,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
         LOAD128(Lv, l);
 
         isz4 -= 64;
-        for (; i4[0] < isz4 && (uint8_t *)sp+72 < ptr_end; ) {
+        for (; i4[0] < isz4 && ptr_end - sp > 72; ) {
             //for (z = 0; z < NX; z++)
             //  m[z] = R[z] & mask;
             __m128i masked1 = _mm_and_si128(Rv1, maskv);
@@ -1257,21 +1254,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1);
             Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1]));
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2);
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2]));
 
             __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3);
             Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3]));
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
 
             __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4);
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
             Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4]));
 
             __m128i Yv1 = _mm_slli_epi32(Rv1, 16);
@@ -1382,21 +1379,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5);
             Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5]));
-            sp += _mm_popcnt_u32(imask5);
+            sp += _mm_popcnt_u32(imask5) * 2;
 
             __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6);
-            sp += _mm_popcnt_u32(imask6);
+            sp += _mm_popcnt_u32(imask6) * 2;
             Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6]));
 
             __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7);
             Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7]));
-            sp += _mm_popcnt_u32(imask7);
+            sp += _mm_popcnt_u32(imask7) * 2;
 
             __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8);
-            sp += _mm_popcnt_u32(imask8);
+            sp += _mm_popcnt_u32(imask8) * 2;
             Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8]));
 
             __m128i Yv5 = _mm_slli_epi32(Rv5, 16);
@@ -1438,7 +1435,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
 
         STORE128(Rv, R);
         STORE128(Lv, l);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         i4[0]-=tidx;
         int T;
@@ -1478,7 +1475,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
         }
     } else {
         // TF_SHIFT_O1 = 10
-        uint16_t *sp = (uint16_t *)ptr;
+        uint8_t *sp = ptr;
         const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
         __m128i maskv  = _mm_set1_epi32(mask); // set mask in all lanes
         uint8_t tbuf[32][32] __attribute__((aligned(32)));
@@ -1487,7 +1484,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
         LOAD128(Lv, l);
 
         isz4 -= 64;
-        for (; i4[0] < isz4 && (uint8_t *)sp+72 < ptr_end; ) {
+        for (; i4[0] < isz4 && ptr_end - sp > 72; ) {
             //for (z = 0; z < NX; z++)
             //  m[z] = R[z] & mask;
             __m128i masked1 = _mm_and_si128(Rv1, maskv);
@@ -1607,21 +1604,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv1 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask1 = _mm_movemask_ps((__m128)renorm_mask1);
             Vv1 = _mm_shuffle_epi8(Vv1, _mm_load_si128((__m128i*)pidx[imask1]));
-            sp += _mm_popcnt_u32(imask1);
+            sp += _mm_popcnt_u32(imask1) * 2;
 
             __m128i Vv2 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask2 = _mm_movemask_ps((__m128)renorm_mask2);
-            sp += _mm_popcnt_u32(imask2);
+            sp += _mm_popcnt_u32(imask2) * 2;
             Vv2 = _mm_shuffle_epi8(Vv2, _mm_load_si128((__m128i*)pidx[imask2]));
 
             __m128i Vv3 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask3 = _mm_movemask_ps((__m128)renorm_mask3);
             Vv3 = _mm_shuffle_epi8(Vv3, _mm_load_si128((__m128i*)pidx[imask3]));
-            sp += _mm_popcnt_u32(imask3);
+            sp += _mm_popcnt_u32(imask3) * 2;
 
             __m128i Vv4 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask4 = _mm_movemask_ps((__m128)renorm_mask4);
-            sp += _mm_popcnt_u32(imask4);
+            sp += _mm_popcnt_u32(imask4) * 2;
             Vv4 = _mm_shuffle_epi8(Vv4, _mm_load_si128((__m128i*)pidx[imask4]));
 
             __m128i Yv1 = _mm_slli_epi32(Rv1, 16);
@@ -1722,21 +1719,21 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
             __m128i Vv5 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask5 = _mm_movemask_ps((__m128)renorm_mask5);
             Vv5 = _mm_shuffle_epi8(Vv5, _mm_load_si128((__m128i*)pidx[imask5]));
-            sp += _mm_popcnt_u32(imask5);
+            sp += _mm_popcnt_u32(imask5) * 2;
 
             __m128i Vv6 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask6 = _mm_movemask_ps((__m128)renorm_mask6);
-            sp += _mm_popcnt_u32(imask6);
+            sp += _mm_popcnt_u32(imask6) * 2;
             Vv6 = _mm_shuffle_epi8(Vv6, _mm_load_si128((__m128i*)pidx[imask6]));
 
             __m128i Vv7 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask7 = _mm_movemask_ps((__m128)renorm_mask7);
             Vv7 = _mm_shuffle_epi8(Vv7, _mm_load_si128((__m128i*)pidx[imask7]));
-            sp += _mm_popcnt_u32(imask7);
+            sp += _mm_popcnt_u32(imask7) * 2;
 
             __m128i Vv8 = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)sp));
             unsigned int imask8 = _mm_movemask_ps((__m128)renorm_mask8);
-            sp += _mm_popcnt_u32(imask8);
+            sp += _mm_popcnt_u32(imask8) * 2;
             Vv8 = _mm_shuffle_epi8(Vv8, _mm_load_si128((__m128i*)pidx[imask8]));
 
             __m128i Yv5 = _mm_slli_epi32(Rv5, 16);
@@ -1777,7 +1774,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
 
         STORE128(Rv, R);
         STORE128(Lv, l);
-        ptr = (uint8_t *)sp;
+        ptr = sp;
 
         i4[0]-=tidx;
         int T;
diff --git a/htscodecs/rANS_static4x16pr.c b/htscodecs/rANS_static4x16pr.c
index d7fd00c..7b4d9bb 100644
--- a/htscodecs/rANS_static4x16pr.c
+++ b/htscodecs/rANS_static4x16pr.c
@@ -190,31 +190,10 @@ unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
         RansEncSymbol *s1 = &syms[in[i-3]];
         RansEncSymbol *s0 = &syms[in[i-4]];
 
-#if 1
         RansEncPutSymbol(&rans3, &ptr, s3);
         RansEncPutSymbol(&rans2, &ptr, s2);
         RansEncPutSymbol(&rans1, &ptr, s1);
         RansEncPutSymbol(&rans0, &ptr, s0);
-#else
-        // Slightly beter on gcc, much better on clang
-        uint16_t *ptr16 = (uint16_t *)ptr;
-
-        if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16;
-        if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16;
-        uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift);
-        uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift);
-        rans3 += s3->bias + q3 * s3->cmpl_freq;
-        rans2 += s2->bias + q2 * s2->cmpl_freq;
-
-        if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16;
-        if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16;
-        uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift);
-        uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift);
-        rans1 += s1->bias + q1 * s1->cmpl_freq;
-        rans0 += s0->bias + q0 * s0->cmpl_freq;
-
-        ptr = (uint8_t *)ptr16;
-#endif
     }
 
     RansEncFlush(&rans3, &ptr);
diff --git a/htscodecs/rANS_word.h b/htscodecs/rANS_word.h
index db60b04..a537cb0 100644
--- a/htscodecs/rANS_word.h
+++ b/htscodecs/rANS_word.h
@@ -77,8 +77,10 @@ static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq
 {
     uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 16) * freq-1; // this turns into a shift.
     if (x > x_max) {
-        uint16_t* ptr = (uint16_t *)*pptr;
-        *--ptr = (uint16_t) (x & 0xffff);
+        uint8_t* ptr = *pptr;
+        ptr -= 2;
+        ptr[0] = x & 0xff;
+        ptr[1] = (x >> 8) & 0xff;
         x >>= 16;
         *pptr = (uint8_t *)ptr;
     }