From f0e417bb8be8b062dd8b407172508c0e8b0da60b Mon Sep 17 00:00:00 2001
From: Sandor Zsombor Vegh <sandorzsombor.vegh@arm.com>
Date: Wed, 6 Nov 2024 12:59:02 +0100
Subject: [PATCH] Arm: Speed up iamf_decoder_plane2stride_out

Extract zipping channels with conversion to int16, int24 and int32
outputs as separate functions, with C and Neon implementations.
Reorder loops in C implementation to be more cache-friendly and
autovectorizable.
---
 code/src/iamf_dec/IAMF_decoder.c              |  81 +-
 code/src/iamf_dec/arch.h                      |  13 +
 code/src/iamf_dec/arch/arch_common.c          |  70 ++
 code/src/iamf_dec/arch/arch_common.h          |  11 +
 code/src/iamf_dec/arch/arch_init.c            |   4 +
 code/src/iamf_dec/arch/arm/arm_zip_channels.c | 811 ++++++++++++++++++
 code/src/iamf_dec/arch/arm/arm_zip_channels.h |  42 +
 code/src/iamf_dec/arch/arm/override_arm.c     |   5 +
 code/win64/VS2022/iamf/iamf.vcxproj           |   2 +
 code/win64/VS2022/iamf/iamf.vcxproj.filters   |   6 +
 10 files changed, 981 insertions(+), 64 deletions(-)
 create mode 100644 code/src/iamf_dec/arch/arm/arm_zip_channels.c
 create mode 100644 code/src/iamf_dec/arch/arm/arm_zip_channels.h

diff --git a/code/src/iamf_dec/IAMF_decoder.c b/code/src/iamf_dec/IAMF_decoder.c
index 675cbd4b..51151b6d 100644
--- a/code/src/iamf_dec/IAMF_decoder.c
+++ b/code/src/iamf_dec/IAMF_decoder.c
@@ -22,7 +22,6 @@
 #endif
 
 #include <inttypes.h>
-#include <math.h>
 
 #include "IAMF_OBU.h"
 #include "IAMF_debug.h"
@@ -76,74 +75,28 @@ static int64_t time_transform(int64_t t1, int s1, int s2) {
 }
 
 /* ----------------------------- Internal methods ------------------ */
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-static int16_t FLOAT2INT16(float x) {
-  x = x * 32768.f;
-  x = MAX(x, -32768.f);
-  x = MIN(x, 32767.f);
-  return (int16_t)lrintf(x);
-}
-
-static int32_t FLOAT2INT24(float x) {
-  x = x * 8388608.f;
-  x = MAX(x, -8388608.f);
-  x = MIN(x, 8388607.f);
-  return (int32_t)lrintf(x);
-}
 
-static int32_t FLOAT2INT32(float x) {
-  x = x * 2147483648.f;
-  if (x > -2147483648.f && x < 2147483647.f)
-    return (int32_t)lrintf(x);
-  else
-    return (x > 0.0f ? 2147483647 : (-2147483647 - 1));
-}
+static void iamf_decoder_plane2stride_out(const Arch *arch, void *dst,
+                                          const float *src, int frame_size,
+                                          int channels, uint32_t bit_depth) {
+  if (!src) {
+    if (bit_depth == 16 || bit_depth == 24 || bit_depth == 32)
+      memset(dst, 0x0, frame_size * channels * (bit_depth / 8));
+    return;
+  }
 
-static void iamf_decoder_plane2stride_out(void *dst, const float *src,
-                                          int frame_size, int channels,
-                                          uint32_t bit_depth) {
   if (bit_depth == 16) {
-    int16_t *int16_dst = (int16_t *)dst;
-    for (int c = 0; c < channels; ++c) {
-      for (int i = 0; i < frame_size; i++) {
-        if (src) {
-          int16_dst[i * channels + c] = FLOAT2INT16(src[frame_size * c + i]);
-        } else {
-          int16_dst[i * channels + c] = 0;
-        }
-      }
-    }
+    (*arch->output.float2int16_zip_channels)(src, frame_size, channels,
+                                             (int16_t *)dst, frame_size);
   } else if (bit_depth == 24) {
-    uint8_t *int24_dst = (uint8_t *)dst;
-    for (int c = 0; c < channels; ++c) {
-      for (int i = 0; i < frame_size; i++) {
-        if (src) {
-          int32_t tmp = FLOAT2INT24(src[frame_size * c + i]);
-          int24_dst[(i * channels + c) * 3] = tmp & 0xff;
-          int24_dst[(i * channels + c) * 3 + 1] = (tmp >> 8) & 0xff;
-          int24_dst[(i * channels + c) * 3 + 2] =
-              ((tmp >> 16) & 0x7f) | ((tmp >> 24) & 0x80);
-        } else {
-          int24_dst[(i * channels + c) * 3] = 0;
-          int24_dst[(i * channels + c) * 3 + 1] = 0;
-          int24_dst[(i * channels + c) * 3 + 2] = 0;
-        }
-      }
-    }
+    (*arch->output.float2int24_zip_channels)(src, frame_size, channels,
+                                             (uint8_t *)dst, frame_size);
   } else if (bit_depth == 32) {
-    int32_t *int32_dst = (int32_t *)dst;
-    for (int c = 0; c < channels; ++c) {
-      for (int i = 0; i < frame_size; i++) {
-        if (src) {
-          int32_dst[i * channels + c] = FLOAT2INT32(src[frame_size * c + i]);
-        } else {
-          int32_dst[i * channels + c] = 0;
-        }
-      }
-    }
+    (*arch->output.float2int32_zip_channels)(src, frame_size, channels,
+                                             (int32_t *)dst, frame_size);
   }
 }
+
 static void ia_decoder_stride2plane_out_float(void *dst, const float *src,
                                               int frame_size, int channels) {
   float *float_dst = (float *)dst;
@@ -3713,7 +3666,7 @@ static int iamf_delay_buffer_handle(IAMF_DecoderHandle handle, void *pcm) {
         audio_effect_peak_limiter_process_block(limiter, in, out, frame_size);
   }
 
-  iamf_decoder_plane2stride_out(pcm, out, frame_size,
+  iamf_decoder_plane2stride_out(handle->arch, pcm, out, frame_size,
                                 ctx->output_layout->channels, ctx->bit_depth);
   free(in);
   free(out);
@@ -3921,7 +3874,7 @@ static int iamf_decoder_internal_decode(IAMF_DecoderHandle handle,
       swap((void **)&f->data, (void **)&out);
     }
 
-    iamf_decoder_plane2stride_out(pcm, f->data, real_frame_size,
+    iamf_decoder_plane2stride_out(handle->arch, pcm, f->data, real_frame_size,
                                   ctx->output_layout->channels, ctx->bit_depth);
 
 #if SR
diff --git a/code/src/iamf_dec/arch.h b/code/src/iamf_dec/arch.h
index ef2e6e48..e21411ed 100644
--- a/code/src/iamf_dec/arch.h
+++ b/code/src/iamf_dec/arch.h
@@ -20,6 +20,8 @@
 #ifndef ARCH_H_
 #define ARCH_H_
 
+#include <stdint.h>
+
 typedef struct ArchCallbacks {
   // Functions with possible architecture-specific optimizations
   struct {
@@ -28,6 +30,17 @@ typedef struct ArchCallbacks {
                                         int out_next, float **in, float **out,
                                         int nsamples);
   } rendering;
+  struct {
+    void (*float2int16_zip_channels)(const float *src, int next_channel,
+                                     int channels, int16_t *int16_dst,
+                                     int nsamples);
+    void (*float2int24_zip_channels)(const float *src, int next_channel,
+                                     int channels, uint8_t *int24_dst,
+                                     int nsamples);
+    void (*float2int32_zip_channels)(const float *src, int next_channel,
+                                     int channels, int32_t *int32_dst,
+                                     int nsamples);
+  } output;
 } Arch;
 
 Arch *arch_create();
diff --git a/code/src/iamf_dec/arch/arch_common.c b/code/src/iamf_dec/arch/arch_common.c
index 157c7a71..4dc50ab0 100644
--- a/code/src/iamf_dec/arch/arch_common.c
+++ b/code/src/iamf_dec/arch/arch_common.c
@@ -19,6 +19,8 @@
 
 #include "arch_common.h"
 
+#include <math.h>
+
 void multiply_channels_by_matrix_c(float *mat, int in_dim, int in_next,
                                    int *in_idx_map, int out_dim, int out_next,
                                    float **in, float **out, int nsamples) {
@@ -41,3 +43,71 @@ void multiply_channels_by_matrix_c(float *mat, int in_dim, int in_next,
     }
   }
 }
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static int16_t FLOAT2INT16(float x) {
+  x = x * (float)(1 << 15);
+  x = MAX(x, INT16_MIN);
+  x = MIN(x, INT16_MAX);
+  return (int16_t)lrintf(x);
+}
+
+static int32_t FLOAT2INT24(float x) {
+  #define INT24_MAX (8388607)
+  #define INT24_MIN (-8388608)
+
+  x = x * (float)(1 << 23);
+  x = MAX(x, (float)INT24_MIN);
+  x = MIN(x, (float)INT24_MAX);
+  return (int32_t)lrintf(x);
+}
+
+static int32_t FLOAT2INT32(float x) {
+  // unary minus applied to maintain correct signedness
+  x = x * -(float)(1 << 31);
+  if (x > (float)INT32_MIN && x < (float)INT32_MAX)
+    return (int32_t)lrintf(x);
+  else
+    return (x > 0.0f ? INT32_MAX : INT32_MIN);
+}
+
+void float2int16_zip_channels_c(const float *src, int next_channel,
+                                int channels, int16_t *int16_dst,
+                                int nsamples) {
+  int i, c;
+
+  for (c = 0; c < channels; ++c) {
+    for (i = 0; i < nsamples; i++) {
+      int16_dst[i * channels + c] = FLOAT2INT16(src[next_channel * c + i]);
+    }
+  }
+}
+
+void float2int24_zip_channels_c(const float *src, int next_channel,
+                                int channels, uint8_t *int24_dst,
+                                int nsamples) {
+  int i, c;
+
+  for (c = 0; c < channels; ++c) {
+    for (i = 0; i < nsamples; i++) {
+      int32_t tmp = FLOAT2INT24(src[next_channel * c + i]);
+      int24_dst[(i * channels + c) * 3 + 0] = tmp & 0xff;
+      int24_dst[(i * channels + c) * 3 + 1] = (tmp >> 8) & 0xff;
+      int24_dst[(i * channels + c) * 3 + 2] = (tmp >> 16) & 0xff;
+    }
+  }
+}
+
+void float2int32_zip_channels_c(const float *src, int next_channel,
+                                int channels, int32_t *int32_dst,
+                                int nsamples) {
+  int i, c;
+
+  for (c = 0; c < channels; ++c) {
+    for (i = 0; i < nsamples; i++) {
+      int32_dst[i * channels + c] = FLOAT2INT32(src[next_channel * c + i]);
+    }
+  }
+}
diff --git a/code/src/iamf_dec/arch/arch_common.h b/code/src/iamf_dec/arch/arch_common.h
index 8b92ba62..f3cc7efb 100644
--- a/code/src/iamf_dec/arch/arch_common.h
+++ b/code/src/iamf_dec/arch/arch_common.h
@@ -20,8 +20,19 @@
 #ifndef ARCH_COMMON_H_
 #define ARCH_COMMON_H_
 
+#include <stdint.h>
+
 void multiply_channels_by_matrix_c(float *mat, int in_dim, int in_next,
                                    int *in_idx_map, int out_dim, int out_next,
                                    float **in, float **out, int nsamples);
 
+void float2int16_zip_channels_c(const float *src, int next_channel,
+                                int channels, int16_t *int16_dst, int nsamples);
+
+void float2int24_zip_channels_c(const float *src, int next_channel,
+                                int channels, uint8_t *int24_dst, int nsamples);
+
+void float2int32_zip_channels_c(const float *src, int next_channel,
+                                int channels, int32_t *int32_dst, int nsamples);
+
 #endif /* ARCH_COMMON_H_ */
diff --git a/code/src/iamf_dec/arch/arch_init.c b/code/src/iamf_dec/arch/arch_init.c
index d4095e1a..6b6a7ea2 100644
--- a/code/src/iamf_dec/arch/arch_init.c
+++ b/code/src/iamf_dec/arch/arch_init.c
@@ -48,6 +48,10 @@ void arch_init(Arch* arch) {
   // Fill with reference implementations
   arch->rendering.multiply_channels_by_matrix = &multiply_channels_by_matrix_c;
 
+  arch->output.float2int16_zip_channels = &float2int16_zip_channels_c;
+  arch->output.float2int24_zip_channels = &float2int24_zip_channels_c;
+  arch->output.float2int32_zip_channels = &float2int32_zip_channels_c;
+
 #if defined(HAS_ARCH_OVERRIDE)
   // Override with platform-specific functions, if available
   arch_override(arch);
diff --git a/code/src/iamf_dec/arch/arm/arm_zip_channels.c b/code/src/iamf_dec/arch/arm/arm_zip_channels.c
new file mode 100644
index 00000000..e2def168
--- /dev/null
+++ b/code/src/iamf_dec/arch/arm/arm_zip_channels.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 3-Clause Clear License
+ * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
+ * License was not distributed with this source code in the LICENSE file, you
+ * can obtain it at www.aomedia.org/license/software-license/bsd-3-c-c. If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * www.aomedia.org/license/patent.
+ */
+
+/**
+ * @file arm_zip_channels.c
+ * @brief Arm implementation for zipping channels.
+ * @version 0.1
+ * @date Created 10/24/2024
+ **/
+
+#include "arm_zip_channels.h"
+
+#if defined(IAMF_ARCH_DETECTED_ARM)
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "../arch_common.h"
+
+#define MUL_16BIT 32768.0f
+
+#define MUL_24BIT 8388608.f
+#define RANGE_MIN_24BIT -8388608
+#define RANGE_MAX_24BIT 8388607
+
+#define MUL_32BIT 2147483648.f
+
+static inline int32x4_t vroundf(float32x4_t x)
+{
+#if defined(__ARM_ARCH) && __ARM_ARCH >= 8
+  return vcvtaq_s32_f32(x);
+#else
+  uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
+  uint32x4_t bias = vdupq_n_u32(0x3F000000);
+  return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
+#endif
+}
+
+static inline int16x4_t cvt_for_int16(float32x4_t vals) {
+  return vqmovn_s32(vroundf(vmulq_n_f32(vals, MUL_16BIT)));
+}
+
+static inline int16x8x2_t cvt_for_int16_x2(float32x4x2_t vals0,
+                                           float32x4x2_t vals1) {
+  int16x8x2_t ret;
+  ret.val[0] =
+      vcombine_s16(cvt_for_int16(vals0.val[0]), cvt_for_int16(vals0.val[1]));
+  ret.val[1] =
+      vcombine_s16(cvt_for_int16(vals1.val[0]), cvt_for_int16(vals1.val[1]));
+  return ret;
+}
+
+static inline int16x8x3_t cvt_for_int16_x3(float32x4x2_t vals0,
+                                           float32x4x2_t vals1,
+                                           float32x4x2_t vals2) {
+  int16x8x3_t ret;
+  ret.val[0] =
+      vcombine_s16(cvt_for_int16(vals0.val[0]), cvt_for_int16(vals0.val[1]));
+  ret.val[1] =
+      vcombine_s16(cvt_for_int16(vals1.val[0]), cvt_for_int16(vals1.val[1]));
+  ret.val[2] =
+      vcombine_s16(cvt_for_int16(vals2.val[0]), cvt_for_int16(vals2.val[1]));
+  return ret;
+}
+
+static inline int32x4_t cvt_clamp_for_int24_s32(int32x4_t lower,
+                                                float32x4_t vals,
+                                                int32x4_t upper) {
+  return vmaxq_s32(
+      vminq_s32(vroundf(vmulq_n_f32(vals, MUL_24BIT)), upper), lower);
+}
+
+static inline uint8x16_t cvt_clamp_for_int24_u8(int32x4_t lower,
+                                                float32x4_t vals,
+                                                int32x4_t upper) {
+  return vreinterpretq_u8_s32(vmaxq_s32(
+      vminq_s32(vroundf(vmulq_n_f32(vals, MUL_24BIT)), upper), lower));
+}
+
+static inline void write_consecutive_int24(uint8_t *ptr, uint8x8_t firstPart,
+                                           uint8_t secondPart) {
+  vst1_u8(ptr, firstPart);
+  ptr[8] = secondPart;
+}
+
+static inline void write_pair_int24(uint8_t *ptr, int step, uint64_t val) {
+  ptr[0] = (uint8_t)((val >> 0) & 0xff);
+  ptr[1] = (uint8_t)((val >> 8) & 0xff);
+  ptr[2] = (uint8_t)((val >> 16) & 0xff);
+
+  ptr[step + 0] = (uint8_t)((val >> 32) & 0xff);
+  ptr[step + 1] = (uint8_t)((val >> 40) & 0xff);
+  ptr[step + 2] = (uint8_t)((val >> 48) & 0xff);
+}
+
+static inline int32x4_t cvt_for_int32(float32x4_t vals) {
+  return vroundf(vmulq_n_f32(vals, MUL_32BIT));
+}
+
+static inline int32x4x2_t cvt_for_int32_x2(float32x4_t vals0,
+                                           float32x4_t vals1) {
+  int32x4x2_t ret;
+  ret.val[0] = cvt_for_int32(vals0);
+  ret.val[1] = cvt_for_int32(vals1);
+  return ret;
+}
+
+static inline int32x4x3_t cvt_for_int32_x3(float32x4_t vals0, float32x4_t vals1,
+                                           float32x4_t vals2) {
+  int32x4x3_t ret;
+  ret.val[0] = cvt_for_int32(vals0);
+  ret.val[1] = cvt_for_int32(vals1);
+  ret.val[2] = cvt_for_int32(vals2);
+  return ret;
+}
+
+static inline int16x4x4_t transpose_s16_4x4(const int16x4_t a, const int16x4_t b, const int16x4_t c,
+                                            const int16x4_t d) {
+  int16x8_t aq = vcombine_s16(a, vdup_n_s16(0));
+  int16x8_t bq = vcombine_s16(b, vdup_n_s16(0));
+  int16x8_t cq = vcombine_s16(c, vdup_n_s16(0));
+  int16x8_t dq = vcombine_s16(d, vdup_n_s16(0));
+
+  int16x8_t ac = vzipq_s16(aq, cq).val[0];
+  int16x8_t bd = vzipq_s16(bq, dq).val[0];
+
+  int16x8x2_t abcd = vzipq_s16(ac, bd);
+
+  int16x4x4_t ret = {{
+      vget_low_s16(abcd.val[0]),
+      vget_high_s16(abcd.val[0]),
+      vget_low_s16(abcd.val[1]),
+      vget_high_s16(abcd.val[1])
+  }};
+  return ret;
+}
+
+static inline int32x4x2_t vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vreinterpretq_s32_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s32_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
+  return b0;
+}
+
+static inline int32x4x4_t transpose_s32_4x4(int32x4_t a, int32x4_t b, int32x4_t c,
+                                            int32x4_t d) {
+  const int32x4x2_t trn_ab = vtrnq_s32(a, b);
+  const int32x4x2_t trn_cd = vtrnq_s32(c, d);
+
+  const int32x4x2_t r0 = vtrnq_s64_to_s32(trn_ab.val[0], trn_cd.val[0]);
+  const int32x4x2_t r1 = vtrnq_s64_to_s32(trn_ab.val[1], trn_cd.val[1]);
+
+  int32x4x4_t ret;
+  ret.val[0] = r0.val[0];
+  ret.val[1] = r1.val[0];
+  ret.val[2] = r0.val[1];
+  ret.val[3] = r1.val[1];
+  return ret;
+}
+
+static int float2int16_zip_1channels(const float *src, int16_t *int16_dst,
+                                     int nsamples) {
+  const int BLOCK_SIZE = 32;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+  int i;
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4x2_t in_a_01 = { { vld1q_f32(src + i +  0), vld1q_f32(src + i +  4) } };
+    float32x4x2_t in_a_23 = { { vld1q_f32(src + i +  8), vld1q_f32(src + i + 12) } };
+    float32x4x2_t in_a_45 = { { vld1q_f32(src + i + 16), vld1q_f32(src + i + 20) } };
+    float32x4x2_t in_a_67 = { { vld1q_f32(src + i + 24), vld1q_f32(src + i + 28) } };
+
+    int16x8x2_t out_01 = cvt_for_int16_x2(in_a_01, in_a_23);
+    int16x8x2_t out_23 = cvt_for_int16_x2(in_a_45, in_a_67);
+
+    vst1q_s16(int16_dst + i +  0, out_01.val[0]);
+    vst1q_s16(int16_dst + i +  8, out_01.val[1]);
+    vst1q_s16(int16_dst + i + 16, out_23.val[0]);
+    vst1q_s16(int16_dst + i + 24, out_23.val[1]);
+  }
+
+  return blocked_size;
+}
+
+static int float2int16_zip_2channels(const float *src, int next_channel,
+                                     int16_t *int16_dst, int nsamples) {
+  const int BLOCK_SIZE = 16;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+  int i;
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i +  4) } };
+    float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } };
+    float32x4x2_t in_b_01 = { { vld1q_f32(src + i + next_channel + 0), vld1q_f32(src + i + next_channel +  4) } };
+    float32x4x2_t in_b_23 = { { vld1q_f32(src + i + next_channel + 8), vld1q_f32(src + i + next_channel + 12) } };
+
+    int16x8x2_t out_ab_01 = cvt_for_int16_x2(in_a_01, in_b_01);
+    int16x8x2_t out_ab_23 = cvt_for_int16_x2(in_a_23, in_b_23);
+
+    vst2q_s16(int16_dst + i * 2 + 0, out_ab_01);
+    vst2q_s16(int16_dst + i * 2 + 16, out_ab_23);
+  }
+
+  return blocked_size;
+}
+
+static int float2int16_zip_3channels(const float *src, int next_channel,
+                                     int16_t *int16_dst, int nsamples) {
+  const int BLOCK_SIZE = 16;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+  int i;
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0 * next_channel + 0), vld1q_f32(src + i + 0 * next_channel +  4) } };
+    float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 0 * next_channel + 8), vld1q_f32(src + i + 0 * next_channel + 12) } };
+    float32x4x2_t in_b_01 = { { vld1q_f32(src + i + 1 * next_channel + 0), vld1q_f32(src + i + 1 * next_channel +  4) } };
+    float32x4x2_t in_b_23 = { { vld1q_f32(src + i + 1 * next_channel + 8), vld1q_f32(src + i + 1 * next_channel + 12) } };
+    float32x4x2_t in_c_01 = { { vld1q_f32(src + i + 2 * next_channel + 0), vld1q_f32(src + i + 2 * next_channel +  4) } };
+    float32x4x2_t in_c_23 = { { vld1q_f32(src + i + 2 * next_channel + 8), vld1q_f32(src + i + 2 * next_channel + 12) } };
+
+    int16x8x3_t out_abc_01 = cvt_for_int16_x3(in_a_01, in_b_01, in_c_01);
+    int16x8x3_t out_abc_23 = cvt_for_int16_x3(in_a_23, in_b_23, in_c_23);
+
+    vst3q_s16(int16_dst + i * 3 + 0, out_abc_01);
+    vst3q_s16(int16_dst + i * 3 + 24, out_abc_23);
+  }
+
+  return blocked_size;
+}
+
+static int float2int16_zip_nchannels(const float *src, int next_channel,
+                                     int channels, int16_t *int16_dst,
+                                     int nsamples) {
+  const int BATCH = 4;
+  const int BATCHED_BLOCK_SIZE = 8;
+  const int SINGLE_BLOCK_SIZE = 16;
+  const int bathed_channels = channels / BATCH * BATCH;
+  const int blocked_size = nsamples / BATCHED_BLOCK_SIZE * BATCHED_BLOCK_SIZE /
+                           SINGLE_BLOCK_SIZE * SINGLE_BLOCK_SIZE;
+
+  int i, c;
+
+  for (c = 0; c < bathed_channels; c += BATCH) {
+    for (i = 0; i < blocked_size; i += BATCHED_BLOCK_SIZE) {
+      float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * (c + 0) + i), vld1q_f32(src + next_channel * (c + 0) + i + 4) } };
+      float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel * (c + 1) + i), vld1q_f32(src + next_channel * (c + 1) + i + 4) } };
+      float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * (c + 2) + i), vld1q_f32(src + next_channel * (c + 2) + i + 4) } };
+      float32x4x2_t in_d_01 = { { vld1q_f32(src + next_channel * (c + 3) + i), vld1q_f32(src + next_channel * (c + 3) + i + 4) } };
+
+      int16x4_t s32_a_0 = cvt_for_int16(in_a_01.val[0]);
+      int16x4_t s32_b_0 = cvt_for_int16(in_b_01.val[0]);
+      int16x4_t s32_c_0 = cvt_for_int16(in_c_01.val[0]);
+      int16x4_t s32_d_0 = cvt_for_int16(in_d_01.val[0]);
+      int16x4_t s32_a_1 = cvt_for_int16(in_a_01.val[1]);
+      int16x4_t s32_b_1 = cvt_for_int16(in_b_01.val[1]);
+      int16x4_t s32_c_1 = cvt_for_int16(in_c_01.val[1]);
+      int16x4_t s32_d_1 = cvt_for_int16(in_d_01.val[1]);
+
+      int16x4x4_t transposed_abcd_0 =
+          transpose_s16_4x4(s32_a_0, s32_b_0, s32_c_0, s32_d_0);
+      int16x4x4_t transposed_abcd_1 =
+          transpose_s16_4x4(s32_a_1, s32_b_1, s32_c_1, s32_d_1);
+
+      int16_t *ptr = int16_dst + i * channels + c;
+      const int step = channels;
+      vst1_s16(ptr + step * 0, transposed_abcd_0.val[0]);
+      vst1_s16(ptr + step * 1, transposed_abcd_0.val[1]);
+      vst1_s16(ptr + step * 2, transposed_abcd_0.val[2]);
+      vst1_s16(ptr + step * 3, transposed_abcd_0.val[3]);
+      vst1_s16(ptr + step * 4, transposed_abcd_1.val[0]);
+      vst1_s16(ptr + step * 5, transposed_abcd_1.val[1]);
+      vst1_s16(ptr + step * 6, transposed_abcd_1.val[2]);
+      vst1_s16(ptr + step * 7, transposed_abcd_1.val[3]);
+    }
+  }
+
+  for (c = bathed_channels; c < channels; ++c) {
+    for (i = 0; i < blocked_size; i += SINGLE_BLOCK_SIZE) {
+      float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * c + i + 0), vld1q_f32(src + next_channel * c + i +  4) } };
+      float32x4x2_t in_a_23 = { { vld1q_f32(src + next_channel * c + i + 8), vld1q_f32(src + next_channel * c + i + 12) } };
+
+      uint64_t out_a_0 =
+          vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_01.val[0])), 0);
+      int64_t out_a_1 =
+          vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_01.val[1])), 0);
+      int64_t out_a_2 =
+          vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_23.val[0])), 0);
+      int64_t out_a_3 =
+          vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_23.val[1])), 0);
+
+      int16_t *ptr = int16_dst + i * channels + c;
+      const int step = channels;
+      ptr[step * 0] = (int16_t)((out_a_0 >> 0) & 0xffff);
+      ptr[step * 1] = (int16_t)((out_a_0 >> 16) & 0xffff);
+      ptr[step * 2] = (int16_t)((out_a_0 >> 32) & 0xffff);
+      ptr[step * 3] = (int16_t)((out_a_0 >> 48) & 0xffff);
+      ptr[step * 4] = (int16_t)((out_a_1 >> 0) & 0xffff);
+      ptr[step * 5] = (int16_t)((out_a_1 >> 16) & 0xffff);
+      ptr[step * 6] = (int16_t)((out_a_1 >> 32) & 0xffff);
+      ptr[step * 7] = (int16_t)((out_a_1 >> 48) & 0xffff);
+      ptr[step * 8] = (int16_t)((out_a_2 >> 0) & 0xffff);
+      ptr[step * 9] = (int16_t)((out_a_2 >> 16) & 0xffff);
+      ptr[step * 10] = (int16_t)((out_a_2 >> 32) & 0xffff);
+      ptr[step * 11] = (int16_t)((out_a_2 >> 48) & 0xffff);
+      ptr[step * 12] = (int16_t)((out_a_3 >> 0) & 0xffff);
+      ptr[step * 13] = (int16_t)((out_a_3 >> 16) & 0xffff);
+      ptr[step * 14] = (int16_t)((out_a_3 >> 32) & 0xffff);
+      ptr[step * 15] = (int16_t)((out_a_3 >> 48) & 0xffff);
+    }
+  }
+
+  return blocked_size;
+}
+
+static inline uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
+#if defined(__aarch64__)
+  uint8x16x2_t table = { { a, b } };
+  return vqtbl2_u8(table, idx);
+#else
+  uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+                          vget_high_u8(b) } };
+  return vtbl4_u8(table, idx);
+#endif
+}
+
+static inline uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
+#if defined(__aarch64__)
+  uint8x16x2_t table = { { a, b } };
+  return vqtbl2q_u8(table, idx);
+#else
+  uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+                          vget_high_u8(b) } };
+  return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
+                     vtbl4_u8(table, vget_high_u8(idx)));
+#endif
+}
+
+static int float2int24_zip_1channels(const float *src, uint8_t *int24_dst,
+                                     int nsamples) {
+  const int BLOCK_SIZE = 8;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+
+  int i;
+
+  static uint8_t MAP01[] = {0,  1,  2,  4,  5,  6,  8,  9,
+                            10, 12, 13, 14, 16, 17, 18, 20};
+  static uint8_t MAP2[] = {21, 22, 24, 25, 26, 28, 29, 30,
+                           0,  0,  0,  0,  0,  0,  0,  0};
+  uint8x16_t map01 = vld1q_u8(MAP01);
+  uint8x16_t map2 = vld1q_u8(MAP2);
+  int32x4_t min24 = vdupq_n_s32(RANGE_MIN_24BIT);
+  int32x4_t max24 = vdupq_n_s32(RANGE_MAX_24BIT);
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4x2_t in_a_01 = { { vld1q_f32(src + i), vld1q_f32(src + i + 4) } };
+
+    uint8x16x2_t u8_a_01;
+    u8_a_01.val[0] = cvt_clamp_for_int24_u8(min24, in_a_01.val[0], max24);
+    u8_a_01.val[1] = cvt_clamp_for_int24_u8(min24, in_a_01.val[1], max24);
+
+    uint8x16_t out_01 = tbl2q(u8_a_01.val[0], u8_a_01.val[1], map01);
+    uint8x8_t out_2 = vget_low_u8(tbl2q(u8_a_01.val[0], u8_a_01.val[1], map2));
+
+    vst1q_u8(int24_dst + i * 3 + 0, out_01);
+    vst1_u8(int24_dst + i * 3 + 16, out_2);
+  }
+
+  return blocked_size;
+}
+
+static int float2int24_zip_2channels(const float *src, int next_channel,
+                                     uint8_t *int24_dst, int nsamples) {
+  const int BLOCK_SIZE = 4;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+
+  int i;
+
+  static uint8_t MAP01[] = {0, 1,  2,  16, 17, 18, 4,  5,
+                            6, 20, 21, 22, 8,  9,  10, 24};
+  static uint8_t MAP2[] = {25, 26, 12, 13, 14, 28, 29, 30,
+                           0,  0,  0,  0,  0,  0,  0,  0};
+  uint8x16_t map01 = vld1q_u8(MAP01);
+  uint8x16_t map2 = vld1q_u8(MAP2);
+  int32x4_t min24 = vdupq_n_s32(RANGE_MIN_24BIT);
+  int32x4_t max24 = vdupq_n_s32(RANGE_MAX_24BIT);
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4_t in_a_0 = vld1q_f32(src + i);
+    float32x4_t in_b_0 = vld1q_f32(src + next_channel + i);
+
+    uint8x16x2_t u8_ab_0;
+    u8_ab_0.val[0] = cvt_clamp_for_int24_u8(min24, in_a_0, max24);
+    u8_ab_0.val[1] = cvt_clamp_for_int24_u8(min24, in_b_0, max24);
+
+    uint8x16_t out_firstpart_ab_0 = tbl2q(u8_ab_0.val[0], u8_ab_0.val[1], map01);
+    uint8x8_t out_secondpart_ab_0 = vget_low_u8(tbl2q(u8_ab_0.val[0], u8_ab_0.val[1], map2));
+
+    vst1q_u8(int24_dst + (i * 2 + 0) * 3 + 0 + 0, out_firstpart_ab_0);
+    vst1_u8(int24_dst + (i * 2 + 0) * 3 + 15 + 1, out_secondpart_ab_0);
+  }
+
+  return blocked_size;
+}
+
+static inline uint8x16_t tbl1q(uint8x16_t a, uint8x16_t idx) {
+#if defined(__aarch64__)
+  return vqtbl1q_u8(a, idx);
+#else
+  uint8x8x2_t table = { { vget_low_u8(a), vget_high_u8(a) } };
+  uint8x8_t lo = vtbl2_u8(table, vget_low_u8(idx));
+  uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
+  return vcombine_u8(lo, hi);
+#endif
+}
+
+static int float2int24_zip_nchannels(const float *src, int next_channel,
+                                     int channels, uint8_t *int24_dst,
+                                     int nsamples) {
+  const int BATCH = 3;
+  const int BATCHED_BLOCK_SIZE = 8;
+  const int SINGLE_BLOCK_SIZE = 8;
+  const int bathed_channels = channels / BATCH * BATCH;
+  const int blocked_size = nsamples / BATCHED_BLOCK_SIZE * BATCHED_BLOCK_SIZE /
+                           SINGLE_BLOCK_SIZE * SINGLE_BLOCK_SIZE;
+
+  int i, c;
+
+  static uint8_t MAP01[] = {0,  1,  2,  4,  5,  6,  8,  9,
+                            10, 12, 13, 14, 16, 17, 18, 20};
+  uint8x16_t map01 = vld1q_u8(MAP01);
+  int32x4_t min24 = vdupq_n_s32(RANGE_MIN_24BIT);
+  int32x4_t max24 = vdupq_n_s32(RANGE_MAX_24BIT);
+
+  for (c = 0; c < bathed_channels; c += BATCH) {
+    for (i = 0; i < blocked_size; i += BATCHED_BLOCK_SIZE) {
+      float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * (c + 0) + i), vld1q_f32(src + next_channel * (c + 0) + i + 4) } };
+      float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel * (c + 1) + i), vld1q_f32(src + next_channel * (c + 1) + i + 4) } };
+      float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * (c + 2) + i), vld1q_f32(src + next_channel * (c + 2) + i + 4) } };
+
+      int32x4_t s32_a_0 = cvt_clamp_for_int24_s32(min24, in_a_01.val[0], max24);
+      int32x4_t s32_b_0 = cvt_clamp_for_int24_s32(min24, in_b_01.val[0], max24);
+      int32x4_t s32_c_0 = cvt_clamp_for_int24_s32(min24, in_c_01.val[0], max24);
+
+      int32x4_t s32_a_1 = cvt_clamp_for_int24_s32(min24, in_a_01.val[1], max24);
+      int32x4_t s32_b_1 = cvt_clamp_for_int24_s32(min24, in_b_01.val[1], max24);
+      int32x4_t s32_c_1 = cvt_clamp_for_int24_s32(min24, in_c_01.val[1], max24);
+      int32x4_t zeros = vdupq_n_s32(0);
+
+      int32x4x4_t transposed_abcd_0 =
+          transpose_s32_4x4(s32_a_0, s32_b_0, s32_c_0, zeros);
+      uint8x16_t out_full_0 = vreinterpretq_u8_s32(transposed_abcd_0.val[0]);
+      uint8x16_t out_full_1 = vreinterpretq_u8_s32(transposed_abcd_0.val[1]);
+      uint8x16_t out_full_2 = vreinterpretq_u8_s32(transposed_abcd_0.val[2]);
+      uint8x16_t out_full_3 = vreinterpretq_u8_s32(transposed_abcd_0.val[3]);
+
+      int32x4x4_t transposed_abcd_1 =
+          transpose_s32_4x4(s32_a_1, s32_b_1, s32_c_1, zeros);
+      uint8x16_t out_full_4 = vreinterpretq_u8_s32(transposed_abcd_1.val[0]);
+      uint8x16_t out_full_5 = vreinterpretq_u8_s32(transposed_abcd_1.val[1]);
+      uint8x16_t out_full_6 = vreinterpretq_u8_s32(transposed_abcd_1.val[2]);
+      uint8x16_t out_full_7 = vreinterpretq_u8_s32(transposed_abcd_1.val[3]);
+
+      const uint8_t out_secondpart_0 = vgetq_lane_u8(out_full_0, 10);
+      const uint8_t out_secondpart_1 = vgetq_lane_u8(out_full_1, 10);
+      const uint8_t out_secondpart_2 = vgetq_lane_u8(out_full_2, 10);
+      const uint8_t out_secondpart_3 = vgetq_lane_u8(out_full_3, 10);
+      const uint8_t out_secondpart_4 = vgetq_lane_u8(out_full_4, 10);
+      const uint8_t out_secondpart_5 = vgetq_lane_u8(out_full_5, 10);
+      const uint8_t out_secondpart_6 = vgetq_lane_u8(out_full_6, 10);
+      const uint8_t out_secondpart_7 = vgetq_lane_u8(out_full_7, 10);
+
+      uint8x8_t out_firstpart_0 = vget_low_u8(tbl1q(out_full_0, map01));
+      uint8x8_t out_firstpart_1 = vget_low_u8(tbl1q(out_full_1, map01));
+      uint8x8_t out_firstpart_2 = vget_low_u8(tbl1q(out_full_2, map01));
+      uint8x8_t out_firstpart_3 = vget_low_u8(tbl1q(out_full_3, map01));
+      uint8x8_t out_firstpart_4 = vget_low_u8(tbl1q(out_full_4, map01));
+      uint8x8_t out_firstpart_5 = vget_low_u8(tbl1q(out_full_5, map01));
+      uint8x8_t out_firstpart_6 = vget_low_u8(tbl1q(out_full_6, map01));
+      uint8x8_t out_firstpart_7 = vget_low_u8(tbl1q(out_full_7, map01));
+
+      uint8_t *ptr = int24_dst + ((i + 0) * channels + c) * 3;
+      const int step = channels * 3;
+      write_consecutive_int24(ptr + step * 0, out_firstpart_0,
+                              out_secondpart_0);
+      write_consecutive_int24(ptr + step * 1, out_firstpart_1,
+                              out_secondpart_1);
+      write_consecutive_int24(ptr + step * 2, out_firstpart_2,
+                              out_secondpart_2);
+      write_consecutive_int24(ptr + step * 3, out_firstpart_3,
+                              out_secondpart_3);
+      write_consecutive_int24(ptr + step * 4, out_firstpart_4,
+                              out_secondpart_4);
+      write_consecutive_int24(ptr + step * 5, out_firstpart_5,
+                              out_secondpart_5);
+      write_consecutive_int24(ptr + step * 6, out_firstpart_6,
+                              out_secondpart_6);
+      write_consecutive_int24(ptr + step * 7, out_firstpart_7,
+                              out_secondpart_7);
+    }
+  }
+
+  for (c = bathed_channels; c < channels; ++c) {
+    for (i = 0; i < blocked_size; i += SINGLE_BLOCK_SIZE) {
+      float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * c + i), vld1q_f32(src + next_channel * c + i + 4) } };
+
+      uint8x16_t out_full_0 = vreinterpretq_u8_s32(
+          cvt_clamp_for_int24_s32(min24, in_a_01.val[0], max24));
+      uint8x16_t out_full_1 = vreinterpretq_u8_s32(
+          cvt_clamp_for_int24_s32(min24, in_a_01.val[1], max24));
+
+      uint64_t out_low_0 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_0), 0);
+      uint64_t out_high_0 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_0), 1);
+      uint64_t out_low_1 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_1), 0);
+      uint64_t out_high_1 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_1), 1);
+
+      uint8_t *ptr = int24_dst + ((i + 0) * channels + c) * 3;
+      const int step = channels * 3;
+      write_pair_int24(ptr + step * 0, step, out_low_0);
+      write_pair_int24(ptr + step * 2, step, out_high_0);
+      write_pair_int24(ptr + step * 4, step, out_low_1);
+      write_pair_int24(ptr + step * 6, step, out_high_1);
+    }
+  }
+
+  return blocked_size;
+}
+
+static int float2int32_zip_1channels(const float *src, int32_t *int32_dst,
+                                     int nsamples) {
+  const int BLOCK_SIZE = 32;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+
+  int i;
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4x2_t in_a_01 = { { vld1q_f32(src + i +  0), vld1q_f32(src + i +  4) } };
+    float32x4x2_t in_a_23 = { { vld1q_f32(src + i +  8), vld1q_f32(src + i + 12) } };
+    float32x4x2_t in_a_45 = { { vld1q_f32(src + i + 16), vld1q_f32(src + i + 20) } };
+    float32x4x2_t in_a_67 = { { vld1q_f32(src + i + 24), vld1q_f32(src + i + 28) } };
+
+    int32x4x2_t out_01 = cvt_for_int32_x2(in_a_01.val[0], in_a_01.val[1]);
+    int32x4x2_t out_23 = cvt_for_int32_x2(in_a_23.val[0], in_a_23.val[1]);
+    int32x4x2_t out_45 = cvt_for_int32_x2(in_a_45.val[0], in_a_45.val[1]);
+    int32x4x2_t out_67 = cvt_for_int32_x2(in_a_67.val[0], in_a_67.val[1]);
+
+    vst1q_s32(int32_dst + i +  0, out_01.val[0]);
+    vst1q_s32(int32_dst + i +  4, out_01.val[1]);
+    vst1q_s32(int32_dst + i +  8, out_23.val[0]);
+    vst1q_s32(int32_dst + i + 12, out_23.val[1]);
+    vst1q_s32(int32_dst + i + 16, out_45.val[0]);
+    vst1q_s32(int32_dst + i + 20, out_45.val[1]);
+    vst1q_s32(int32_dst + i + 24, out_67.val[0]);
+    vst1q_s32(int32_dst + i + 28, out_67.val[1]);
+  }
+
+  return blocked_size;
+}
+
+static int float2int32_zip_2channels(const float *src, int next_channel,
+                                     int32_t *int32_dst, int nsamples) {
+  const int BLOCK_SIZE = 16;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+
+  int i;
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i +  4) } };
+    float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } };
+    float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel + i + 0), vld1q_f32(src + next_channel + i +  4) } };
+    float32x4x2_t in_b_23 = { { vld1q_f32(src + next_channel + i + 8), vld1q_f32(src + next_channel + i + 12) } };
+
+    int32x4x2_t out_ab_0 = cvt_for_int32_x2(in_a_01.val[0], in_b_01.val[0]);
+    int32x4x2_t out_ab_1 = cvt_for_int32_x2(in_a_01.val[1], in_b_01.val[1]);
+    int32x4x2_t out_ab_2 = cvt_for_int32_x2(in_a_23.val[0], in_b_23.val[0]);
+    int32x4x2_t out_ab_3 = cvt_for_int32_x2(in_a_23.val[1], in_b_23.val[1]);
+
+    vst2q_s32(int32_dst + i * 2 + 0, out_ab_0);
+    vst2q_s32(int32_dst + i * 2 + 8, out_ab_1);
+    vst2q_s32(int32_dst + i * 2 + 16, out_ab_2);
+    vst2q_s32(int32_dst + i * 2 + 24, out_ab_3);
+  }
+
+  return blocked_size;
+}
+
+static int float2int32_zip_3channels(const float *src, int next_channel,
+                                     int32_t *int32_dst, int nsamples) {
+  const int BLOCK_SIZE = 16;
+  const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE;
+
+  int i;
+
+  for (i = 0; i < blocked_size; i += BLOCK_SIZE) {
+    float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i +  4) } };
+    float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } };
+    float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel + i + 0), vld1q_f32(src + next_channel + i +  4) } };
+    float32x4x2_t in_b_23 = { { vld1q_f32(src + next_channel + i + 8), vld1q_f32(src + next_channel + i + 12) } };
+    float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * 2 + i + 0), vld1q_f32(src + next_channel * 2 + i +  4) } };
+    float32x4x2_t in_c_23 = { { vld1q_f32(src + next_channel * 2 + i + 8), vld1q_f32(src + next_channel * 2 + i + 12) } };
+
+    int32x4x3_t out_abc_0 =
+        cvt_for_int32_x3(in_a_01.val[0], in_b_01.val[0], in_c_01.val[0]);
+    int32x4x3_t out_abc_1 =
+        cvt_for_int32_x3(in_a_01.val[1], in_b_01.val[1], in_c_01.val[1]);
+    int32x4x3_t out_abc_2 =
+        cvt_for_int32_x3(in_a_23.val[0], in_b_23.val[0], in_c_23.val[0]);
+    int32x4x3_t out_abc_3 =
+        cvt_for_int32_x3(in_a_23.val[1], in_b_23.val[1], in_c_23.val[1]);
+
+    vst3q_s32(int32_dst + i * 3 + 0, out_abc_0);
+    vst3q_s32(int32_dst + i * 3 + 12, out_abc_1);
+    vst3q_s32(int32_dst + i * 3 + 24, out_abc_2);
+    vst3q_s32(int32_dst + i * 3 + 36, out_abc_3);
+  }
+
+  return blocked_size;
+}
+
+static int float2int32_zip_nchannels(const float *src, int next_channel,
+                                     int channels, int32_t *int32_dst,
+                                     int nsamples) {
+  const int BATCH = 4;
+  const int BATCHED_BLOCK_SIZE = 8;
+  const int SINGLE_BLOCK_SIZE = 16;
+  const int bathed_channels = channels / BATCH * BATCH;
+  const int blocked_size = nsamples / BATCHED_BLOCK_SIZE * BATCHED_BLOCK_SIZE /
+                           SINGLE_BLOCK_SIZE * SINGLE_BLOCK_SIZE;
+
+  int i, c;
+
+  for (c = 0; c < bathed_channels; c += BATCH) {
+    for (i = 0; i < blocked_size; i += BATCHED_BLOCK_SIZE) {
+      float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * (c + 0) + i), vld1q_f32(src + next_channel * (c + 0) + i + 4) } };
+      float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel * (c + 1) + i), vld1q_f32(src + next_channel * (c + 1) + i + 4) } };
+      float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * (c + 2) + i), vld1q_f32(src + next_channel * (c + 2) + i + 4) } };
+      float32x4x2_t in_d_01 = { { vld1q_f32(src + next_channel * (c + 3) + i), vld1q_f32(src + next_channel * (c + 3) + i + 4) } };
+
+      int32x4x2_t s32_a_01 = cvt_for_int32_x2(in_a_01.val[0], in_a_01.val[1]);
+      int32x4x2_t s32_b_01 = cvt_for_int32_x2(in_b_01.val[0], in_b_01.val[1]);
+      int32x4x2_t s32_c_01 = cvt_for_int32_x2(in_c_01.val[0], in_c_01.val[1]);
+      int32x4x2_t s32_d_01 = cvt_for_int32_x2(in_d_01.val[0], in_d_01.val[1]);
+
+      int32x4x4_t transposed_0 = transpose_s32_4x4(
+          s32_a_01.val[0], s32_b_01.val[0], s32_c_01.val[0], s32_d_01.val[0]);
+      int32x4x4_t transposed_1 = transpose_s32_4x4(
+          s32_a_01.val[1], s32_b_01.val[1], s32_c_01.val[1], s32_d_01.val[1]);
+
+      int32_t *ptr = int32_dst + i * channels + c;
+      const int step = channels;
+      vst1q_s32(ptr + step * 0, transposed_0.val[0]);
+      vst1q_s32(ptr + step * 1, transposed_0.val[1]);
+      vst1q_s32(ptr + step * 2, transposed_0.val[2]);
+      vst1q_s32(ptr + step * 3, transposed_0.val[3]);
+      vst1q_s32(ptr + step * 4, transposed_1.val[0]);
+      vst1q_s32(ptr + step * 5, transposed_1.val[1]);
+      vst1q_s32(ptr + step * 6, transposed_1.val[2]);
+      vst1q_s32(ptr + step * 7, transposed_1.val[3]);
+    }
+  }
+
+  for (c = bathed_channels; c < channels; ++c) {
+    for (i = 0; i < blocked_size; i += SINGLE_BLOCK_SIZE) {
+      float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * c + i + 0), vld1q_f32(src + next_channel * c + i +  4) } };
+      float32x4x2_t in_a_23 = { { vld1q_f32(src + next_channel * c + i + 8), vld1q_f32(src + next_channel * c + i + 12) } };
+
+      int32x4_t out_scattered_a_0 = cvt_for_int32(in_a_01.val[0]);
+      int32x4_t out_scattered_a_1 = cvt_for_int32(in_a_01.val[1]);
+      int32x4_t out_scattered_a_2 = cvt_for_int32(in_a_23.val[0]);
+      int32x4_t out_scattered_a_3 = cvt_for_int32(in_a_23.val[1]);
+
+      int64_t out_low_a_0 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_0), 0);
+      int64_t out_high_a_0 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_0), 1);
+      int64_t out_low_a_1 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_1), 0);
+      int64_t out_high_a_1 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_1), 1);
+      int64_t out_low_a_2 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_2), 0);
+      int64_t out_high_a_2 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_2), 1);
+      int64_t out_low_a_3 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_3), 0);
+      int64_t out_high_a_3 =
+          vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_3), 1);
+
+      int32_t *ptr = int32_dst + i * channels + c;
+      const int step = channels;
+      ptr[step * 0] = (int32_t)((out_low_a_0 >> 0) & 0xffffffff);
+      ptr[step * 1] = (int32_t)((out_low_a_0 >> 32) & 0xffffffff);
+      ptr[step * 2] = (int32_t)((out_high_a_0 >> 0) & 0xffffffff);
+      ptr[step * 3] = (int32_t)((out_high_a_0 >> 32) & 0xffffffff);
+      ptr[step * 4] = (int32_t)((out_low_a_1 >> 0) & 0xffffffff);
+      ptr[step * 5] = (int32_t)((out_low_a_1 >> 32) & 0xffffffff);
+      ptr[step * 6] = (int32_t)((out_high_a_1 >> 0) & 0xffffffff);
+      ptr[step * 7] = (int32_t)((out_high_a_1 >> 32) & 0xffffffff);
+      ptr[step * 8] = (int32_t)((out_low_a_2 >> 0) & 0xffffffff);
+      ptr[step * 9] = (int32_t)((out_low_a_2 >> 32) & 0xffffffff);
+      ptr[step * 10] = (int32_t)((out_high_a_2 >> 0) & 0xffffffff);
+      ptr[step * 11] = (int32_t)((out_high_a_2 >> 32) & 0xffffffff);
+      ptr[step * 12] = (int32_t)((out_low_a_3 >> 0) & 0xffffffff);
+      ptr[step * 13] = (int32_t)((out_low_a_3 >> 32) & 0xffffffff);
+      ptr[step * 14] = (int32_t)((out_high_a_3 >> 0) & 0xffffffff);
+      ptr[step * 15] = (int32_t)((out_high_a_3 >> 32) & 0xffffffff);
+    }
+  }
+
+  return blocked_size;
+}
+
+void float2int16_zip_channels_neon(const float *src, int next_channel,
+                                   int channels, int16_t *int16_dst,
+                                   int nsamples) {
+  int processed = 0;
+
+  switch (channels) {
+    case 1:
+      processed = float2int16_zip_1channels(src, int16_dst, nsamples);
+      break;
+    case 2:
+      processed =
+          float2int16_zip_2channels(src, next_channel, int16_dst, nsamples);
+      break;
+    case 3:
+      processed =
+          float2int16_zip_3channels(src, next_channel, int16_dst, nsamples);
+      break;
+    default:
+      processed = float2int16_zip_nchannels(src, next_channel, channels,
+                                            int16_dst, nsamples);
+      break;
+  }
+
+  // Let C version handle the residuals
+  float2int16_zip_channels_c(src + processed, next_channel, channels,
+                             int16_dst + processed * channels,
+                             nsamples - processed);
+}
+
+void float2int24_zip_channels_neon(const float *src, int next_channel,
+                                   int channels, uint8_t *int24_dst,
+                                   int nsamples) {
+  int processed = 0;
+
+  switch (channels) {
+    case 1:
+      processed = float2int24_zip_1channels(src, int24_dst, nsamples);
+      break;
+    case 2:
+      processed =
+          float2int24_zip_2channels(src, next_channel, int24_dst, nsamples);
+      break;
+    default:
+      processed = float2int24_zip_nchannels(src, next_channel, channels,
+                                            int24_dst, nsamples);
+      break;
+  }
+
+  // Let C version handle the residuals
+  float2int24_zip_channels_c(src + processed, next_channel, channels,
+                             int24_dst + processed * channels * 3,
+                             nsamples - processed);
+}
+
+void float2int32_zip_channels_neon(const float *src, int next_channel,
+                                   int channels, int32_t *int32_dst,
+                                   int nsamples) {
+  int processed = 0;
+
+  switch (channels) {
+    case 1:
+      processed = float2int32_zip_1channels(src, int32_dst, nsamples);
+      break;
+    case 2:
+      processed =
+          float2int32_zip_2channels(src, next_channel, int32_dst, nsamples);
+      break;
+    case 3:
+      processed =
+          float2int32_zip_3channels(src, next_channel, int32_dst, nsamples);
+      break;
+    default:
+      processed = float2int32_zip_nchannels(src, next_channel, channels,
+                                            int32_dst, nsamples);
+      break;
+  }
+
+  // Let C version handle the residuals
+  float2int32_zip_channels_c(src + processed, next_channel, channels,
+                             int32_dst + processed * channels,
+                             nsamples - processed);
+}
+
+#endif /* IAMF_ARCH_DETECTED_ARM */
diff --git a/code/src/iamf_dec/arch/arm/arm_zip_channels.h b/code/src/iamf_dec/arch/arm/arm_zip_channels.h
new file mode 100644
index 00000000..b8ba7b27
--- /dev/null
+++ b/code/src/iamf_dec/arch/arm/arm_zip_channels.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 3-Clause Clear License
+ * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
+ * License was not distributed with this source code in the LICENSE file, you
+ * can obtain it at www.aomedia.org/license/software-license/bsd-3-c-c. If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * www.aomedia.org/license/patent.
+ */
+
+/**
+ * @file arm_zip_channels.h
+ * @brief Arm implementation for zipping channels.
+ * @version 0.1
+ * @date Created 10/24/2024
+ **/
+
+#ifndef ARM_ZIP_CHANNELS_H_
+#define ARM_ZIP_CHANNELS_H_
+
+#include "detect_arm.h"
+
+#if defined(IAMF_ARCH_DETECTED_ARM)
+
+#include <stdint.h>
+
+void float2int16_zip_channels_neon(const float *src, int next_channel,
+                                   int channels, int16_t *int16_dst,
+                                   int nsamples);
+
+void float2int24_zip_channels_neon(const float *src, int next_channel,
+                                   int channels, uint8_t *int24_dst,
+                                   int nsamples);
+
+void float2int32_zip_channels_neon(const float *src, int next_channel,
+                                   int channels, int32_t *int32_dst,
+                                   int nsamples);
+
+#endif /* IAMF_ARCH_DETECTED_ARM */
+#endif /* ARM_ZIP_CHANNELS_H_ */
diff --git a/code/src/iamf_dec/arch/arm/override_arm.c b/code/src/iamf_dec/arch/arm/override_arm.c
index 15621f4c..d4f16dc1 100644
--- a/code/src/iamf_dec/arch/arm/override_arm.c
+++ b/code/src/iamf_dec/arch/arm/override_arm.c
@@ -25,12 +25,17 @@
 
 #include "../../arch.h"
 #include "arm_multiply_channels.h"
+#include "arm_zip_channels.h"
 
 void arch_override(Arch *arch) {
   // Override functions with Arm implementations here
 
   arch->rendering.multiply_channels_by_matrix =
       &multiply_channels_by_matrix_neon;
+
+  arch->output.float2int16_zip_channels = &float2int16_zip_channels_neon;
+  arch->output.float2int24_zip_channels = &float2int24_zip_channels_neon;
+  arch->output.float2int32_zip_channels = &float2int32_zip_channels_neon;
 }
 
 #endif
diff --git a/code/win64/VS2022/iamf/iamf.vcxproj b/code/win64/VS2022/iamf/iamf.vcxproj
index 661addae..ad9a4fb1 100755
--- a/code/win64/VS2022/iamf/iamf.vcxproj
+++ b/code/win64/VS2022/iamf/iamf.vcxproj
@@ -136,6 +136,7 @@
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arch_init.c" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arch_common.c" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\arm_multiply_channels.c" />
+    <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\arm_zip_channels.c" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\override_arm.c" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\x86\override_x86.c" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch.c" />
@@ -176,6 +177,7 @@
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arch_init.h" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arch_common.h" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\arm_multiply_channels.h" />
+    <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\arm_zip_channels.h" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\detect_arm.h" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch\x86\detect_x86.h" />
     <ClCompile Include="..\..\..\src\iamf_dec\arch.h" />
diff --git a/code/win64/VS2022/iamf/iamf.vcxproj.filters b/code/win64/VS2022/iamf/iamf.vcxproj.filters
index 1a838eba..d6ecb6c1 100755
--- a/code/win64/VS2022/iamf/iamf.vcxproj.filters
+++ b/code/win64/VS2022/iamf/iamf.vcxproj.filters
@@ -81,6 +81,9 @@
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\arm_multiply_channels.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\arm_zip_channels.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\src\iamf_dec\arch\arm\override_arm.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -146,6 +149,9 @@
     <ClInclude Include="..\..\..\src\iamf_dec\arch\arm\arm_multiply_channels.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\src\iamf_dec\arch\arm\arm_zip_channels.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\..\src\iamf_dec\arch\arm\detect_arm.h">
       <Filter>Header Files</Filter>
     </ClInclude>