From f0e417bb8be8b062dd8b407172508c0e8b0da60b Mon Sep 17 00:00:00 2001 From: Sandor Zsombor Vegh Date: Wed, 6 Nov 2024 12:59:02 +0100 Subject: [PATCH] Arm: Speed up iamf_decoder_plane2stride_out Extract zipping channels with conversion to int16, int24 and int32 outputs as separate functions, with C and Neon implementations. Reorder loops in C implementation to be more cache-friendly and autovectorizable. --- code/src/iamf_dec/IAMF_decoder.c | 81 +- code/src/iamf_dec/arch.h | 13 + code/src/iamf_dec/arch/arch_common.c | 70 ++ code/src/iamf_dec/arch/arch_common.h | 11 + code/src/iamf_dec/arch/arch_init.c | 4 + code/src/iamf_dec/arch/arm/arm_zip_channels.c | 811 ++++++++++++++++++ code/src/iamf_dec/arch/arm/arm_zip_channels.h | 42 + code/src/iamf_dec/arch/arm/override_arm.c | 5 + code/win64/VS2022/iamf/iamf.vcxproj | 2 + code/win64/VS2022/iamf/iamf.vcxproj.filters | 6 + 10 files changed, 981 insertions(+), 64 deletions(-) create mode 100644 code/src/iamf_dec/arch/arm/arm_zip_channels.c create mode 100644 code/src/iamf_dec/arch/arm/arm_zip_channels.h diff --git a/code/src/iamf_dec/IAMF_decoder.c b/code/src/iamf_dec/IAMF_decoder.c index 675cbd4b..51151b6d 100644 --- a/code/src/iamf_dec/IAMF_decoder.c +++ b/code/src/iamf_dec/IAMF_decoder.c @@ -22,7 +22,6 @@ #endif #include -#include #include "IAMF_OBU.h" #include "IAMF_debug.h" @@ -76,74 +75,28 @@ static int64_t time_transform(int64_t t1, int s1, int s2) { } /* ----------------------------- Internal methods ------------------ */ -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -static int16_t FLOAT2INT16(float x) { - x = x * 32768.f; - x = MAX(x, -32768.f); - x = MIN(x, 32767.f); - return (int16_t)lrintf(x); -} - -static int32_t FLOAT2INT24(float x) { - x = x * 8388608.f; - x = MAX(x, -8388608.f); - x = MIN(x, 8388607.f); - return (int32_t)lrintf(x); -} -static int32_t FLOAT2INT32(float x) { - x = x * 2147483648.f; - if (x > -2147483648.f && x < 2147483647.f) - return (int32_t)lrintf(x); - else - return (x > 0.0f ? 2147483647 : (-2147483647 - 1)); -} +static void iamf_decoder_plane2stride_out(const Arch *arch, void *dst, + const float *src, int frame_size, + int channels, uint32_t bit_depth) { + if (!src) { + if (bit_depth == 16 || bit_depth == 24 || bit_depth == 32) + memset(dst, 0x0, frame_size * channels * (bit_depth / 8)); + return; + } -static void iamf_decoder_plane2stride_out(void *dst, const float *src, - int frame_size, int channels, - uint32_t bit_depth) { if (bit_depth == 16) { - int16_t *int16_dst = (int16_t *)dst; - for (int c = 0; c < channels; ++c) { - for (int i = 0; i < frame_size; i++) { - if (src) { - int16_dst[i * channels + c] = FLOAT2INT16(src[frame_size * c + i]); - } else { - int16_dst[i * channels + c] = 0; - } - } - } + (*arch->output.float2int16_zip_channels)(src, frame_size, channels, + (int16_t *)dst, frame_size); } else if (bit_depth == 24) { - uint8_t *int24_dst = (uint8_t *)dst; - for (int c = 0; c < channels; ++c) { - for (int i = 0; i < frame_size; i++) { - if (src) { - int32_t tmp = FLOAT2INT24(src[frame_size * c + i]); - int24_dst[(i * channels + c) * 3] = tmp & 0xff; - int24_dst[(i * channels + c) * 3 + 1] = (tmp >> 8) & 0xff; - int24_dst[(i * channels + c) * 3 + 2] = - ((tmp >> 16) & 0x7f) | ((tmp >> 24) & 0x80); - } else { - int24_dst[(i * channels + c) * 3] = 0; - int24_dst[(i * channels + c) * 3 + 1] = 0; - int24_dst[(i * channels + c) * 3 + 2] = 0; - } - } - } + (*arch->output.float2int24_zip_channels)(src, frame_size, channels, + (uint8_t *)dst, frame_size); } else if (bit_depth == 32) { - int32_t *int32_dst = (int32_t *)dst; - for (int c = 0; c < channels; ++c) { - for (int i = 0; i < frame_size; i++) { - if (src) { - int32_dst[i * channels + c] = FLOAT2INT32(src[frame_size * c + i]); - } else { - int32_dst[i * channels + c] = 0; - } - } - } + (*arch->output.float2int32_zip_channels)(src, frame_size, channels, + (int32_t *)dst, frame_size); } } + static void ia_decoder_stride2plane_out_float(void *dst, const float *src, int frame_size, int channels) { float *float_dst = (float *)dst; @@ -3713,7 +3666,7 @@ static int iamf_delay_buffer_handle(IAMF_DecoderHandle handle, void *pcm) { audio_effect_peak_limiter_process_block(limiter, in, out, frame_size); } - iamf_decoder_plane2stride_out(pcm, out, frame_size, + iamf_decoder_plane2stride_out(handle->arch, pcm, out, frame_size, ctx->output_layout->channels, ctx->bit_depth); free(in); free(out); @@ -3921,7 +3874,7 @@ static int iamf_decoder_internal_decode(IAMF_DecoderHandle handle, swap((void **)&f->data, (void **)&out); } - iamf_decoder_plane2stride_out(pcm, f->data, real_frame_size, + iamf_decoder_plane2stride_out(handle->arch, pcm, f->data, real_frame_size, ctx->output_layout->channels, ctx->bit_depth); #if SR diff --git a/code/src/iamf_dec/arch.h b/code/src/iamf_dec/arch.h index ef2e6e48..e21411ed 100644 --- a/code/src/iamf_dec/arch.h +++ b/code/src/iamf_dec/arch.h @@ -20,6 +20,8 @@ #ifndef ARCH_H_ #define ARCH_H_ +#include + typedef struct ArchCallbacks { // Functions with possible architecture-specific optimizations struct { @@ -28,6 +30,17 @@ typedef struct ArchCallbacks { int out_next, float **in, float **out, int nsamples); } rendering; + struct { + void (*float2int16_zip_channels)(const float *src, int next_channel, + int channels, int16_t *int16_dst, + int nsamples); + void (*float2int24_zip_channels)(const float *src, int next_channel, + int channels, uint8_t *int24_dst, + int nsamples); + void (*float2int32_zip_channels)(const float *src, int next_channel, + int channels, int32_t *int32_dst, + int nsamples); + } output; } Arch; Arch *arch_create(); diff --git a/code/src/iamf_dec/arch/arch_common.c b/code/src/iamf_dec/arch/arch_common.c index 157c7a71..4dc50ab0 100644 --- a/code/src/iamf_dec/arch/arch_common.c +++ b/code/src/iamf_dec/arch/arch_common.c @@ -19,6 +19,8 @@ #include "arch_common.h" +#include + void multiply_channels_by_matrix_c(float *mat, int in_dim, int in_next, int *in_idx_map, int out_dim, int out_next, float **in, float **out, int nsamples) { @@ -41,3 +43,71 @@ void multiply_channels_by_matrix_c(float *mat, int in_dim, int in_next, } } } + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static int16_t FLOAT2INT16(float x) { + x = x * (float)(1 << 15); + x = MAX(x, INT16_MIN); + x = MIN(x, INT16_MAX); + return (int16_t)lrintf(x); +} + +static int32_t FLOAT2INT24(float x) { + #define INT24_MAX (8388607) + #define INT24_MIN (-8388608) + + x = x * (float)(1 << 23); + x = MAX(x, (float)INT24_MIN); + x = MIN(x, (float)INT24_MAX); + return (int32_t)lrintf(x); +} + +static int32_t FLOAT2INT32(float x) { + // unary minus applied to maintain correct signedness + x = x * -(float)(1 << 31); + if (x > (float)INT32_MIN && x < (float)INT32_MAX) + return (int32_t)lrintf(x); + else + return (x > 0.0f ? INT32_MAX : INT32_MIN); +} + +void float2int16_zip_channels_c(const float *src, int next_channel, + int channels, int16_t *int16_dst, + int nsamples) { + int i, c; + + for (c = 0; c < channels; ++c) { + for (i = 0; i < nsamples; i++) { + int16_dst[i * channels + c] = FLOAT2INT16(src[next_channel * c + i]); + } + } +} + +void float2int24_zip_channels_c(const float *src, int next_channel, + int channels, uint8_t *int24_dst, + int nsamples) { + int i, c; + + for (c = 0; c < channels; ++c) { + for (i = 0; i < nsamples; i++) { + int32_t tmp = FLOAT2INT24(src[next_channel * c + i]); + int24_dst[(i * channels + c) * 3 + 0] = tmp & 0xff; + int24_dst[(i * channels + c) * 3 + 1] = (tmp >> 8) & 0xff; + int24_dst[(i * channels + c) * 3 + 2] = (tmp >> 16) & 0xff; + } + } +} + +void float2int32_zip_channels_c(const float *src, int next_channel, + int channels, int32_t *int32_dst, + int nsamples) { + int i, c; + + for (c = 0; c < channels; ++c) { + for (i = 0; i < nsamples; i++) { + int32_dst[i * channels + c] = FLOAT2INT32(src[next_channel * c + i]); + } + } +} diff --git a/code/src/iamf_dec/arch/arch_common.h b/code/src/iamf_dec/arch/arch_common.h index 8b92ba62..f3cc7efb 100644 --- a/code/src/iamf_dec/arch/arch_common.h +++ b/code/src/iamf_dec/arch/arch_common.h @@ -20,8 +20,19 @@ #ifndef ARCH_COMMON_H_ #define ARCH_COMMON_H_ +#include + void multiply_channels_by_matrix_c(float *mat, int in_dim, int in_next, int *in_idx_map, int out_dim, int out_next, float **in, float **out, int nsamples); +void float2int16_zip_channels_c(const float *src, int next_channel, + int channels, int16_t *int16_dst, int nsamples); + +void float2int24_zip_channels_c(const float *src, int next_channel, + int channels, uint8_t *int24_dst, int nsamples); + +void float2int32_zip_channels_c(const float *src, int next_channel, + int channels, int32_t *int32_dst, int nsamples); + #endif /* ARCH_COMMON_H_ */ diff --git a/code/src/iamf_dec/arch/arch_init.c b/code/src/iamf_dec/arch/arch_init.c index d4095e1a..6b6a7ea2 100644 --- a/code/src/iamf_dec/arch/arch_init.c +++ b/code/src/iamf_dec/arch/arch_init.c @@ -48,6 +48,10 @@ void arch_init(Arch* arch) { // Fill with reference implementations arch->rendering.multiply_channels_by_matrix = &multiply_channels_by_matrix_c; + arch->output.float2int16_zip_channels = &float2int16_zip_channels_c; + arch->output.float2int24_zip_channels = &float2int24_zip_channels_c; + arch->output.float2int32_zip_channels = &float2int32_zip_channels_c; + #if defined(HAS_ARCH_OVERRIDE) // Override with platform-specific functions, if available arch_override(arch); diff --git a/code/src/iamf_dec/arch/arm/arm_zip_channels.c b/code/src/iamf_dec/arch/arm/arm_zip_channels.c new file mode 100644 index 00000000..e2def168 --- /dev/null +++ b/code/src/iamf_dec/arch/arm/arm_zip_channels.c @@ -0,0 +1,811 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at www.aomedia.org/license/software-license/bsd-3-c-c. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * www.aomedia.org/license/patent. + */ + +/** + * @file arm_zip_channels.c + * @brief Arm implementation for zipping channels. + * @version 0.1 + * @date Created 10/24/2024 + **/ + +#include "arm_zip_channels.h" + +#if defined(IAMF_ARCH_DETECTED_ARM) + +#include +#include + +#include "../arch_common.h" + +#define MUL_16BIT 32768.0f + +#define MUL_24BIT 8388608.f +#define RANGE_MIN_24BIT -8388608 +#define RANGE_MAX_24BIT 8388607 + +#define MUL_32BIT 2147483648.f + +static inline int32x4_t vroundf(float32x4_t x) +{ +#if defined(__ARM_ARCH) && __ARM_ARCH >= 8 + return vcvtaq_s32_f32(x); +#else + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000)); + uint32x4_t bias = vdupq_n_u32(0x3F000000); + return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign)))); +#endif +} + +static inline int16x4_t cvt_for_int16(float32x4_t vals) { + return vqmovn_s32(vroundf(vmulq_n_f32(vals, MUL_16BIT))); +} + +static inline int16x8x2_t cvt_for_int16_x2(float32x4x2_t vals0, + float32x4x2_t vals1) { + int16x8x2_t ret; + ret.val[0] = + vcombine_s16(cvt_for_int16(vals0.val[0]), cvt_for_int16(vals0.val[1])); + ret.val[1] = + vcombine_s16(cvt_for_int16(vals1.val[0]), cvt_for_int16(vals1.val[1])); + return ret; +} + +static inline int16x8x3_t cvt_for_int16_x3(float32x4x2_t vals0, + float32x4x2_t vals1, + float32x4x2_t vals2) { + int16x8x3_t ret; + ret.val[0] = + vcombine_s16(cvt_for_int16(vals0.val[0]), cvt_for_int16(vals0.val[1])); + ret.val[1] = + vcombine_s16(cvt_for_int16(vals1.val[0]), cvt_for_int16(vals1.val[1])); + ret.val[2] = + vcombine_s16(cvt_for_int16(vals2.val[0]), cvt_for_int16(vals2.val[1])); + return ret; +} + +static inline int32x4_t cvt_clamp_for_int24_s32(int32x4_t lower, + float32x4_t vals, + int32x4_t upper) { + return vmaxq_s32( + vminq_s32(vroundf(vmulq_n_f32(vals, MUL_24BIT)), upper), lower); +} + +static inline uint8x16_t cvt_clamp_for_int24_u8(int32x4_t lower, + float32x4_t vals, + int32x4_t upper) { + return vreinterpretq_u8_s32(vmaxq_s32( + vminq_s32(vroundf(vmulq_n_f32(vals, MUL_24BIT)), upper), lower)); +} + +static inline void write_consecutive_int24(uint8_t *ptr, uint8x8_t firstPart, + uint8_t secondPart) { + vst1_u8(ptr, firstPart); + ptr[8] = secondPart; +} + +static inline void write_pair_int24(uint8_t *ptr, int step, uint64_t val) { + ptr[0] = (uint8_t)((val >> 0) & 0xff); + ptr[1] = (uint8_t)((val >> 8) & 0xff); + ptr[2] = (uint8_t)((val >> 16) & 0xff); + + ptr[step + 0] = (uint8_t)((val >> 32) & 0xff); + ptr[step + 1] = (uint8_t)((val >> 40) & 0xff); + ptr[step + 2] = (uint8_t)((val >> 48) & 0xff); +} + +static inline int32x4_t cvt_for_int32(float32x4_t vals) { + return vroundf(vmulq_n_f32(vals, MUL_32BIT)); +} + +static inline int32x4x2_t cvt_for_int32_x2(float32x4_t vals0, + float32x4_t vals1) { + int32x4x2_t ret; + ret.val[0] = cvt_for_int32(vals0); + ret.val[1] = cvt_for_int32(vals1); + return ret; +} + +static inline int32x4x3_t cvt_for_int32_x3(float32x4_t vals0, float32x4_t vals1, + float32x4_t vals2) { + int32x4x3_t ret; + ret.val[0] = cvt_for_int32(vals0); + ret.val[1] = cvt_for_int32(vals1); + ret.val[2] = cvt_for_int32(vals2); + return ret; +} + +static inline int16x4x4_t transpose_s16_4x4(const int16x4_t a, const int16x4_t b, const int16x4_t c, + const int16x4_t d) { + int16x8_t aq = vcombine_s16(a, vdup_n_s16(0)); + int16x8_t bq = vcombine_s16(b, vdup_n_s16(0)); + int16x8_t cq = vcombine_s16(c, vdup_n_s16(0)); + int16x8_t dq = vcombine_s16(d, vdup_n_s16(0)); + + int16x8_t ac = vzipq_s16(aq, cq).val[0]; + int16x8_t bd = vzipq_s16(bq, dq).val[0]; + + int16x8x2_t abcd = vzipq_s16(ac, bd); + + int16x4x4_t ret = {{ + vget_low_s16(abcd.val[0]), + vget_high_s16(abcd.val[0]), + vget_low_s16(abcd.val[1]), + vget_high_s16(abcd.val[1]) + }}; + return ret; +} + +static inline int32x4x2_t vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_s32_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s32_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); +#endif + return b0; +} + +static inline int32x4x4_t transpose_s32_4x4(int32x4_t a, int32x4_t b, int32x4_t c, + int32x4_t d) { + const int32x4x2_t trn_ab = vtrnq_s32(a, b); + const int32x4x2_t trn_cd = vtrnq_s32(c, d); + + const int32x4x2_t r0 = vtrnq_s64_to_s32(trn_ab.val[0], trn_cd.val[0]); + const int32x4x2_t r1 = vtrnq_s64_to_s32(trn_ab.val[1], trn_cd.val[1]); + + int32x4x4_t ret; + ret.val[0] = r0.val[0]; + ret.val[1] = r1.val[0]; + ret.val[2] = r0.val[1]; + ret.val[3] = r1.val[1]; + return ret; +} + +static int float2int16_zip_1channels(const float *src, int16_t *int16_dst, + int nsamples) { + const int BLOCK_SIZE = 32; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + int i; + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } }; + float32x4x2_t in_a_45 = { { vld1q_f32(src + i + 16), vld1q_f32(src + i + 20) } }; + float32x4x2_t in_a_67 = { { vld1q_f32(src + i + 24), vld1q_f32(src + i + 28) } }; + + int16x8x2_t out_01 = cvt_for_int16_x2(in_a_01, in_a_23); + int16x8x2_t out_23 = cvt_for_int16_x2(in_a_45, in_a_67); + + vst1q_s16(int16_dst + i + 0, out_01.val[0]); + vst1q_s16(int16_dst + i + 8, out_01.val[1]); + vst1q_s16(int16_dst + i + 16, out_23.val[0]); + vst1q_s16(int16_dst + i + 24, out_23.val[1]); + } + + return blocked_size; +} + +static int float2int16_zip_2channels(const float *src, int next_channel, + int16_t *int16_dst, int nsamples) { + const int BLOCK_SIZE = 16; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + int i; + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } }; + float32x4x2_t in_b_01 = { { vld1q_f32(src + i + next_channel + 0), vld1q_f32(src + i + next_channel + 4) } }; + float32x4x2_t in_b_23 = { { vld1q_f32(src + i + next_channel + 8), vld1q_f32(src + i + next_channel + 12) } }; + + int16x8x2_t out_ab_01 = cvt_for_int16_x2(in_a_01, in_b_01); + int16x8x2_t out_ab_23 = cvt_for_int16_x2(in_a_23, in_b_23); + + vst2q_s16(int16_dst + i * 2 + 0, out_ab_01); + vst2q_s16(int16_dst + i * 2 + 16, out_ab_23); + } + + return blocked_size; +} + +static int float2int16_zip_3channels(const float *src, int next_channel, + int16_t *int16_dst, int nsamples) { + const int BLOCK_SIZE = 16; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + int i; + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0 * next_channel + 0), vld1q_f32(src + i + 0 * next_channel + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 0 * next_channel + 8), vld1q_f32(src + i + 0 * next_channel + 12) } }; + float32x4x2_t in_b_01 = { { vld1q_f32(src + i + 1 * next_channel + 0), vld1q_f32(src + i + 1 * next_channel + 4) } }; + float32x4x2_t in_b_23 = { { vld1q_f32(src + i + 1 * next_channel + 8), vld1q_f32(src + i + 1 * next_channel + 12) } }; + float32x4x2_t in_c_01 = { { vld1q_f32(src + i + 2 * next_channel + 0), vld1q_f32(src + i + 2 * next_channel + 4) } }; + float32x4x2_t in_c_23 = { { vld1q_f32(src + i + 2 * next_channel + 8), vld1q_f32(src + i + 2 * next_channel + 12) } }; + + int16x8x3_t out_abc_01 = cvt_for_int16_x3(in_a_01, in_b_01, in_c_01); + int16x8x3_t out_abc_23 = cvt_for_int16_x3(in_a_23, in_b_23, in_c_23); + + vst3q_s16(int16_dst + i * 3 + 0, out_abc_01); + vst3q_s16(int16_dst + i * 3 + 24, out_abc_23); + } + + return blocked_size; +} + +static int float2int16_zip_nchannels(const float *src, int next_channel, + int channels, int16_t *int16_dst, + int nsamples) { + const int BATCH = 4; + const int BATCHED_BLOCK_SIZE = 8; + const int SINGLE_BLOCK_SIZE = 16; + const int bathed_channels = channels / BATCH * BATCH; + const int blocked_size = nsamples / BATCHED_BLOCK_SIZE * BATCHED_BLOCK_SIZE / + SINGLE_BLOCK_SIZE * SINGLE_BLOCK_SIZE; + + int i, c; + + for (c = 0; c < bathed_channels; c += BATCH) { + for (i = 0; i < blocked_size; i += BATCHED_BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * (c + 0) + i), vld1q_f32(src + next_channel * (c + 0) + i + 4) } }; + float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel * (c + 1) + i), vld1q_f32(src + next_channel * (c + 1) + i + 4) } }; + float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * (c + 2) + i), vld1q_f32(src + next_channel * (c + 2) + i + 4) } }; + float32x4x2_t in_d_01 = { { vld1q_f32(src + next_channel * (c + 3) + i), vld1q_f32(src + next_channel * (c + 3) + i + 4) } }; + + int16x4_t s32_a_0 = cvt_for_int16(in_a_01.val[0]); + int16x4_t s32_b_0 = cvt_for_int16(in_b_01.val[0]); + int16x4_t s32_c_0 = cvt_for_int16(in_c_01.val[0]); + int16x4_t s32_d_0 = cvt_for_int16(in_d_01.val[0]); + int16x4_t s32_a_1 = cvt_for_int16(in_a_01.val[1]); + int16x4_t s32_b_1 = cvt_for_int16(in_b_01.val[1]); + int16x4_t s32_c_1 = cvt_for_int16(in_c_01.val[1]); + int16x4_t s32_d_1 = cvt_for_int16(in_d_01.val[1]); + + int16x4x4_t transposed_abcd_0 = + transpose_s16_4x4(s32_a_0, s32_b_0, s32_c_0, s32_d_0); + int16x4x4_t transposed_abcd_1 = + transpose_s16_4x4(s32_a_1, s32_b_1, s32_c_1, s32_d_1); + + int16_t *ptr = int16_dst + i * channels + c; + const int step = channels; + vst1_s16(ptr + step * 0, transposed_abcd_0.val[0]); + vst1_s16(ptr + step * 1, transposed_abcd_0.val[1]); + vst1_s16(ptr + step * 2, transposed_abcd_0.val[2]); + vst1_s16(ptr + step * 3, transposed_abcd_0.val[3]); + vst1_s16(ptr + step * 4, transposed_abcd_1.val[0]); + vst1_s16(ptr + step * 5, transposed_abcd_1.val[1]); + vst1_s16(ptr + step * 6, transposed_abcd_1.val[2]); + vst1_s16(ptr + step * 7, transposed_abcd_1.val[3]); + } + } + + for (c = bathed_channels; c < channels; ++c) { + for (i = 0; i < blocked_size; i += SINGLE_BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * c + i + 0), vld1q_f32(src + next_channel * c + i + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + next_channel * c + i + 8), vld1q_f32(src + next_channel * c + i + 12) } }; + + uint64_t out_a_0 = + vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_01.val[0])), 0); + int64_t out_a_1 = + vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_01.val[1])), 0); + int64_t out_a_2 = + vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_23.val[0])), 0); + int64_t out_a_3 = + vget_lane_u64(vreinterpret_u64_s16(cvt_for_int16(in_a_23.val[1])), 0); + + int16_t *ptr = int16_dst + i * channels + c; + const int step = channels; + ptr[step * 0] = (int16_t)((out_a_0 >> 0) & 0xffff); + ptr[step * 1] = (int16_t)((out_a_0 >> 16) & 0xffff); + ptr[step * 2] = (int16_t)((out_a_0 >> 32) & 0xffff); + ptr[step * 3] = (int16_t)((out_a_0 >> 48) & 0xffff); + ptr[step * 4] = (int16_t)((out_a_1 >> 0) & 0xffff); + ptr[step * 5] = (int16_t)((out_a_1 >> 16) & 0xffff); + ptr[step * 6] = (int16_t)((out_a_1 >> 32) & 0xffff); + ptr[step * 7] = (int16_t)((out_a_1 >> 48) & 0xffff); + ptr[step * 8] = (int16_t)((out_a_2 >> 0) & 0xffff); + ptr[step * 9] = (int16_t)((out_a_2 >> 16) & 0xffff); + ptr[step * 10] = (int16_t)((out_a_2 >> 32) & 0xffff); + ptr[step * 11] = (int16_t)((out_a_2 >> 48) & 0xffff); + ptr[step * 12] = (int16_t)((out_a_3 >> 0) & 0xffff); + ptr[step * 13] = (int16_t)((out_a_3 >> 16) & 0xffff); + ptr[step * 14] = (int16_t)((out_a_3 >> 32) & 0xffff); + ptr[step * 15] = (int16_t)((out_a_3 >> 48) & 0xffff); + } + } + + return blocked_size; +} + +static inline uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) { +#if defined(__aarch64__) + uint8x16x2_t table = { { a, b } }; + return vqtbl2_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vtbl4_u8(table, idx); +#endif +} + +static inline uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) { +#if defined(__aarch64__) + uint8x16x2_t table = { { a, b } }; + return vqtbl2q_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx))); +#endif +} + +static int float2int24_zip_1channels(const float *src, uint8_t *int24_dst, + int nsamples) { + const int BLOCK_SIZE = 8; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + + int i; + + static uint8_t MAP01[] = {0, 1, 2, 4, 5, 6, 8, 9, + 10, 12, 13, 14, 16, 17, 18, 20}; + static uint8_t MAP2[] = {21, 22, 24, 25, 26, 28, 29, 30, + 0, 0, 0, 0, 0, 0, 0, 0}; + uint8x16_t map01 = vld1q_u8(MAP01); + uint8x16_t map2 = vld1q_u8(MAP2); + int32x4_t min24 = vdupq_n_s32(RANGE_MIN_24BIT); + int32x4_t max24 = vdupq_n_s32(RANGE_MAX_24BIT); + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + i), vld1q_f32(src + i + 4) } }; + + uint8x16x2_t u8_a_01; + u8_a_01.val[0] = cvt_clamp_for_int24_u8(min24, in_a_01.val[0], max24); + u8_a_01.val[1] = cvt_clamp_for_int24_u8(min24, in_a_01.val[1], max24); + + uint8x16_t out_01 = tbl2q(u8_a_01.val[0], u8_a_01.val[1], map01); + uint8x8_t out_2 = vget_low_u8(tbl2q(u8_a_01.val[0], u8_a_01.val[1], map2)); + + vst1q_u8(int24_dst + i * 3 + 0, out_01); + vst1_u8(int24_dst + i * 3 + 16, out_2); + } + + return blocked_size; +} + +static int float2int24_zip_2channels(const float *src, int next_channel, + uint8_t *int24_dst, int nsamples) { + const int BLOCK_SIZE = 4; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + + int i; + + static uint8_t MAP01[] = {0, 1, 2, 16, 17, 18, 4, 5, + 6, 20, 21, 22, 8, 9, 10, 24}; + static uint8_t MAP2[] = {25, 26, 12, 13, 14, 28, 29, 30, + 0, 0, 0, 0, 0, 0, 0, 0}; + uint8x16_t map01 = vld1q_u8(MAP01); + uint8x16_t map2 = vld1q_u8(MAP2); + int32x4_t min24 = vdupq_n_s32(RANGE_MIN_24BIT); + int32x4_t max24 = vdupq_n_s32(RANGE_MAX_24BIT); + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4_t in_a_0 = vld1q_f32(src + i); + float32x4_t in_b_0 = vld1q_f32(src + next_channel + i); + + uint8x16x2_t u8_ab_0; + u8_ab_0.val[0] = cvt_clamp_for_int24_u8(min24, in_a_0, max24); + u8_ab_0.val[1] = cvt_clamp_for_int24_u8(min24, in_b_0, max24); + + uint8x16_t out_firstpart_ab_0 = tbl2q(u8_ab_0.val[0], u8_ab_0.val[1], map01); + uint8x8_t out_secondpart_ab_0 = vget_low_u8(tbl2q(u8_ab_0.val[0], u8_ab_0.val[1], map2)); + + vst1q_u8(int24_dst + (i * 2 + 0) * 3 + 0 + 0, out_firstpart_ab_0); + vst1_u8(int24_dst + (i * 2 + 0) * 3 + 15 + 1, out_secondpart_ab_0); + } + + return blocked_size; +} + +static inline uint8x16_t tbl1q(uint8x16_t a, uint8x16_t idx) { +#if defined(__aarch64__) + return vqtbl1q_u8(a, idx); +#else + uint8x8x2_t table = { { vget_low_u8(a), vget_high_u8(a) } }; + uint8x8_t lo = vtbl2_u8(table, vget_low_u8(idx)); + uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx)); + return vcombine_u8(lo, hi); +#endif +} + +static int float2int24_zip_nchannels(const float *src, int next_channel, + int channels, uint8_t *int24_dst, + int nsamples) { + const int BATCH = 3; + const int BATCHED_BLOCK_SIZE = 8; + const int SINGLE_BLOCK_SIZE = 8; + const int bathed_channels = channels / BATCH * BATCH; + const int blocked_size = nsamples / BATCHED_BLOCK_SIZE * BATCHED_BLOCK_SIZE / + SINGLE_BLOCK_SIZE * SINGLE_BLOCK_SIZE; + + int i, c; + + static uint8_t MAP01[] = {0, 1, 2, 4, 5, 6, 8, 9, + 10, 12, 13, 14, 16, 17, 18, 20}; + uint8x16_t map01 = vld1q_u8(MAP01); + int32x4_t min24 = vdupq_n_s32(RANGE_MIN_24BIT); + int32x4_t max24 = vdupq_n_s32(RANGE_MAX_24BIT); + + for (c = 0; c < bathed_channels; c += BATCH) { + for (i = 0; i < blocked_size; i += BATCHED_BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * (c + 0) + i), vld1q_f32(src + next_channel * (c + 0) + i + 4) } }; + float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel * (c + 1) + i), vld1q_f32(src + next_channel * (c + 1) + i + 4) } }; + float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * (c + 2) + i), vld1q_f32(src + next_channel * (c + 2) + i + 4) } }; + + int32x4_t s32_a_0 = cvt_clamp_for_int24_s32(min24, in_a_01.val[0], max24); + int32x4_t s32_b_0 = cvt_clamp_for_int24_s32(min24, in_b_01.val[0], max24); + int32x4_t s32_c_0 = cvt_clamp_for_int24_s32(min24, in_c_01.val[0], max24); + + int32x4_t s32_a_1 = cvt_clamp_for_int24_s32(min24, in_a_01.val[1], max24); + int32x4_t s32_b_1 = cvt_clamp_for_int24_s32(min24, in_b_01.val[1], max24); + int32x4_t s32_c_1 = cvt_clamp_for_int24_s32(min24, in_c_01.val[1], max24); + int32x4_t zeros = vdupq_n_s32(0); + + int32x4x4_t transposed_abcd_0 = + transpose_s32_4x4(s32_a_0, s32_b_0, s32_c_0, zeros); + uint8x16_t out_full_0 = vreinterpretq_u8_s32(transposed_abcd_0.val[0]); + uint8x16_t out_full_1 = vreinterpretq_u8_s32(transposed_abcd_0.val[1]); + uint8x16_t out_full_2 = vreinterpretq_u8_s32(transposed_abcd_0.val[2]); + uint8x16_t out_full_3 = vreinterpretq_u8_s32(transposed_abcd_0.val[3]); + + int32x4x4_t transposed_abcd_1 = + transpose_s32_4x4(s32_a_1, s32_b_1, s32_c_1, zeros); + uint8x16_t out_full_4 = vreinterpretq_u8_s32(transposed_abcd_1.val[0]); + uint8x16_t out_full_5 = vreinterpretq_u8_s32(transposed_abcd_1.val[1]); + uint8x16_t out_full_6 = vreinterpretq_u8_s32(transposed_abcd_1.val[2]); + uint8x16_t out_full_7 = vreinterpretq_u8_s32(transposed_abcd_1.val[3]); + + const uint8_t out_secondpart_0 = vgetq_lane_u8(out_full_0, 10); + const uint8_t out_secondpart_1 = vgetq_lane_u8(out_full_1, 10); + const uint8_t out_secondpart_2 = vgetq_lane_u8(out_full_2, 10); + const uint8_t out_secondpart_3 = vgetq_lane_u8(out_full_3, 10); + const uint8_t out_secondpart_4 = vgetq_lane_u8(out_full_4, 10); + const uint8_t out_secondpart_5 = vgetq_lane_u8(out_full_5, 10); + const uint8_t out_secondpart_6 = vgetq_lane_u8(out_full_6, 10); + const uint8_t out_secondpart_7 = vgetq_lane_u8(out_full_7, 10); + + uint8x8_t out_firstpart_0 = vget_low_u8(tbl1q(out_full_0, map01)); + uint8x8_t out_firstpart_1 = vget_low_u8(tbl1q(out_full_1, map01)); + uint8x8_t out_firstpart_2 = vget_low_u8(tbl1q(out_full_2, map01)); + uint8x8_t out_firstpart_3 = vget_low_u8(tbl1q(out_full_3, map01)); + uint8x8_t out_firstpart_4 = vget_low_u8(tbl1q(out_full_4, map01)); + uint8x8_t out_firstpart_5 = vget_low_u8(tbl1q(out_full_5, map01)); + uint8x8_t out_firstpart_6 = vget_low_u8(tbl1q(out_full_6, map01)); + uint8x8_t out_firstpart_7 = vget_low_u8(tbl1q(out_full_7, map01)); + + uint8_t *ptr = int24_dst + ((i + 0) * channels + c) * 3; + const int step = channels * 3; + write_consecutive_int24(ptr + step * 0, out_firstpart_0, + out_secondpart_0); + write_consecutive_int24(ptr + step * 1, out_firstpart_1, + out_secondpart_1); + write_consecutive_int24(ptr + step * 2, out_firstpart_2, + out_secondpart_2); + write_consecutive_int24(ptr + step * 3, out_firstpart_3, + out_secondpart_3); + write_consecutive_int24(ptr + step * 4, out_firstpart_4, + out_secondpart_4); + write_consecutive_int24(ptr + step * 5, out_firstpart_5, + out_secondpart_5); + write_consecutive_int24(ptr + step * 6, out_firstpart_6, + out_secondpart_6); + write_consecutive_int24(ptr + step * 7, out_firstpart_7, + out_secondpart_7); + } + } + + for (c = bathed_channels; c < channels; ++c) { + for (i = 0; i < blocked_size; i += SINGLE_BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * c + i), vld1q_f32(src + next_channel * c + i + 4) } }; + + uint8x16_t out_full_0 = vreinterpretq_u8_s32( + cvt_clamp_for_int24_s32(min24, in_a_01.val[0], max24)); + uint8x16_t out_full_1 = vreinterpretq_u8_s32( + cvt_clamp_for_int24_s32(min24, in_a_01.val[1], max24)); + + uint64_t out_low_0 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_0), 0); + uint64_t out_high_0 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_0), 1); + uint64_t out_low_1 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_1), 0); + uint64_t out_high_1 = vgetq_lane_u64(vreinterpretq_u64_u8(out_full_1), 1); + + uint8_t *ptr = int24_dst + ((i + 0) * channels + c) * 3; + const int step = channels * 3; + write_pair_int24(ptr + step * 0, step, out_low_0); + write_pair_int24(ptr + step * 2, step, out_high_0); + write_pair_int24(ptr + step * 4, step, out_low_1); + write_pair_int24(ptr + step * 6, step, out_high_1); + } + } + + return blocked_size; +} + +static int float2int32_zip_1channels(const float *src, int32_t *int32_dst, + int nsamples) { + const int BLOCK_SIZE = 32; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + + int i; + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } }; + float32x4x2_t in_a_45 = { { vld1q_f32(src + i + 16), vld1q_f32(src + i + 20) } }; + float32x4x2_t in_a_67 = { { vld1q_f32(src + i + 24), vld1q_f32(src + i + 28) } }; + + int32x4x2_t out_01 = cvt_for_int32_x2(in_a_01.val[0], in_a_01.val[1]); + int32x4x2_t out_23 = cvt_for_int32_x2(in_a_23.val[0], in_a_23.val[1]); + int32x4x2_t out_45 = cvt_for_int32_x2(in_a_45.val[0], in_a_45.val[1]); + int32x4x2_t out_67 = cvt_for_int32_x2(in_a_67.val[0], in_a_67.val[1]); + + vst1q_s32(int32_dst + i + 0, out_01.val[0]); + vst1q_s32(int32_dst + i + 4, out_01.val[1]); + vst1q_s32(int32_dst + i + 8, out_23.val[0]); + vst1q_s32(int32_dst + i + 12, out_23.val[1]); + vst1q_s32(int32_dst + i + 16, out_45.val[0]); + vst1q_s32(int32_dst + i + 20, out_45.val[1]); + vst1q_s32(int32_dst + i + 24, out_67.val[0]); + vst1q_s32(int32_dst + i + 28, out_67.val[1]); + } + + return blocked_size; +} + +static int float2int32_zip_2channels(const float *src, int next_channel, + int32_t *int32_dst, int nsamples) { + const int BLOCK_SIZE = 16; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + + int i; + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } }; + float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel + i + 0), vld1q_f32(src + next_channel + i + 4) } }; + float32x4x2_t in_b_23 = { { vld1q_f32(src + next_channel + i + 8), vld1q_f32(src + next_channel + i + 12) } }; + + int32x4x2_t out_ab_0 = cvt_for_int32_x2(in_a_01.val[0], in_b_01.val[0]); + int32x4x2_t out_ab_1 = cvt_for_int32_x2(in_a_01.val[1], in_b_01.val[1]); + int32x4x2_t out_ab_2 = cvt_for_int32_x2(in_a_23.val[0], in_b_23.val[0]); + int32x4x2_t out_ab_3 = cvt_for_int32_x2(in_a_23.val[1], in_b_23.val[1]); + + vst2q_s32(int32_dst + i * 2 + 0, out_ab_0); + vst2q_s32(int32_dst + i * 2 + 8, out_ab_1); + vst2q_s32(int32_dst + i * 2 + 16, out_ab_2); + vst2q_s32(int32_dst + i * 2 + 24, out_ab_3); + } + + return blocked_size; +} + +static int float2int32_zip_3channels(const float *src, int next_channel, + int32_t *int32_dst, int nsamples) { + const int BLOCK_SIZE = 16; + const int blocked_size = nsamples / BLOCK_SIZE * BLOCK_SIZE; + + int i; + + for (i = 0; i < blocked_size; i += BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + i + 0), vld1q_f32(src + i + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + i + 8), vld1q_f32(src + i + 12) } }; + float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel + i + 0), vld1q_f32(src + next_channel + i + 4) } }; + float32x4x2_t in_b_23 = { { vld1q_f32(src + next_channel + i + 8), vld1q_f32(src + next_channel + i + 12) } }; + float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * 2 + i + 0), vld1q_f32(src + next_channel * 2 + i + 4) } }; + float32x4x2_t in_c_23 = { { vld1q_f32(src + next_channel * 2 + i + 8), vld1q_f32(src + next_channel * 2 + i + 12) } }; + + int32x4x3_t out_abc_0 = + cvt_for_int32_x3(in_a_01.val[0], in_b_01.val[0], in_c_01.val[0]); + int32x4x3_t out_abc_1 = + cvt_for_int32_x3(in_a_01.val[1], in_b_01.val[1], in_c_01.val[1]); + int32x4x3_t out_abc_2 = + cvt_for_int32_x3(in_a_23.val[0], in_b_23.val[0], in_c_23.val[0]); + int32x4x3_t out_abc_3 = + cvt_for_int32_x3(in_a_23.val[1], in_b_23.val[1], in_c_23.val[1]); + + vst3q_s32(int32_dst + i * 3 + 0, out_abc_0); + vst3q_s32(int32_dst + i * 3 + 12, out_abc_1); + vst3q_s32(int32_dst + i * 3 + 24, out_abc_2); + vst3q_s32(int32_dst + i * 3 + 36, out_abc_3); + } + + return blocked_size; +} + +static int float2int32_zip_nchannels(const float *src, int next_channel, + int channels, int32_t *int32_dst, + int nsamples) { + const int BATCH = 4; + const int BATCHED_BLOCK_SIZE = 8; + const int SINGLE_BLOCK_SIZE = 16; + const int bathed_channels = channels / BATCH * BATCH; + const int blocked_size = nsamples / BATCHED_BLOCK_SIZE * BATCHED_BLOCK_SIZE / + SINGLE_BLOCK_SIZE * SINGLE_BLOCK_SIZE; + + int i, c; + + for (c = 0; c < bathed_channels; c += BATCH) { + for (i = 0; i < blocked_size; i += BATCHED_BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * (c + 0) + i), vld1q_f32(src + next_channel * (c + 0) + i + 4) } }; + float32x4x2_t in_b_01 = { { vld1q_f32(src + next_channel * (c + 1) + i), vld1q_f32(src + next_channel * (c + 1) + i + 4) } }; + float32x4x2_t in_c_01 = { { vld1q_f32(src + next_channel * (c + 2) + i), vld1q_f32(src + next_channel * (c + 2) + i + 4) } }; + float32x4x2_t in_d_01 = { { vld1q_f32(src + next_channel * (c + 3) + i), vld1q_f32(src + next_channel * (c + 3) + i + 4) } }; + + int32x4x2_t s32_a_01 = cvt_for_int32_x2(in_a_01.val[0], in_a_01.val[1]); + int32x4x2_t s32_b_01 = cvt_for_int32_x2(in_b_01.val[0], in_b_01.val[1]); + int32x4x2_t s32_c_01 = cvt_for_int32_x2(in_c_01.val[0], in_c_01.val[1]); + int32x4x2_t s32_d_01 = cvt_for_int32_x2(in_d_01.val[0], in_d_01.val[1]); + + int32x4x4_t transposed_0 = transpose_s32_4x4( + s32_a_01.val[0], s32_b_01.val[0], s32_c_01.val[0], s32_d_01.val[0]); + int32x4x4_t transposed_1 = transpose_s32_4x4( + s32_a_01.val[1], s32_b_01.val[1], s32_c_01.val[1], s32_d_01.val[1]); + + int32_t *ptr = int32_dst + i * channels + c; + const int step = channels; + vst1q_s32(ptr + step * 0, transposed_0.val[0]); + vst1q_s32(ptr + step * 1, transposed_0.val[1]); + vst1q_s32(ptr + step * 2, transposed_0.val[2]); + vst1q_s32(ptr + step * 3, transposed_0.val[3]); + vst1q_s32(ptr + step * 4, transposed_1.val[0]); + vst1q_s32(ptr + step * 5, transposed_1.val[1]); + vst1q_s32(ptr + step * 6, transposed_1.val[2]); + vst1q_s32(ptr + step * 7, transposed_1.val[3]); + } + } + + for (c = bathed_channels; c < channels; ++c) { + for (i = 0; i < blocked_size; i += SINGLE_BLOCK_SIZE) { + float32x4x2_t in_a_01 = { { vld1q_f32(src + next_channel * c + i + 0), vld1q_f32(src + next_channel * c + i + 4) } }; + float32x4x2_t in_a_23 = { { vld1q_f32(src + next_channel * c + i + 8), vld1q_f32(src + next_channel * c + i + 12) } }; + + int32x4_t out_scattered_a_0 = cvt_for_int32(in_a_01.val[0]); + int32x4_t out_scattered_a_1 = cvt_for_int32(in_a_01.val[1]); + int32x4_t out_scattered_a_2 = cvt_for_int32(in_a_23.val[0]); + int32x4_t out_scattered_a_3 = cvt_for_int32(in_a_23.val[1]); + + int64_t out_low_a_0 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_0), 0); + int64_t out_high_a_0 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_0), 1); + int64_t out_low_a_1 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_1), 0); + int64_t out_high_a_1 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_1), 1); + int64_t out_low_a_2 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_2), 0); + int64_t out_high_a_2 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_2), 1); + int64_t out_low_a_3 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_3), 0); + int64_t out_high_a_3 = + vgetq_lane_u64(vreinterpretq_u64_s32(out_scattered_a_3), 1); + + int32_t *ptr = int32_dst + i * channels + c; + const int step = channels; + ptr[step * 0] = (int32_t)((out_low_a_0 >> 0) & 0xffffffff); + ptr[step * 1] = (int32_t)((out_low_a_0 >> 32) & 0xffffffff); + ptr[step * 2] = (int32_t)((out_high_a_0 >> 0) & 0xffffffff); + ptr[step * 3] = (int32_t)((out_high_a_0 >> 32) & 0xffffffff); + ptr[step * 4] = (int32_t)((out_low_a_1 >> 0) & 0xffffffff); + ptr[step * 5] = (int32_t)((out_low_a_1 >> 32) & 0xffffffff); + ptr[step * 6] = (int32_t)((out_high_a_1 >> 0) & 0xffffffff); + ptr[step * 7] = (int32_t)((out_high_a_1 >> 32) & 0xffffffff); + ptr[step * 8] = (int32_t)((out_low_a_2 >> 0) & 0xffffffff); + ptr[step * 9] = (int32_t)((out_low_a_2 >> 32) & 0xffffffff); + ptr[step * 10] = (int32_t)((out_high_a_2 >> 0) & 0xffffffff); + ptr[step * 11] = (int32_t)((out_high_a_2 >> 32) & 0xffffffff); + ptr[step * 12] = (int32_t)((out_low_a_3 >> 0) & 0xffffffff); + ptr[step * 13] = (int32_t)((out_low_a_3 >> 32) & 0xffffffff); + ptr[step * 14] = (int32_t)((out_high_a_3 >> 0) & 0xffffffff); + ptr[step * 15] = (int32_t)((out_high_a_3 >> 32) & 0xffffffff); + } + } + + return blocked_size; +} + +void float2int16_zip_channels_neon(const float *src, int next_channel, + int channels, int16_t *int16_dst, + int nsamples) { + int processed = 0; + + switch (channels) { + case 1: + processed = float2int16_zip_1channels(src, int16_dst, nsamples); + break; + case 2: + processed = + float2int16_zip_2channels(src, next_channel, int16_dst, nsamples); + break; + case 3: + processed = + float2int16_zip_3channels(src, next_channel, int16_dst, nsamples); + break; + default: + processed = float2int16_zip_nchannels(src, next_channel, channels, + int16_dst, nsamples); + break; + } + + // Let C version handle the residuals + float2int16_zip_channels_c(src + processed, next_channel, channels, + int16_dst + processed * channels, + nsamples - processed); +} + +void float2int24_zip_channels_neon(const float *src, int next_channel, + int channels, uint8_t *int24_dst, + int nsamples) { + int processed = 0; + + switch (channels) { + case 1: + processed = float2int24_zip_1channels(src, int24_dst, nsamples); + break; + case 2: + processed = + float2int24_zip_2channels(src, next_channel, int24_dst, nsamples); + break; + default: + processed = float2int24_zip_nchannels(src, next_channel, channels, + int24_dst, nsamples); + break; + } + + // Let C version handle the residuals + float2int24_zip_channels_c(src + processed, next_channel, channels, + int24_dst + processed * channels * 3, + nsamples - processed); +} + +void float2int32_zip_channels_neon(const float *src, int next_channel, + int channels, int32_t *int32_dst, + int nsamples) { + int processed = 0; + + switch (channels) { + case 1: + processed = float2int32_zip_1channels(src, int32_dst, nsamples); + break; + case 2: + processed = + float2int32_zip_2channels(src, next_channel, int32_dst, nsamples); + break; + case 3: + processed = + float2int32_zip_3channels(src, next_channel, int32_dst, nsamples); + break; + default: + processed = float2int32_zip_nchannels(src, next_channel, channels, + int32_dst, nsamples); + break; + } + + // Let C version handle the residuals + float2int32_zip_channels_c(src + processed, next_channel, channels, + int32_dst + processed * channels, + nsamples - processed); +} + +#endif /* IAMF_ARCH_DETECTED_ARM */ diff --git a/code/src/iamf_dec/arch/arm/arm_zip_channels.h b/code/src/iamf_dec/arch/arm/arm_zip_channels.h new file mode 100644 index 00000000..b8ba7b27 --- /dev/null +++ b/code/src/iamf_dec/arch/arm/arm_zip_channels.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at www.aomedia.org/license/software-license/bsd-3-c-c. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * www.aomedia.org/license/patent. + */ + +/** + * @file arm_zip_channels.h + * @brief Arm implementation for zipping channels. + * @version 0.1 + * @date Created 10/24/2024 + **/ + +#ifndef ARM_ZIP_CHANNELS_H_ +#define ARM_ZIP_CHANNELS_H_ + +#include "detect_arm.h" + +#if defined(IAMF_ARCH_DETECTED_ARM) + +#include + +void float2int16_zip_channels_neon(const float *src, int next_channel, + int channels, int16_t *int16_dst, + int nsamples); + +void float2int24_zip_channels_neon(const float *src, int next_channel, + int channels, uint8_t *int24_dst, + int nsamples); + +void float2int32_zip_channels_neon(const float *src, int next_channel, + int channels, int32_t *int32_dst, + int nsamples); + +#endif /* IAMF_ARCH_DETECTED_ARM */ +#endif /* ARM_ZIP_CHANNELS_H_ */ diff --git a/code/src/iamf_dec/arch/arm/override_arm.c b/code/src/iamf_dec/arch/arm/override_arm.c index 15621f4c..d4f16dc1 100644 --- a/code/src/iamf_dec/arch/arm/override_arm.c +++ b/code/src/iamf_dec/arch/arm/override_arm.c @@ -25,12 +25,17 @@ #include "../../arch.h" #include "arm_multiply_channels.h" +#include "arm_zip_channels.h" void arch_override(Arch *arch) { // Override functions with Arm implementations here arch->rendering.multiply_channels_by_matrix = &multiply_channels_by_matrix_neon; + + arch->output.float2int16_zip_channels = &float2int16_zip_channels_neon; + arch->output.float2int24_zip_channels = &float2int24_zip_channels_neon; + arch->output.float2int32_zip_channels = &float2int32_zip_channels_neon; } #endif diff --git a/code/win64/VS2022/iamf/iamf.vcxproj b/code/win64/VS2022/iamf/iamf.vcxproj index 661addae..ad9a4fb1 100755 --- a/code/win64/VS2022/iamf/iamf.vcxproj +++ b/code/win64/VS2022/iamf/iamf.vcxproj @@ -136,6 +136,7 @@ + @@ -176,6 +177,7 @@ + diff --git a/code/win64/VS2022/iamf/iamf.vcxproj.filters b/code/win64/VS2022/iamf/iamf.vcxproj.filters index 1a838eba..d6ecb6c1 100755 --- a/code/win64/VS2022/iamf/iamf.vcxproj.filters +++ b/code/win64/VS2022/iamf/iamf.vcxproj.filters @@ -81,6 +81,9 @@ Source Files + + Source Files + Source Files @@ -146,6 +149,9 @@ Header Files + + Header Files + Header Files