1313#include <xtensa/tie/xt_hifi3.h>
1414
1515#define ONE_OVER_SQRT2_Q30 759250112 /* Q_CONVERT_FLOAT(0.70710678118654752f, 30); 1/sqrt(2) */
16- #define LOG10_FUNC_A5_Q26 75959200 /* Q_CONVERT_FLOAT(1.131880283355712890625f, 26) */
16+ #define LOG10_FUNC_A5_Q26 75959200 /* Q_CONVERT_FLOAT(1.131880283355712890625f, 26) */
1717#define LOG10_FUNC_A4_Q26 -285795039 /* Q_CONVERT_FLOAT(-4.258677959442138671875f, 26) */
1818#define LOG10_FUNC_A3_Q26 457435200 /* Q_CONVERT_FLOAT(6.81631565093994140625f, 26) */
1919#define LOG10_FUNC_A2_Q26 -410610303 /* Q_CONVERT_FLOAT(-6.1185703277587890625f, 26) */
3939#define INV_FUNC_A2_Q25 1126492160 /* Q_CONVERT_FLOAT(33.57208251953125f, 25) */
4040#define INV_FUNC_A1_Q25 -713042175 /* Q_CONVERT_FLOAT(-21.25031280517578125f, 25) */
4141#define INV_FUNC_A0_Q25 239989712 /* Q_CONVERT_FLOAT(7.152250766754150390625f, 25) */
42+ #define INV_FUNC_ONE_Q30 1073741824 /* Q_CONVERT_FLOAT(1, 30) */
4243#define SHIFT_IDX_QX30_QY30_QZ30 1 /* drc_get_lshift(30, 30, 30) */
4344#define SHIFT_IDX_QX26_QY30_QZ26 1 /* drc_get_lshift(26, 30, 26) */
4445#define SHIFT_IDX_QX25_QY30_QZ25 1 /* drc_get_lshift(25, 30, 25) */
4546#define SHIFT_IDX_QX29_QY26_QZ26 2 /* drc_get_lshift(29, 26, 26) */
4647#define SHIFT_IDX_QX25_QY26_QZ26 6 /* drc_get_lshift(25, 26, 26) */
4748#define DRC_TWENTY_Q26 1342177280 /* Q_CONVERT_FLOAT(20, 26) */
4849
50+ #define DRC_PACK_INT32X2_TO_UINT64 (a , b ) (((uint64_t)((int64_t)(a) & 0xffffffff) << 32) | \
51+ ((uint64_t)((int64_t)(b) & 0xffffffff)))
52+
53+ static const uint64_t drc_inv_func_coefficients [] = {
54+ DRC_PACK_INT32X2_TO_UINT64 (INV_FUNC_A2_Q25 , INV_FUNC_A5_Q25 ),
55+ DRC_PACK_INT32X2_TO_UINT64 (INV_FUNC_A1_Q25 , INV_FUNC_A4_Q25 ),
56+ DRC_PACK_INT32X2_TO_UINT64 (INV_FUNC_A0_Q25 , INV_FUNC_A3_Q25 )
57+ };
58+
4959/*
5060 * Input is Q6.26: max 32.0
5161 * Output range ~ (-inf, 1.505); regulated to Q6.26: (-32.0, 32.0)
@@ -220,17 +230,28 @@ inline int32_t drc_inv_fixed(int32_t x, int32_t precision_x, int32_t precision_y
220230 * fpminimax(1/x, 5, [|SG...|], [sqrt(2)/2;1], absolute);
221231 * max err ~= 1.00388e-6
222232 */
223- ae_f32 in ;
224- int32_t in_abs ;
225- int32_t bit = 31 - AE_NSAZ32_L (x );
226- int32_t e = bit - precision_x ;
227- int32_t precision_inv ;
228- int32_t sqrt2_extracted = 0 ;
229- ae_f32 acc ;
233+ __aligned (8 ) ae_f32 coef [2 ];
230234 ae_f64 tmp ;
235+ ae_f32x2 in ;
236+ ae_f32x2 p1p0 ;
237+ ae_f32 acc ;
238+ ae_f32x2 * coefp ;
239+ int32_t in_abs ;
240+ int shift_input ;
241+ int shift_output ;
242+ int sqrt2_extracted = 0 ;
231243
232- /*Output range [0.5, 1); regulated to Q2.30 */
233- in = AE_SRAA32 (x , bit - 30 );
244+ /* Output range [0.5, 1); regulated to Q2.30
245+ *
246+ * Note: input and output shift code was originally as more self-documenting
247+ * bit = 31 - AE_NSAZ32_L(x); input_shift = bit - 30;
248+ * e = bit - precision_x; precision_inv = e + 25;
249+ * output_shift = precision_y - precision_inv;
250+ */
251+
252+ shift_input = 1 - AE_NSAZ32_L (x );
253+ shift_output = precision_y + precision_x - shift_input - 55 ;
254+ in = AE_SRAA32 (x , shift_input );
234255 in_abs = AE_MOVAD32_L (AE_ABS32S (in ));
235256
236257 if (in_abs < ONE_OVER_SQRT2_Q30 ) {
@@ -240,35 +261,46 @@ inline int32_t drc_inv_fixed(int32_t x, int32_t precision_x, int32_t precision_y
240261 sqrt2_extracted = 1 ;
241262 }
242263
243- tmp = AE_MULF32R_LL (in , INV_FUNC_A5_Q25 );
244- tmp = AE_SLAI64S (tmp , SHIFT_IDX_QX25_QY30_QZ25 );
245- acc = AE_ROUND32F48SSYM (tmp );
246- acc = AE_ADD32 (acc , INV_FUNC_A4_Q25 );
247- tmp = AE_MULF32R_LL (in , acc );
248- tmp = AE_SLAI64S (tmp , SHIFT_IDX_QX25_QY30_QZ25 );
249- acc = AE_ROUND32F48SSYM (tmp );
250- acc = AE_ADD32 (acc , INV_FUNC_A3_Q25 );
251- tmp = AE_MULF32R_LL (in , acc );
252- tmp = AE_SLAI64S (tmp , SHIFT_IDX_QX25_QY30_QZ25 );
253- acc = AE_ROUND32F48SSYM (tmp );
254- acc = AE_ADD32 (acc , INV_FUNC_A2_Q25 );
255- tmp = AE_MULF32R_LL (in , acc );
256- tmp = AE_SLAI64S (tmp , SHIFT_IDX_QX25_QY30_QZ25 );
257- acc = AE_ROUND32F48SSYM (tmp );
258- acc = AE_ADD32 (acc , INV_FUNC_A1_Q25 );
259- tmp = AE_MULF32R_LL (in , acc );
260- tmp = AE_SLAI64S (tmp , SHIFT_IDX_QX25_QY30_QZ25 );
261- acc = AE_ROUND32F48SSYM (tmp );
262- acc = AE_ADD32 (acc , INV_FUNC_A0_Q25 );
264+ /* calculate p00(x) = a1 + a2 * x, p11(x) = a4 + a5 * x
265+ *
266+ * In Q25 coef * Q30 in 32 bit AE multiply the result is Q24 (25 + 30 + 1 - 32),
267+ * so every multiply is shifted left by one to keep results as Q25.
268+ */
269+ coefp = (ae_f32x2 * )drc_inv_func_coefficients ;
270+ p1p0 = AE_SLAI32S (AE_MULFP32X2RS (* coefp , in ), SHIFT_IDX_QX25_QY30_QZ25 );
271+ coefp ++ ;
272+ p1p0 = AE_ADD32S (p1p0 , * coefp );
273+
274+ /* calculate p0(x) = a0 + p00(x) * x, p1(x) = a3 + p11(x) * x */
275+ p1p0 = AE_SLAI32S (AE_MULFP32X2RS (p1p0 , in ), SHIFT_IDX_QX25_QY30_QZ25 );
276+ coefp ++ ;
277+ p1p0 = AE_ADD32S (p1p0 , * coefp );
278+
279+ /* calculate p1(x) * x^3
280+ *
281+ * Q30 * Q30 AE multiply gives Q61. shifting it low 32 bits as Q30 would
282+ * need right shift by 31. For Q17,47 AE round instead it is shifted 16 bits
283+ * less, so shift right by 15.
284+ */
285+ acc = AE_ROUND32F48SASYM (AE_SRAA64 (AE_MULF32S_HH (in , in ), 15 ));
286+ acc = AE_ROUND32F48SASYM (AE_SRAA64 (AE_MULF32S_HH (acc , in ), 15 ));
287+ coef [1 ] = INV_FUNC_ONE_Q30 ;
288+ coef [0 ] = acc ;
289+ p1p0 = AE_SLAI32S (AE_MULFP32X2RS (p1p0 , * (ae_int32x2 * )coef ), SHIFT_IDX_QX25_QY30_QZ25 );
290+
291+ /* calculate p(x) = p0(x) + p1(x) * x^3
292+ * p1p0.l holds p0(x) after multiply with one
293+ * p1p0.h holds p1(x) * x^3
294+ */
295+ acc = AE_MOVAD32_L (AE_ADD32_HL_LH (p1p0 , p1p0 ));
263296
264297 if (sqrt2_extracted ) {
265298 tmp = AE_MULF32R_LL (SQRT2_Q30 , acc );
266299 tmp = AE_SLAI64S (tmp , SHIFT_IDX_QX25_QY30_QZ25 );
267300 acc = AE_ROUND32F48SSYM (tmp );
268301 }
269302
270- precision_inv = e + 25 ;
271- acc = AE_SLAA32S (acc , precision_y - precision_inv );
303+ acc = AE_SLAA32S (acc , shift_output );
272304 return AE_MOVAD32_L (acc );
273305}
274306
0 commit comments