Skip to content

Commit c46e4e3

Browse files
committed
Audio: DRC: Calculate drc_inv_fixed() with dual multiply
This change improves calculation speed. The implementation is bit exact with previous but implemented with 2-way SIMD multiply. The Horner polynomial evaluation is changed to parallel Horner version for two multipliers. The input and output shifting code is also simplified. The code changes save 1.0 MCPS in sof-testbench4 simulated MTL platform. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 6117ceb commit c46e4e3

File tree

1 file changed

+64
-32
lines changed

1 file changed

+64
-32
lines changed

src/audio/drc/drc_math_hifi3.c

Lines changed: 64 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include <xtensa/tie/xt_hifi3.h>
1414

1515
#define ONE_OVER_SQRT2_Q30 759250112 /* Q_CONVERT_FLOAT(0.70710678118654752f, 30); 1/sqrt(2) */
16-
#define LOG10_FUNC_A5_Q26 75959200 /* Q_CONVERT_FLOAT(1.131880283355712890625f, 26) */
16+
#define LOG10_FUNC_A5_Q26 75959200 /* Q_CONVERT_FLOAT(1.131880283355712890625f, 26) */
1717
#define LOG10_FUNC_A4_Q26 -285795039 /* Q_CONVERT_FLOAT(-4.258677959442138671875f, 26) */
1818
#define LOG10_FUNC_A3_Q26 457435200 /* Q_CONVERT_FLOAT(6.81631565093994140625f, 26) */
1919
#define LOG10_FUNC_A2_Q26 -410610303 /* Q_CONVERT_FLOAT(-6.1185703277587890625f, 26) */
@@ -39,13 +39,23 @@
3939
#define INV_FUNC_A2_Q25 1126492160 /* Q_CONVERT_FLOAT(33.57208251953125f, 25) */
4040
#define INV_FUNC_A1_Q25 -713042175 /* Q_CONVERT_FLOAT(-21.25031280517578125f, 25) */
4141
#define INV_FUNC_A0_Q25 239989712 /* Q_CONVERT_FLOAT(7.152250766754150390625f, 25) */
42+
#define INV_FUNC_ONE_Q30 1073741824 /* Q_CONVERT_FLOAT(1, 30) */
4243
#define SHIFT_IDX_QX30_QY30_QZ30 1 /* drc_get_lshift(30, 30, 30) */
4344
#define SHIFT_IDX_QX26_QY30_QZ26 1 /* drc_get_lshift(26, 30, 26) */
4445
#define SHIFT_IDX_QX25_QY30_QZ25 1 /* drc_get_lshift(25, 30, 25) */
4546
#define SHIFT_IDX_QX29_QY26_QZ26 2 /* drc_get_lshift(29, 26, 26) */
4647
#define SHIFT_IDX_QX25_QY26_QZ26 6 /* drc_get_lshift(25, 26, 26) */
4748
#define DRC_TWENTY_Q26 1342177280 /* Q_CONVERT_FLOAT(20, 26) */
4849

50+
#define DRC_PACK_INT32X2_TO_UINT64(a, b) (((uint64_t)((int64_t)(a) & 0xffffffff) << 32) | \
51+
((uint64_t)((int64_t)(b) & 0xffffffff)))
52+
53+
static const uint64_t drc_inv_func_coefficients[] = {
54+
DRC_PACK_INT32X2_TO_UINT64(INV_FUNC_A2_Q25, INV_FUNC_A5_Q25),
55+
DRC_PACK_INT32X2_TO_UINT64(INV_FUNC_A1_Q25, INV_FUNC_A4_Q25),
56+
DRC_PACK_INT32X2_TO_UINT64(INV_FUNC_A0_Q25, INV_FUNC_A3_Q25)
57+
};
58+
4959
/*
5060
* Input is Q6.26: max 32.0
5161
* Output range ~ (-inf, 1.505); regulated to Q6.26: (-32.0, 32.0)
@@ -220,17 +230,28 @@ inline int32_t drc_inv_fixed(int32_t x, int32_t precision_x, int32_t precision_y
220230
* fpminimax(1/x, 5, [|SG...|], [sqrt(2)/2;1], absolute);
221231
* max err ~= 1.00388e-6
222232
*/
223-
ae_f32 in;
224-
int32_t in_abs;
225-
int32_t bit = 31 - AE_NSAZ32_L(x);
226-
int32_t e = bit - precision_x;
227-
int32_t precision_inv;
228-
int32_t sqrt2_extracted = 0;
229-
ae_f32 acc;
233+
__aligned(8) ae_f32 coef[2];
230234
ae_f64 tmp;
235+
ae_f32x2 in;
236+
ae_f32x2 p1p0;
237+
ae_f32 acc;
238+
ae_f32x2 *coefp;
239+
int32_t in_abs;
240+
int shift_input;
241+
int shift_output;
242+
int sqrt2_extracted = 0;
231243

232-
/*Output range [0.5, 1); regulated to Q2.30 */
233-
in = AE_SRAA32(x, bit - 30);
244+
/* Output range [0.5, 1); regulated to Q2.30
245+
*
246+
* Note: input and output shift code was originally as more self-documenting
247+
* bit = 31 - AE_NSAZ32_L(x); input_shift = bit - 30;
248+
* e = bit - precision_x; precision_inv = e + 25;
249+
* output_shift = precision_y - precision_inv;
250+
*/
251+
252+
shift_input = 1 - AE_NSAZ32_L(x);
253+
shift_output = precision_y + precision_x - shift_input - 55;
254+
in = AE_SRAA32(x, shift_input);
234255
in_abs = AE_MOVAD32_L(AE_ABS32S(in));
235256

236257
if (in_abs < ONE_OVER_SQRT2_Q30) {
@@ -240,35 +261,46 @@ inline int32_t drc_inv_fixed(int32_t x, int32_t precision_x, int32_t precision_y
240261
sqrt2_extracted = 1;
241262
}
242263

243-
tmp = AE_MULF32R_LL(in, INV_FUNC_A5_Q25);
244-
tmp = AE_SLAI64S(tmp, SHIFT_IDX_QX25_QY30_QZ25);
245-
acc = AE_ROUND32F48SSYM(tmp);
246-
acc = AE_ADD32(acc, INV_FUNC_A4_Q25);
247-
tmp = AE_MULF32R_LL(in, acc);
248-
tmp = AE_SLAI64S(tmp, SHIFT_IDX_QX25_QY30_QZ25);
249-
acc = AE_ROUND32F48SSYM(tmp);
250-
acc = AE_ADD32(acc, INV_FUNC_A3_Q25);
251-
tmp = AE_MULF32R_LL(in, acc);
252-
tmp = AE_SLAI64S(tmp, SHIFT_IDX_QX25_QY30_QZ25);
253-
acc = AE_ROUND32F48SSYM(tmp);
254-
acc = AE_ADD32(acc, INV_FUNC_A2_Q25);
255-
tmp = AE_MULF32R_LL(in, acc);
256-
tmp = AE_SLAI64S(tmp, SHIFT_IDX_QX25_QY30_QZ25);
257-
acc = AE_ROUND32F48SSYM(tmp);
258-
acc = AE_ADD32(acc, INV_FUNC_A1_Q25);
259-
tmp = AE_MULF32R_LL(in, acc);
260-
tmp = AE_SLAI64S(tmp, SHIFT_IDX_QX25_QY30_QZ25);
261-
acc = AE_ROUND32F48SSYM(tmp);
262-
acc = AE_ADD32(acc, INV_FUNC_A0_Q25);
264+
/* calculate p00(x) = a1 + a2 * x, p11(x) = a4 + a5 * x
265+
*
266+
* In Q25 coef * Q30 in 32 bit AE multiply the result is Q24 (25 + 30 + 1 - 32),
267+
* so every multiply is shifted left by one to keep results as Q25.
268+
*/
269+
coefp = (ae_f32x2 *)drc_inv_func_coefficients;
270+
p1p0 = AE_SLAI32S(AE_MULFP32X2RS(*coefp, in), SHIFT_IDX_QX25_QY30_QZ25);
271+
coefp++;
272+
p1p0 = AE_ADD32S(p1p0, *coefp);
273+
274+
/* calculate p0(x) = a0 + p00(x) * x, p1(x) = a3 + p11(x) * x */
275+
p1p0 = AE_SLAI32S(AE_MULFP32X2RS(p1p0, in), SHIFT_IDX_QX25_QY30_QZ25);
276+
coefp++;
277+
p1p0 = AE_ADD32S(p1p0, *coefp);
278+
279+
/* calculate p1(x) * x^3
280+
*
281+
* Q30 * Q30 AE multiply gives Q61. shifting it low 32 bits as Q30 would
282+
* need right shift by 31. For Q17,47 AE round instead it is shifted 16 bits
283+
* less, so shift right by 15.
284+
*/
285+
acc = AE_ROUND32F48SASYM(AE_SRAA64(AE_MULF32S_HH(in, in), 15));
286+
acc = AE_ROUND32F48SASYM(AE_SRAA64(AE_MULF32S_HH(acc, in), 15));
287+
coef[1] = INV_FUNC_ONE_Q30;
288+
coef[0] = acc;
289+
p1p0 = AE_SLAI32S(AE_MULFP32X2RS(p1p0, *(ae_int32x2 *)coef), SHIFT_IDX_QX25_QY30_QZ25);
290+
291+
/* calculate p(x) = p0(x) + p1(x) * x^3
292+
* p1p0.l holds p0(x) after multiply with one
293+
* p1p0.h holds p1(x) * x^3
294+
*/
295+
acc = AE_MOVAD32_L(AE_ADD32_HL_LH(p1p0, p1p0));
263296

264297
if (sqrt2_extracted) {
265298
tmp = AE_MULF32R_LL(SQRT2_Q30, acc);
266299
tmp = AE_SLAI64S(tmp, SHIFT_IDX_QX25_QY30_QZ25);
267300
acc = AE_ROUND32F48SSYM(tmp);
268301
}
269302

270-
precision_inv = e + 25;
271-
acc = AE_SLAA32S(acc, precision_y - precision_inv);
303+
acc = AE_SLAA32S(acc, shift_output);
272304
return AE_MOVAD32_L(acc);
273305
}
274306

0 commit comments

Comments
 (0)