Math: Optimize sofm_exp_fixed() HiFi version

singalsu · singalsu · commit bdd20f91641f · 2024-10-31T17:50:13.000+02:00
The unnecessary shift and multiply functions can be removed
with use of normal C shift left and with use xtensa multiply,
shift, and round intrinsics directly in the function.

This change saves in TGL HiFi3 platform 1.3 MCPS in DRC
processing mode.

Signed-off-by: Seppo Ingalsuo &lt;seppo.ingalsuo@linux.intel.com&gt;
diff --git a/src/math/exp_fcn_hifi.c b/src/math/exp_fcn_hifi.c
@@ -280,52 +280,6 @@ int32_t sofm_exp_int32(int32_t x)
 	return AE_MOVAD32_L(AE_MOVINT32X2_FROMINT64(ts));
 }
 
-/* Fractional multiplication with shift and round
- * Note that the parameters px and py must be cast to (int64_t) if other type.
- */
-static inline int exp_hifi_q_multsr_32x32(int a, int b, int c, int d, int e)
-{
-	ae_int64 res;
-	int xt_o;
-	int shift;
-
-	res = AE_MUL32_LL(a, b);
-	shift = XT_SUB(XT_ADD(c, d), XT_ADD(e, 1));
-	res = AE_SRAA64(res, shift);
-	res = AE_ADD64(res, 1);
-	res = AE_SRAI64(res, 1);
-	xt_o = AE_MOVINT32_FROMINT64(res);
-
-	return xt_o;
-}
-
-/* A macro for Q-shifts */
-static inline int exp_hifi_q_shift_rnd(int a, int b, int c)
-{
-	ae_int32 res;
-	int shift;
-
-	shift = XT_SUB(b, XT_ADD(c, 1));
-	res = AE_SRAA32(a, shift);
-	res = AE_ADD32(res, 1);
-	res = AE_SRAI32(res, 1);
-
-	return res;
-}
-
-/* Alternative version since compiler does not allow (x >> -1) */
-static inline int exp_hifi_q_shift_left(int a, int b, int c)
-{
-	ae_int32 xt_o;
-	int shift;
-
-	shift = XT_SUB(c, b);
-	xt_o = AE_SLAA32(a, shift);
-
-	return xt_o;
-}
-
-#define q_mult(a, b, qa, qb, qy) ((int32_t)exp_hifi_q_multsr_32x32((int64_t)(a), b, qa, qb, qy))
 /* Fixed point exponent function for approximate range -11.5 .. 7.6
  * that corresponds to decibels range -100 .. +66 dB.
  *
@@ -341,11 +295,12 @@ static inline int exp_hifi_q_shift_left(int a, int b, int c)
 
 int32_t sofm_exp_fixed(int32_t x)
 {
+	ae_f64 p;
+	ae_int32 y0;
+	ae_int32 y;
 	int32_t xs;
-	int32_t y;
-	int32_t y0;
+	int32_t n = 1;
 	int i;
-	int n = 0;
 
 	if (x < SOFM_EXP_FIXED_INPUT_MIN)
 		return 0;
@@ -357,20 +312,27 @@ int32_t sofm_exp_fixed(int32_t x)
 	xs = x;
 	while (xs >= SOFM_EXP_TWO_Q27 || xs <= SOFM_EXP_MINUS_TWO_Q27) {
 		xs >>= 1;
-		n++;
+		n <<= 1;
 	}
 
 	/* sofm_exp_int32() input is Q4.28, while x1 is Q5.27
 	 * sofm_exp_int32() output is Q9.23, while y0 is Q12.20
 	 */
-	y0 = exp_hifi_q_shift_rnd(sofm_exp_int32(exp_hifi_q_shift_left(xs, 27, 28)),
-				  23, 20);
+	y0 = AE_SRAI32R(sofm_exp_int32(xs << 1), 3);
 	y = SOFM_EXP_ONE_Q20;
-	for (i = 0; i < (1 << n); i++)
-		y = (int32_t)exp_hifi_q_multsr_32x32((int64_t)y, y0, 20, 20, 20);
 
-	return y;
+	/* AE multiply returns Q41 from Q20 * Q20. To get Q20 it need to be
+	 * shifted right by 21. Since the used round instruction is aligned
+	 * to the high 32 bits it is shifted instead left by 32 - 21 = 11:
+	 */
+	for (i = 0; i < n; i++) {
+		p = AE_SLAI64S(AE_MULF32S_LL(y, y0), 11);
+		y = AE_ROUND32F64SASYM(p);
+	}
+
+	return (int32_t)y;
 }
+
 EXPORT_SYMBOL(sofm_exp_fixed);
 
 #endif