From 2db021ab4fb7507e605737e9f627fb487cf44728 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Thu, 31 Oct 2024 16:52:07 +0200
Subject: [PATCH 1/2] Test: Cmocka: Add unit test for sofm_exp_fixed() function

There was no unit test for the function, so it is added. The
pass criteria is tuned for current implementation to just
pass in the three defined regions. It is useful to verify
the next changes to optimize the function.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 test/cmocka/src/math/arithmetic/exponential.c | 114 +++++++++++++++++-
 1 file changed, 111 insertions(+), 3 deletions(-)

diff --git a/test/cmocka/src/math/arithmetic/exponential.c b/test/cmocka/src/math/arithmetic/exponential.c
index 52e6e65d3a84..f35299fd3f1b 100644
--- a/test/cmocka/src/math/arithmetic/exponential.c
+++ b/test/cmocka/src/math/arithmetic/exponential.c
@@ -5,6 +5,7 @@
  * Author: Shriram Shastry <malladi.sastry@linux.intel.com>
  */
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdarg.h>
@@ -25,7 +26,17 @@
 #define ULP_SCALE 0.0000999999999749
 #define NUMTESTSAMPLES 256
 
-static void gen_exp_testvector(double a, double b, double *r, int32_t *b_i);
+#define NUMTESTSAMPLES_MIDDLE_TEST2 100
+#define NUMTESTSAMPLES_SMALL_TEST2 100
+#define NUMTESTSAMPLES_LARGE_TEST2 100
+#define ABS_DELTA_TOLERANCE_TEST1 1.2e-2
+#define REL_DELTA_TOLERANCE_TEST1 2.8e-4
+#define ABS_DELTA_TOLERANCE_TEST2 1.7e-6
+#define REL_DELTA_TOLERANCE_TEST2 3.7e-2
+#define ABS_DELTA_TOLERANCE_TEST3 1.2e-1
+#define REL_DELTA_TOLERANCE_TEST3 7.7e-5
+#define SOFM_EXP_FIXED_ARG_MIN -11.5
+#define SOFM_EXP_FIXED_ARG_MAX 7.6245
 
 /* testvector in Q4.28, -5 to +5 */
 /*
@@ -52,7 +63,7 @@ static void gen_exp_testvector(double a, double b, double *r, int32_t *b_i)
 		*b_i = INT32_MAX;
 }
 
-static void test_math_arithmetic_exponential_fixed(void **state)
+static void test_function_sofm_exp_int32(void **state)
 {
 	(void)state;
 
@@ -81,10 +92,107 @@ static void test_math_arithmetic_exponential_fixed(void **state)
 	}
 }
 
+static void gen_exp_testvector_linspace_q27(double a, double b, int step_count,
+					    int point, int32_t *value)
+{
+	double fstep = (b - a) / (step_count - 1);
+	double fvalue = a + fstep * point;
+
+	*value = (int32_t)round(fvalue * 134217728.0);
+}
+
+static double ref_exp(double value)
+{
+	if (value < SOFM_EXP_FIXED_ARG_MIN)
+		return 0.0;
+	else if (value > SOFM_EXP_FIXED_ARG_MAX)
+		return 2048.0;
+	else
+		return exp(value);
+}
+
+static void test_exp_with_input_value(int32_t ivalue, int32_t *iexp_value,
+				      double *abs_delta_max, double *rel_delta_max,
+				      double abs_delta_tolerance, double rel_delta_tolerance)
+{
+	double fvalue, fexp_value, ref_exp_value;
+	double rel_delta, abs_delta;
+
+	*iexp_value = sofm_exp_fixed(ivalue);
+	fvalue = (double)ivalue / (1 << 27); /* Q5.27 */
+	fexp_value = (double)*iexp_value / (1 << 20); /* Q12.20 */
+	/* printf("val = %10.6f, exp = %12.6f\n", fvalue, fexp_value); */
+
+	ref_exp_value = ref_exp(fvalue);
+	abs_delta = fabs(ref_exp_value - fexp_value);
+	rel_delta = abs_delta / ref_exp_value;
+	if (abs_delta > *abs_delta_max)
+		*abs_delta_max = abs_delta;
+
+	if (rel_delta > *rel_delta_max)
+		*rel_delta_max = rel_delta;
+
+	if (abs_delta > abs_delta_tolerance) {
+		printf("%s: Absolute error %g exceeds limit %g, input %g output %g.\n",
+		       __func__, abs_delta, abs_delta_tolerance, fvalue, fexp_value);
+		assert_true(false);
+	}
+
+	if (rel_delta > rel_delta_tolerance) {
+		printf("%s: Relative error %g exceeds limit %g, input %g output %g.\n",
+		       __func__, rel_delta, rel_delta_tolerance, fvalue, fexp_value);
+		assert_true(false);
+	}
+}
+
+static void test_function_sofm_exp_fixed(void **state)
+{
+	(void)state;
+
+	double rel_delta_max, abs_delta_max;
+	int32_t ivalue, iexp_value;
+	int i;
+
+	rel_delta_max = 0;
+	abs_delta_max = 0;
+	for (i = 0; i < NUMTESTSAMPLES_MIDDLE_TEST2; i++) {
+		gen_exp_testvector_linspace_q27(-5, 5, NUMTESTSAMPLES_MIDDLE_TEST2, i, &ivalue);
+		test_exp_with_input_value(ivalue, &iexp_value, &abs_delta_max, &rel_delta_max,
+					  ABS_DELTA_TOLERANCE_TEST1, REL_DELTA_TOLERANCE_TEST1);
+
+	}
+
+	printf("%s: Absolute max error was %.6e (middle).\n", __func__, abs_delta_max);
+	printf("%s: Relative max error was %.6e (middle).\n", __func__, rel_delta_max);
+
+	rel_delta_max = 0;
+	abs_delta_max = 0;
+	for (i = 0; i < NUMTESTSAMPLES_SMALL_TEST2; i++) {
+		gen_exp_testvector_linspace_q27(-16, -5, NUMTESTSAMPLES_SMALL_TEST2, i, &ivalue);
+		test_exp_with_input_value(ivalue, &iexp_value, &abs_delta_max, &rel_delta_max,
+					  ABS_DELTA_TOLERANCE_TEST2, REL_DELTA_TOLERANCE_TEST2);
+	}
+
+	printf("%s: Absolute max error was %.6e (small).\n", __func__, abs_delta_max);
+	printf("%s: Relative max error was %.6e (small).\n", __func__, rel_delta_max);
+
+	rel_delta_max = 0;
+	abs_delta_max = 0;
+	for (i = 0; i < NUMTESTSAMPLES_LARGE_TEST2; i++) {
+		gen_exp_testvector_linspace_q27(5, 15.9999, NUMTESTSAMPLES_LARGE_TEST2, i, &ivalue);
+		test_exp_with_input_value(ivalue, &iexp_value, &abs_delta_max, &rel_delta_max,
+					  ABS_DELTA_TOLERANCE_TEST3, REL_DELTA_TOLERANCE_TEST3);
+	}
+
+	printf("%s: Absolute max error was %.6e (large).\n", __func__, abs_delta_max);
+	printf("%s: Relative max error was %.6e (large).\n", __func__, rel_delta_max);
+}
+
 int main(void)
 {
 	const struct CMUnitTest tests[] = {
-		cmocka_unit_test(test_math_arithmetic_exponential_fixed)
+		cmocka_unit_test(test_function_sofm_exp_int32),
+		cmocka_unit_test(test_function_sofm_exp_fixed),
 	};
 
 	cmocka_set_message_output(CM_OUTPUT_TAP);

From 9b19183397b926514b8d2be3c2385c9e0186bd08 Mon Sep 17 00:00:00 2001
From: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
Date: Wed, 30 Oct 2024 18:27:20 +0200
Subject: [PATCH 2/2] Math: Optimize sofm_exp_fixed() HiFi version

The unnecessary shift and multiply functions can be removed
with use of normal C shift left and with use xtensa multiply,
shift, and round intrinsics directly in the function.

This change saves in TGL HiFi3 platform 1.3 MCPS in DRC
processing mode.

Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
---
 src/math/exp_fcn_hifi.c | 100 ++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 60 deletions(-)

diff --git a/src/math/exp_fcn_hifi.c b/src/math/exp_fcn_hifi.c
index 10775133e8a4..d41071335fd9 100644
--- a/src/math/exp_fcn_hifi.c
+++ b/src/math/exp_fcn_hifi.c
@@ -280,52 +280,6 @@ int32_t sofm_exp_int32(int32_t x)
 	return AE_MOVAD32_L(AE_MOVINT32X2_FROMINT64(ts));
 }
 
-/* Fractional multiplication with shift and round
- * Note that the parameters px and py must be cast to (int64_t) if other type.
- */
-static inline int exp_hifi_q_multsr_32x32(int a, int b, int c, int d, int e)
-{
-	ae_int64 res;
-	int xt_o;
-	int shift;
-
-	res = AE_MUL32_LL(a, b);
-	shift = XT_SUB(XT_ADD(c, d), XT_ADD(e, 1));
-	res = AE_SRAA64(res, shift);
-	res = AE_ADD64(res, 1);
-	res = AE_SRAI64(res, 1);
-	xt_o = AE_MOVINT32_FROMINT64(res);
-
-	return xt_o;
-}
-
-/* A macro for Q-shifts */
-static inline int exp_hifi_q_shift_rnd(int a, int b, int c)
-{
-	ae_int32 res;
-	int shift;
-
-	shift = XT_SUB(b, XT_ADD(c, 1));
-	res = AE_SRAA32(a, shift);
-	res = AE_ADD32(res, 1);
-	res = AE_SRAI32(res, 1);
-
-	return res;
-}
-
-/* Alternative version since compiler does not allow (x >> -1) */
-static inline int exp_hifi_q_shift_left(int a, int b, int c)
-{
-	ae_int32 xt_o;
-	int shift;
-
-	shift = XT_SUB(c, b);
-	xt_o = AE_SLAA32(a, shift);
-
-	return xt_o;
-}
-
-#define q_mult(a, b, qa, qb, qy) ((int32_t)exp_hifi_q_multsr_32x32((int64_t)(a), b, qa, qb, qy))
 /* Fixed point exponent function for approximate range -11.5 .. 7.6
  * that corresponds to decibels range -100 .. +66 dB.
  *
@@ -341,11 +295,13 @@ static inline int exp_hifi_q_shift_left(int a, int b, int c)
 
 int32_t sofm_exp_fixed(int32_t x)
 {
-	int32_t xs;
-	int32_t y;
-	int32_t y0;
+	ae_f64 p;
+	ae_int32 y0;
+	ae_int32 y;
+	ae_int32 xs;
+	int32_t n;
+	int shift;
 	int i;
-	int n = 0;
 
 	if (x < SOFM_EXP_FIXED_INPUT_MIN)
 		return 0;
@@ -353,21 +309,45 @@ int32_t sofm_exp_fixed(int32_t x)
 	if (x > SOFM_EXP_FIXED_INPUT_MAX)
 		return INT32_MAX;
 
-	/* x is Q5.27 */
-	xs = x;
-	while (xs >= SOFM_EXP_TWO_Q27 || xs <= SOFM_EXP_MINUS_TWO_Q27) {
-		xs >>= 1;
-		n++;
-	}
+	/* This returns number of right shifts needed to scale value x to |x| < 2.
+	 * The behavior differs slightly for positive and negative values but it
+	 * is not problem for sofm_exp_int32() function. E.g.
+	 *
+	 * x =  268435455 (1.9999999925), shift = 0
+	 * x =  268435456 (2.0000000000), shift = 1
+	 * x =  268435457 (2.0000000075), shift = 1
+	 *
+	 * x = -268435457 (-2.0000000075), shift = 1
+	 * x = -268435456 (-2.0000000000), shift = 0
+	 * x = -268435455 (-1.9999999925), shift = 0
+	 *
+	 * If the shift is zero, just return result from sofm_exp_int32() with
+	 * input Q format and output Q format adjusts.
+	 */
+	shift = (int)AE_MAX32(0, 3 - AE_NSAZ32_L(x));
+	if (!shift)
+		return AE_SRAI32R(sofm_exp_int32(AE_MOVAD32_L(AE_SLAI32S(x, 1))), 3);
+
+	/* Shifting x one less right to save an additional Q27 to Q28 conversion
+	 * shift for sofm_exp_int32()
+	 */
+	n = 1 << shift;
+	xs = AE_SRAA32RS(x, shift - 1);
 
 	/* sofm_exp_int32() input is Q4.28, while x1 is Q5.27
 	 * sofm_exp_int32() output is Q9.23, while y0 is Q12.20
 	 */
-	y0 = exp_hifi_q_shift_rnd(sofm_exp_int32(exp_hifi_q_shift_left(xs, 27, 28)),
-				  23, 20);
+	y0 = AE_SRAI32R(sofm_exp_int32(xs), 3);
 	y = SOFM_EXP_ONE_Q20;
-	for (i = 0; i < (1 << n); i++)
-		y = (int32_t)exp_hifi_q_multsr_32x32((int64_t)y, y0, 20, 20, 20);
+
+	/* AE multiply returns Q41 from Q20 * Q20. To get Q20 it need to be
+	 * shifted right by 21. Since the used round instruction is aligned
+	 * to the high 32 bits it is shifted instead left by 32 - 21 = 11:
+	 */
+	for (i = 0; i < n; i++) {
+		p = AE_SLAI64S(AE_MULF32S_LL(y, y0), 11);
+		y = AE_ROUND32F64SASYM(p);
+	}
 
 	return y;
 }