Skip to content

Commit fad27a2

Browse files
ipc4: mixin: Add "mix with gain" HiFi5 impl
Adds HiFi5 implementation of "mix with gain" functions. Signed-off-by: Serhiy Katsyuba <serhiy.katsyuba@intel.com>
1 parent cad6f82 commit fad27a2

File tree

1 file changed

+290
-3
lines changed

1 file changed

+290
-3
lines changed

src/audio/mixin_mixout/mixin_mixout_hifi5.c

Lines changed: 290 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,107 @@ static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe
9898
}
9999
}
100100
}
101+
102+
static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
103+
const struct cir_buf_ptr *source,
104+
int32_t sample_count, uint16_t gain)
105+
{
106+
int samples_to_mix, samples_to_copy, left_samples;
107+
int n, nmax, i, m, left;
108+
ae_int16x4 in_sample, in_sample1;
109+
ae_int16x4 out_sample, out_sample1;
110+
ae_int16x8 *in;
111+
ae_int16x8 *out;
112+
ae_valignx2 inu;
113+
ae_valignx2 outu1;
114+
ae_valignx2 outu2 = AE_ZALIGN128();
115+
/* cir_buf_wrap() is required and is done below in a loop */
116+
ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample;
117+
ae_int16 *src = source->ptr;
118+
ae_f16x4 gain_vec;
119+
120+
/* this func does not support unity gain as 1 cannot be represented as Q1.15 value */
121+
assert(gain < IPC4_MIXIN_UNITY_GAIN);
122+
123+
gain_vec = AE_L16_I((ae_int16 *)&gain, 0);
124+
gain_vec = AE_SLAI16S(gain_vec, 5); /* convert to Q1.15 */
125+
126+
assert(mixed_samples >= start_sample);
127+
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
128+
samples_to_copy = sample_count - samples_to_mix;
129+
n = 0;
130+
131+
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
132+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
133+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
134+
/* calculate the remaining samples*/
135+
nmax = (ae_int16 *)source->buf_end - src;
136+
n = AE_MIN32(left_samples, nmax);
137+
nmax = (ae_int16 *)sink->buf_end - dst;
138+
n = AE_MIN32(n, nmax);
139+
in = (ae_int16x8 *)src;
140+
out = (ae_int16x8 *)dst;
141+
inu = AE_LA128_PP(in);
142+
outu1 = AE_LA128_PP(out);
143+
m = n >> 3;
144+
left = n & 0x07;
145+
/* process 8 samples per loop */
146+
for (i = 0; i < m; i++) {
147+
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in);
148+
AE_LA16X4X2_IP(out_sample, out_sample1, outu1, out);
149+
out--;
150+
in_sample = AE_MULFP16X4RS(in_sample, gain_vec);
151+
in_sample1 = AE_MULFP16X4RS(in_sample1, gain_vec);
152+
out_sample = AE_ADD16S(in_sample, out_sample);
153+
out_sample1 = AE_ADD16S(in_sample1, out_sample1);
154+
AE_SA16X4X2_IP(out_sample, out_sample1, outu2, out);
155+
}
156+
AE_SA128POS_FP(outu2, out);
157+
158+
/* process the left samples that less than 8
159+
* one by one to avoid memory access overrun
160+
*/
161+
for (i = 0; i < left ; i++) {
162+
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
163+
AE_L16_IP(out_sample, (ae_int16 *)out, 0);
164+
in_sample = AE_MULFP16X4RS(in_sample, gain_vec);
165+
out_sample = AE_ADD16S(in_sample, out_sample);
166+
AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16));
167+
}
168+
}
169+
170+
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
171+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
172+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
173+
/* calculate the remaining samples*/
174+
nmax = (ae_int16 *)source->buf_end - src;
175+
n = AE_MIN32(left_samples, nmax);
176+
nmax = (ae_int16 *)sink->buf_end - dst;
177+
n = AE_MIN32(n, nmax);
178+
in = (ae_int16x8 *)src;
179+
out = (ae_int16x8 *)dst;
180+
inu = AE_LA128_PP(in);
181+
m = n >> 3;
182+
left = n & 0x07;
183+
/* process 8 samples per loop */
184+
for (i = 0; i < m; i++) {
185+
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in);
186+
in_sample = AE_MULFP16X4RS(in_sample, gain_vec);
187+
in_sample1 = AE_MULFP16X4RS(in_sample1, gain_vec);
188+
AE_SA16X4X2_IP(in_sample, in_sample1, outu2, out);
189+
}
190+
AE_SA128POS_FP(outu2, out);
191+
192+
/* process the left samples that less than 8
193+
* one by one to avoid memory access overrun
194+
*/
195+
for (i = 0; i < left ; i++) {
196+
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
197+
in_sample = AE_MULFP16X4RS(in_sample, gain_vec);
198+
AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16));
199+
}
200+
}
201+
}
101202
#endif /* CONFIG_FORMAT_S16LE */
102203

103204
#if CONFIG_FORMAT_S24LE
@@ -193,6 +294,102 @@ static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe
193294
}
194295
}
195296

297+
static void mix_s24_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
298+
const struct cir_buf_ptr *source,
299+
int32_t sample_count, uint16_t gain)
300+
{
301+
int samples_to_mix, samples_to_copy, left_samples;
302+
int n, nmax, i, m, left;
303+
ae_int32x2 in_sample, in_sample1;
304+
ae_int32x2 out_sample, out_sample1;
305+
ae_int32x4 *in;
306+
ae_int32x4 *out;
307+
ae_valignx2 inu;
308+
ae_valignx2 outu1;
309+
ae_valignx2 outu2 = AE_ZALIGN128();
310+
/* cir_buf_wrap() is required and is done below in a loop */
311+
int32_t *dst = (int32_t *)sink->ptr + start_sample;
312+
int32_t *src = source->ptr;
313+
ae_f24x2 gain_vec;
314+
ae_int32 gain32 = (ae_int32)gain;
315+
316+
/* this func does not support unity gain as 1 cannot be represented as Q1.23 value */
317+
assert(gain < IPC4_MIXIN_UNITY_GAIN);
318+
319+
gain_vec = AE_MOVF24X2_FROMINT32X2(AE_L32_I(&gain32, 0));
320+
gain_vec = AE_SLAI24S(gain_vec, 13); /* convert to Q1.23 */
321+
322+
assert(mixed_samples >= start_sample);
323+
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
324+
samples_to_copy = sample_count - samples_to_mix;
325+
n = 0;
326+
327+
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
328+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
329+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
330+
/* calculate the remaining samples*/
331+
nmax = (int32_t *)source->buf_end - src;
332+
n = AE_MIN32(left_samples, nmax);
333+
nmax = (int32_t *)sink->buf_end - dst;
334+
n = AE_MIN32(n, nmax);
335+
in = (ae_int32x4 *)src;
336+
out = (ae_int32x4 *)dst;
337+
inu = AE_LA128_PP(in);
338+
outu1 = AE_LA128_PP(out);
339+
m = n >> 2;
340+
left = n & 3;
341+
/* process 4 samples per time */
342+
for (i = 0; i < m; i++) {
343+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
344+
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out);
345+
out--;
346+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
347+
in_sample1 = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample1), gain_vec);
348+
/* out samples are already sign extended by other mixin in a loop below */
349+
out_sample = AE_ADD24S(in_sample, out_sample);
350+
out_sample1 = AE_ADD24S(in_sample1, out_sample1);
351+
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out);
352+
}
353+
AE_SA128POS_FP(outu2, out);
354+
355+
/* process the left samples to avoid memory access overrun */
356+
for (i = 0; i < left; i++) {
357+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
358+
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
359+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
360+
/* out samples are already sign extended by other mixin in a loop below */
361+
out_sample = AE_ADD24S(in_sample, out_sample);
362+
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
363+
}
364+
}
365+
366+
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
367+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
368+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
369+
nmax = (int32_t *)source->buf_end - src;
370+
n = AE_MIN32(left_samples, nmax);
371+
nmax = (int32_t *)sink->buf_end - dst;
372+
n = AE_MIN32(n, nmax);
373+
in = (ae_int32x4 *)src;
374+
out = (ae_int32x4 *)dst;
375+
inu = AE_LA128_PP(in);
376+
m = n >> 2;
377+
left = n & 3;
378+
for (i = 0; i < m; i++) {
379+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
380+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
381+
in_sample1 = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample1), gain_vec);
382+
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out);
383+
}
384+
AE_SA128POS_FP(outu2, out);
385+
/* process the left samples to avoid memory access overrun */
386+
for (i = 0; i < left; i++) {
387+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
388+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
389+
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
390+
}
391+
}
392+
}
196393
#endif /* CONFIG_FORMAT_S24LE */
197394

198395
#if CONFIG_FORMAT_S32LE
@@ -277,18 +474,108 @@ static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixe
277474
}
278475
}
279476

477+
static void mix_s32_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples,
478+
const struct cir_buf_ptr *source,
479+
int32_t sample_count, uint16_t gain)
480+
{
481+
int samples_to_mix, samples_to_copy, left_samples;
482+
int n, nmax, i, m, left;
483+
ae_int32x2 in_sample, in_sample1;
484+
ae_int32x2 out_sample, out_sample1;
485+
ae_int32x4 *in;
486+
ae_int32x4 *out;
487+
ae_valignx2 inu;
488+
ae_valignx2 outu1;
489+
ae_valignx2 outu2 = AE_ZALIGN128();
490+
/* cir_buf_wrap() is required and is done below in a loop */
491+
int32_t *dst = (int32_t *)sink->ptr + start_sample;
492+
int32_t *src = source->ptr;
493+
ae_f16x4 gain_vec;
494+
495+
/* this func does not support unity gain as 1 cannot be represented as Q1.15 value */
496+
assert(gain < IPC4_MIXIN_UNITY_GAIN);
497+
498+
gain_vec = AE_L16_I((ae_int16 *)&gain, 0);
499+
gain_vec = AE_SLAI16S(gain_vec, 5); /* convert to Q1.15 */
500+
501+
assert(mixed_samples >= start_sample);
502+
samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count);
503+
samples_to_copy = sample_count - samples_to_mix;
504+
n = 0;
505+
506+
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) {
507+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
508+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
509+
/* calculate the remaining samples*/
510+
nmax = (int32_t *)source->buf_end - src;
511+
n = AE_MIN32(left_samples, nmax);
512+
nmax = (int32_t *)sink->buf_end - dst;
513+
n = AE_MIN32(n, nmax);
514+
in = (ae_int32x4 *)src;
515+
out = (ae_int32x4 *)dst;
516+
inu = AE_LA128_PP(in);
517+
outu1 = AE_LA128_PP(out);
518+
m = n >> 2;
519+
left = n & 3;
520+
for (i = 0; i < m; i++) {
521+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
522+
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out);
523+
out--;
524+
AE_MULAFP32X16X2RS_L(out_sample, in_sample, gain_vec);
525+
AE_MULAFP32X16X2RS_L(out_sample1, in_sample1, gain_vec);
526+
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out);
527+
}
528+
AE_SA128POS_FP(outu2, out);
529+
530+
/* process the left samples to avoid memory access overrun */
531+
for (i = 0; i < left; i++) {
532+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
533+
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
534+
AE_MULAFP32X16X2RS_L(out_sample, in_sample, gain_vec);
535+
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
536+
}
537+
}
538+
539+
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) {
540+
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end);
541+
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end);
542+
/* calculate the remaining samples*/
543+
nmax = (int32_t *)source->buf_end - src;
544+
n = AE_MIN32(left_samples, nmax);
545+
nmax = (int32_t *)sink->buf_end - dst;
546+
n = AE_MIN32(n, nmax);
547+
in = (ae_int32x4 *)src;
548+
out = (ae_int32x4 *)dst;
549+
inu = AE_LA128_PP(in);
550+
m = n >> 2;
551+
left = n & 3;
552+
for (i = 0; i < m; i++) {
553+
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in);
554+
in_sample = AE_MULFP32X16X2RS_L(in_sample, gain_vec);
555+
in_sample1 = AE_MULFP32X16X2RS_L(in_sample1, gain_vec);
556+
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out);
557+
}
558+
AE_SA128POS_FP(outu2, out);
559+
/* process the left samples to avoid memory access overrun */
560+
for (i = 0; i < left; i++) {
561+
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
562+
in_sample = AE_MULFP32X16X2RS_L(in_sample, gain_vec);
563+
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
564+
}
565+
}
566+
}
280567
#endif /* CONFIG_FORMAT_S32LE */
281568

282569
/* TODO: implement mixing functions with gain support!*/
283570
__cold_rodata const struct mix_func_map mix_func_map[] = {
284571
#if CONFIG_FORMAT_S16LE
285-
{ SOF_IPC_FRAME_S16_LE, mix_s16, mix_s16 },
572+
{ SOF_IPC_FRAME_S16_LE, mix_s16, mix_s16_gain },
286573
#endif
287574
#if CONFIG_FORMAT_S24LE
288-
{ SOF_IPC_FRAME_S24_4LE, mix_s24, mix_s24 },
575+
{ SOF_IPC_FRAME_S24_4LE, mix_s24, mix_s24_gain },
289576
#endif
290577
#if CONFIG_FORMAT_S32LE
291-
{ SOF_IPC_FRAME_S32_LE, mix_s32, mix_s32 }
578+
{ SOF_IPC_FRAME_S32_LE, mix_s32, mix_s32_gain }
292579
#endif
293580
};
294581

0 commit comments

Comments
 (0)