@@ -237,48 +237,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
237237 vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc0, 0 ), __riscv_vget_v_f32m2_f32m1 (acc0, 1 ), vl);
238238 vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1 (acc1, __riscv_vfmv_v_f_f32m1 (0 .0f , 1 ), vl);
239239 sumf += __riscv_vfmv_f_s_f32m1_f32 (redsum);
240- #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
241- size_t vl = __riscv_vsetvlmax_e32m4 ();
242-
243- // initialize accumulators to all zeroes
244- vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4 (0 .0f , vl);
245- vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4 (0 .0f , vl);
246-
247- // calculate step size
248- const size_t epr = __riscv_vsetvlmax_e16m2 ();
249- const size_t step = epr * 2 ;
250- const int np = (n & ~(step - 1 ));
251240
252- // unroll by 2
253- for (; i < np; i += step) {
254- vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2 ((const __bf16 *)&x[i], epr);
255- vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2 ((const __bf16 *)&y[i], epr);
256- vsum0 = __riscv_vfwmaccbf16_vv_f32m4 (vsum0, ax0, ay0, epr);
257- __asm__ __volatile__ (" " ::: " memory" );
258-
259- vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2 ((const __bf16 *)&x[i + epr], epr);
260- vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2 ((const __bf16 *)&y[i + epr], epr);
261- vsum1 = __riscv_vfwmaccbf16_vv_f32m4 (vsum1, ax1, ay1, epr);
262- __asm__ __volatile__ (" " ::: " memory" );
263- }
264-
265- // accumulate in 1 register
266- vsum0 = __riscv_vfadd_vv_f32m4 (vsum0, vsum1, vl);
267-
268- // leftovers
269- for (i = np; i < n; i += vl) {
270- vl = __riscv_vsetvl_e16m2 (n - i);
271- vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2 ((const __bf16 *)&x[i], vl);
272- vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2 ((const __bf16 *)&y[i], vl);
273- vsum0 = __riscv_vfwmaccbf16_vv_f32m4 (vsum0, ax0, ay0, vl);
274- }
275-
276- // reduce
277- vl = __riscv_vsetvlmax_e32m2 ();
278- vfloat32m2_t acc0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum0, 0 ), __riscv_vget_v_f32m4_f32m2 (vsum0, 1 ), vl);
279- vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc0, 0 ), __riscv_vget_v_f32m2_f32m1 (acc0, 1 ), vl);
280- vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1 (acc1, __riscv_vfmv_v_f_f32m1 (0 .0f , 1 ), vl);
281- sumf += __riscv_vfmv_f_s_f32m1_f32 (redsum);
282241#endif
283242 for (; i < n; ++i) {
284243 sumf += (ggml_float)(GGML_BF16_TO_FP32 (x[i]) *
0 commit comments