ggml: fmt

taimur-10x · taimur-10x · commit be4fa9736cf8 · 2025-11-17T15:12:21.000+05:00
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
@@ -237,48 +237,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
     vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0, 0), __riscv_vget_v_f32m2_f32m1(acc0, 1), vl);
     vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1(acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
     sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
-#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
-    size_t vl = __riscv_vsetvlmax_e32m4();
-
-    // initialize accumulators to all zeroes
-    vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
-    vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
-
-    // calculate step size
-    const size_t epr = __riscv_vsetvlmax_e16m2();
-    const size_t step = epr * 2;
-    const int np = (n & ~(step - 1));
 
-    // unroll by 2
-    for (; i < np; i += step) {
-        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
-        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
-        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
-        __asm__ __volatile__ ("" ::: "memory");
-
-        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
-        vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
-        vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
-        __asm__ __volatile__ ("" ::: "memory");
-    }
-
-    // accumulate in 1 register
-    vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
-
-    // leftovers
-    for (i = np; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m2(n - i);
-        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
-        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
-        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
-    }
-
-    // reduce
-    vl = __riscv_vsetvlmax_e32m2();
-    vfloat32m2_t acc0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
-    vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0, 0), __riscv_vget_v_f32m2_f32m1(acc0, 1), vl);
-    vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1(acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
-    sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
 #endif
     for (; i < n; ++i) {
         sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *