ggml-cpu: change lmul, unrolling for kernels

taimur-10x · taimur-10x · commit ffbab18e05db · 2025-11-14T17:54:39.000+05:00
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3274,31 +3274,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
         __m128 y_vec = _mm_cvtph_ps(x_vec);
         _mm_storeu_ps(y + i, y_vec);
     }
+
 #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
     // calculate step size
-    const int epr = __riscv_vsetvlmax_e16m4();
+    const int epr = __riscv_vsetvlmax_e16m2();
     const int step = epr * 2;
     const int np = (n & ~(step - 1));
 
     // unroll by 2
     for (; i < np; i += step) {
-        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
-        vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, epr);
-        __riscv_vse32_v_f32m8(y + i, ay0, epr);
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
 
-        vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
-        vfloat32m8_t ay1 = __riscv_vfwcvt_f_f_v_f32m8(ax1, epr);
-        __riscv_vse32_v_f32m8(y + i + epr, ay1, epr);
+        vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
     }
 
     // leftovers
     int vl;
     for (i = np; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m4(n - i);
-        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
-        vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, vl);
-        __riscv_vse32_v_f32m8(y + i, ay0, vl);
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
     }
+
 #endif
 
     for (; i < n; ++i) {
@@ -3345,28 +3347,28 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
     }
 #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
     // calculate step size
-    const int epr = __riscv_vsetvlmax_e16m4();
+    const int epr = __riscv_vsetvlmax_e16m2();
     const int step = epr * 2;
     const int np = (n & ~(step - 1));
 
     // unroll by 2
     for (; i < np; i += step) {
-        vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, epr);
-        vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, epr);
-        __riscv_vse32_v_f32m8(y + i, ay0, epr);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
 
-        vbfloat16m4_t ax1 = __riscv_vle16_v_bf16m4((const __bf16*)x + i + epr, epr);
-        vfloat32m8_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax1, epr);
-        __riscv_vse32_v_f32m8(y + i + epr, ay1, epr);
+        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
     }
 
     // leftovers
     int vl;
     for (i = np; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m4(n - i);
-        vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, vl);
-        vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, vl);
-        __riscv_vse32_v_f32m8(y + i, ay0, vl);
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
     }
 #endif
     for (; i < n; i++) {
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
@@ -226,84 +226,66 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
         GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
 
     #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
-        size_t vl = __riscv_vsetvlmax_e32m2();
+        size_t vl = __riscv_vsetvlmax_e32m4();
 
         // initialize accumulators to all zeroes
-        vfloat32m2_t vsum0_0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
-        vfloat32m2_t vsum0_1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
-        vfloat32m2_t vsum0_2 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
-        vfloat32m2_t vsum0_3 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
-        vfloat32m2_t vsum1_0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
-        vfloat32m2_t vsum1_1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
-        vfloat32m2_t vsum1_2 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
-        vfloat32m2_t vsum1_3 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
+        vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
 
         // calculate step size
-        const size_t epr = __riscv_vsetvlmax_e16m1();
-        const size_t step = epr * 4;
+        const size_t epr = __riscv_vsetvlmax_e16m2();
+        const size_t step = epr * 2;
         const int np = (n & ~(step - 1));
 
-        // unroll by 4
+        // unroll by 2 along the row dimension
         for (int i = 0; i < np; i += step) {
-            vfloat16m1_t ay0 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i), epr);
-            vfloat16m1_t ax0_0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i), epr);
-            vfloat16m1_t ax1_0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i), epr);
-            vsum0_0 = __riscv_vfwmacc_vv_f32m2(vsum0_0, ax0_0, ay0, epr);
-            vsum1_0 = __riscv_vfwmacc_vv_f32m2(vsum1_0, ax1_0, ay0, epr);
-            __asm__ __volatile__("" ::: "memory");
-
-            vfloat16m1_t ay1 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i + epr), epr);
-            vfloat16m1_t ax0_1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i + epr), epr);
-            vfloat16m1_t ax1_1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i + epr), epr);
-            vsum0_1 = __riscv_vfwmacc_vv_f32m2(vsum0_1, ax0_1, ay1, epr);
-            vsum1_1 = __riscv_vfwmacc_vv_f32m2(vsum1_1, ax1_1, ay1, epr);
-            __asm__ __volatile__("" ::: "memory");
-
-            vfloat16m1_t ay2 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i + 2 * epr), epr);
-            vfloat16m1_t ax0_2 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i + 2 * epr), epr);
-            vfloat16m1_t ax1_2 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i + 2 * epr), epr);
-            vsum0_2 = __riscv_vfwmacc_vv_f32m2(vsum0_2, ax0_2, ay2, epr);
-            vsum1_2 = __riscv_vfwmacc_vv_f32m2(vsum1_2, ax1_2, ay2, epr);
-            __asm__ __volatile__("" ::: "memory");
-
-            vfloat16m1_t ay3 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i + 3 * epr), epr);
-            vfloat16m1_t ax0_3 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i + 3 * epr), epr);
-            vfloat16m1_t ax1_3 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i + 3 * epr), epr);
-            vsum0_3 = __riscv_vfwmacc_vv_f32m2(vsum0_3, ax0_3, ay3, epr);
-            vsum1_3 = __riscv_vfwmacc_vv_f32m2(vsum1_3, ax1_3, ay3, epr);
-            __asm__ __volatile__("" ::: "memory");
+            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
+            vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
+            vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
+            vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
+            vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
+
+            vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
+            vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
+            vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
+            vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
+            vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
         }
 
-        vfloat32m2_t vsum0_01 = __riscv_vfadd_vv_f32m2(vsum0_0, vsum0_1, vl);
-        vfloat32m2_t vsum0_23 = __riscv_vfadd_vv_f32m2(vsum0_2, vsum0_3, vl);
-        vfloat32m2_t vsum0 = __riscv_vfadd_vv_f32m2(vsum0_01, vsum0_23, vl);
-
-        vfloat32m2_t vsum1_01 = __riscv_vfadd_vv_f32m2(vsum1_0, vsum1_1, vl);
-        vfloat32m2_t vsum1_23 = __riscv_vfadd_vv_f32m2(vsum1_2, vsum1_3, vl);
-        vfloat32m2_t vsum1 = __riscv_vfadd_vv_f32m2(vsum1_01, vsum1_23, vl);
+        vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
+        vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
 
         // leftovers
         for (int i = np; i < n; i += vl) {
-            vl = __riscv_vsetvl_e16m1(n - i);
-            vfloat16m1_t ay = __riscv_vle16_v_f16m1((const _Float16 *)(y + i), vl);
-            vfloat16m1_t ax0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i), vl);
-            vfloat16m1_t ax1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i), vl);
+            vl = __riscv_vsetvl_e16m2(n - i);
+            vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
+            vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
+            vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
 
-            vsum0 = __riscv_vfwmacc_vv_f32m2(vsum0, ax0, ay, vl);
-            vsum1 = __riscv_vfwmacc_vv_f32m2(vsum1, ax1, ay, vl);
+            vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
+            vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
         }
 
         // reduce
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
         vl = __riscv_vsetvlmax_e32m1();
-        vfloat32m1_t acc0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum0, 0),
-                                    __riscv_vget_v_f32m2_f32m1(vsum0, 1), vl);
+        vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
+        __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
         vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
-                                    acc0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+                                    acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
 
-        vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum1, 0),
-                                    __riscv_vget_v_f32m2_f32m1(vsum1, 1), vl);
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
+        vl = __riscv_vsetvlmax_e32m1();
+        vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
+                                    __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
         vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
-                                    acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+                                    acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
         sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
         sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
 
@@ -556,31 +538,33 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
         const _Float16 scale = *(const _Float16*)(&s);
 
         // calculate step size
-        const int epr = __riscv_vsetvlmax_e16m2();
+        const int epr = __riscv_vsetvlmax_e16m4();
         const int step = epr * 2;
         const int np = (n & ~(step - 1));
 
         // unroll by 2
         for (int i = 0; i < np; i += step) {
-            vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
-            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, epr);
-            ay0 = __riscv_vfmacc_vf_f16m2(ay0, scale, ax0, epr);
-            __riscv_vse16_v_f16m2((_Float16*)y + i, ay0, epr);
-
-            vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
-            vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16*)y + i + epr, epr);
-            ay1 = __riscv_vfmacc_vf_f16m2(ay1, scale, ax1, epr);
-            __riscv_vse16_v_f16m2((_Float16*)y + i + epr, ay1, epr);
+            vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
+            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+            ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
+            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+            __asm__ __volatile__ ("" ::: "memory");
+
+            vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
+            vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+            ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
+            __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+            __asm__ __volatile__ ("" ::: "memory");
         }
 
         // leftovers
         int vl;
         for (int i = np; i < n; i += vl) {
-            vl = __riscv_vsetvl_e16m2(n - i);
-            vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i , vl);
-            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, vl);
-            ay0 = __riscv_vfmacc_vf_f16m2(ay0, scale, ax0, vl);
-            __riscv_vse16_v_f16m2((_Float16*)y + i, ay0, vl);
+            vl = __riscv_vsetvl_e16m4(n - i);
+            vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i , vl);
+            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+            ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
+            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
         }
     #else
         const int np = (n & ~(GGML_F16_STEP - 1));
@@ -831,30 +815,30 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
         const _Float16 scale = *(const _Float16*)(&s);
 
         // calculate step size
-        const int epr = __riscv_vsetvlmax_e16m2();
+        const int epr = __riscv_vsetvlmax_e16m4();
         const int step = epr * 2;
         const int np = (n & ~(step - 1));
 
         // unroll by 2
         for (int i = 0; i < np; i += step) {
-            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, epr);
-            ay0 = __riscv_vfmul_vf_f16m2(ay0, scale, epr);
-            __riscv_vse16_v_f16m2((_Float16*)y + i, ay0, epr);
+            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+            ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
+            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
             __asm__ __volatile__ ("" ::: "memory");
 
-            vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16*)y + i + epr, epr);
-            ay1 = __riscv_vfmul_vf_f16m2(ay1, scale, epr);
-            __riscv_vse16_v_f16m2((_Float16*)y + i + epr, ay1, epr);
+            vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+            ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
+            __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
             __asm__ __volatile__ ("" ::: "memory");
         }
 
         // leftovers
         int vl;
         for (int i = np; i < n; i += vl) {
-            vl = __riscv_vsetvl_e16m2(n - i);
-            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, vl);
-            ay0 = __riscv_vfmul_vf_f16m2(ay0, scale, vl);
-            __riscv_vse16_v_f16m2((_Float16*)y + i, ay0, vl);
+            vl = __riscv_vsetvl_e16m4(n - i);
+            vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+            ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
+            __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
         }
     #else
         const int np = (n & ~(GGML_F16_STEP - 1));