@@ -226,63 +226,69 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
226226 GGML_F16x_VEC_REDUCE (sumf [1 ], sum_10 , sum_11 , sum_12 , sum_13 );
227227
228228 #elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
229- size_t vl = __riscv_vsetvlmax_e32m2 ();
229+ size_t vl = __riscv_vsetvlmax_e32m4 ();
230230
231231 // initialize accumulators to all zeroes
232- vfloat32m2_t vsum0_0 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
233- vfloat32m2_t vsum0_1 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
234- vfloat32m2_t vsum1_0 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
235- vfloat32m2_t vsum1_1 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
236-
232+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
233+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
234+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
235+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
237236
238237 // calculate step size
239- const size_t epr = __riscv_vsetvlmax_e16m1 ();
238+ const size_t epr = __riscv_vsetvlmax_e16m2 ();
240239 const size_t step = epr * 2 ;
241240 const int np = (n & ~(step - 1 ));
242241
243- // unroll by 2
242+ // unroll by 2 along the row dimension
244243 for (int i = 0 ; i < np ; i += step ) {
245- vfloat16m1_t ay0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i ), epr );
246- vfloat16m1_t ax0_0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i ), epr );
247- vfloat16m1_t ax1_0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i ), epr );
248- vsum0_0 = __riscv_vfwmacc_vv_f32m2 (vsum0_0 , ax0_0 , ay0 , epr );
249- vsum1_0 = __riscv_vfwmacc_vv_f32m2 (vsum1_0 , ax1_0 , ay0 , epr );
250-
251- vfloat16m1_t ay1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i + epr ), epr );
252- vfloat16m1_t ax0_1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i + epr ), epr );
253- vfloat16m1_t ax1_1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i + epr ), epr );
254- vsum0_1 = __riscv_vfwmacc_vv_f32m2 (vsum0_1 , ax0_1 , ay1 , epr );
255- vsum1_1 = __riscv_vfwmacc_vv_f32m2 (vsum1_1 , ax1_1 , ay1 , epr );
244+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), epr );
245+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), epr );
246+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), epr );
247+ vsum0_0 = __riscv_vfwmacc_vv_f32m4 (vsum0_0 , ax0_0 , ay0 , epr );
248+ vsum1_0 = __riscv_vfwmacc_vv_f32m4 (vsum1_0 , ax1_0 , ay0 , epr );
249+
250+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i + epr ), epr );
251+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i + epr ), epr );
252+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i + epr ), epr );
253+ vsum0_1 = __riscv_vfwmacc_vv_f32m4 (vsum0_1 , ax0_1 , ay1 , epr );
254+ vsum1_1 = __riscv_vfwmacc_vv_f32m4 (vsum1_1 , ax1_1 , ay1 , epr );
256255 }
257256
258- vfloat32m2_t vsum0 = __riscv_vfadd_vv_f32m2 (vsum0_0 , vsum0_1 , vl );
259-
260- vfloat32m2_t vsum1 = __riscv_vfadd_vv_f32m2 (vsum1_0 , vsum1_1 , vl );
257+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4 (vsum0_0 , vsum0_1 , vl );
258+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4 (vsum1_0 , vsum1_1 , vl );
261259
262260 // leftovers
263261 for (int i = np ; i < n ; i += vl ) {
264- vl = __riscv_vsetvl_e16m1 (n - i );
265- vfloat16m1_t ay = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i ), vl );
266- vfloat16m1_t ax0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i ), vl );
267- vfloat16m1_t ax1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i ), vl );
262+ vl = __riscv_vsetvl_e16m2 (n - i );
263+ vfloat16m2_t ay = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), vl );
264+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), vl );
265+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), vl );
268266
269- vsum0 = __riscv_vfwmacc_vv_f32m2 (vsum0 , ax0 , ay , vl );
270- vsum1 = __riscv_vfwmacc_vv_f32m2 (vsum1 , ax1 , ay , vl );
267+ vsum0 = __riscv_vfwmacc_vv_f32m4 (vsum0 , ax0 , ay , vl );
268+ vsum1 = __riscv_vfwmacc_vv_f32m4 (vsum1 , ax1 , ay , vl );
271269 }
272270
273271 // reduce
272+ vl = __riscv_vsetvlmax_e32m2 ();
273+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum0 , 0 ),
274+ __riscv_vget_v_f32m4_f32m2 (vsum0 , 1 ), vl );
274275 vl = __riscv_vsetvlmax_e32m1 ();
275- vfloat32m1_t acc0 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (vsum0 , 0 ),
276- __riscv_vget_v_f32m2_f32m1 (vsum0 , 1 ), vl );
276+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc0_0 , 0 ),
277+ __riscv_vget_v_f32m2_f32m1 (acc0_0 , 1 ), vl );
277278 vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1 (
278- acc0 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
279+ acc0_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
279280
280- vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (vsum1 , 0 ),
281- __riscv_vget_v_f32m2_f32m1 (vsum1 , 1 ), vl );
281+ vl = __riscv_vsetvlmax_e32m2 ();
282+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum1 , 0 ),
283+ __riscv_vget_v_f32m4_f32m2 (vsum1 , 1 ), vl );
284+ vl = __riscv_vsetvlmax_e32m1 ();
285+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc1_0 , 0 ),
286+ __riscv_vget_v_f32m2_f32m1 (acc1_0 , 1 ), vl );
282287 vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1 (
283- acc1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
288+ acc1_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
284289 sumf [0 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum0 );
285290 sumf [1 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum1 );
291+
286292 #else
287293 const int np = (n & ~(GGML_F16_STEP - 1 ));
288294
@@ -554,7 +560,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
554560 // leftovers
555561 int vl ;
556562 for (int i = np ; i < n ; i += vl ) {
557- vl = __riscv_vsetvl_e16m2 (n - i );
563+ vl = __riscv_vsetvl_e16m4 (n - i );
558564 vfloat16m4_t ax0 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i , vl );
559565 vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
560566 ay0 = __riscv_vfmacc_vf_f16m4 (ay0 , scale , ax0 , vl );
@@ -809,30 +815,30 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
809815 const _Float16 scale = * (const _Float16 * )(& s );
810816
811817 // calculate step size
812- const int epr = __riscv_vsetvlmax_e16m2 ();
818+ const int epr = __riscv_vsetvlmax_e16m4 ();
813819 const int step = epr * 2 ;
814820 const int np = (n & ~(step - 1 ));
815821
816822 // unroll by 2
817823 for (int i = 0 ; i < np ; i += step ) {
818- vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i , epr );
819- ay0 = __riscv_vfmul_vf_f16m2 (ay0 , scale , epr );
820- __riscv_vse16_v_f16m2 ((_Float16 * )y + i , ay0 , epr );
824+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , epr );
825+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , epr );
826+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , epr );
821827 __asm__ __volatile__ ("" ::: "memory" );
822828
823- vfloat16m2_t ay1 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i + epr , epr );
824- ay1 = __riscv_vfmul_vf_f16m2 (ay1 , scale , epr );
825- __riscv_vse16_v_f16m2 ((_Float16 * )y + i + epr , ay1 , epr );
829+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i + epr , epr );
830+ ay1 = __riscv_vfmul_vf_f16m4 (ay1 , scale , epr );
831+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i + epr , ay1 , epr );
826832 __asm__ __volatile__ ("" ::: "memory" );
827833 }
828834
829835 // leftovers
830836 int vl ;
831837 for (int i = np ; i < n ; i += vl ) {
832- vl = __riscv_vsetvl_e16m2 (n - i );
833- vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i , vl );
834- ay0 = __riscv_vfmul_vf_f16m2 (ay0 , scale , vl );
835- __riscv_vse16_v_f16m2 ((_Float16 * )y + i , ay0 , vl );
838+ vl = __riscv_vsetvl_e16m4 (n - i );
839+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
840+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , vl );
841+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , vl );
836842 }
837843 #else
838844 const int np = (n & ~(GGML_F16_STEP - 1 ));
0 commit comments