@@ -226,84 +226,66 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
226226 GGML_F16x_VEC_REDUCE (sumf [1 ], sum_10 , sum_11 , sum_12 , sum_13 );
227227
228228 #elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
229- size_t vl = __riscv_vsetvlmax_e32m2 ();
229+ size_t vl = __riscv_vsetvlmax_e32m4 ();
230230
231231 // initialize accumulators to all zeroes
232- vfloat32m2_t vsum0_0 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
233- vfloat32m2_t vsum0_1 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
234- vfloat32m2_t vsum0_2 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
235- vfloat32m2_t vsum0_3 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
236- vfloat32m2_t vsum1_0 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
237- vfloat32m2_t vsum1_1 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
238- vfloat32m2_t vsum1_2 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
239- vfloat32m2_t vsum1_3 = __riscv_vfmv_v_f_f32m2 (0.0f , vl );
232+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
233+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
234+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
235+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
240236
241237 // calculate step size
242- const size_t epr = __riscv_vsetvlmax_e16m1 ();
243- const size_t step = epr * 4 ;
238+ const size_t epr = __riscv_vsetvlmax_e16m2 ();
239+ const size_t step = epr * 2 ;
244240 const int np = (n & ~(step - 1 ));
245241
246- // unroll by 4
242+ // unroll by 2 along the row dimension
247243 for (int i = 0 ; i < np ; i += step ) {
248- vfloat16m1_t ay0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i ), epr );
249- vfloat16m1_t ax0_0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i ), epr );
250- vfloat16m1_t ax1_0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i ), epr );
251- vsum0_0 = __riscv_vfwmacc_vv_f32m2 (vsum0_0 , ax0_0 , ay0 , epr );
252- vsum1_0 = __riscv_vfwmacc_vv_f32m2 (vsum1_0 , ax1_0 , ay0 , epr );
253- __asm__ __volatile__("" ::: "memory" );
254-
255- vfloat16m1_t ay1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i + epr ), epr );
256- vfloat16m1_t ax0_1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i + epr ), epr );
257- vfloat16m1_t ax1_1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i + epr ), epr );
258- vsum0_1 = __riscv_vfwmacc_vv_f32m2 (vsum0_1 , ax0_1 , ay1 , epr );
259- vsum1_1 = __riscv_vfwmacc_vv_f32m2 (vsum1_1 , ax1_1 , ay1 , epr );
260- __asm__ __volatile__("" ::: "memory" );
261-
262- vfloat16m1_t ay2 = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i + 2 * epr ), epr );
263- vfloat16m1_t ax0_2 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i + 2 * epr ), epr );
264- vfloat16m1_t ax1_2 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i + 2 * epr ), epr );
265- vsum0_2 = __riscv_vfwmacc_vv_f32m2 (vsum0_2 , ax0_2 , ay2 , epr );
266- vsum1_2 = __riscv_vfwmacc_vv_f32m2 (vsum1_2 , ax1_2 , ay2 , epr );
267- __asm__ __volatile__("" ::: "memory" );
268-
269- vfloat16m1_t ay3 = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i + 3 * epr ), epr );
270- vfloat16m1_t ax0_3 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i + 3 * epr ), epr );
271- vfloat16m1_t ax1_3 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i + 3 * epr ), epr );
272- vsum0_3 = __riscv_vfwmacc_vv_f32m2 (vsum0_3 , ax0_3 , ay3 , epr );
273- vsum1_3 = __riscv_vfwmacc_vv_f32m2 (vsum1_3 , ax1_3 , ay3 , epr );
274- __asm__ __volatile__("" ::: "memory" );
244+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), epr );
245+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), epr );
246+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), epr );
247+ vsum0_0 = __riscv_vfwmacc_vv_f32m4 (vsum0_0 , ax0_0 , ay0 , epr );
248+ vsum1_0 = __riscv_vfwmacc_vv_f32m4 (vsum1_0 , ax1_0 , ay0 , epr );
249+
250+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i + epr ), epr );
251+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i + epr ), epr );
252+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i + epr ), epr );
253+ vsum0_1 = __riscv_vfwmacc_vv_f32m4 (vsum0_1 , ax0_1 , ay1 , epr );
254+ vsum1_1 = __riscv_vfwmacc_vv_f32m4 (vsum1_1 , ax1_1 , ay1 , epr );
275255 }
276256
277- vfloat32m2_t vsum0_01 = __riscv_vfadd_vv_f32m2 (vsum0_0 , vsum0_1 , vl );
278- vfloat32m2_t vsum0_23 = __riscv_vfadd_vv_f32m2 (vsum0_2 , vsum0_3 , vl );
279- vfloat32m2_t vsum0 = __riscv_vfadd_vv_f32m2 (vsum0_01 , vsum0_23 , vl );
280-
281- vfloat32m2_t vsum1_01 = __riscv_vfadd_vv_f32m2 (vsum1_0 , vsum1_1 , vl );
282- vfloat32m2_t vsum1_23 = __riscv_vfadd_vv_f32m2 (vsum1_2 , vsum1_3 , vl );
283- vfloat32m2_t vsum1 = __riscv_vfadd_vv_f32m2 (vsum1_01 , vsum1_23 , vl );
257+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4 (vsum0_0 , vsum0_1 , vl );
258+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4 (vsum1_0 , vsum1_1 , vl );
284259
285260 // leftovers
286261 for (int i = np ; i < n ; i += vl ) {
287- vl = __riscv_vsetvl_e16m1 (n - i );
288- vfloat16m1_t ay = __riscv_vle16_v_f16m1 ((const _Float16 * )(y + i ), vl );
289- vfloat16m1_t ax0 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [0 ] + i ), vl );
290- vfloat16m1_t ax1 = __riscv_vle16_v_f16m1 ((const _Float16 * )(x [1 ] + i ), vl );
262+ vl = __riscv_vsetvl_e16m2 (n - i );
263+ vfloat16m2_t ay = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), vl );
264+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), vl );
265+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), vl );
291266
292- vsum0 = __riscv_vfwmacc_vv_f32m2 (vsum0 , ax0 , ay , vl );
293- vsum1 = __riscv_vfwmacc_vv_f32m2 (vsum1 , ax1 , ay , vl );
267+ vsum0 = __riscv_vfwmacc_vv_f32m4 (vsum0 , ax0 , ay , vl );
268+ vsum1 = __riscv_vfwmacc_vv_f32m4 (vsum1 , ax1 , ay , vl );
294269 }
295270
296271 // reduce
272+ vl = __riscv_vsetvlmax_e32m2 ();
273+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum0 , 0 ),
274+ __riscv_vget_v_f32m4_f32m2 (vsum0 , 1 ), vl );
297275 vl = __riscv_vsetvlmax_e32m1 ();
298- vfloat32m1_t acc0 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (vsum0 , 0 ),
299- __riscv_vget_v_f32m2_f32m1 (vsum0 , 1 ), vl );
276+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc0_0 , 0 ),
277+ __riscv_vget_v_f32m2_f32m1 (acc0_0 , 1 ), vl );
300278 vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1 (
301- acc0 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
279+ acc0_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
302280
303- vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (vsum1 , 0 ),
304- __riscv_vget_v_f32m2_f32m1 (vsum1 , 1 ), vl );
281+ vl = __riscv_vsetvlmax_e32m2 ();
282+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum1 , 0 ),
283+ __riscv_vget_v_f32m4_f32m2 (vsum1 , 1 ), vl );
284+ vl = __riscv_vsetvlmax_e32m1 ();
285+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc1_0 , 0 ),
286+ __riscv_vget_v_f32m2_f32m1 (acc1_0 , 1 ), vl );
305287 vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1 (
306- acc1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
288+ acc1_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
307289 sumf [0 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum0 );
308290 sumf [1 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum1 );
309291
@@ -556,31 +538,33 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
556538 const _Float16 scale = * (const _Float16 * )(& s );
557539
558540 // calculate step size
559- const int epr = __riscv_vsetvlmax_e16m2 ();
541+ const int epr = __riscv_vsetvlmax_e16m4 ();
560542 const int step = epr * 2 ;
561543 const int np = (n & ~(step - 1 ));
562544
563545 // unroll by 2
564546 for (int i = 0 ; i < np ; i += step ) {
565- vfloat16m2_t ax0 = __riscv_vle16_v_f16m2 ((const _Float16 * )x + i , epr );
566- vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i , epr );
567- ay0 = __riscv_vfmacc_vf_f16m2 (ay0 , scale , ax0 , epr );
568- __riscv_vse16_v_f16m2 ((_Float16 * )y + i , ay0 , epr );
569-
570- vfloat16m2_t ax1 = __riscv_vle16_v_f16m2 ((const _Float16 * )x + i + epr , epr );
571- vfloat16m2_t ay1 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i + epr , epr );
572- ay1 = __riscv_vfmacc_vf_f16m2 (ay1 , scale , ax1 , epr );
573- __riscv_vse16_v_f16m2 ((_Float16 * )y + i + epr , ay1 , epr );
547+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i , epr );
548+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , epr );
549+ ay0 = __riscv_vfmacc_vf_f16m4 (ay0 , scale , ax0 , epr );
550+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , epr );
551+ __asm__ __volatile__ ("" ::: "memory" );
552+
553+ vfloat16m4_t ax1 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i + epr , epr );
554+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i + epr , epr );
555+ ay1 = __riscv_vfmacc_vf_f16m4 (ay1 , scale , ax1 , epr );
556+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i + epr , ay1 , epr );
557+ __asm__ __volatile__ ("" ::: "memory" );
574558 }
575559
576560 // leftovers
577561 int vl ;
578562 for (int i = np ; i < n ; i += vl ) {
579- vl = __riscv_vsetvl_e16m2 (n - i );
580- vfloat16m2_t ax0 = __riscv_vle16_v_f16m2 ((const _Float16 * )x + i , vl );
581- vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i , vl );
582- ay0 = __riscv_vfmacc_vf_f16m2 (ay0 , scale , ax0 , vl );
583- __riscv_vse16_v_f16m2 ((_Float16 * )y + i , ay0 , vl );
563+ vl = __riscv_vsetvl_e16m4 (n - i );
564+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i , vl );
565+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
566+ ay0 = __riscv_vfmacc_vf_f16m4 (ay0 , scale , ax0 , vl );
567+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , vl );
584568 }
585569 #else
586570 const int np = (n & ~(GGML_F16_STEP - 1 ));
@@ -831,30 +815,30 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
831815 const _Float16 scale = * (const _Float16 * )(& s );
832816
833817 // calculate step size
834- const int epr = __riscv_vsetvlmax_e16m2 ();
818+ const int epr = __riscv_vsetvlmax_e16m4 ();
835819 const int step = epr * 2 ;
836820 const int np = (n & ~(step - 1 ));
837821
838822 // unroll by 2
839823 for (int i = 0 ; i < np ; i += step ) {
840- vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i , epr );
841- ay0 = __riscv_vfmul_vf_f16m2 (ay0 , scale , epr );
842- __riscv_vse16_v_f16m2 ((_Float16 * )y + i , ay0 , epr );
824+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , epr );
825+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , epr );
826+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , epr );
843827 __asm__ __volatile__ ("" ::: "memory" );
844828
845- vfloat16m2_t ay1 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i + epr , epr );
846- ay1 = __riscv_vfmul_vf_f16m2 (ay1 , scale , epr );
847- __riscv_vse16_v_f16m2 ((_Float16 * )y + i + epr , ay1 , epr );
829+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i + epr , epr );
830+ ay1 = __riscv_vfmul_vf_f16m4 (ay1 , scale , epr );
831+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i + epr , ay1 , epr );
848832 __asm__ __volatile__ ("" ::: "memory" );
849833 }
850834
851835 // leftovers
852836 int vl ;
853837 for (int i = np ; i < n ; i += vl ) {
854- vl = __riscv_vsetvl_e16m2 (n - i );
855- vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )y + i , vl );
856- ay0 = __riscv_vfmul_vf_f16m2 (ay0 , scale , vl );
857- __riscv_vse16_v_f16m2 ((_Float16 * )y + i , ay0 , vl );
838+ vl = __riscv_vsetvl_e16m4 (n - i );
839+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
840+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , vl );
841+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , vl );
858842 }
859843 #else
860844 const int np = (n & ~(GGML_F16_STEP - 1 ));
0 commit comments