@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
224224 }
225225 GGML_F16x_VEC_REDUCE (sumf [0 ], sum_00 , sum_01 , sum_02 , sum_03 );
226226 GGML_F16x_VEC_REDUCE (sumf [1 ], sum_10 , sum_11 , sum_12 , sum_13 );
227- #elif defined(__riscv_v_intrinsic )
228- // todo: RVV impl
229- for (int i = 0 ; i < n ; ++ i ) {
230- for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
231- sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
232- }
233- }
227+
228+ #elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
229+ size_t vl = __riscv_vsetvlmax_e32m4 ();
230+
231+ // initialize accumulators to all zeroes
232+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
233+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
234+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
235+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
236+
237+ // calculate step size
238+ const size_t epr = __riscv_vsetvlmax_e16m2 ();
239+ const size_t step = epr * 2 ;
240+ const int np = (n & ~(step - 1 ));
241+
242+ // unroll by 2 along the row dimension
243+ for (int i = 0 ; i < np ; i += step ) {
244+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), epr );
245+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), epr );
246+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), epr );
247+ vsum0_0 = __riscv_vfwmacc_vv_f32m4 (vsum0_0 , ax0_0 , ay0 , epr );
248+ vsum1_0 = __riscv_vfwmacc_vv_f32m4 (vsum1_0 , ax1_0 , ay0 , epr );
249+
250+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i + epr ), epr );
251+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i + epr ), epr );
252+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i + epr ), epr );
253+ vsum0_1 = __riscv_vfwmacc_vv_f32m4 (vsum0_1 , ax0_1 , ay1 , epr );
254+ vsum1_1 = __riscv_vfwmacc_vv_f32m4 (vsum1_1 , ax1_1 , ay1 , epr );
255+ }
256+
257+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4 (vsum0_0 , vsum0_1 , vl );
258+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4 (vsum1_0 , vsum1_1 , vl );
259+
260+ // leftovers
261+ for (int i = np ; i < n ; i += vl ) {
262+ vl = __riscv_vsetvl_e16m2 (n - i );
263+ vfloat16m2_t ay = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), vl );
264+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), vl );
265+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), vl );
266+
267+ vsum0 = __riscv_vfwmacc_vv_f32m4 (vsum0 , ax0 , ay , vl );
268+ vsum1 = __riscv_vfwmacc_vv_f32m4 (vsum1 , ax1 , ay , vl );
269+ }
270+
271+ // reduce
272+ vl = __riscv_vsetvlmax_e32m2 ();
273+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum0 , 0 ),
274+ __riscv_vget_v_f32m4_f32m2 (vsum0 , 1 ), vl );
275+ vl = __riscv_vsetvlmax_e32m1 ();
276+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc0_0 , 0 ),
277+ __riscv_vget_v_f32m2_f32m1 (acc0_0 , 1 ), vl );
278+ vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1 (
279+ acc0_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
280+
281+ vl = __riscv_vsetvlmax_e32m2 ();
282+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum1 , 0 ),
283+ __riscv_vget_v_f32m4_f32m2 (vsum1 , 1 ), vl );
284+ vl = __riscv_vsetvlmax_e32m1 ();
285+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc1_0 , 0 ),
286+ __riscv_vget_v_f32m2_f32m1 (acc1_0 , 1 ), vl );
287+ vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1 (
288+ acc1_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
289+ sumf [0 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum0 );
290+ sumf [1 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum1 );
291+
234292 #else
235293 const int np = (n & ~(GGML_F16_STEP - 1 ));
236294
@@ -475,11 +533,38 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
475533 svst1_f16 (pg , (__fp16 * )(y + np2 ), hy );
476534 }
477535
478- #elif defined(__riscv_v_intrinsic )
479- // todo: RVV impl
480- // scalar
481- for (int i = 0 ; i < n ; ++ i ) {
482- y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
536+ #elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
537+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16 (v );
538+ const _Float16 scale = * (const _Float16 * )(& s );
539+
540+ // calculate step size
541+ const int epr = __riscv_vsetvlmax_e16m4 ();
542+ const int step = epr * 2 ;
543+ const int np = (n & ~(step - 1 ));
544+
545+ // unroll by 2
546+ for (int i = 0 ; i < np ; i += step ) {
547+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i , epr );
548+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , epr );
549+ ay0 = __riscv_vfmacc_vf_f16m4 (ay0 , scale , ax0 , epr );
550+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , epr );
551+ __asm__ __volatile__ ("" ::: "memory" );
552+
553+ vfloat16m4_t ax1 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i + epr , epr );
554+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i + epr , epr );
555+ ay1 = __riscv_vfmacc_vf_f16m4 (ay1 , scale , ax1 , epr );
556+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i + epr , ay1 , epr );
557+ __asm__ __volatile__ ("" ::: "memory" );
558+ }
559+
560+ // leftovers
561+ int vl ;
562+ for (int i = np ; i < n ; i += vl ) {
563+ vl = __riscv_vsetvl_e16m4 (n - i );
564+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i , vl );
565+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
566+ ay0 = __riscv_vfmacc_vf_f16m4 (ay0 , scale , ax0 , vl );
567+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , vl );
483568 }
484569 #else
485570 const int np = (n & ~(GGML_F16_STEP - 1 ));
@@ -725,11 +810,35 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
725810 svfloat16_t out = svmul_f16_m (pg , hy , vx );
726811 svst1_f16 (pg , (__fp16 * )(y + np ), out );
727812 }
728- #elif defined(__riscv_v_intrinsic )
729- // todo: RVV impl
730- // scalar
731- for (int i = 0 ; i < n ; ++ i ) {
732- y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
813+ #elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
814+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16 (v );
815+ const _Float16 scale = * (const _Float16 * )(& s );
816+
817+ // calculate step size
818+ const int epr = __riscv_vsetvlmax_e16m4 ();
819+ const int step = epr * 2 ;
820+ const int np = (n & ~(step - 1 ));
821+
822+ // unroll by 2
823+ for (int i = 0 ; i < np ; i += step ) {
824+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , epr );
825+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , epr );
826+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , epr );
827+ __asm__ __volatile__ ("" ::: "memory" );
828+
829+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i + epr , epr );
830+ ay1 = __riscv_vfmul_vf_f16m4 (ay1 , scale , epr );
831+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i + epr , ay1 , epr );
832+ __asm__ __volatile__ ("" ::: "memory" );
833+ }
834+
835+ // leftovers
836+ int vl ;
837+ for (int i = np ; i < n ; i += vl ) {
838+ vl = __riscv_vsetvl_e16m4 (n - i );
839+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
840+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , vl );
841+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , vl );
733842 }
734843 #else
735844 const int np = (n & ~(GGML_F16_STEP - 1 ));
0 commit comments