@@ -698,60 +698,61 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
698698}
699699
700700inline static void ggml_vec_scale_f16 (const int n , ggml_fp16_t * y , const float v ) {
701- #if defined(GGML_SIMD )
702- #if defined(__ARM_FEATURE_SVE )
703- const int sve_register_length = svcntb () * 8 ;
704- const int ggml_f16_epr = sve_register_length / 16 ;
705- const int ggml_f16_step = 2 * ggml_f16_epr ;
706-
707- GGML_F16x_VEC vx = GGML_F16x_VEC_SET1 (v );
708- const int np = (n & ~(ggml_f16_step - 1 ));
709- svfloat16_t ay1 , ay2 ;
710-
711- for (int i = 0 ; i < np ; i += ggml_f16_step ) {
712- ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 );
713- ay1 = GGML_F16x_VEC_MUL (ay1 , vx );
714- GGML_F16x_VEC_STORE (y + i + 0 * ggml_f16_epr , ay1 , 0 );
715-
716- ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 );
717- ay2 = GGML_F16x_VEC_MUL (ay2 , vx );
718- GGML_F16x_VEC_STORE (y + i + 1 * ggml_f16_epr , ay2 , 1 );
719- }
720- // leftovers
721- // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
722- if (np < n ) {
723- svbool_t pg = svwhilelt_b16 (np , n );
724- svfloat16_t hy = svld1_f16 (pg , (__fp16 * )(y + np ));
725- svfloat16_t out = svmul_f16_m (pg , hy , vx );
726- svst1_f16 (pg , (__fp16 * )(y + np ), out );
727- }
728- #elif defined(__riscv_v_intrinsic )
729- // todo: RVV impl
730- // scalar
731- for (int i = 0 ; i < n ; ++ i ) {
732- y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
733- }
734- #else
735- const int np = (n & ~(GGML_F16_STEP - 1 ));
701+ #if defined(GGML_SIMD ) && defined(__ARM_FEATURE_SVE )
702+ const int sve_register_length = svcntb () * 8 ;
703+ const int ggml_f16_epr = sve_register_length / 16 ;
704+ const int ggml_f16_step = 2 * ggml_f16_epr ;
705+
706+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1 (v );
707+ const int np = (n & ~(ggml_f16_step - 1 ));
708+ svfloat16_t ay1 , ay2 ;
709+
710+ for (int i = 0 ; i < np ; i += ggml_f16_step ) {
711+ ay1 = GGML_F16x_VEC_LOAD (y + i + 0 * ggml_f16_epr , 0 );
712+ ay1 = GGML_F16x_VEC_MUL (ay1 , vx );
713+ GGML_F16x_VEC_STORE (y + i + 0 * ggml_f16_epr , ay1 , 0 );
714+
715+ ay2 = GGML_F16x_VEC_LOAD (y + i + 1 * ggml_f16_epr , 1 );
716+ ay2 = GGML_F16x_VEC_MUL (ay2 , vx );
717+ GGML_F16x_VEC_STORE (y + i + 1 * ggml_f16_epr , ay2 , 1 );
718+ }
719+ // leftovers
720+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721+ if (np < n ) {
722+ svbool_t pg = svwhilelt_b16 (np , n );
723+ svfloat16_t hy = svld1_f16 (pg , (__fp16 * )(y + np ));
724+ svfloat16_t out = svmul_f16_m (pg , hy , vx );
725+ svst1_f16 (pg , (__fp16 * )(y + np ), out );
726+ }
727+ #elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
728+ for (int i = 0 , vl ; i < n ; i += vl ) {
729+ vl = __riscv_vsetvl_e16m2 (n - i );
730+ vfloat16m2_t vy = __riscv_vle16_v_f16m2 ((_Float16 * )& y [i ], vl );
731+ vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4 (vy , vl );
732+ vy32 = __riscv_vfmul_vf_f32m4 (vy32 , v , vl );
733+ vy = __riscv_vfncvt_f_f_w_f16m2 (vy32 , vl );
734+ __riscv_vse16_v_f16m2 ((_Float16 * )& y [i ], vy , vl );
735+ }
736+ #elif defined(GGML_SIMD )
737+ const int np = (n & ~(GGML_F16_STEP - 1 ));
736738
737- GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
739+ GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
738740
739- GGML_F16_VEC ay [GGML_F16_ARR ];
741+ GGML_F16_VEC ay [GGML_F16_ARR ];
740742
741- for (int i = 0 ; i < np ; i += GGML_F16_STEP ) {
742- for (int j = 0 ; j < GGML_F16_ARR ; j ++ ) {
743- ay [j ] = GGML_F16_VEC_LOAD (y + i + j * GGML_F16_EPR , j );
744- ay [j ] = GGML_F16_VEC_MUL (ay [j ], vx );
743+ for (int i = 0 ; i < np ; i += GGML_F16_STEP ) {
744+ for (int j = 0 ; j < GGML_F16_ARR ; j ++ ) {
745+ ay [j ] = GGML_F16_VEC_LOAD (y + i + j * GGML_F16_EPR , j );
746+ ay [j ] = GGML_F16_VEC_MUL (ay [j ], vx );
745747
746- GGML_F16_VEC_STORE (y + i + j * GGML_F16_EPR , ay , j );
747- }
748+ GGML_F16_VEC_STORE (y + i + j * GGML_F16_EPR , ay , j );
748749 }
750+ }
749751
750- // leftovers
751- for (int i = np ; i < n ; ++ i ) {
752- y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
753- }
754- #endif
752+ // leftovers
753+ for (int i = np ; i < n ; ++ i ) {
754+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
755+ }
755756#else
756757 // scalar
757758 for (int i = 0 ; i < n ; ++ i ) {
0 commit comments