Skip to content

Commit 5be353e

Browse files
authored
ggml-cpu:add RISC-V RVV (Zvfh) optimization for FP16 vector scaling (#17314)
* ggml-cpu:add RISC-V RVV (Zvfh) optimization for FP16 vector scaling Signed-off-by: Wang Yang <yangwang@iscas.ac.cn> * fix comment * fix comment 2 --------- Signed-off-by: Wang Yang <yangwang@iscas.ac.cn>
1 parent 7d77f07 commit 5be353e

File tree

1 file changed

+49
-48
lines changed

1 file changed

+49
-48
lines changed

ggml/src/ggml-cpu/vec.h

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -698,60 +698,61 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
698698
}
699699

700700
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
701-
#if defined(GGML_SIMD)
702-
#if defined(__ARM_FEATURE_SVE)
703-
const int sve_register_length = svcntb() * 8;
704-
const int ggml_f16_epr = sve_register_length / 16;
705-
const int ggml_f16_step = 2 * ggml_f16_epr;
706-
707-
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
708-
const int np = (n & ~(ggml_f16_step - 1));
709-
svfloat16_t ay1, ay2;
710-
711-
for (int i = 0; i < np; i += ggml_f16_step) {
712-
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
713-
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
714-
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
715-
716-
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
717-
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
718-
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
719-
}
720-
// leftovers
721-
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
722-
if (np < n) {
723-
svbool_t pg = svwhilelt_b16(np, n);
724-
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
725-
svfloat16_t out = svmul_f16_m(pg, hy, vx);
726-
svst1_f16(pg, (__fp16 *)(y + np), out);
727-
}
728-
#elif defined(__riscv_v_intrinsic)
729-
// todo: RVV impl
730-
// scalar
731-
for (int i = 0; i < n; ++i) {
732-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
733-
}
734-
#else
735-
const int np = (n & ~(GGML_F16_STEP - 1));
701+
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
702+
const int sve_register_length = svcntb() * 8;
703+
const int ggml_f16_epr = sve_register_length / 16;
704+
const int ggml_f16_step = 2 * ggml_f16_epr;
705+
706+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
707+
const int np = (n & ~(ggml_f16_step - 1));
708+
svfloat16_t ay1, ay2;
709+
710+
for (int i = 0; i < np; i += ggml_f16_step) {
711+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
712+
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
713+
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
714+
715+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
716+
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
717+
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
718+
}
719+
// leftovers
720+
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
721+
if (np < n) {
722+
svbool_t pg = svwhilelt_b16(np, n);
723+
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
724+
svfloat16_t out = svmul_f16_m(pg, hy, vx);
725+
svst1_f16(pg, (__fp16 *)(y + np), out);
726+
}
727+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
728+
for (int i = 0, vl; i < n; i += vl) {
729+
vl = __riscv_vsetvl_e16m2(n - i);
730+
vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
731+
vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
732+
vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
733+
vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
734+
__riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
735+
}
736+
#elif defined(GGML_SIMD)
737+
const int np = (n & ~(GGML_F16_STEP - 1));
736738

737-
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
739+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
738740

739-
GGML_F16_VEC ay[GGML_F16_ARR];
741+
GGML_F16_VEC ay[GGML_F16_ARR];
740742

741-
for (int i = 0; i < np; i += GGML_F16_STEP) {
742-
for (int j = 0; j < GGML_F16_ARR; j++) {
743-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
744-
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
743+
for (int i = 0; i < np; i += GGML_F16_STEP) {
744+
for (int j = 0; j < GGML_F16_ARR; j++) {
745+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
746+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
745747

746-
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
747-
}
748+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
748749
}
750+
}
749751

750-
// leftovers
751-
for (int i = np; i < n; ++i) {
752-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
753-
}
754-
#endif
752+
// leftovers
753+
for (int i = np; i < n; ++i) {
754+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
755+
}
755756
#else
756757
// scalar
757758
for (int i = 0; i < n; ++i) {

0 commit comments

Comments
 (0)