Skip to content

Commit 1bda8fb

Browse files
taimur-10xrehan-10xengineer
authored andcommitted
ggml: add floating-point kernels
Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>
1 parent 96128a9 commit 1bda8fb

File tree

3 files changed

+193
-40
lines changed

3 files changed

+193
-40
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3296,31 +3296,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
32963296
__m128 y_vec = _mm_cvtph_ps(x_vec);
32973297
_mm_storeu_ps(y + i, y_vec);
32983298
}
3299+
32993300
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
33003301
// calculate step size
3301-
const int epr = __riscv_vsetvlmax_e16m4();
3302+
const int epr = __riscv_vsetvlmax_e16m2();
33023303
const int step = epr * 2;
33033304
const int np = (n & ~(step - 1));
33043305

33053306
// unroll by 2
33063307
for (; i < np; i += step) {
3307-
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
3308-
vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, epr);
3309-
__riscv_vse32_v_f32m8(y + i, ay0, epr);
3308+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
3309+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
3310+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
33103311

3311-
vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
3312-
vfloat32m8_t ay1 = __riscv_vfwcvt_f_f_v_f32m8(ax1, epr);
3313-
__riscv_vse32_v_f32m8(y + i + epr, ay1, epr);
3312+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
3313+
vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
3314+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
33143315
}
33153316

33163317
// leftovers
33173318
int vl;
33183319
for (i = np; i < n; i += vl) {
3319-
vl = __riscv_vsetvl_e16m4(n - i);
3320-
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
3321-
vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, vl);
3322-
__riscv_vse32_v_f32m8(y + i, ay0, vl);
3320+
vl = __riscv_vsetvl_e16m2(n - i);
3321+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
3322+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
3323+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
33233324
}
3325+
33243326
#endif
33253327

33263328
for (; i < n; ++i) {
@@ -3367,28 +3369,28 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
33673369
}
33683370
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
33693371
// calculate step size
3370-
const int epr = __riscv_vsetvlmax_e16m4();
3372+
const int epr = __riscv_vsetvlmax_e16m2();
33713373
const int step = epr * 2;
33723374
const int np = (n & ~(step - 1));
33733375

33743376
// unroll by 2
33753377
for (; i < np; i += step) {
3376-
vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, epr);
3377-
vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, epr);
3378-
__riscv_vse32_v_f32m8(y + i, ay0, epr);
3378+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
3379+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
3380+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
33793381

3380-
vbfloat16m4_t ax1 = __riscv_vle16_v_bf16m4((const __bf16*)x + i + epr, epr);
3381-
vfloat32m8_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax1, epr);
3382-
__riscv_vse32_v_f32m8(y + i + epr, ay1, epr);
3382+
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
3383+
vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
3384+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
33833385
}
33843386

33853387
// leftovers
33863388
int vl;
33873389
for (i = np; i < n; i += vl) {
3388-
vl = __riscv_vsetvl_e16m4(n - i);
3389-
vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, vl);
3390-
vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, vl);
3391-
__riscv_vse32_v_f32m8(y + i, ay0, vl);
3390+
vl = __riscv_vsetvl_e16m2(n - i);
3391+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
3392+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
3393+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
33923394
}
33933395
#endif
33943396
for (; i < n; i++) {

ggml/src/ggml-cpu/vec.cpp

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,50 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
195195
sumf += (ggml_float)_mm_cvtss_f32(g);
196196

197197
#undef LOAD
198-
#endif
198+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
199+
size_t vl = __riscv_vsetvlmax_e32m4();
200+
201+
// initialize accumulators to all zeroes
202+
vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
203+
vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
204+
205+
// calculate step size
206+
const size_t epr = __riscv_vsetvlmax_e16m2();
207+
const size_t step = epr * 2;
208+
const int np = (n & ~(step - 1));
209+
210+
// unroll by 2
211+
for (; i < np; i += step) {
212+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
213+
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
214+
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
215+
__asm__ __volatile__ ("" ::: "memory");
216+
217+
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
218+
vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
219+
vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
220+
__asm__ __volatile__ ("" ::: "memory");
221+
}
199222

223+
// accumulate in 1 register
224+
vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
225+
226+
// leftovers
227+
for (i = np; i < n; i += vl) {
228+
vl = __riscv_vsetvl_e16m2(n - i);
229+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
230+
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
231+
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
232+
}
233+
234+
// reduce
235+
vl = __riscv_vsetvlmax_e32m2();
236+
vfloat32m2_t acc0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
237+
vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0, 0), __riscv_vget_v_f32m2_f32m1(acc0, 1), vl);
238+
vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m1_f32m1(acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
239+
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
240+
241+
#endif
200242
for (; i < n; ++i) {
201243
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
202244
GGML_BF16_TO_FP32(y[i]));

ggml/src/ggml-cpu/vec.h

Lines changed: 126 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
224224
}
225225
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226226
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227-
#elif defined(__riscv_v_intrinsic)
228-
// todo: RVV impl
229-
for (int i = 0; i < n; ++i) {
230-
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
231-
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
232-
}
233-
}
227+
228+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
229+
size_t vl = __riscv_vsetvlmax_e32m4();
230+
231+
// initialize accumulators to all zeroes
232+
vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
233+
vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
234+
vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
235+
vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
236+
237+
// calculate step size
238+
const size_t epr = __riscv_vsetvlmax_e16m2();
239+
const size_t step = epr * 2;
240+
const int np = (n & ~(step - 1));
241+
242+
// unroll by 2 along the row dimension
243+
for (int i = 0; i < np; i += step) {
244+
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
245+
vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
246+
vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
247+
vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
248+
vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
249+
250+
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
251+
vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
252+
vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
253+
vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
254+
vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
255+
}
256+
257+
vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
258+
vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
259+
260+
// leftovers
261+
for (int i = np; i < n; i += vl) {
262+
vl = __riscv_vsetvl_e16m2(n - i);
263+
vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
264+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
265+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
266+
267+
vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
268+
vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
269+
}
270+
271+
// reduce
272+
vl = __riscv_vsetvlmax_e32m2();
273+
vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
274+
__riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
275+
vl = __riscv_vsetvlmax_e32m1();
276+
vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
277+
__riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
278+
vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
279+
acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
280+
281+
vl = __riscv_vsetvlmax_e32m2();
282+
vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
283+
__riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
284+
vl = __riscv_vsetvlmax_e32m1();
285+
vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
286+
__riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
287+
vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
288+
acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
289+
sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
290+
sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
291+
234292
#else
235293
const int np = (n & ~(GGML_F16_STEP - 1));
236294

@@ -475,11 +533,38 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
475533
svst1_f16(pg, (__fp16 *)(y + np2), hy);
476534
}
477535

478-
#elif defined(__riscv_v_intrinsic)
479-
// todo: RVV impl
480-
// scalar
481-
for (int i = 0; i < n; ++i) {
482-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
536+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
537+
const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
538+
const _Float16 scale = *(const _Float16*)(&s);
539+
540+
// calculate step size
541+
const int epr = __riscv_vsetvlmax_e16m4();
542+
const int step = epr * 2;
543+
const int np = (n & ~(step - 1));
544+
545+
// unroll by 2
546+
for (int i = 0; i < np; i += step) {
547+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
548+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
549+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
550+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
551+
__asm__ __volatile__ ("" ::: "memory");
552+
553+
vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
554+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
555+
ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
556+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
557+
__asm__ __volatile__ ("" ::: "memory");
558+
}
559+
560+
// leftovers
561+
int vl;
562+
for (int i = np; i < n; i += vl) {
563+
vl = __riscv_vsetvl_e16m4(n - i);
564+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i , vl);
565+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
566+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
567+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
483568
}
484569
#else
485570
const int np = (n & ~(GGML_F16_STEP - 1));
@@ -725,11 +810,35 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
725810
svfloat16_t out = svmul_f16_m(pg, hy, vx);
726811
svst1_f16(pg, (__fp16 *)(y + np), out);
727812
}
728-
#elif defined(__riscv_v_intrinsic)
729-
// todo: RVV impl
730-
// scalar
731-
for (int i = 0; i < n; ++i) {
732-
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
813+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
814+
const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
815+
const _Float16 scale = *(const _Float16*)(&s);
816+
817+
// calculate step size
818+
const int epr = __riscv_vsetvlmax_e16m4();
819+
const int step = epr * 2;
820+
const int np = (n & ~(step - 1));
821+
822+
// unroll by 2
823+
for (int i = 0; i < np; i += step) {
824+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
825+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
826+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
827+
__asm__ __volatile__ ("" ::: "memory");
828+
829+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
830+
ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
831+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
832+
__asm__ __volatile__ ("" ::: "memory");
833+
}
834+
835+
// leftovers
836+
int vl;
837+
for (int i = np; i < n; i += vl) {
838+
vl = __riscv_vsetvl_e16m4(n - i);
839+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
840+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
841+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
733842
}
734843
#else
735844
const int np = (n & ~(GGML_F16_STEP - 1));

0 commit comments

Comments
 (0)