Skip to content

Commit ffbab18

Browse files
committed
ggml-cpu: change lmul, unrolling for kernels
1 parent a423cf3 commit ffbab18

File tree

2 files changed

+92
-106
lines changed

2 files changed

+92
-106
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3274,31 +3274,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
32743274
__m128 y_vec = _mm_cvtph_ps(x_vec);
32753275
_mm_storeu_ps(y + i, y_vec);
32763276
}
3277+
32773278
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
32783279
// calculate step size
3279-
const int epr = __riscv_vsetvlmax_e16m4();
3280+
const int epr = __riscv_vsetvlmax_e16m2();
32803281
const int step = epr * 2;
32813282
const int np = (n & ~(step - 1));
32823283

32833284
// unroll by 2
32843285
for (; i < np; i += step) {
3285-
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
3286-
vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, epr);
3287-
__riscv_vse32_v_f32m8(y + i, ay0, epr);
3286+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
3287+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
3288+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
32883289

3289-
vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
3290-
vfloat32m8_t ay1 = __riscv_vfwcvt_f_f_v_f32m8(ax1, epr);
3291-
__riscv_vse32_v_f32m8(y + i + epr, ay1, epr);
3290+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
3291+
vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
3292+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
32923293
}
32933294

32943295
// leftovers
32953296
int vl;
32963297
for (i = np; i < n; i += vl) {
3297-
vl = __riscv_vsetvl_e16m4(n - i);
3298-
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
3299-
vfloat32m8_t ay0 = __riscv_vfwcvt_f_f_v_f32m8(ax0, vl);
3300-
__riscv_vse32_v_f32m8(y + i, ay0, vl);
3298+
vl = __riscv_vsetvl_e16m2(n - i);
3299+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
3300+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
3301+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
33013302
}
3303+
33023304
#endif
33033305

33043306
for (; i < n; ++i) {
@@ -3345,28 +3347,28 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
33453347
}
33463348
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
33473349
// calculate step size
3348-
const int epr = __riscv_vsetvlmax_e16m4();
3350+
const int epr = __riscv_vsetvlmax_e16m2();
33493351
const int step = epr * 2;
33503352
const int np = (n & ~(step - 1));
33513353

33523354
// unroll by 2
33533355
for (; i < np; i += step) {
3354-
vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, epr);
3355-
vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, epr);
3356-
__riscv_vse32_v_f32m8(y + i, ay0, epr);
3356+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
3357+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
3358+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
33573359

3358-
vbfloat16m4_t ax1 = __riscv_vle16_v_bf16m4((const __bf16*)x + i + epr, epr);
3359-
vfloat32m8_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax1, epr);
3360-
__riscv_vse32_v_f32m8(y + i + epr, ay1, epr);
3360+
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
3361+
vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
3362+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
33613363
}
33623364

33633365
// leftovers
33643366
int vl;
33653367
for (i = np; i < n; i += vl) {
3366-
vl = __riscv_vsetvl_e16m4(n - i);
3367-
vbfloat16m4_t ax0 = __riscv_vle16_v_bf16m4((const __bf16*)x + i, vl);
3368-
vfloat32m8_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m8(ax0, vl);
3369-
__riscv_vse32_v_f32m8(y + i, ay0, vl);
3368+
vl = __riscv_vsetvl_e16m2(n - i);
3369+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
3370+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
3371+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
33703372
}
33713373
#endif
33723374
for (; i < n; i++) {

ggml/src/ggml-cpu/vec.h

Lines changed: 68 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -226,84 +226,66 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
226226
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227227

228228
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
229-
size_t vl = __riscv_vsetvlmax_e32m2();
229+
size_t vl = __riscv_vsetvlmax_e32m4();
230230

231231
// initialize accumulators to all zeroes
232-
vfloat32m2_t vsum0_0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
233-
vfloat32m2_t vsum0_1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
234-
vfloat32m2_t vsum0_2 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
235-
vfloat32m2_t vsum0_3 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
236-
vfloat32m2_t vsum1_0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
237-
vfloat32m2_t vsum1_1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
238-
vfloat32m2_t vsum1_2 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
239-
vfloat32m2_t vsum1_3 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
232+
vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
233+
vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
234+
vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
235+
vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
240236

241237
// calculate step size
242-
const size_t epr = __riscv_vsetvlmax_e16m1();
243-
const size_t step = epr * 4;
238+
const size_t epr = __riscv_vsetvlmax_e16m2();
239+
const size_t step = epr * 2;
244240
const int np = (n & ~(step - 1));
245241

246-
// unroll by 4
242+
// unroll by 2 along the row dimension
247243
for (int i = 0; i < np; i += step) {
248-
vfloat16m1_t ay0 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i), epr);
249-
vfloat16m1_t ax0_0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i), epr);
250-
vfloat16m1_t ax1_0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i), epr);
251-
vsum0_0 = __riscv_vfwmacc_vv_f32m2(vsum0_0, ax0_0, ay0, epr);
252-
vsum1_0 = __riscv_vfwmacc_vv_f32m2(vsum1_0, ax1_0, ay0, epr);
253-
__asm__ __volatile__("" ::: "memory");
254-
255-
vfloat16m1_t ay1 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i + epr), epr);
256-
vfloat16m1_t ax0_1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i + epr), epr);
257-
vfloat16m1_t ax1_1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i + epr), epr);
258-
vsum0_1 = __riscv_vfwmacc_vv_f32m2(vsum0_1, ax0_1, ay1, epr);
259-
vsum1_1 = __riscv_vfwmacc_vv_f32m2(vsum1_1, ax1_1, ay1, epr);
260-
__asm__ __volatile__("" ::: "memory");
261-
262-
vfloat16m1_t ay2 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i + 2 * epr), epr);
263-
vfloat16m1_t ax0_2 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i + 2 * epr), epr);
264-
vfloat16m1_t ax1_2 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i + 2 * epr), epr);
265-
vsum0_2 = __riscv_vfwmacc_vv_f32m2(vsum0_2, ax0_2, ay2, epr);
266-
vsum1_2 = __riscv_vfwmacc_vv_f32m2(vsum1_2, ax1_2, ay2, epr);
267-
__asm__ __volatile__("" ::: "memory");
268-
269-
vfloat16m1_t ay3 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i + 3 * epr), epr);
270-
vfloat16m1_t ax0_3 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i + 3 * epr), epr);
271-
vfloat16m1_t ax1_3 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i + 3 * epr), epr);
272-
vsum0_3 = __riscv_vfwmacc_vv_f32m2(vsum0_3, ax0_3, ay3, epr);
273-
vsum1_3 = __riscv_vfwmacc_vv_f32m2(vsum1_3, ax1_3, ay3, epr);
274-
__asm__ __volatile__("" ::: "memory");
244+
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
245+
vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
246+
vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
247+
vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
248+
vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
249+
250+
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
251+
vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
252+
vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
253+
vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
254+
vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
275255
}
276256

277-
vfloat32m2_t vsum0_01 = __riscv_vfadd_vv_f32m2(vsum0_0, vsum0_1, vl);
278-
vfloat32m2_t vsum0_23 = __riscv_vfadd_vv_f32m2(vsum0_2, vsum0_3, vl);
279-
vfloat32m2_t vsum0 = __riscv_vfadd_vv_f32m2(vsum0_01, vsum0_23, vl);
280-
281-
vfloat32m2_t vsum1_01 = __riscv_vfadd_vv_f32m2(vsum1_0, vsum1_1, vl);
282-
vfloat32m2_t vsum1_23 = __riscv_vfadd_vv_f32m2(vsum1_2, vsum1_3, vl);
283-
vfloat32m2_t vsum1 = __riscv_vfadd_vv_f32m2(vsum1_01, vsum1_23, vl);
257+
vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
258+
vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
284259

285260
// leftovers
286261
for (int i = np; i < n; i += vl) {
287-
vl = __riscv_vsetvl_e16m1(n - i);
288-
vfloat16m1_t ay = __riscv_vle16_v_f16m1((const _Float16 *)(y + i), vl);
289-
vfloat16m1_t ax0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i), vl);
290-
vfloat16m1_t ax1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i), vl);
262+
vl = __riscv_vsetvl_e16m2(n - i);
263+
vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
264+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
265+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
291266

292-
vsum0 = __riscv_vfwmacc_vv_f32m2(vsum0, ax0, ay, vl);
293-
vsum1 = __riscv_vfwmacc_vv_f32m2(vsum1, ax1, ay, vl);
267+
vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
268+
vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
294269
}
295270

296271
// reduce
272+
vl = __riscv_vsetvlmax_e32m2();
273+
vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
274+
__riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
297275
vl = __riscv_vsetvlmax_e32m1();
298-
vfloat32m1_t acc0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum0, 0),
299-
__riscv_vget_v_f32m2_f32m1(vsum0, 1), vl);
276+
vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
277+
__riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
300278
vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
301-
acc0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
279+
acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
302280

303-
vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum1, 0),
304-
__riscv_vget_v_f32m2_f32m1(vsum1, 1), vl);
281+
vl = __riscv_vsetvlmax_e32m2();
282+
vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
283+
__riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
284+
vl = __riscv_vsetvlmax_e32m1();
285+
vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
286+
__riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
305287
vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
306-
acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
288+
acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
307289
sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
308290
sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
309291

@@ -556,31 +538,33 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
556538
const _Float16 scale = *(const _Float16*)(&s);
557539

558540
// calculate step size
559-
const int epr = __riscv_vsetvlmax_e16m2();
541+
const int epr = __riscv_vsetvlmax_e16m4();
560542
const int step = epr * 2;
561543
const int np = (n & ~(step - 1));
562544

563545
// unroll by 2
564546
for (int i = 0; i < np; i += step) {
565-
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
566-
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, epr);
567-
ay0 = __riscv_vfmacc_vf_f16m2(ay0, scale, ax0, epr);
568-
__riscv_vse16_v_f16m2((_Float16*)y + i, ay0, epr);
569-
570-
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
571-
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16*)y + i + epr, epr);
572-
ay1 = __riscv_vfmacc_vf_f16m2(ay1, scale, ax1, epr);
573-
__riscv_vse16_v_f16m2((_Float16*)y + i + epr, ay1, epr);
547+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
548+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
549+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
550+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
551+
__asm__ __volatile__ ("" ::: "memory");
552+
553+
vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
554+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
555+
ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
556+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
557+
__asm__ __volatile__ ("" ::: "memory");
574558
}
575559

576560
// leftovers
577561
int vl;
578562
for (int i = np; i < n; i += vl) {
579-
vl = __riscv_vsetvl_e16m2(n - i);
580-
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i , vl);
581-
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, vl);
582-
ay0 = __riscv_vfmacc_vf_f16m2(ay0, scale, ax0, vl);
583-
__riscv_vse16_v_f16m2((_Float16*)y + i, ay0, vl);
563+
vl = __riscv_vsetvl_e16m4(n - i);
564+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i , vl);
565+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
566+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
567+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
584568
}
585569
#else
586570
const int np = (n & ~(GGML_F16_STEP - 1));
@@ -831,30 +815,30 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
831815
const _Float16 scale = *(const _Float16*)(&s);
832816

833817
// calculate step size
834-
const int epr = __riscv_vsetvlmax_e16m2();
818+
const int epr = __riscv_vsetvlmax_e16m4();
835819
const int step = epr * 2;
836820
const int np = (n & ~(step - 1));
837821

838822
// unroll by 2
839823
for (int i = 0; i < np; i += step) {
840-
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, epr);
841-
ay0 = __riscv_vfmul_vf_f16m2(ay0, scale, epr);
842-
__riscv_vse16_v_f16m2((_Float16*)y + i, ay0, epr);
824+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
825+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
826+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
843827
__asm__ __volatile__ ("" ::: "memory");
844828

845-
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16*)y + i + epr, epr);
846-
ay1 = __riscv_vfmul_vf_f16m2(ay1, scale, epr);
847-
__riscv_vse16_v_f16m2((_Float16*)y + i + epr, ay1, epr);
829+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
830+
ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
831+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
848832
__asm__ __volatile__ ("" ::: "memory");
849833
}
850834

851835
// leftovers
852836
int vl;
853837
for (int i = np; i < n; i += vl) {
854-
vl = __riscv_vsetvl_e16m2(n - i);
855-
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, vl);
856-
ay0 = __riscv_vfmul_vf_f16m2(ay0, scale, vl);
857-
__riscv_vse16_v_f16m2((_Float16*)y + i, ay0, vl);
838+
vl = __riscv_vsetvl_e16m4(n - i);
839+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
840+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
841+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
858842
}
859843
#else
860844
const int np = (n & ~(GGML_F16_STEP - 1));

0 commit comments

Comments
 (0)