Skip to content

Commit 7a27886

Browse files
committed
ggml-cpu: change lmul for kernels
1 parent cd44eee commit 7a27886

File tree

2 files changed

+52
-48
lines changed

2 files changed

+52
-48
lines changed

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3285,12 +3285,10 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
32853285
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
32863286
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
32873287
__riscv_vse32_v_f32m4(y + i, ay0, epr);
3288-
__asm__ __volatile__ ("" ::: "memory");
32893288

32903289
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
32913290
vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
32923291
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3293-
__asm__ __volatile__ ("" ::: "memory");
32943292
}
32953293

32963294
// leftovers

ggml/src/ggml-cpu/vec.h

Lines changed: 52 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -226,63 +226,69 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
226226
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227227

228228
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
229-
size_t vl = __riscv_vsetvlmax_e32m2();
229+
size_t vl = __riscv_vsetvlmax_e32m4();
230230

231231
// initialize accumulators to all zeroes
232-
vfloat32m2_t vsum0_0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
233-
vfloat32m2_t vsum0_1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
234-
vfloat32m2_t vsum1_0 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
235-
vfloat32m2_t vsum1_1 = __riscv_vfmv_v_f_f32m2(0.0f, vl);
236-
232+
vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
233+
vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
234+
vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
235+
vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
237236

238237
// calculate step size
239-
const size_t epr = __riscv_vsetvlmax_e16m1();
238+
const size_t epr = __riscv_vsetvlmax_e16m2();
240239
const size_t step = epr * 2;
241240
const int np = (n & ~(step - 1));
242241

243-
// unroll by 2
242+
// unroll by 2 along the row dimension
244243
for (int i = 0; i < np; i += step) {
245-
vfloat16m1_t ay0 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i), epr);
246-
vfloat16m1_t ax0_0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i), epr);
247-
vfloat16m1_t ax1_0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i), epr);
248-
vsum0_0 = __riscv_vfwmacc_vv_f32m2(vsum0_0, ax0_0, ay0, epr);
249-
vsum1_0 = __riscv_vfwmacc_vv_f32m2(vsum1_0, ax1_0, ay0, epr);
250-
251-
vfloat16m1_t ay1 = __riscv_vle16_v_f16m1((const _Float16 *)(y + i + epr), epr);
252-
vfloat16m1_t ax0_1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i + epr), epr);
253-
vfloat16m1_t ax1_1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i + epr), epr);
254-
vsum0_1 = __riscv_vfwmacc_vv_f32m2(vsum0_1, ax0_1, ay1, epr);
255-
vsum1_1 = __riscv_vfwmacc_vv_f32m2(vsum1_1, ax1_1, ay1, epr);
244+
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
245+
vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
246+
vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
247+
vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
248+
vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
249+
250+
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
251+
vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
252+
vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
253+
vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
254+
vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
256255
}
257256

258-
vfloat32m2_t vsum0 = __riscv_vfadd_vv_f32m2(vsum0_0, vsum0_1, vl);
259-
260-
vfloat32m2_t vsum1 = __riscv_vfadd_vv_f32m2(vsum1_0, vsum1_1, vl);
257+
vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
258+
vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
261259

262260
// leftovers
263261
for (int i = np; i < n; i += vl) {
264-
vl = __riscv_vsetvl_e16m1(n - i);
265-
vfloat16m1_t ay = __riscv_vle16_v_f16m1((const _Float16 *)(y + i), vl);
266-
vfloat16m1_t ax0 = __riscv_vle16_v_f16m1((const _Float16 *)(x[0] + i), vl);
267-
vfloat16m1_t ax1 = __riscv_vle16_v_f16m1((const _Float16 *)(x[1] + i), vl);
262+
vl = __riscv_vsetvl_e16m2(n - i);
263+
vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
264+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
265+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
268266

269-
vsum0 = __riscv_vfwmacc_vv_f32m2(vsum0, ax0, ay, vl);
270-
vsum1 = __riscv_vfwmacc_vv_f32m2(vsum1, ax1, ay, vl);
267+
vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
268+
vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
271269
}
272270

273271
// reduce
272+
vl = __riscv_vsetvlmax_e32m2();
273+
vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
274+
__riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
274275
vl = __riscv_vsetvlmax_e32m1();
275-
vfloat32m1_t acc0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum0, 0),
276-
__riscv_vget_v_f32m2_f32m1(vsum0, 1), vl);
276+
vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
277+
__riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
277278
vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
278-
acc0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
279+
acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
279280

280-
vfloat32m1_t acc1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum1, 0),
281-
__riscv_vget_v_f32m2_f32m1(vsum1, 1), vl);
281+
vl = __riscv_vsetvlmax_e32m2();
282+
vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
283+
__riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
284+
vl = __riscv_vsetvlmax_e32m1();
285+
vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
286+
__riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
282287
vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
283-
acc1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
288+
acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
284289
sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
285290
sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
291+
286292
#else
287293
const int np = (n & ~(GGML_F16_STEP - 1));
288294

@@ -554,7 +560,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
554560
// leftovers
555561
int vl;
556562
for (int i = np; i < n; i += vl) {
557-
vl = __riscv_vsetvl_e16m2(n - i);
563+
vl = __riscv_vsetvl_e16m4(n - i);
558564
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i , vl);
559565
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
560566
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
@@ -809,30 +815,30 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
809815
const _Float16 scale = *(const _Float16*)(&s);
810816

811817
// calculate step size
812-
const int epr = __riscv_vsetvlmax_e16m2();
818+
const int epr = __riscv_vsetvlmax_e16m4();
813819
const int step = epr * 2;
814820
const int np = (n & ~(step - 1));
815821

816822
// unroll by 2
817823
for (int i = 0; i < np; i += step) {
818-
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, epr);
819-
ay0 = __riscv_vfmul_vf_f16m2(ay0, scale, epr);
820-
__riscv_vse16_v_f16m2((_Float16*)y + i, ay0, epr);
824+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
825+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
826+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
821827
__asm__ __volatile__ ("" ::: "memory");
822828

823-
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16*)y + i + epr, epr);
824-
ay1 = __riscv_vfmul_vf_f16m2(ay1, scale, epr);
825-
__riscv_vse16_v_f16m2((_Float16*)y + i + epr, ay1, epr);
829+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
830+
ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
831+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
826832
__asm__ __volatile__ ("" ::: "memory");
827833
}
828834

829835
// leftovers
830836
int vl;
831837
for (int i = np; i < n; i += vl) {
832-
vl = __riscv_vsetvl_e16m2(n - i);
833-
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16*)y + i, vl);
834-
ay0 = __riscv_vfmul_vf_f16m2(ay0, scale, vl);
835-
__riscv_vse16_v_f16m2((_Float16*)y + i, ay0, vl);
838+
vl = __riscv_vsetvl_e16m4(n - i);
839+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
840+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
841+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
836842
}
837843
#else
838844
const int np = (n & ~(GGML_F16_STEP - 1));

0 commit comments

Comments
 (0)