From ab75281b711929f3debcc0e4b5316e8eff49583f Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Mon, 10 Nov 2025 23:10:54 +0800 Subject: [PATCH 01/23] refactor: use hvx_vec_exp_fp32_guard_inf for overflow handling in hvx_exp_f32 --- ggml/src/ggml-hexagon/htp/hvx-exp.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index 19f6795083c1d..f9127251899b5 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -16,6 +16,19 @@ #include "hvx-utils.h" #include "ops-utils.h" +static inline HVX_Vector hvx_vec_exp_fp32_guard_inf(HVX_Vector in_vec) { + static const float kInf = INFINITY; + static const float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); + const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); + + HVX_Vector out = hvx_vec_exp_fp32(in_vec); + + return Q6_V_vmux_QVV(pred0, inf, out); +} + void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; @@ -38,25 +51,25 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int HVX_Vector * p_vec_in1 = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); - *p_vec_out++ = hvx_vec_exp_fp32(neg_vec_in); + *p_vec_out++ = hvx_vec_exp_fp32_guard_inf(neg_vec_in); } else { - *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++); + *p_vec_out++ = hvx_vec_exp_fp32_guard_inf(*p_vec_in1++); } } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(neg_vec_in); } else { - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(in); } } } @@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - vec_out = hvx_vec_exp_fp32(neg_vec_in); + vec_out = hvx_vec_exp_fp32_guard_inf(neg_vec_in); } else { - vec_out = hvx_vec_exp_fp32(in); + vec_out = hvx_vec_exp_fp32_guard_inf(in); } hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); From 5aa4a8328d7635dc0e36987c88d74106bc13153c Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Tue, 11 Nov 2025 00:10:29 +0800 Subject: [PATCH 02/23] feat: add fast sigmoid function with overflow guard for fp32 --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index b2ca8e88f464e..686c55fa99b64 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -934,6 +934,17 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { return Q6_Vsf_equals_Vqf32(temp); } +static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) { + static const float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); + + HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); + + return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out); +} + static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { int step_of_1 = num_elems >> 5; int remaining = num_elems - step_of_1 * VLEN_FP32; @@ -945,7 +956,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { - v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]); + v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard_inf(v_src[i]); } } From a64154ce57198e87152abd44cb7fbdf8178cc533 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Tue, 11 Nov 2025 22:04:15 +0800 Subject: [PATCH 03/23] refactor: replace hvx_vec_inverse_fp32 with hvx_vec_inverse_fp32_guard_inf for improved overflow handling --- ggml/src/ggml-hexagon/htp/hvx-inverse.c | 10 ++++---- ggml/src/ggml-hexagon/htp/hvx-utils.h | 31 +++++++++++++++++-------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c index 4cf588a8781f1..25dda0b729fbd 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c +++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c @@ -36,15 +36,15 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const HVX_Vector * p_vec_in = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++); + *p_vec_out++ = hvx_vec_inverse_fp32_guard_inf(*p_vec_in++); } } else { - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard_inf(in); } } @@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_inverse_fp32(in); + HVX_Vector out = hvx_vec_inverse_fp32_guard_inf(in); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 686c55fa99b64..cc1de34e252e7 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -80,7 +80,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -104,7 +104,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -128,7 +128,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -153,7 +153,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -177,7 +177,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -201,7 +201,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -226,7 +226,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t uint32_t i = 0; - #pragma unroll(4) +#pragma unroll(4) for (; i < nvec; i++) { vdst[i] = velem; } @@ -411,8 +411,8 @@ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n HVX_Vector sum = in, sum_t; while (width < total) { - sum_t = Q6_V_vror_VR(sum, width); // rotate right - sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum + sum_t = Q6_V_vror_VR(sum, width); // rotate right + sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum width = width << 1; } return sum; @@ -720,6 +720,17 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { return Q6_Vsf_equals_Vqf32(r_qf); } +static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { + static const float kInf = INFINITY; + + const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); + + HVX_Vector out = hvx_vec_inverse_fp32(v_sf); + + return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero()); +} + #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 @@ -954,7 +965,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; - #pragma unroll(4) +#pragma unroll(4) for (int i = 0; i < step_of_1; i++) { v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard_inf(v_src[i]); } From a8cdbcf0d2fa2077bd545ef901250d4ff60a0913 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Tue, 11 Nov 2025 23:46:40 +0800 Subject: [PATCH 04/23] feat: enhance hvx_add_scalar_f32 with overflow handling using infinity guard --- ggml/src/ggml-hexagon/htp/hvx-utils.c | 27 +++++++++++++++++++++------ ggml/src/ggml-hexagon/htp/hvx-utils.h | 4 ++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index d3599bc9c1276..4a1233ef5a9f1 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -401,6 +401,10 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } + static const float kInf = INFINITY; + + const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + HVX_Vector val_vec = hvx_vec_splat_fp32(val); if (0 == unaligned_loop) { @@ -409,17 +413,24 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec); - *vec_out++ = Q6_Vsf_equals_Vqf32(v); + HVX_Vector in = *vec_in1++; + const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + v = Q6_Vsf_equals_Vqf32(v); + v = Q6_V_vmux_QVV(pred0, inf, v); + *vec_out++ = v; } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred0, inf, out); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out; } } @@ -429,8 +440,12 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out)); + const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred0, inf, out); + + hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index cc1de34e252e7..bb8bef4bcb603 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -724,11 +724,11 @@ static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { static const float kInf = INFINITY; const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); - const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); + const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, v_sf); HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero()); + return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out); } #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 From ae42fb63304a3abb1798d5ed8e8a56e266fb9a63 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 14 Nov 2025 01:01:55 +0800 Subject: [PATCH 05/23] wip --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index bb8bef4bcb603..3fd8e5481feb6 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -946,10 +946,10 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { } static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) { - static const float kMaxExp = 88.02f; // log(INF) + static const float kMaxExp = -88.02f; // log(INF) const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); - const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); From 39445ab0d1464092ec1c1005fb447fce6aaca09d Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 14 Nov 2025 11:29:48 +0800 Subject: [PATCH 06/23] add HVX_Vector_Alias wip --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 3fd8e5481feb6..8244fd55dae92 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -12,6 +12,15 @@ #define VLEN_FP32 (VLEN / SIZEOF_FP32) #define VLEN_FP16 (VLEN / SIZEOF_FP16) +typedef union { + HVX_Vector v; + uint8_t b[VLEN]; + uint16_t h[VLEN_FP16]; + uint32_t w[VLEN_FP32]; + __fp16 fp16[VLEN_FP16]; + float fp32[VLEN_FP32]; +} HVX_VectorAlias; + static inline HVX_Vector hvx_vec_splat_fp32(float i) { union { float f; @@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3 } static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) { - union { - HVX_Vector v; - __fp16 d[64]; - } u = { .v = v }; + HVX_VectorAlias u = { .v = v }; const uint32_t n0 = n / 16; const uint32_t n1 = n % 16; int i = 0; for (; i < n0; i++) { - htp_dump_fp16_line(pref, u.d + (16 * i), 16); + htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16); } if (n1) { - htp_dump_fp16_line(pref, u.d + (16 * i), n1); + htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1); } } From a589b611110d2968dc0af780e2b55c071fdfedb1 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 15 Nov 2025 10:25:48 +0800 Subject: [PATCH 07/23] wip --- ggml/src/ggml-hexagon/htp/act-ops.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 16044975d9253..1580107d87eda 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -106,20 +106,16 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, t1 = HAP_perf_get_qtimer_count(); int is_aligned = 1; - int opt_path = 0; if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) { is_aligned = 0; FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n"); } - if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { - opt_path = 1; - } const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; uint8_t * restrict data_dst = (uint8_t *) dst->data; - bool src1_valid = src1->ne[0]; + const bool src1_valid = src1->ne[0]; if (!src1_valid) { data_src1 = data_src0; src1_row_size = src0_row_size; @@ -129,10 +125,9 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size); uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); - const int32_t swapped = op_params[1]; - - const int nc = (src1_valid) ? ne0 : ne0 / 2; - + const int32_t swapped = op_params[1]; + const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1))); + const int nc = (src1_valid) ? ne0 : ne0 / 2; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size)); @@ -147,7 +142,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, src1 += swapped ? 0 : nc; } - if (1 == opt_path) { + if (opt_path) { hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc); hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc); From 6f57b9e2ab463d5e9b1ce5e258d068bb50e3c5f9 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 15 Nov 2025 13:02:59 +0800 Subject: [PATCH 08/23] fix: improve handling of src1 tensor in glu_swiglu_fp32_per_thread function --- ggml/src/ggml-hexagon/htp/act-ops.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 1580107d87eda..6ed791bc7b89e 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -116,18 +116,22 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, uint8_t * restrict data_dst = (uint8_t *) dst->data; const bool src1_valid = src1->ne[0]; + const int nc = (src1_valid) ? ne0 : ne0 / 2; if (!src1_valid) { - data_src1 = data_src0; - src1_row_size = src0_row_size; + const int32_t swapped = op_params[1]; + data_src1 = data_src0; + src1_row_size = src0_row_size; + + const size_t nc_in_bytes = nc * SIZEOF_FP32; + data_src0 += swapped ? nc_in_bytes : 0; + data_src1 += swapped ? 0 : nc_in_bytes; } uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size); uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); - const int32_t swapped = op_params[1]; - const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1))); - const int nc = (src1_valid) ? ne0 : ne0 / 2; + const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1))); for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size)); @@ -137,11 +141,6 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size); } - if (!src1_valid) { - src0 += swapped ? nc : 0; - src1 += swapped ? 0 : nc; - } - if (opt_path) { hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc); hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1, From db9e9308bb4d99e093288137a7a3ad723f882498 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 15 Nov 2025 14:34:51 +0800 Subject: [PATCH 09/23] fix nc --- ggml/src/ggml-hexagon/htp/act-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 6ed791bc7b89e..01b3d8fec0489 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -116,7 +116,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0, uint8_t * restrict data_dst = (uint8_t *) dst->data; const bool src1_valid = src1->ne[0]; - const int nc = (src1_valid) ? ne0 : ne0 / 2; + const int nc = (src1_valid) ? ne00 : ne00 / 2; if (!src1_valid) { const int32_t swapped = op_params[1]; data_src1 = data_src0; From ce48af57b0ecdc38fb0c6102f10b4cf88181ce0a Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 16 Nov 2025 10:40:02 +0800 Subject: [PATCH 10/23] wip --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 8244fd55dae92..fa996a484941c 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -955,11 +955,11 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) { static const float kMaxExp = -88.02f; // log(INF) const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); - const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); - return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out); + return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero()); } static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { From fc5f31fd23b43d911deb89462038b5a12d78a76c Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 16 Nov 2025 11:14:37 +0800 Subject: [PATCH 11/23] wip --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index fa996a484941c..33728fd615a39 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -19,7 +19,7 @@ typedef union { uint32_t w[VLEN_FP32]; __fp16 fp16[VLEN_FP16]; float fp32[VLEN_FP32]; -} HVX_VectorAlias; +} __attribute__((packed)) __attribute__((aligned(VLEN))) HVX_VectorAlias; static inline HVX_Vector hvx_vec_splat_fp32(float i) { union { From 5707384152b8dc853fa799c4e7f71e911046e80a Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 16 Nov 2025 11:24:28 +0800 Subject: [PATCH 12/23] handle nan at inverse --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 33728fd615a39..573377821c82b 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -728,13 +728,15 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { static const float kInf = INFINITY; + static const float kNan = NAN; - const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); - const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, v_sf); + const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v_sf); + const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf); HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - - return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out); + out = Q6_V_vmux_QVV(pred_nan, inf, out); + return Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out); } #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 From 54235e39453efb409ae16ffde9c9d8a1117756cb Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 16 Nov 2025 23:07:41 +0800 Subject: [PATCH 13/23] wip --- ggml/src/ggml-hexagon/htp/act-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 01b3d8fec0489..87b09cca3afef 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -212,7 +212,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0, const float alpha = ((const float *) (op_params))[2]; const float limit = ((const float *) (op_params))[3]; - const int nc = (src1_valid) ? ne0 : ne0 / 2; + const int nc = (src1_valid) ? ne00 : ne00 / 2; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size)); From 38594657ad6c99cad0786ba2bacd2171132029e9 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 17 Nov 2025 00:36:53 +0800 Subject: [PATCH 14/23] fix neg --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 573377821c82b..b88f721460791 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -19,7 +19,7 @@ typedef union { uint32_t w[VLEN_FP32]; __fp16 fp16[VLEN_FP16]; float fp32[VLEN_FP32]; -} __attribute__((packed)) __attribute__((aligned(VLEN))) HVX_VectorAlias; +} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias; static inline HVX_Vector hvx_vec_splat_fp32(float i) { union { @@ -497,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) { static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) { // neg by setting the fp16 sign bit HVX_Vector mask = Q6_Vh_vsplat_R(0x8000); - return Q6_V_vor_VV(v, mask); + return Q6_V_vxor_VV(v, mask); } static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) { @@ -512,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) { #else // neg by setting the fp32 sign bit HVX_Vector mask = Q6_V_vsplat_R(0x80000000); - return Q6_V_vor_VV(v, mask); + return Q6_V_vxor_VV(v, mask); #endif // __HTP_ARCH__ > 75 } From 014ad77333d2b6773dfeb5552b5294041e14ed60 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 17 Nov 2025 01:09:14 +0800 Subject: [PATCH 15/23] wip --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index b88f721460791..db103c368ea79 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -731,11 +731,12 @@ static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { static const float kNan = NAN; const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + const HVX_Vector nan = Q6_V_vsplat_R(*((uint32_t *) &kNan)); const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v_sf); const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf); HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - out = Q6_V_vmux_QVV(pred_nan, inf, out); + out = Q6_V_vmux_QVV(pred_nan, nan, out); return Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out); } From 8c374577d8ebd4a080f8085555c9617d513c06b3 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 17 Nov 2025 18:16:13 +0800 Subject: [PATCH 16/23] rename --- ggml/src/ggml-hexagon/htp/hvx-utils.c | 36 +++++++++++++-------------- ggml/src/ggml-hexagon/htp/hvx-utils.h | 4 +-- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index 4a1233ef5a9f1..5492a52f71f91 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -401,11 +401,9 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n"); } - static const float kInf = INFINITY; - - const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); - - HVX_Vector val_vec = hvx_vec_splat_fp32(val); + static const float kInf = INFINITY; + const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + HVX_Vector val_vec = hvx_vec_splat_fp32(val); if (0 == unaligned_loop) { HVX_Vector * restrict vec_in1 = (HVX_Vector *) src; @@ -413,22 +411,22 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - HVX_Vector in = *vec_in1++; - const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in); - HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - v = Q6_Vsf_equals_Vqf32(v); - v = Q6_V_vmux_QVV(pred0, inf, v); - *vec_out++ = v; + HVX_Vector in = *vec_in1++; + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + v = Q6_Vsf_equals_Vqf32(v); + v = Q6_V_vmux_QVV(pred_inf, inf, v); + *vec_out++ = v; } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in); - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - out = Q6_Vsf_equals_Vqf32(out); - out = Q6_V_vmux_QVV(pred0, inf, out); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred_inf, inf, out); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out; } @@ -440,10 +438,10 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * HVX_Vector in = *(HVX_UVector *) srcf; - const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in); - HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); - out = Q6_Vsf_equals_Vqf32(out); - out = Q6_V_vmux_QVV(pred0, inf, out); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in); + HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec); + out = Q6_Vsf_equals_Vqf32(out); + out = Q6_V_vmux_QVV(pred_inf, inf, out); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index db103c368ea79..e900b35f439a1 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -957,8 +957,8 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) { static const float kMaxExp = -88.02f; // log(INF) - const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); - const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); + const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); From 33a050e76360e23edd5680007e84704a538e9730 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 17 Nov 2025 18:17:34 +0800 Subject: [PATCH 17/23] fix hvx_vec_inverse_fp32_guard_inf to handle infinity and NaN cases correctly --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index e900b35f439a1..9338b383cb39c 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -728,16 +728,13 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { static const float kInf = INFINITY; - static const float kNan = NAN; const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); - const HVX_Vector nan = Q6_V_vsplat_R(*((uint32_t *) &kNan)); - const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v_sf); + const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf); HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - out = Q6_V_vmux_QVV(pred_nan, nan, out); - return Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out); + return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); } #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 @@ -961,8 +958,8 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) { const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); + return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); - return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero()); } static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { From 83884a5c2fb04280ab8f8e01da09e9c95a6e4c6b Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 17 Nov 2025 20:07:34 +0800 Subject: [PATCH 18/23] wip --- ggml/src/ggml-hexagon/htp/hvx-exp.c | 4 ++-- ggml/src/ggml-hexagon/htp/hvx-utils.c | 2 +- ggml/src/ggml-hexagon/htp/hvx-utils.h | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index f9127251899b5..27d2fff084b68 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -20,8 +20,8 @@ static inline HVX_Vector hvx_vec_exp_fp32_guard_inf(HVX_Vector in_vec) { static const float kInf = INFINITY; static const float kMaxExp = 88.02f; // log(INF) - const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); - const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); HVX_Vector out = hvx_vec_exp_fp32(in_vec); diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c index 5492a52f71f91..e02b1d9099629 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.c +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c @@ -402,7 +402,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * } static const float kInf = INFINITY; - const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); HVX_Vector val_vec = hvx_vec_splat_fp32(val); if (0 == unaligned_loop) { diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 9338b383cb39c..17900f92f5cec 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -729,9 +729,8 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { static const float kInf = INFINITY; - const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf)); + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); - const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf); HVX_Vector out = hvx_vec_inverse_fp32(v_sf); return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); From f7662f3b3cd60f457d8451ce8b071d2b472b001c Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 17 Nov 2025 23:49:12 +0800 Subject: [PATCH 19/23] fix hvx_vec_inverse_fp32_guard_inf to handle NaN cases correctly --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 17900f92f5cec..241fb53506f1a 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -727,13 +727,22 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { } static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { - static const float kInf = INFINITY; + static const float kInf = INFINITY; + static const uint32_t kNanMask = 0x7fffffff; + static const uint32_t kNanMin = 0x7f800000; const HVX_Vector inf = hvx_vec_splat_fp32(kInf); const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); + + const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); + const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); + out = Q6_V_vand_VV(out, nan_mask); + + const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, out); + + return Q6_V_vmux_QVV(pred, out, Q6_V_vzero()); } #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 From f6d7f3c5f2528114853b25e418f55a45d38d19d3 Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 18 Nov 2025 00:03:47 +0800 Subject: [PATCH 20/23] wip --- ggml/src/ggml-hexagon/htp/hvx-inverse.c | 6 +++--- ggml/src/ggml-hexagon/htp/hvx-utils.h | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c index 25dda0b729fbd..f4df866641c77 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c +++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c @@ -38,13 +38,13 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - *p_vec_out++ = hvx_vec_inverse_fp32_guard_inf(*p_vec_in++); + *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard_inf(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in); } } @@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_inverse_fp32_guard_inf(in); + HVX_Vector out = hvx_vec_inverse_fp32_guard(in); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 241fb53506f1a..5429204ee45b1 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -726,7 +726,7 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { return Q6_Vsf_equals_Vqf32(r_qf); } -static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) { +static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { static const float kInf = INFINITY; static const uint32_t kNanMask = 0x7fffffff; static const uint32_t kNanMin = 0x7f800000; @@ -959,7 +959,7 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { return Q6_Vsf_equals_Vqf32(temp); } -static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) { +static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { static const float kMaxExp = -88.02f; // log(INF) const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); @@ -967,7 +967,6 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) { HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); - } static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { @@ -981,7 +980,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { - v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard_inf(v_src[i]); + v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); } } From 37e9a1d197dcf85b7f41627c593c546dfdfe23ca Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 18 Nov 2025 10:25:04 +0800 Subject: [PATCH 21/23] wip --- ggml/src/ggml-hexagon/htp/hvx-exp.c | 4 ++-- ggml/src/ggml-hexagon/htp/hvx-inverse.c | 4 ++-- ggml/src/ggml-hexagon/htp/hvx-utils.h | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index 27d2fff084b68..21552c8c5df6e 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -51,7 +51,7 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int HVX_Vector * p_vec_in1 = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); @@ -61,7 +61,7 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int } } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c index f4df866641c77..953d3e6c16709 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c +++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c @@ -36,12 +36,12 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const HVX_Vector * p_vec_in = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++); } } else { -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in); diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 5429204ee45b1..80526ff28fb13 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -89,7 +89,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -113,7 +113,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -137,7 +137,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -162,7 +162,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -186,7 +186,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -210,7 +210,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { HVX_Vector v = vsrc[i]; vdst[i] = v; @@ -235,7 +235,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t uint32_t i = 0; -#pragma unroll(4) + #pragma unroll(4) for (; i < nvec; i++) { vdst[i] = velem; } @@ -978,7 +978,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; -#pragma unroll(4) + #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); } From 185dc20357407d367499cca631b695b2fe68d31b Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 18 Nov 2025 11:13:17 +0800 Subject: [PATCH 22/23] wip --- ggml/src/ggml-hexagon/htp/hvx-exp.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index 21552c8c5df6e..d0735e9325e1c 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -16,7 +16,7 @@ #include "hvx-utils.h" #include "ops-utils.h" -static inline HVX_Vector hvx_vec_exp_fp32_guard_inf(HVX_Vector in_vec) { +static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) { static const float kInf = INFINITY; static const float kMaxExp = 88.02f; // log(INF) @@ -55,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); - *p_vec_out++ = hvx_vec_exp_fp32_guard_inf(neg_vec_in); + *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - *p_vec_out++ = hvx_vec_exp_fp32_guard_inf(*p_vec_in1++); + *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++); } } } else { @@ -67,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(neg_vec_in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in); } } } @@ -83,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - vec_out = hvx_vec_exp_fp32_guard_inf(neg_vec_in); + vec_out = hvx_vec_exp_fp32_guard(neg_vec_in); } else { - vec_out = hvx_vec_exp_fp32_guard_inf(in); + vec_out = hvx_vec_exp_fp32_guard(in); } hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); From 6d88789ef851dd0c4127d2dcc3547b22f23ff4a1 Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 18 Nov 2025 11:41:36 +0800 Subject: [PATCH 23/23] fix output sign --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 80526ff28fb13..5f94645cde3b1 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -736,11 +736,10 @@ static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); - const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); - out = Q6_V_vand_VV(out, nan_mask); - - const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, out); + const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); + const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); + HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask); + const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out); return Q6_V_vmux_QVV(pred, out, Q6_V_vzero()); }