From ab75281b711929f3debcc0e4b5316e8eff49583f Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Mon, 10 Nov 2025 23:10:54 +0800
Subject: [PATCH 01/23] refactor: use hvx_vec_exp_fp32_guard_inf for overflow
 handling in hvx_exp_f32

---
 ggml/src/ggml-hexagon/htp/hvx-exp.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index 19f6795083c1d..f9127251899b5 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -16,6 +16,19 @@
 #include "hvx-utils.h"
 #include "ops-utils.h"
 
+static inline HVX_Vector hvx_vec_exp_fp32_guard_inf(HVX_Vector in_vec) {
+    static const float kInf    = INFINITY;
+    static const float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector     max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
+    const HVX_Vector     inf     = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
+
+    HVX_Vector out = hvx_vec_exp_fp32(in_vec);
+
+    return Q6_V_vmux_QVV(pred0, inf, out);
+}
+
 void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
     int left_over       = num_elems & (VLEN_FP32 - 1);
     int num_elems_whole = num_elems - left_over;
@@ -38,25 +51,25 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             if (true == negate) {
                 HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
-                *p_vec_out++          = hvx_vec_exp_fp32(neg_vec_in);
+                *p_vec_out++          = hvx_vec_exp_fp32_guard_inf(neg_vec_in);
             } else {
-                *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++);
+                *p_vec_out++ = hvx_vec_exp_fp32_guard_inf(*p_vec_in1++);
             }
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
             if (true == negate) {
                 HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(neg_vec_in);
             } else {
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(in);
             }
         }
     }
@@ -70,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         if (true == negate) {
             HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
 
-            vec_out = hvx_vec_exp_fp32(neg_vec_in);
+            vec_out = hvx_vec_exp_fp32_guard_inf(neg_vec_in);
         } else {
-            vec_out = hvx_vec_exp_fp32(in);
+            vec_out = hvx_vec_exp_fp32_guard_inf(in);
         }
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);

From 5aa4a8328d7635dc0e36987c88d74106bc13153c Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Tue, 11 Nov 2025 00:10:29 +0800
Subject: [PATCH 02/23] feat: add fast sigmoid function with overflow guard for
 fp32

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index b2ca8e88f464e..686c55fa99b64 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -934,6 +934,17 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
     return Q6_Vsf_equals_Vqf32(temp);
 }
 
+static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) {
+    static const float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector     max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
+    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
+
+    HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
+
+    return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
+}
+
 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
     int step_of_1 = num_elems >> 5;
     int remaining = num_elems - step_of_1 * VLEN_FP32;
@@ -945,7 +956,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
 
     #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
-        v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
+        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard_inf(v_src[i]);
     }
 }
 

From a64154ce57198e87152abd44cb7fbdf8178cc533 Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Tue, 11 Nov 2025 22:04:15 +0800
Subject: [PATCH 03/23] refactor: replace hvx_vec_inverse_fp32 with
 hvx_vec_inverse_fp32_guard_inf for improved overflow handling

---
 ggml/src/ggml-hexagon/htp/hvx-inverse.c | 10 ++++----
 ggml/src/ggml-hexagon/htp/hvx-utils.h   | 31 +++++++++++++++++--------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
index 4cf588a8781f1..25dda0b729fbd 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -36,15 +36,15 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         HVX_Vector * p_vec_in  = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
 
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
+            *p_vec_out++ = hvx_vec_inverse_fp32_guard_inf(*p_vec_in++);
         }
     } else {
-        #pragma unroll(4)
+#pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard_inf(in);
         }
     }
 
@@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         float *       dstf = (float *) dst + num_elems_whole;
 
         HVX_Vector in  = *(HVX_UVector *) srcf;
-        HVX_Vector out = hvx_vec_inverse_fp32(in);
+        HVX_Vector out = hvx_vec_inverse_fp32_guard_inf(in);
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
     }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 686c55fa99b64..cc1de34e252e7 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -80,7 +80,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -104,7 +104,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -128,7 +128,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -153,7 +153,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -177,7 +177,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -201,7 +201,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -226,7 +226,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
 
     uint32_t i = 0;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (; i < nvec; i++) {
         vdst[i] = velem;
     }
@@ -411,8 +411,8 @@ static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n
 
     HVX_Vector sum = in, sum_t;
     while (width < total) {
-        sum_t = Q6_V_vror_VR(sum, width);       // rotate right
-        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
+        sum_t = Q6_V_vror_VR(sum, width);                               // rotate right
+        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t));  // elementwise sum
         width = width << 1;
     }
     return sum;
@@ -720,6 +720,17 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
     return Q6_Vsf_equals_Vqf32(r_qf);
 }
 
+static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
+    static const float kInf = INFINITY;
+
+    const HVX_Vector     inf   = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
+
+    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
+
+    return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero());
+}
+
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
 #define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
 #define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
@@ -954,7 +965,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     const HVX_Vector * restrict v_src = (HVX_Vector *) src;
     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
 
-    #pragma unroll(4)
+#pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard_inf(v_src[i]);
     }

From a8cdbcf0d2fa2077bd545ef901250d4ff60a0913 Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Tue, 11 Nov 2025 23:46:40 +0800
Subject: [PATCH 04/23] feat: enhance hvx_add_scalar_f32 with overflow handling
 using infinity guard

---
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 27 +++++++++++++++++++++------
 ggml/src/ggml-hexagon/htp/hvx-utils.h |  4 ++--
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index d3599bc9c1276..4a1233ef5a9f1 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -401,6 +401,10 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
+    static const float kInf = INFINITY;
+
+    const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
 
     if (0 == unaligned_loop) {
@@ -409,17 +413,24 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
 
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
-            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
+            HVX_Vector           in    = *vec_in1++;
+            const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in);
+            HVX_Vector           v     = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            v                          = Q6_Vsf_equals_Vqf32(v);
+            v                          = Q6_V_vmux_QVV(pred0, inf, v);
+            *vec_out++                 = v;
         }
     } else {
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
-            HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in);
+            HVX_Vector           out   = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            out                        = Q6_Vsf_equals_Vqf32(out);
+            out                        = Q6_V_vmux_QVV(pred0, inf, out);
 
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out;
         }
     }
 
@@ -429,8 +440,12 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
 
         HVX_Vector in = *(HVX_UVector *) srcf;
 
-        HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
+        const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in);
+        HVX_Vector           out   = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+        out                        = Q6_Vsf_equals_Vqf32(out);
+        out                        = Q6_V_vmux_QVV(pred0, inf, out);
+
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
     }
 }
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index cc1de34e252e7..bb8bef4bcb603 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -724,11 +724,11 @@ static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
     static const float kInf = INFINITY;
 
     const HVX_Vector     inf   = Q6_V_vsplat_R(*((uint32_t *) &kInf));
-    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
+    const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, v_sf);
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
 
-    return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero());
+    return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
 }
 
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022

From ae42fb63304a3abb1798d5ed8e8a56e266fb9a63 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 14 Nov 2025 01:01:55 +0800
Subject: [PATCH 05/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index bb8bef4bcb603..3fd8e5481feb6 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -946,10 +946,10 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
 }
 
 static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) {
-    static const float kMaxExp = 88.02f;  // log(INF)
+    static const float kMaxExp = -88.02f;  // log(INF)
 
     const HVX_Vector     max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
-    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
+    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
 
     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
 

From 39445ab0d1464092ec1c1005fb447fce6aaca09d Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 14 Nov 2025 11:29:48 +0800
Subject: [PATCH 06/23] add HVX_Vector_Alias

wip
---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 3fd8e5481feb6..8244fd55dae92 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -12,6 +12,15 @@
 #define VLEN_FP32   (VLEN / SIZEOF_FP32)
 #define VLEN_FP16   (VLEN / SIZEOF_FP16)
 
+typedef union {
+    HVX_Vector v;
+    uint8_t    b[VLEN];
+    uint16_t   h[VLEN_FP16];
+    uint32_t   w[VLEN_FP32];
+    __fp16     fp16[VLEN_FP16];
+    float      fp32[VLEN_FP32];
+} HVX_VectorAlias;
+
 static inline HVX_Vector hvx_vec_splat_fp32(float i) {
     union {
         float   f;
@@ -243,19 +252,16 @@ static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint3
 }
 
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
-    union {
-        HVX_Vector v;
-        __fp16 d[64];
-    } u = { .v = v };
+    HVX_VectorAlias u = { .v = v };
 
     const uint32_t n0 = n / 16;
     const uint32_t n1 = n % 16;
     int            i  = 0;
     for (; i < n0; i++) {
-        htp_dump_fp16_line(pref, u.d + (16 * i), 16);
+        htp_dump_fp16_line(pref, u.fp16 + (16 * i), 16);
     }
     if (n1) {
-        htp_dump_fp16_line(pref, u.d + (16 * i), n1);
+        htp_dump_fp16_line(pref, u.fp16 + (16 * i), n1);
     }
 }
 

From a589b611110d2968dc0af780e2b55c071fdfedb1 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 15 Nov 2025 10:25:48 +0800
Subject: [PATCH 07/23] wip

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 16044975d9253..1580107d87eda 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -106,20 +106,16 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
     t1 = HAP_perf_get_qtimer_count();
 
     int is_aligned = 1;
-    int opt_path   = 0;
     if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
         is_aligned = 0;
         FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
     }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
-    }
 
     const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
     uint8_t * restrict data_dst        = (uint8_t *) dst->data;
 
-    bool src1_valid = src1->ne[0];
+    const bool src1_valid = src1->ne[0];
     if (!src1_valid) {
         data_src1     = data_src0;
         src1_row_size = src0_row_size;
@@ -129,10 +125,9 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
     uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
     uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
 
-    const int32_t swapped = op_params[1];
-
-    const int nc = (src1_valid) ? ne0 : ne0 / 2;
-
+    const int32_t swapped  = op_params[1];
+    const bool    opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
+    const int     nc       = (src1_valid) ? ne0 : ne0 / 2;
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
         const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
         const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
@@ -147,7 +142,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
             src1 += swapped ? 0 : nc;
         }
 
-        if (1 == opt_path) {
+        if (opt_path) {
             hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
             hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
                                 (uint8_t *) dst, nc);

From 6f57b9e2ab463d5e9b1ce5e258d068bb50e3c5f9 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 15 Nov 2025 13:02:59 +0800
Subject: [PATCH 08/23] fix: improve handling of src1 tensor in
 glu_swiglu_fp32_per_thread function

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 1580107d87eda..6ed791bc7b89e 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -116,18 +116,22 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
     uint8_t * restrict data_dst        = (uint8_t *) dst->data;
 
     const bool src1_valid = src1->ne[0];
+    const int  nc         = (src1_valid) ? ne0 : ne0 / 2;
     if (!src1_valid) {
-        data_src1     = data_src0;
-        src1_row_size = src0_row_size;
+        const int32_t swapped = op_params[1];
+        data_src1             = data_src0;
+        src1_row_size         = src0_row_size;
+
+        const size_t nc_in_bytes = nc * SIZEOF_FP32;
+        data_src0 += swapped ? nc_in_bytes : 0;
+        data_src1 += swapped ? 0 : nc_in_bytes;
     }
 
     uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
     uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
     uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
 
-    const int32_t swapped  = op_params[1];
-    const bool    opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
-    const int     nc       = (src1_valid) ? ne0 : ne0 / 2;
+    const bool opt_path = ((1 == is_aligned) && !(nb01 & (VLEN - 1)));
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
         const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
         const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
@@ -137,11 +141,6 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
             htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
         }
 
-        if (!src1_valid) {
-            src0 += swapped ? nc : 0;
-            src1 += swapped ? 0 : nc;
-        }
-
         if (opt_path) {
             hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
             hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,

From db9e9308bb4d99e093288137a7a3ad723f882498 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 15 Nov 2025 14:34:51 +0800
Subject: [PATCH 09/23] fix nc

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 6ed791bc7b89e..01b3d8fec0489 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -116,7 +116,7 @@ static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
     uint8_t * restrict data_dst        = (uint8_t *) dst->data;
 
     const bool src1_valid = src1->ne[0];
-    const int  nc         = (src1_valid) ? ne0 : ne0 / 2;
+    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
     if (!src1_valid) {
         const int32_t swapped = op_params[1];
         data_src1             = data_src0;

From ce48af57b0ecdc38fb0c6102f10b4cf88181ce0a Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 16 Nov 2025 10:40:02 +0800
Subject: [PATCH 10/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 8244fd55dae92..fa996a484941c 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -955,11 +955,11 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) {
     static const float kMaxExp = -88.02f;  // log(INF)
 
     const HVX_Vector     max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
-    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
+    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
 
     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
 
-    return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
+    return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero());
 }
 
 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {

From fc5f31fd23b43d911deb89462038b5a12d78a76c Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 16 Nov 2025 11:14:37 +0800
Subject: [PATCH 11/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index fa996a484941c..33728fd615a39 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -19,7 +19,7 @@ typedef union {
     uint32_t   w[VLEN_FP32];
     __fp16     fp16[VLEN_FP16];
     float      fp32[VLEN_FP32];
-} HVX_VectorAlias;
+} __attribute__((packed)) __attribute__((aligned(VLEN))) HVX_VectorAlias;
 
 static inline HVX_Vector hvx_vec_splat_fp32(float i) {
     union {

From 5707384152b8dc853fa799c4e7f71e911046e80a Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 16 Nov 2025 11:24:28 +0800
Subject: [PATCH 12/23] handle nan at inverse

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 33728fd615a39..573377821c82b 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -728,13 +728,15 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
 
 static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
     static const float kInf = INFINITY;
+    static const float kNan = NAN;
 
-    const HVX_Vector     inf   = Q6_V_vsplat_R(*((uint32_t *) &kInf));
-    const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, v_sf);
+    const HVX_Vector     inf      = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v_sf);
+    const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf);
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
-
-    return Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
+    out            = Q6_V_vmux_QVV(pred_nan, inf, out);
+    return Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out);
 }
 
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022

From 54235e39453efb409ae16ffde9c9d8a1117756cb Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 16 Nov 2025 23:07:41 +0800
Subject: [PATCH 13/23] wip

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 01b3d8fec0489..87b09cca3afef 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -212,7 +212,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
     const float   alpha   = ((const float *) (op_params))[2];
     const float   limit   = ((const float *) (op_params))[3];
 
-    const int nc = (src1_valid) ? ne0 : ne0 / 2;
+    const int nc = (src1_valid) ? ne00 : ne00 / 2;
 
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
         const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));

From 38594657ad6c99cad0786ba2bacd2171132029e9 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 17 Nov 2025 00:36:53 +0800
Subject: [PATCH 14/23] fix neg

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 573377821c82b..b88f721460791 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -19,7 +19,7 @@ typedef union {
     uint32_t   w[VLEN_FP32];
     __fp16     fp16[VLEN_FP16];
     float      fp32[VLEN_FP32];
-} __attribute__((packed)) __attribute__((aligned(VLEN))) HVX_VectorAlias;
+} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
 
 static inline HVX_Vector hvx_vec_splat_fp32(float i) {
     union {
@@ -497,7 +497,7 @@ static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
 static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
     // neg by setting the fp16 sign bit
     HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
-    return Q6_V_vor_VV(v, mask);
+    return Q6_V_vxor_VV(v, mask);
 }
 
 static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
@@ -512,7 +512,7 @@ static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
 #else
     // neg by setting the fp32 sign bit
     HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
-    return Q6_V_vor_VV(v, mask);
+    return Q6_V_vxor_VV(v, mask);
 #endif  // __HTP_ARCH__ > 75
 }
 

From 014ad77333d2b6773dfeb5552b5294041e14ed60 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 17 Nov 2025 01:09:14 +0800
Subject: [PATCH 15/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index b88f721460791..db103c368ea79 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -731,11 +731,12 @@ static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
     static const float kNan = NAN;
 
     const HVX_Vector     inf      = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    const HVX_Vector     nan      = Q6_V_vsplat_R(*((uint32_t *) &kNan));
     const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v_sf);
     const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf);
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
-    out            = Q6_V_vmux_QVV(pred_nan, inf, out);
+    out            = Q6_V_vmux_QVV(pred_nan, nan, out);
     return Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out);
 }
 

From 8c374577d8ebd4a080f8085555c9617d513c06b3 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 17 Nov 2025 18:16:13 +0800
Subject: [PATCH 16/23] rename

---
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 36 +++++++++++++--------------
 ggml/src/ggml-hexagon/htp/hvx-utils.h |  4 +--
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index 4a1233ef5a9f1..5492a52f71f91 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -401,11 +401,9 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
         FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
-    static const float kInf = INFINITY;
-
-    const HVX_Vector inf = Q6_V_vsplat_R(*((uint32_t *) &kInf));
-
-    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
+    static const float kInf    = INFINITY;
+    const HVX_Vector   inf     = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    HVX_Vector         val_vec = hvx_vec_splat_fp32(val);
 
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
@@ -413,22 +411,22 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
 
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector           in    = *vec_in1++;
-            const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in);
-            HVX_Vector           v     = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-            v                          = Q6_Vsf_equals_Vqf32(v);
-            v                          = Q6_V_vmux_QVV(pred0, inf, v);
-            *vec_out++                 = v;
+            HVX_Vector           in       = *vec_in1++;
+            const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
+            HVX_Vector           v        = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            v                             = Q6_Vsf_equals_Vqf32(v);
+            v                             = Q6_V_vmux_QVV(pred_inf, inf, v);
+            *vec_out++                    = v;
         }
     } else {
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
-            const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in);
-            HVX_Vector           out   = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-            out                        = Q6_Vsf_equals_Vqf32(out);
-            out                        = Q6_V_vmux_QVV(pred0, inf, out);
+            const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
+            HVX_Vector           out      = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+            out                           = Q6_Vsf_equals_Vqf32(out);
+            out                           = Q6_V_vmux_QVV(pred_inf, inf, out);
 
             *(HVX_UVector *) (dst + i * SIZEOF_FP32) = out;
         }
@@ -440,10 +438,10 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
 
         HVX_Vector in = *(HVX_UVector *) srcf;
 
-        const HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(inf, in);
-        HVX_Vector           out   = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
-        out                        = Q6_Vsf_equals_Vqf32(out);
-        out                        = Q6_V_vmux_QVV(pred0, inf, out);
+        const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, in);
+        HVX_Vector           out      = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
+        out                           = Q6_Vsf_equals_Vqf32(out);
+        out                           = Q6_V_vmux_QVV(pred_inf, inf, out);
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
     }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index db103c368ea79..e900b35f439a1 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -957,8 +957,8 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
 static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) {
     static const float kMaxExp = -88.02f;  // log(INF)
 
-    const HVX_Vector     max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
-    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
+    const HVX_Vector     max_exp  = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
+    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
 
     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
 

From 33a050e76360e23edd5680007e84704a538e9730 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 17 Nov 2025 18:17:34 +0800
Subject: [PATCH 17/23] fix hvx_vec_inverse_fp32_guard_inf to handle infinity
 and NaN cases correctly

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index e900b35f439a1..9338b383cb39c 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -728,16 +728,13 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
 
 static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
     static const float kInf = INFINITY;
-    static const float kNan = NAN;
 
     const HVX_Vector     inf      = Q6_V_vsplat_R(*((uint32_t *) &kInf));
-    const HVX_Vector     nan      = Q6_V_vsplat_R(*((uint32_t *) &kNan));
-    const HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(inf, v_sf);
+    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
     const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf);
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
-    out            = Q6_V_vmux_QVV(pred_nan, nan, out);
-    return Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out);
+    return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
 }
 
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
@@ -961,8 +958,8 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) {
     const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
 
     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
+    return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
 
-    return Q6_V_vmux_QVV(pred0, out, Q6_V_vzero());
 }
 
 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {

From 83884a5c2fb04280ab8f8e01da09e9c95a6e4c6b Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 17 Nov 2025 20:07:34 +0800
Subject: [PATCH 18/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-exp.c   | 4 ++--
 ggml/src/ggml-hexagon/htp/hvx-utils.c | 2 +-
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index f9127251899b5..27d2fff084b68 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -20,8 +20,8 @@ static inline HVX_Vector hvx_vec_exp_fp32_guard_inf(HVX_Vector in_vec) {
     static const float kInf    = INFINITY;
     static const float kMaxExp = 88.02f;  // log(INF)
 
-    const HVX_Vector     max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
-    const HVX_Vector     inf     = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    const HVX_Vector     max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector     inf     = hvx_vec_splat_fp32(kInf);
     const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
 
     HVX_Vector out = hvx_vec_exp_fp32(in_vec);
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index 5492a52f71f91..e02b1d9099629 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -402,7 +402,7 @@ void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     }
 
     static const float kInf    = INFINITY;
-    const HVX_Vector   inf     = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    const HVX_Vector   inf     = hvx_vec_splat_fp32(kInf);
     HVX_Vector         val_vec = hvx_vec_splat_fp32(val);
 
     if (0 == unaligned_loop) {
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 9338b383cb39c..17900f92f5cec 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -729,9 +729,8 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
 static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
     static const float kInf = INFINITY;
 
-    const HVX_Vector     inf      = Q6_V_vsplat_R(*((uint32_t *) &kInf));
+    const HVX_Vector     inf      = hvx_vec_splat_fp32(kInf);
     const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
-    const HVX_VectorPred pred_nan = Q6_Q_vcmp_eq_VwVw(Q6_V_vzero(), v_sf);
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
     return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());

From f7662f3b3cd60f457d8451ce8b071d2b472b001c Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 17 Nov 2025 23:49:12 +0800
Subject: [PATCH 19/23] fix hvx_vec_inverse_fp32_guard_inf to handle NaN cases
 correctly

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 17900f92f5cec..241fb53506f1a 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -727,13 +727,22 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
 }
 
 static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
-    static const float kInf = INFINITY;
+    static const float    kInf     = INFINITY;
+    static const uint32_t kNanMask = 0x7fffffff;
+    static const uint32_t kNanMin  = 0x7f800000;
 
     const HVX_Vector     inf      = hvx_vec_splat_fp32(kInf);
     const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
-    return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
+
+    const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
+    const HVX_Vector nan_min  = Q6_V_vsplat_R(kNanMin);
+    out                       = Q6_V_vand_VV(out, nan_mask);
+
+    const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, out);
+
+    return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
 }
 
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022

From f6d7f3c5f2528114853b25e418f55a45d38d19d3 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 18 Nov 2025 00:03:47 +0800
Subject: [PATCH 20/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-inverse.c | 6 +++---
 ggml/src/ggml-hexagon/htp/hvx-utils.h   | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
index 25dda0b729fbd..f4df866641c77 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -38,13 +38,13 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
 
 #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            *p_vec_out++ = hvx_vec_inverse_fp32_guard_inf(*p_vec_in++);
+            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
         }
     } else {
 #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard_inf(in);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
         }
     }
 
@@ -53,7 +53,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         float *       dstf = (float *) dst + num_elems_whole;
 
         HVX_Vector in  = *(HVX_UVector *) srcf;
-        HVX_Vector out = hvx_vec_inverse_fp32_guard_inf(in);
+        HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
     }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 241fb53506f1a..5429204ee45b1 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -726,7 +726,7 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
     return Q6_Vsf_equals_Vqf32(r_qf);
 }
 
-static inline HVX_Vector hvx_vec_inverse_fp32_guard_inf(HVX_Vector v_sf) {
+static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
     static const float    kInf     = INFINITY;
     static const uint32_t kNanMask = 0x7fffffff;
     static const uint32_t kNanMin  = 0x7f800000;
@@ -959,7 +959,7 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
     return Q6_Vsf_equals_Vqf32(temp);
 }
 
-static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) {
+static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
     static const float kMaxExp = -88.02f;  // log(INF)
 
     const HVX_Vector     max_exp  = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
@@ -967,7 +967,6 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard_inf(HVX_Vector v) {
 
     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
     return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
-
 }
 
 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
@@ -981,7 +980,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
 
 #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
-        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard_inf(v_src[i]);
+        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
     }
 }
 

From 37e9a1d197dcf85b7f41627c593c546dfdfe23ca Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 18 Nov 2025 10:25:04 +0800
Subject: [PATCH 21/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-exp.c     |  4 ++--
 ggml/src/ggml-hexagon/htp/hvx-inverse.c |  4 ++--
 ggml/src/ggml-hexagon/htp/hvx-utils.h   | 16 ++++++++--------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index 27d2fff084b68..21552c8c5df6e 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -51,7 +51,7 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             if (true == negate) {
                 HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
@@ -61,7 +61,7 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
             }
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
index f4df866641c77..953d3e6c16709 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -36,12 +36,12 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         HVX_Vector * p_vec_in  = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
         }
     } else {
-#pragma unroll(4)
+        #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
             *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 5429204ee45b1..80526ff28fb13 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -89,7 +89,7 @@ static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -113,7 +113,7 @@ static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -137,7 +137,7 @@ static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -162,7 +162,7 @@ static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -186,7 +186,7 @@ static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -210,7 +210,7 @@ static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * rest
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         HVX_Vector v = vsrc[i];
         vdst[i]      = v;
@@ -235,7 +235,7 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
 
     uint32_t i = 0;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (; i < nvec; i++) {
         vdst[i] = velem;
     }
@@ -978,7 +978,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     const HVX_Vector * restrict v_src = (HVX_Vector *) src;
     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
 
-#pragma unroll(4)
+    #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
         v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
     }

From 185dc20357407d367499cca631b695b2fe68d31b Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 18 Nov 2025 11:13:17 +0800
Subject: [PATCH 22/23] wip

---
 ggml/src/ggml-hexagon/htp/hvx-exp.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index 21552c8c5df6e..d0735e9325e1c 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -16,7 +16,7 @@
 #include "hvx-utils.h"
 #include "ops-utils.h"
 
-static inline HVX_Vector hvx_vec_exp_fp32_guard_inf(HVX_Vector in_vec) {
+static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
     static const float kInf    = INFINITY;
     static const float kMaxExp = 88.02f;  // log(INF)
 
@@ -55,9 +55,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             if (true == negate) {
                 HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
-                *p_vec_out++          = hvx_vec_exp_fp32_guard_inf(neg_vec_in);
+                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in);
             } else {
-                *p_vec_out++ = hvx_vec_exp_fp32_guard_inf(*p_vec_in1++);
+                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
             }
         }
     } else {
@@ -67,9 +67,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
             if (true == negate) {
                 HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(neg_vec_in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
             } else {
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard_inf(in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
             }
         }
     }
@@ -83,9 +83,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         if (true == negate) {
             HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
 
-            vec_out = hvx_vec_exp_fp32_guard_inf(neg_vec_in);
+            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
         } else {
-            vec_out = hvx_vec_exp_fp32_guard_inf(in);
+            vec_out = hvx_vec_exp_fp32_guard(in);
         }
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);

From 6d88789ef851dd0c4127d2dcc3547b22f23ff4a1 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 18 Nov 2025 11:41:36 +0800
Subject: [PATCH 23/23] fix output sign

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 80526ff28fb13..5f94645cde3b1 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -736,11 +736,10 @@ static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
 
-    const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
-    const HVX_Vector nan_min  = Q6_V_vsplat_R(kNanMin);
-    out                       = Q6_V_vand_VV(out, nan_mask);
-
-    const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, out);
+    const HVX_Vector     nan_mask   = Q6_V_vsplat_R(kNanMask);
+    const HVX_Vector     nan_min    = Q6_V_vsplat_R(kNanMin);
+    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_mask);
+    const HVX_VectorPred pred       = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
 
     return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
 }