diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index c99b6a0d18e..62a2a210971 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -1092,8 +1092,7 @@ static void matmul(struct htp_matmul_type * mt,
     uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
     uint8_t * restrict src1_data = src1_spad->data;
 
-    volatile uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matmul);
 
     const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
 
@@ -1144,12 +1143,9 @@ static void matmul(struct htp_matmul_type * mt,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
-         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matmul, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
+                 nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+                 src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // q8x4x2 src1 tensor is already in VTCM spad
@@ -1190,8 +1186,7 @@ static void matvec(struct htp_matmul_type * mt,
     uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
     uint8_t * src1_data = src1_spad->data;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matvec);
 
     float * tmp = (float *) spad_dst;
 
@@ -1236,12 +1231,9 @@ static void matvec(struct htp_matmul_type * mt,
 
     hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
-         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matvec, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
+                 nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+                 src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
@@ -1267,8 +1259,7 @@ static void matmul_id(struct htp_matmul_type * mt,
                       dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matmul_id);
 
     const uint32_t src0_nrows = ne01;  // src0 rows per expert
     const uint32_t src1_nrows = ne11;
@@ -1373,12 +1364,11 @@ static void matmul_id(struct htp_matmul_type * mt,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
-         src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
-         dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matmul_id,
+                 "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
+                 mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
+                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3],
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // q8x4 src1 tensor is already in VTCM spad
@@ -1397,8 +1387,7 @@ static void matvec_id(struct htp_matmul_type * mt,
                       dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matvec_id);
 
     const uint32_t src0_nrows = ne01;  // src0 rows per expert
 
@@ -1473,12 +1462,11 @@ static void matvec_id(struct htp_matmul_type * mt,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
-         src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
-         dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matvec_id,
+                 "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
+                 mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
+                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3],
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // *** matmul in fp16
@@ -1495,8 +1483,7 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
                            dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matmul_f16_f32);
 
     const size_t src0_row_size = sizeof(__fp16) * ne00;
     const size_t src1_row_size = sizeof(float) * ne10;
@@ -1575,12 +1562,10 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0],
-         src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matmul_f16_f32,
+                 "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+                 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end,
+                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // *** dynamic quant
@@ -1662,7 +1647,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
                                  uint32_t          nth,
                                  uint32_t          ith,
                                  uint32_t          nrows_per_thread) {
-    uint64_t t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(quantize_fp32_q8x4);
 
     const uint32_t ne0 = src->ne[0];
     const uint32_t ne1 = src->ne[1];
@@ -1694,10 +1679,8 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
         src_data += src_row_size;
     }
 
-    uint64_t t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
-         ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(quantize_fp32_q8x4, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith,
+                 nth, nrows, ir_first, ir_last, src_row_size, dst_row_size);
 }
 
 static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {
diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
index af9c3305f61..5e5c48afda3 100644
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -146,4 +146,11 @@ static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
     }
 }
 
+#define PROFILER_START(name) const uint64_t name##_start_cycles = HAP_perf_get_qtimer_count()
+#define PROFILER_END(name, ...)                                                                                   \
+    do {                                                                                                          \
+        const uint64_t name##_end_cycles = HAP_perf_get_qtimer_count();                                           \
+        FARF(HIGH, __VA_ARGS__, (unsigned) HAP_perf_qtimer_count_to_us(name##_end_cycles - name##_start_cycles)); \
+    } while (0)
+
 #endif /* OPS_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 00419bcba6b..a48cbf43f23 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
     return (1 - MIN(1, MAX(0, y)));
 }
 
-static void rope_cache_init(const float   theta_base,
-                            float         freq_scale,
-                            const float * freq_factors,
-                            float *       corr_dims,
-                            uint32_t      ne0,
-                            float         ext_factor,
-                            float         mscale,
-                            float *       cache,
-                            float         theta_scale) {
+static void rope_cache_init(const float    theta_base,
+                            const float    freq_scale,
+                            const float *  freq_factors,
+                            float *        corr_dims,
+                            const uint32_t ne0,
+                            const float    ext_factor,
+                            const float    mscale,
+                            float *        cache,
+                            const float    theta_scale) {
     // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
     float theta = theta_base;
 
@@ -92,18 +92,19 @@ static void rope_cache_init(const float   theta_base,
 
         // Get n-d rotational scaling corrected for extrapolation
         float theta_interp = freq_scale * theta_extrap;
-        float theta2       = theta_interp;
+        float theta_final  = theta_interp;
+        float mscale_final = mscale;
 
         if (ext_factor != 0.0f) {
             float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            theta_final    = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
 
             // Get n-d magnitude scaling corrected for interpolation
-            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+            mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
         }
 
-        cache[i0 + 0] = cosf(theta2) * mscale;
-        cache[i0 + 1] = sinf(theta2) * mscale;
+        cache[i0 + 0] = cosf(theta_final) * mscale_final;
+        cache[i0 + 1] = sinf(theta_final) * mscale_final;
 
         theta *= theta_scale;
     }
@@ -151,9 +152,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
 }
 
 static void hvx_calc_rope_neox_f32(const float * restrict src0,
-                              float * restrict dst,
-                              const int num_elems,
-                              const float * restrict theta_cache) {
+                                   float * restrict dst,
+                                   const int num_elems,
+                                   const float * restrict theta_cache) {
     // for (int i = 0; i < num_elems; i += 2) {
     //const float cos_theta = theta_cache[i + 0];
     //const float sin_theta = theta_cache[i + 1];
@@ -192,7 +193,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
         HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
         HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
 
-        *(HVX_Vector *) dst_curr          = Q6_Vsf_equals_Vqf32(v4);
+        *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
         *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
 
         src0_curr += VLEN;
@@ -259,7 +260,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                          const uint32_t       ir1,
                          int                  nth,
                          int                  ith,
-                         int                  opt_path) {
+                         const int            opt_path) {
     struct htp_ops_context * octx = rope_ctx->octx;
 
     const struct htp_tensor * src0 = &octx->src0;
@@ -267,8 +268,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
     const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
 
-    const int32_t mode  = rope_ctx->mode;
-    const bool is_neox  = mode & HTP_ROPE_TYPE_NEOX;
+    const int32_t mode    = rope_ctx->mode;
+    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
 
     htp_rope_preamble;
 
@@ -281,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
         freq_factors = (const float *) src2->data;
     }
 
-    int ir = 0;
-
+    const uint32_t i1_end       = MIN(ir1, ne1);
+    const int32_t  half_dims    = rope_ctx->n_dims / 2;
+    const size_t   remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
     for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
         for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
             const int32_t p = pos[i2];
@@ -290,14 +292,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
             rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
                             rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
 
-            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
-                if (ir++ < ir0) {
-                    continue;
-                }
-                if (ir > ir1) {
-                    break;
-                }
-
+            for (uint32_t i1 = ir0; i1 < i1_end; i1++) {  // attn-heads
                 const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
                 float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
 
@@ -310,6 +305,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                     } else {
                         hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
                     }
+
+                    src_loc += rope_ctx->n_dims;
+                    dst_data_loc += rope_ctx->n_dims;
                 } else {
                     for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
                         const float cos_theta = wp0[i0 + 0];
@@ -317,10 +315,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
 
                         if (is_neox) {
                             const float x0 = src_loc[0];
-                            const float x1 = src_loc[rope_ctx->n_dims/2];
+                            const float x1 = src_loc[half_dims];
 
-                            dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
+                            dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
+                            dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
 
                             src_loc += 1;
                             dst_data_loc += 1;
@@ -335,15 +333,13 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                             dst_data_loc += 2;
                         }
                     }
-                }
 
-                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
-                    dst_data_loc[0] = src_loc[0];
-                    dst_data_loc[1] = src_loc[1];
-
-                    src_loc += 2;
-                    dst_data_loc += 2;
+                    src_loc += (is_neox ? half_dims : 0);
+                    dst_data_loc += (is_neox ? half_dims : 0);
                 }
+
+                // TODO: use simd to speed up the remaining elements copy
+                memcpy(dst_data_loc, src_loc, remain_bytes);
             }
         }
     }
@@ -369,8 +365,7 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
         return;
     }
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(rope_job_f32);
 
     int is_aligned = 1;
     int opt_path   = 0;
@@ -385,10 +380,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
 
     rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(rope_job_f32, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row,
+                 src0_end_row);
 }
 
 static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {