From 407b408981abc4180f64b99f89b003168bf7b962 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 27 Nov 2025 12:54:46 +0800 Subject: [PATCH 1/9] fix test failure --- ggml/src/ggml-hexagon/htp/rope-ops.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 00419bcba6b..ba0ff2dbc61 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -151,9 +151,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context } static void hvx_calc_rope_neox_f32(const float * restrict src0, - float * restrict dst, - const int num_elems, - const float * restrict theta_cache) { + float * restrict dst, + const int num_elems, + const float * restrict theta_cache) { // for (int i = 0; i < num_elems; i += 2) { //const float cos_theta = theta_cache[i + 0]; //const float sin_theta = theta_cache[i + 1]; @@ -192,7 +192,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0, HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s); HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c); - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4); + *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4); *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5); src0_curr += VLEN; @@ -259,7 +259,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, const uint32_t ir1, int nth, int ith, - int opt_path) { + const int opt_path) { struct htp_ops_context * octx = rope_ctx->octx; const struct htp_tensor * src0 = &octx->src0; @@ -267,8 +267,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, const struct htp_tensor * src2 = &octx->src2; struct htp_tensor * dst = &octx->dst; - const int32_t mode = rope_ctx->mode; - const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; + const int32_t mode = rope_ctx->mode; + const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; htp_rope_preamble; @@ -317,10 +317,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, if (is_neox) { const float x0 = src_loc[0]; - const float x1 = src_loc[rope_ctx->n_dims/2]; + const float x1 = src_loc[rope_ctx->n_dims / 2]; - dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; - dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta; + dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; + dst_data_loc[rope_ctx->n_dims / 2] = x0 * sin_theta + x1 * cos_theta; src_loc += 1; dst_data_loc += 1; @@ -337,6 +337,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } } + src_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; + dst_data_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { dst_data_loc[0] = src_loc[0]; dst_data_loc[1] = src_loc[1]; From 4ddb8a449cbe057d67b684b7c34192e69f743d30 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 01:15:25 +0800 Subject: [PATCH 2/9] fix: correct scaling calculations in rope_cache_init --- ggml/src/ggml-hexagon/htp/rope-ops.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index ba0ff2dbc61..719efc09b7a 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -93,17 +93,18 @@ static void rope_cache_init(const float theta_base, // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; float theta2 = theta_interp; + float mscale2 = mscale; if (ext_factor != 0.0f) { float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + mscale2 *= 1.0f + 0.1f * logf(1.0f / freq_scale); } - cache[i0 + 0] = cosf(theta2) * mscale; - cache[i0 + 1] = sinf(theta2) * mscale; + cache[i0 + 0] = cosf(theta2) * mscale2; + cache[i0 + 1] = sinf(theta2) * mscale2; theta *= theta_scale; } @@ -337,8 +338,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } } - src_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; - dst_data_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; + src_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); + dst_data_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { dst_data_loc[0] = src_loc[0]; dst_data_loc[1] = src_loc[1]; From cfca78b0d01b76aad6b1a89bc08045c9eaaa23d0 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 01:17:41 +0800 Subject: [PATCH 3/9] wip --- ggml/src/ggml-hexagon/htp/rope-ops.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 719efc09b7a..ef24c80f82b 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) { return (1 - MIN(1, MAX(0, y))); } -static void rope_cache_init(const float theta_base, - float freq_scale, - const float * freq_factors, - float * corr_dims, - uint32_t ne0, - float ext_factor, - float mscale, - float * cache, - float theta_scale) { +static void rope_cache_init(const float theta_base, + const float freq_scale, + const float * freq_factors, + float * corr_dims, + const uint32_t ne0, + const float ext_factor, + const float mscale, + float * cache, + const float theta_scale) { // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py float theta = theta_base; From e9a02fdba1246b747096cb9bbcbd67c7236ed2de Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 10:33:43 +0800 Subject: [PATCH 4/9] wip --- ggml/src/ggml-hexagon/htp/rope-ops.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index ef24c80f82b..7519505ab0d 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -282,8 +282,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - int ir = 0; - + int ir = 0; + const int32_t half_dims = rope_ctx->n_dims / 2; for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len const int32_t p = pos[i2]; @@ -311,6 +311,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } else { hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0); } + + src_loc += rope_ctx->n_dims; + dst_data_loc += rope_ctx->n_dims; } else { for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) { const float cos_theta = wp0[i0 + 0]; @@ -318,10 +321,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, if (is_neox) { const float x0 = src_loc[0]; - const float x1 = src_loc[rope_ctx->n_dims / 2]; + const float x1 = src_loc[half_dims]; - dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; - dst_data_loc[rope_ctx->n_dims / 2] = x0 * sin_theta + x1 * cos_theta; + dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; + dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta; src_loc += 1; dst_data_loc += 1; @@ -336,10 +339,11 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, dst_data_loc += 2; } } + + src_loc += (is_neox ? half_dims : 0); + dst_data_loc += (is_neox ? half_dims : 0); } - src_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); - dst_data_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { dst_data_loc[0] = src_loc[0]; dst_data_loc[1] = src_loc[1]; From e324bb0bd50898537e41810fcd41f82ce4a78e7c Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 10:57:27 +0800 Subject: [PATCH 5/9] fix: optimize element copying in rope_hex_f32 using memcpy --- ggml/src/ggml-hexagon/htp/rope-ops.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 7519505ab0d..0e71dcfae8e 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -344,13 +344,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, dst_data_loc += (is_neox ? half_dims : 0); } - for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { - dst_data_loc[0] = src_loc[0]; - dst_data_loc[1] = src_loc[1]; - - src_loc += 2; - dst_data_loc += 2; - } + // TODO: use simd to speed up the remaining elements copy + memcpy(dst_data_loc, src_loc, (ne0 - rope_ctx->n_dims) * sizeof(float)); } } } From 0121291d5358ee27bf4e8bd96106020f4b56857c Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 11:12:12 +0800 Subject: [PATCH 6/9] fix: optimize loop boundaries in rope_hex_f32 for better performance --- ggml/src/ggml-hexagon/htp/rope-ops.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 0e71dcfae8e..ddce8971204 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -282,8 +282,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - int ir = 0; - const int32_t half_dims = rope_ctx->n_dims / 2; + const uint32_t i0_end = MIN(ir1, ne1); + const int32_t half_dims = rope_ctx->n_dims / 2; for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len const int32_t p = pos[i2]; @@ -291,14 +291,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor, rope_ctx->attn_factor, wp0, rope_ctx->theta_scale); - for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads - if (ir++ < ir0) { - continue; - } - if (ir > ir1) { - break; - } - + for (uint32_t i1 = ir0; i1 < i0_end; i1++) { // attn-heads const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01); float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1); From 010039a15e0239de0685d70f0a144587efb3f4ee Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 11:22:22 +0800 Subject: [PATCH 7/9] rename --- ggml/src/ggml-hexagon/htp/rope-ops.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index ddce8971204..dbb4df58b56 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -92,19 +92,19 @@ static void rope_cache_init(const float theta_base, // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; - float theta2 = theta_interp; - float mscale2 = mscale; + float theta_final = theta_interp; + float mscale_final = mscale; if (ext_factor != 0.0f) { float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; // Get n-d magnitude scaling corrected for interpolation - mscale2 *= 1.0f + 0.1f * logf(1.0f / freq_scale); + mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); } - cache[i0 + 0] = cosf(theta2) * mscale2; - cache[i0 + 1] = sinf(theta2) * mscale2; + cache[i0 + 0] = cosf(theta_final) * mscale_final; + cache[i0 + 1] = sinf(theta_final) * mscale_final; theta *= theta_scale; } @@ -282,7 +282,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - const uint32_t i0_end = MIN(ir1, ne1); + const uint32_t i1_end = MIN(ir1, ne1); const int32_t half_dims = rope_ctx->n_dims / 2; for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len @@ -291,7 +291,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor, rope_ctx->attn_factor, wp0, rope_ctx->theta_scale); - for (uint32_t i1 = ir0; i1 < i0_end; i1++) { // attn-heads + for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01); float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1); From a6ef41f404da2e581a618d1efbb9a9669dfeea6e Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 12:10:56 +0800 Subject: [PATCH 8/9] wip --- ggml/src/ggml-hexagon/htp/rope-ops.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index dbb4df58b56..a4399704fcb 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -282,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - const uint32_t i1_end = MIN(ir1, ne1); - const int32_t half_dims = rope_ctx->n_dims / 2; + const uint32_t i1_end = MIN(ir1, ne1); + const int32_t half_dims = rope_ctx->n_dims / 2; + const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float); for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len const int32_t p = pos[i2]; @@ -338,7 +339,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } // TODO: use simd to speed up the remaining elements copy - memcpy(dst_data_loc, src_loc, (ne0 - rope_ctx->n_dims) * sizeof(float)); + memcpy(dst_data_loc, src_loc, remain_bytes); } } } From b567413ac97e6152b027714659cf8da847d041bf Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 1 Dec 2025 00:13:12 +0800 Subject: [PATCH 9/9] feat: add profiling macros for performance measurement in operations --- ggml/src/ggml-hexagon/htp/ops-utils.h | 7 +++++++ ggml/src/ggml-hexagon/htp/rope-ops.c | 9 +++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h index af9c3305f61..5e5c48afda3 100644 --- a/ggml/src/ggml-hexagon/htp/ops-utils.h +++ b/ggml/src/ggml-hexagon/htp/ops-utils.h @@ -146,4 +146,11 @@ static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) { } } +#define PROFILER_START(name) const uint64_t name##_start_cycles = HAP_perf_get_qtimer_count() +#define PROFILER_END(name, ...) \ + do { \ + const uint64_t name##_end_cycles = HAP_perf_get_qtimer_count(); \ + FARF(HIGH, __VA_ARGS__, (unsigned) HAP_perf_qtimer_count_to_us(name##_end_cycles - name##_start_cycles)); \ + } while (0) + #endif /* OPS_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index a4399704fcb..a48cbf43f23 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -365,8 +365,7 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int return; } - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(rope_job_f32); int is_aligned = 1; int opt_path = 0; @@ -381,10 +380,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path); - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row, - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(rope_job_f32, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, + src0_end_row); } static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {