From dd1588a8bb21a18592b75fc3c1c540f070aa6551 Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Thu, 27 Nov 2025 08:54:29 +0000 Subject: [PATCH 1/2] cann: add support for partial RoPE and Vision mode Add support for two important RoPE variants: partial rotation (rope_dims < ne0) and Vision mode rotation. 1. Support for partial RoPE (rope_dims < ne0): - Split tensor into head (first rope_dims dimensions) and tail portions - Apply rotation only to head portion using RotaryPositionEmbedding operator - Copy unrotated tail portion directly from source to destination - Handle both contiguous and non-contiguous tensor layouts 2. Support for Vision mode (GGML_ROPE_TYPE_VISION): - Set rope_dims = ne0 for Vision mode to rotate entire tensor - Vision mode pairs dimension i with dimension i+n_dims (where n_dims = ne0/2) - No tail handling needed since entire tensor is rotated Implementation details: - Use has_tail flag to determine execution path: head/tail splitting when rope_dims < ne0, or full tensor rotation when rope_dims == ne0 - Support both F32 and F16 data types with intermediate F32 conversion - Copy non-contiguous tensors to contiguous buffers before calling RotaryPositionEmbedding operator for compatibility - Improve cache invalidation logic to include rope_dims and indep_sects parameters These enhancements enable CANN backend to handle various RoPE configurations used in modern vision-language models and models with partial rotation. --- ggml/src/ggml-cann/aclnn_ops.cpp | 263 ++++++++++++++++++++++++------- ggml/src/ggml-cann/common.h | 15 +- ggml/src/ggml-cann/ggml-cann.cpp | 14 +- 3 files changed, 222 insertions(+), 70 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 48f4b7db691..8fa0e495a6f 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2251,18 +2251,18 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, int sections[4], bool mrope_used, bool is_imrope, - bool indep_sects) { - ggml_tensor * src0 = dst->src[0]; // input + bool indep_sects, + int64_t rope_dims) { ggml_tensor * src1 = dst->src[1]; // position ggml_tensor * src2 = dst->src[2]; // freq_factors - int64_t theta_scale_length = src0->ne[0] / 2; + int64_t theta_scale_length = rope_dims / 2; int64_t position_length = dst->ne[2]; // TODO: check theta_scale_length and position_length. if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, - is_neox, indep_sects, mrope_used, is_imrope, sections)) { + is_neox, indep_sects, mrope_used, is_imrope, sections, rope_dims)) { // use cache. return; } @@ -2294,7 +2294,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, acl_tensor_ptr acl_theta_scale_tensor; bool theta_scale_updated = false; if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale || - ctx.rope_cache.indep_sects != indep_sects) { + ctx.rope_cache.indep_sects != indep_sects || ctx.rope_cache.rope_dims != rope_dims) { theta_scale_updated = true; if (ctx.rope_cache.theta_scale_exp_host != nullptr) { free(ctx.rope_cache.theta_scale_exp_host); @@ -2341,8 +2341,8 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool()); acl_tensor_ptr acl_yarn_ramp_tensor; if (ext_factor != 0 && - // TODO: check more parameter. - (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale)) { + (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale || + ctx.rope_cache.rope_dims != rope_dims || ctx.rope_cache.indep_sects != indep_sects)) { yarn_ramp_tensor_updated = true; // -rope_yarn_ramp @@ -2590,7 +2590,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true); } - int64_t sin_reshape_ne[4] = { src0->ne[0], 1, dst->ne[2], 1 }; + int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 }; size_t sin_reshape_nb[GGML_MAX_DIMS]; sin_reshape_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { @@ -2619,7 +2619,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, // Update cached value. ctx.rope_cache.cached = true; ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, - indep_sects, mrope_used, is_imrope, sections); + indep_sects, mrope_used, is_imrope, sections, rope_dims); } #ifdef __cplusplus @@ -2645,7 +2645,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { // param float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - int sections[4]; + int sections[4]; // const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; @@ -2654,26 +2654,26 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { GGML_TENSOR_UNARY_OP_LOCALS - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4); + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int) * 4); - // TODO: n_dims <= ne0 - GGML_ASSERT(n_dims == ne0); GGML_ASSERT(n_dims % 2 == 0); + GGML_ASSERT(n_dims <= ne00); const float theta_scale = powf(freq_base, -2.0f / n_dims); float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope - const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope + bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope + const bool mrope_used = + mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope const bool is_vision = mode == GGML_ROPE_TYPE_VISION; if (mrope_used) { @@ -2681,17 +2681,26 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { } if (is_vision) { - GGML_ASSERT(n_dims == ne0/2); + GGML_ASSERT(n_dims == ne0 / 2); } if (is_imrope || mrope_used) { is_neox = true; } + int64_t rope_dims = n_dims; + if (is_vision) { + rope_dims = src0->ne[0]; + } + int64_t tail_dims = ne00 - rope_dims; + bool has_tail = tail_dims > 0; + // init ctx.rope_cos/rope_sin cache - aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, mrope_used, is_imrope, is_vision); + aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, + mrope_used, is_imrope, is_vision, rope_dims); - int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 }; + // Cache is generated with ne00 dimensions, so we use ne00 for reshape + int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 }; size_t sin_reshape_nb[GGML_MAX_DIMS]; sin_reshape_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { @@ -2704,7 +2713,6 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0); acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst); - #ifdef ASCEND_310P // Special ROPE operation for 310P @@ -2844,46 +2852,187 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { } return; #endif - int64_t acl_mode = is_neox ? 0 : 1; - - switch (src0->type) { - case GGML_TYPE_F32: - { - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(), - acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get()); - break; + if (has_tail) { + // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions) + int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 }; + size_t head_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 }; + size_t head_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 }; + acl_tensor_ptr acl_src_head = + ggml_cann_create_tensor((char *) src0->data, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), + head_ne, head_nb_src, GGML_MAX_DIMS); + acl_tensor_ptr acl_dst_head = ggml_cann_create_tensor((char *) dst->data, ggml_cann_type_mapping(dst->type), + ggml_element_size(dst), head_ne, head_nb_dst, GGML_MAX_DIMS); + int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 }; + size_t tail_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 }; + size_t tail_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 }; + size_t src_tail_offset = rope_dims * nb00; + size_t dst_tail_offset = rope_dims * nb0; + + auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size, + size_t * nb_src_arr, size_t * nb_dst_arr) { + if (!has_tail) { + return; } - case GGML_TYPE_F16: - { - ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float)); - void * src_trans_buffer = src_trans_allocator.get(); - ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float)); - void * dst_trans_buffer = dst_trans_allocator.get(); + acl_tensor_ptr acl_src_tail = + ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS); + acl_tensor_ptr acl_dst_tail = + ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS); + cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get()); + }; + + switch (src0->type) { + case GGML_TYPE_F32: + { + // Copy head views to contiguous buffers for RotaryPositionEmbedding + // (RotaryPositionEmbedding may not support non-contiguous tensors) + int64_t head_elements = rope_dims * ne01 * ne02 * ne03; + ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); + void * src_head_contiguous_buffer = src_head_contiguous_allocator.get(); + ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); + void * dst_head_contiguous_buffer = dst_head_contiguous_allocator.get(); + + size_t head_contiguous_nb[GGML_MAX_DIMS]; + head_contiguous_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1]; + } - size_t src_trans_nb[GGML_MAX_DIMS]; - src_trans_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + acl_tensor_ptr acl_src_head_contiguous = + ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, + head_contiguous_nb, GGML_MAX_DIMS); + acl_tensor_ptr acl_dst_head_contiguous = + ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, + head_contiguous_nb, GGML_MAX_DIMS); + + // Copy from non-contiguous head view to contiguous buffer + cann_copy(ctx, acl_src_head.get(), acl_src_head_contiguous.get()); + + // Only rotate the first rope_dims dimensions using contiguous buffers + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(), + acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode, + acl_dst_head_contiguous.get()); + + // Copy result back from contiguous buffer to non-contiguous head view + cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_head.get()); + + // Copy the unrotated tail portion from source to destination + copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset, + ggml_cann_type_mapping(dst->type), ggml_element_size(dst), tail_nb_src, + tail_nb_dst); + break; } + case GGML_TYPE_F16: + { + ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float)); + void * src_trans_buffer = src_trans_allocator.get(); + ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float)); + void * dst_trans_buffer = dst_trans_allocator.get(); + + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } - acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS); - acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor( - dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS); + acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS); + acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor( + dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT); + + cann_copy(ctx, acl_src_trans_tensor.get(), acl_dst_trans_tensor.get()); + + // Create head views for FP32 tensors + size_t head_trans_nb[GGML_MAX_DIMS] = { src_trans_nb[0], src_trans_nb[1], src_trans_nb[2], + src_trans_nb[3] }; + acl_tensor_ptr acl_src_trans_head = ggml_cann_create_tensor( + src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS); + acl_tensor_ptr acl_dst_trans_head = ggml_cann_create_tensor( + dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS); + + // Copy head views to contiguous buffers for RotaryPositionEmbedding + // (RotaryPositionEmbedding may not support non-contiguous tensors) + int64_t head_elements = rope_dims * ne01 * ne02 * ne03; + ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); + void * src_head_contiguous_buffer = src_head_contiguous_allocator.get(); + ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); + void * dst_head_contiguous_buffer = dst_head_contiguous_allocator.get(); + + size_t head_contiguous_nb[GGML_MAX_DIMS]; + head_contiguous_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1]; + } + acl_tensor_ptr acl_src_head_contiguous = + ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, + head_contiguous_nb, GGML_MAX_DIMS); + acl_tensor_ptr acl_dst_head_contiguous = + ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, + head_contiguous_nb, GGML_MAX_DIMS); + + // Copy from head view to contiguous buffer + cann_copy(ctx, acl_src_trans_head.get(), acl_src_head_contiguous.get()); + + // Only rotate the first rope_dims dimensions using contiguous buffers + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(), + acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode, + acl_dst_head_contiguous.get()); + + // Copy result back from contiguous buffer to head view + cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_trans_head.get()); + // Copy the unrotated tail portion from source to destination + size_t tail_offset_trans = rope_dims * src_trans_nb[0]; + copy_tail_device((char *) src_trans_buffer + tail_offset_trans, + (char *) dst_trans_buffer + tail_offset_trans, ACL_FLOAT, sizeof(float), + src_trans_nb, src_trans_nb); + aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16); + break; + } + default: + GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE"); + break; + } + } else { + switch (src0->type) { + case GGML_TYPE_F32: + { + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(), + acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get()); + break; + } + case GGML_TYPE_F16: + { + ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float)); + void * src_trans_buffer = src_trans_allocator.get(); + ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float)); + void * dst_trans_buffer = dst_trans_allocator.get(); + + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } + + acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS); + acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor( + dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS); - aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT); + aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT); - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), - acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode, - acl_dst_trans_tensor.get()); + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), + acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode, + acl_dst_trans_tensor.get()); - aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16); + aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16); + break; + } + default: + GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE"); break; - } - default: - GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE"); - break; + } } } diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index b17445bb9a0..536478fe0c6 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -315,7 +315,7 @@ struct ggml_cann_rope_cache { if (theta_scale_exp_host) { free(theta_scale_exp_host); } - if(position_select_index_host) { + if (position_select_index_host) { free(position_select_index_host); } } @@ -330,17 +330,19 @@ struct ggml_cann_rope_cache { bool indep_sects, bool mrope_used, bool is_imrope, - int sections[4]) { + int sections[4], + int64_t rope_dims) { return this->theta_scale_length == theta_scale_length && this->position_length == position_length && this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale && this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects && this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] && - this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3]; + this->sections[1] == sections[1] && this->sections[2] == sections[2] && + this->sections[3] == sections[3] && this->rope_dims == rope_dims; } void set(int64_t theta_scale_length, int64_t position_length, - float ext_factor, + float ext_factor, float theta_scale, float freq_scale, float attn_factor, @@ -348,7 +350,8 @@ struct ggml_cann_rope_cache { bool indep_sects, bool mrope_used, bool is_imrope, - int sections[4]) { + int sections[4], + int64_t rope_dims) { this->theta_scale_length = theta_scale_length; this->position_length = position_length; this->ext_factor = ext_factor; @@ -363,6 +366,7 @@ struct ggml_cann_rope_cache { this->sections[1] = sections[1]; this->sections[2] = sections[2]; this->sections[3] = sections[3]; + this->rope_dims = rope_dims; } // memory cache, prepare before inferencing. @@ -386,6 +390,7 @@ struct ggml_cann_rope_cache { bool mrope_used = false; int sections[4] = { 0, 0, 0, 0 }; bool is_imrope = false; + int64_t rope_dims = 0; }; struct ggml_cann_tensor_cache { diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index df28d67fb0b..cd6e3b90269 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2308,7 +2308,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, bool cann_graph_update_required = false; #ifdef USE_ACL_GRAPH - bool use_cann_graph = true; + bool use_cann_graph = true; static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or("")); if (!prefill_use_graph) { @@ -2338,7 +2338,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, } } #else - bool use_cann_graph = false; + bool use_cann_graph = false; #endif // USE_ACL_GRAPH evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required); @@ -2474,16 +2474,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten } case GGML_OP_ROPE: { - // TODO: with ops-test v == 1 - // TODO: n_dims <= ne0 - if (op->src[0]->ne[0] != op->op_params[1]) { - return false; - } - if (op->src[0]->ne[0] > 896) { return false; } #ifdef ASCEND_310P + // TODO: Support rope_dim < ne00(dim) + if (op->src[0]->ne[0] != op->op_params[1]) { + return false; + } if (!ggml_is_contiguous(op->src[0])) { return false; } From e0d679c9429a1cfeb5a444132e60da1d1d53c391 Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Sat, 29 Nov 2025 03:29:38 +0000 Subject: [PATCH 2/2] cann: fix review comment --- ggml/src/ggml-cann/aclnn_ops.cpp | 303 +++++++++++++------------------ ggml/src/ggml-cann/common.h | 11 +- 2 files changed, 126 insertions(+), 188 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 8fa0e495a6f..835b53f6592 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2262,7 +2262,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, // TODO: check theta_scale_length and position_length. if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, - is_neox, indep_sects, mrope_used, is_imrope, sections, rope_dims)) { + is_neox, indep_sects, mrope_used, is_imrope, sections)) { // use cache. return; } @@ -2294,7 +2294,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, acl_tensor_ptr acl_theta_scale_tensor; bool theta_scale_updated = false; if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale || - ctx.rope_cache.indep_sects != indep_sects || ctx.rope_cache.rope_dims != rope_dims) { + ctx.rope_cache.indep_sects != indep_sects) { theta_scale_updated = true; if (ctx.rope_cache.theta_scale_exp_host != nullptr) { free(ctx.rope_cache.theta_scale_exp_host); @@ -2331,18 +2331,17 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float), ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); - - acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float), - theta_scale_ne, theta_scale_nb, 1); } + acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float), + theta_scale_ne, theta_scale_nb, 1); // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor. + // TODO: acl_yarn_ramp_tensor use rope cache. bool yarn_ramp_tensor_updated = false; ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool()); acl_tensor_ptr acl_yarn_ramp_tensor; - if (ext_factor != 0 && - (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale || - ctx.rope_cache.rope_dims != rope_dims || ctx.rope_cache.indep_sects != indep_sects)) { + if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length || + ctx.rope_cache.freq_scale != freq_scale)) { yarn_ramp_tensor_updated = true; // -rope_yarn_ramp @@ -2619,7 +2618,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx, // Update cached value. ctx.rope_cache.cached = true; ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, - indep_sects, mrope_used, is_imrope, sections, rope_dims); + indep_sects, mrope_used, is_imrope, sections); } #ifdef __cplusplus @@ -2670,11 +2669,13 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope - const bool mrope_used = - mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope - const bool is_vision = mode == GGML_ROPE_TYPE_VISION; + bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope + // mrope_used means the GGML_ROPE_TYPE_MROPE bit is set. + // Note: this bit is also set for imrope and some vision modes, + // so mrope_used does NOT exclusively indicate pure mrope. + const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; + const bool is_vision = mode == GGML_ROPE_TYPE_VISION; if (mrope_used) { GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0); @@ -2689,6 +2690,11 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { } int64_t rope_dims = n_dims; + + //Our current RotaryPositionEmbedding does not support the VISION mode, + //but essentially it only modifies theta_base in mrope, + //then repeats it at the end in the same way as is_neox. + //In fact, RoPE is still applied across all dimensions. if (is_vision) { rope_dims = src0->ne[0]; } @@ -2853,27 +2859,98 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { return; #endif int64_t acl_mode = is_neox ? 0 : 1; + + // Pre-define head and tail dimensions for reuse + int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 }; + int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 }; + + // Step 1: Prepare trans tensors for F16 type conversion to F32 if needed + bool src_dst_need_trans = false; + ggml_cann_pool_alloc src_trans_allocator(ctx.pool()); + ggml_cann_pool_alloc dst_trans_allocator(ctx.pool()); + acl_tensor_ptr acl_src_trans_tensor; + acl_tensor_ptr acl_dst_trans_tensor; + void * src_trans_buffer = nullptr; + void * dst_trans_buffer = nullptr; + size_t src_dst_trans_nb[GGML_MAX_DIMS]; + if (src0->type == GGML_TYPE_F16) { + src_dst_need_trans = true; + src_trans_buffer = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float)); + dst_trans_buffer = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float)); + + src_dst_trans_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1]; + } + acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, + src_dst_trans_nb, GGML_MAX_DIMS); + acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, + src_dst_trans_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT); + } + + // Step 2: Prepare head tensors for tail splitting if needed + acl_tensor_ptr acl_src_head; + acl_tensor_ptr acl_dst_head; if (has_tail) { // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions) - int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 }; - size_t head_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 }; - size_t head_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 }; - acl_tensor_ptr acl_src_head = - ggml_cann_create_tensor((char *) src0->data, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), - head_ne, head_nb_src, GGML_MAX_DIMS); - acl_tensor_ptr acl_dst_head = ggml_cann_create_tensor((char *) dst->data, ggml_cann_type_mapping(dst->type), - ggml_element_size(dst), head_ne, head_nb_dst, GGML_MAX_DIMS); - int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 }; - size_t tail_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 }; - size_t tail_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 }; - size_t src_tail_offset = rope_dims * nb00; - size_t dst_tail_offset = rope_dims * nb0; + // RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer + if (src_dst_need_trans) { + // Use F32 trans tensor strides + acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, + src_dst_trans_nb, GGML_MAX_DIMS); + } else { + // Use original F32 tensor strides + acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb, + GGML_MAX_DIMS); + } + + int64_t head_elements = rope_dims * ne01 * ne02 * ne03; + ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); + void * dst_head_contiguous_buffer = dst_head_contiguous_allocator.get(); + + size_t head_contiguous_nb[GGML_MAX_DIMS]; + head_contiguous_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1]; + } + acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, + head_contiguous_nb, GGML_MAX_DIMS); + } + + // Step 3: Execute RotaryPositionEmbedding + if (has_tail) { + // Rotate only the head portion (first rope_dims dimensions) + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(), + acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get()); + + // Copy head result from contiguous buffer back to destination tensor + if (src_dst_need_trans) { + acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor( + (char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS); + cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get()); + } else { + acl_tensor_ptr acl_dst_head_target = + ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS); + cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get()); + } + } else if (src_dst_need_trans) { + // Rotate full tensor (no tail), using trans tensors + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(), + acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get()); + } else { + // Rotate full tensor (no tail), using original tensors + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(), + acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get()); + } + + // Step 4: Copy unrotated tail portion from source to destination + if (has_tail) { + size_t src_tail_offset; + size_t dst_tail_offset; auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size, size_t * nb_src_arr, size_t * nb_dst_arr) { - if (!has_tail) { - return; - } acl_tensor_ptr acl_src_tail = ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS); acl_tensor_ptr acl_dst_tail = @@ -2881,158 +2958,24 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get()); }; - switch (src0->type) { - case GGML_TYPE_F32: - { - // Copy head views to contiguous buffers for RotaryPositionEmbedding - // (RotaryPositionEmbedding may not support non-contiguous tensors) - int64_t head_elements = rope_dims * ne01 * ne02 * ne03; - ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); - void * src_head_contiguous_buffer = src_head_contiguous_allocator.get(); - ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); - void * dst_head_contiguous_buffer = dst_head_contiguous_allocator.get(); - - size_t head_contiguous_nb[GGML_MAX_DIMS]; - head_contiguous_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1]; - } - - acl_tensor_ptr acl_src_head_contiguous = - ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, - head_contiguous_nb, GGML_MAX_DIMS); - acl_tensor_ptr acl_dst_head_contiguous = - ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, - head_contiguous_nb, GGML_MAX_DIMS); - - // Copy from non-contiguous head view to contiguous buffer - cann_copy(ctx, acl_src_head.get(), acl_src_head_contiguous.get()); - - // Only rotate the first rope_dims dimensions using contiguous buffers - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(), - acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode, - acl_dst_head_contiguous.get()); - - // Copy result back from contiguous buffer to non-contiguous head view - cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_head.get()); - - // Copy the unrotated tail portion from source to destination - copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset, - ggml_cann_type_mapping(dst->type), ggml_element_size(dst), tail_nb_src, - tail_nb_dst); - break; - } - case GGML_TYPE_F16: - { - ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float)); - void * src_trans_buffer = src_trans_allocator.get(); - ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float)); - void * dst_trans_buffer = dst_trans_allocator.get(); - - size_t src_trans_nb[GGML_MAX_DIMS]; - src_trans_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; - } - - acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS); - acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor( - dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS); - - aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT); - - cann_copy(ctx, acl_src_trans_tensor.get(), acl_dst_trans_tensor.get()); - - // Create head views for FP32 tensors - size_t head_trans_nb[GGML_MAX_DIMS] = { src_trans_nb[0], src_trans_nb[1], src_trans_nb[2], - src_trans_nb[3] }; - acl_tensor_ptr acl_src_trans_head = ggml_cann_create_tensor( - src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS); - acl_tensor_ptr acl_dst_trans_head = ggml_cann_create_tensor( - dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS); - - // Copy head views to contiguous buffers for RotaryPositionEmbedding - // (RotaryPositionEmbedding may not support non-contiguous tensors) - int64_t head_elements = rope_dims * ne01 * ne02 * ne03; - ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); - void * src_head_contiguous_buffer = src_head_contiguous_allocator.get(); - ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float)); - void * dst_head_contiguous_buffer = dst_head_contiguous_allocator.get(); - - size_t head_contiguous_nb[GGML_MAX_DIMS]; - head_contiguous_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1]; - } - acl_tensor_ptr acl_src_head_contiguous = - ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, - head_contiguous_nb, GGML_MAX_DIMS); - acl_tensor_ptr acl_dst_head_contiguous = - ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne, - head_contiguous_nb, GGML_MAX_DIMS); - - // Copy from head view to contiguous buffer - cann_copy(ctx, acl_src_trans_head.get(), acl_src_head_contiguous.get()); - - // Only rotate the first rope_dims dimensions using contiguous buffers - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(), - acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode, - acl_dst_head_contiguous.get()); - - // Copy result back from contiguous buffer to head view - cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_trans_head.get()); - // Copy the unrotated tail portion from source to destination - size_t tail_offset_trans = rope_dims * src_trans_nb[0]; - copy_tail_device((char *) src_trans_buffer + tail_offset_trans, - (char *) dst_trans_buffer + tail_offset_trans, ACL_FLOAT, sizeof(float), - src_trans_nb, src_trans_nb); - aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16); - break; - } - default: - GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE"); - break; + if (src_dst_need_trans) { + // Use F32 trans tensor strides and offsets + src_tail_offset = rope_dims * src_dst_trans_nb[0]; + dst_tail_offset = rope_dims * src_dst_trans_nb[0]; + copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset, + ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb); + } else { + // Use original tensor strides and offsets + src_tail_offset = rope_dims * nb00; + dst_tail_offset = rope_dims * nb0; + copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset, + ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb); } - } else { - switch (src0->type) { - case GGML_TYPE_F32: - { - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(), - acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get()); - break; - } - case GGML_TYPE_F16: - { - ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float)); - void * src_trans_buffer = src_trans_allocator.get(); - ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float)); - void * dst_trans_buffer = dst_trans_allocator.get(); - - size_t src_trans_nb[GGML_MAX_DIMS]; - src_trans_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; - } - - acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS); - acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor( - dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS); - - aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT); - - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), - acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode, - acl_dst_trans_tensor.get()); + } - aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16); - break; - } - default: - GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE"); - break; - } + // Step 5: Cast back to F16 if needed + if (src_dst_need_trans) { + aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16); } } diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 536478fe0c6..45c7294e682 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -330,14 +330,12 @@ struct ggml_cann_rope_cache { bool indep_sects, bool mrope_used, bool is_imrope, - int sections[4], - int64_t rope_dims) { + int sections[4]) { return this->theta_scale_length == theta_scale_length && this->position_length == position_length && this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale && this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects && this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] && - this->sections[1] == sections[1] && this->sections[2] == sections[2] && - this->sections[3] == sections[3] && this->rope_dims == rope_dims; + this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3]; } void set(int64_t theta_scale_length, @@ -350,8 +348,7 @@ struct ggml_cann_rope_cache { bool indep_sects, bool mrope_used, bool is_imrope, - int sections[4], - int64_t rope_dims) { + int sections[4]) { this->theta_scale_length = theta_scale_length; this->position_length = position_length; this->ext_factor = ext_factor; @@ -366,7 +363,6 @@ struct ggml_cann_rope_cache { this->sections[1] = sections[1]; this->sections[2] = sections[2]; this->sections[3] = sections[3]; - this->rope_dims = rope_dims; } // memory cache, prepare before inferencing. @@ -390,7 +386,6 @@ struct ggml_cann_rope_cache { bool mrope_used = false; int sections[4] = { 0, 0, 0, 0 }; bool is_imrope = false; - int64_t rope_dims = 0; }; struct ggml_cann_tensor_cache {