From dd1588a8bb21a18592b75fc3c1c540f070aa6551 Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Thu, 27 Nov 2025 08:54:29 +0000
Subject: [PATCH 1/2] cann: add support for partial RoPE and Vision mode

Add support for two important RoPE variants: partial rotation (rope_dims < ne0)
and Vision mode rotation.

1. Support for partial RoPE (rope_dims < ne0):
   - Split tensor into head (first rope_dims dimensions) and tail portions
   - Apply rotation only to head portion using RotaryPositionEmbedding operator
   - Copy unrotated tail portion directly from source to destination
   - Handle both contiguous and non-contiguous tensor layouts

2. Support for Vision mode (GGML_ROPE_TYPE_VISION):
   - Set rope_dims = ne0 for Vision mode to rotate entire tensor
   - Vision mode pairs dimension i with dimension i+n_dims (where n_dims = ne0/2)
   - No tail handling needed since entire tensor is rotated

Implementation details:
   - Use has_tail flag to determine execution path: head/tail splitting when
     rope_dims < ne0, or full tensor rotation when rope_dims == ne0
   - Support both F32 and F16 data types with intermediate F32 conversion
   - Copy non-contiguous tensors to contiguous buffers before calling
     RotaryPositionEmbedding operator for compatibility
   - Improve cache invalidation logic to include rope_dims and indep_sects
     parameters

These enhancements enable CANN backend to handle various RoPE configurations
used in modern vision-language models and models with partial rotation.
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 263 ++++++++++++++++++++++++-------
 ggml/src/ggml-cann/common.h      |  15 +-
 ggml/src/ggml-cann/ggml-cann.cpp |  14 +-
 3 files changed, 222 insertions(+), 70 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 48f4b7db691..8fa0e495a6f 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2251,18 +2251,18 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
                                   int                         sections[4],
                                   bool                        mrope_used,
                                   bool                        is_imrope,
-                                  bool                        indep_sects) {
-    ggml_tensor * src0 = dst->src[0];  // input
+                                  bool                        indep_sects,
+                                  int64_t                     rope_dims) {
     ggml_tensor * src1 = dst->src[1];  // position
     ggml_tensor * src2 = dst->src[2];  // freq_factors
 
-    int64_t theta_scale_length = src0->ne[0] / 2;
+    int64_t theta_scale_length = rope_dims / 2;
     int64_t position_length    = dst->ne[2];
 
     // TODO: check theta_scale_length and position_length.
     if (src2 == nullptr && ctx.rope_cache.cached &&
         ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor,
-                             is_neox, indep_sects, mrope_used, is_imrope, sections)) {
+                             is_neox, indep_sects, mrope_used, is_imrope, sections, rope_dims)) {
         // use cache.
         return;
     }
@@ -2294,7 +2294,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
     acl_tensor_ptr acl_theta_scale_tensor;
     bool           theta_scale_updated = false;
     if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale ||
-        ctx.rope_cache.indep_sects != indep_sects) {
+        ctx.rope_cache.indep_sects != indep_sects || ctx.rope_cache.rope_dims != rope_dims) {
         theta_scale_updated = true;
         if (ctx.rope_cache.theta_scale_exp_host != nullptr) {
             free(ctx.rope_cache.theta_scale_exp_host);
@@ -2341,8 +2341,8 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
     ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
     acl_tensor_ptr       acl_yarn_ramp_tensor;
     if (ext_factor != 0 &&
-        // TODO: check more parameter.
-        (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale)) {
+        (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale ||
+         ctx.rope_cache.rope_dims != rope_dims || ctx.rope_cache.indep_sects != indep_sects)) {
         yarn_ramp_tensor_updated = true;
 
         // -rope_yarn_ramp
@@ -2590,7 +2590,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
         aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
     }
 
-    int64_t sin_reshape_ne[4] = { src0->ne[0], 1, dst->ne[2], 1 };
+    int64_t sin_reshape_ne[4] = { rope_dims, 1, dst->ne[2], 1 };
     size_t  sin_reshape_nb[GGML_MAX_DIMS];
     sin_reshape_nb[0] = sizeof(float);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -2619,7 +2619,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
     // Update cached value.
     ctx.rope_cache.cached = true;
     ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox,
-                       indep_sects, mrope_used, is_imrope, sections);
+                       indep_sects, mrope_used, is_imrope, sections, rope_dims);
 }
 
 #ifdef __cplusplus
@@ -2645,7 +2645,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 
     // param
     float     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    int sections[4];
+    int       sections[4];
     // const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
     const int mode       = ((int32_t *) dst->op_params)[2];
@@ -2654,26 +2654,26 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+    memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
+    memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int) * 4);
 
-    // TODO: n_dims <= ne0
-    GGML_ASSERT(n_dims == ne0);
     GGML_ASSERT(n_dims % 2 == 0);
+    GGML_ASSERT(n_dims <= ne00);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
-    bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
-    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
+    bool       is_neox   = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;  // qwen3vl apply interleaved mrope
+    const bool mrope_used =
+        mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
     const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
 
     if (mrope_used) {
@@ -2681,17 +2681,26 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
     }
 
     if (is_vision) {
-        GGML_ASSERT(n_dims == ne0/2);
+        GGML_ASSERT(n_dims == ne0 / 2);
     }
 
     if (is_imrope || mrope_used) {
         is_neox = true;
     }
 
+    int64_t rope_dims = n_dims;
+    if (is_vision) {
+        rope_dims = src0->ne[0];
+    }
+    int64_t tail_dims = ne00 - rope_dims;
+    bool    has_tail  = tail_dims > 0;
+
     // init ctx.rope_cos/rope_sin cache
-    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, mrope_used, is_imrope, is_vision);
+    aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections,
+                          mrope_used, is_imrope, is_vision, rope_dims);
 
-    int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
+    // Cache is generated with ne00 dimensions, so we use ne00 for reshape
+    int64_t sin_reshape_ne[4] = { rope_dims, 1, ne02, 1 };
     size_t  sin_reshape_nb[GGML_MAX_DIMS];
     sin_reshape_nb[0] = sizeof(float);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
@@ -2704,7 +2713,6 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 
     acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
     acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
-
 #ifdef ASCEND_310P
     // Special ROPE operation for 310P
 
@@ -2844,46 +2852,187 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
     }
     return;
 #endif
-
     int64_t acl_mode = is_neox ? 0 : 1;
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
-                                        acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
-                break;
+    if (has_tail) {
+        // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
+        int64_t        head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
+        size_t         head_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 };
+        size_t         head_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 };
+        acl_tensor_ptr acl_src_head =
+            ggml_cann_create_tensor((char *) src0->data, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
+                                    head_ne, head_nb_src, GGML_MAX_DIMS);
+        acl_tensor_ptr acl_dst_head = ggml_cann_create_tensor((char *) dst->data, ggml_cann_type_mapping(dst->type),
+                                                              ggml_element_size(dst), head_ne, head_nb_dst, GGML_MAX_DIMS);
+        int64_t        tail_ne[GGML_MAX_DIMS]     = { tail_dims, ne01, ne02, ne03 };
+        size_t         tail_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 };
+        size_t         tail_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 };
+        size_t         src_tail_offset = rope_dims * nb00;
+        size_t         dst_tail_offset = rope_dims * nb0;
+
+        auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
+                                    size_t * nb_src_arr, size_t * nb_dst_arr) {
+            if (!has_tail) {
+                return;
             }
-        case GGML_TYPE_F16:
-            {
-                ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
-                void *               src_trans_buffer = src_trans_allocator.get();
-                ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
-                void *               dst_trans_buffer = dst_trans_allocator.get();
+            acl_tensor_ptr acl_src_tail =
+                ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
+            acl_tensor_ptr acl_dst_tail =
+                ggml_cann_create_tensor(dst_ptr, dtype, elem_size, tail_ne, nb_dst_arr, GGML_MAX_DIMS);
+            cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
+        };
+
+        switch (src0->type) {
+            case GGML_TYPE_F32:
+                {
+                    // Copy head views to contiguous buffers for RotaryPositionEmbedding
+                    // (RotaryPositionEmbedding may not support non-contiguous tensors)
+                    int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
+                    ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
+                    void *               src_head_contiguous_buffer = src_head_contiguous_allocator.get();
+                    ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
+                    void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
+
+                    size_t head_contiguous_nb[GGML_MAX_DIMS];
+                    head_contiguous_nb[0] = sizeof(float);
+                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                        head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
+                    }
 
-                size_t src_trans_nb[GGML_MAX_DIMS];
-                src_trans_nb[0] = sizeof(float);
-                for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                    acl_tensor_ptr acl_src_head_contiguous =
+                        ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                                head_contiguous_nb, GGML_MAX_DIMS);
+                    acl_tensor_ptr acl_dst_head_contiguous =
+                        ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                                head_contiguous_nb, GGML_MAX_DIMS);
+
+                    // Copy from non-contiguous head view to contiguous buffer
+                    cann_copy(ctx, acl_src_head.get(), acl_src_head_contiguous.get());
+
+                    // Only rotate the first rope_dims dimensions using contiguous buffers
+                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(),
+                                            acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
+                                            acl_dst_head_contiguous.get());
+
+                    // Copy result back from contiguous buffer to non-contiguous head view
+                    cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_head.get());
+
+                    // Copy the unrotated tail portion from source to destination
+                    copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
+                                     ggml_cann_type_mapping(dst->type), ggml_element_size(dst), tail_nb_src,
+                                     tail_nb_dst);
+                    break;
                 }
+            case GGML_TYPE_F16:
+                {
+                    ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
+                    void *               src_trans_buffer = src_trans_allocator.get();
+                    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+                    void *               dst_trans_buffer = dst_trans_allocator.get();
+
+                    size_t src_trans_nb[GGML_MAX_DIMS];
+                    src_trans_nb[0] = sizeof(float);
+                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                        src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                    }
 
-                acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
-                    src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
-                acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
-                    dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
+                    acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
+                        src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+                    acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
+                        dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
+
+                    aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
+
+                    cann_copy(ctx, acl_src_trans_tensor.get(), acl_dst_trans_tensor.get());
+
+                    // Create head views for FP32 tensors
+                    size_t         head_trans_nb[GGML_MAX_DIMS] = { src_trans_nb[0], src_trans_nb[1], src_trans_nb[2],
+                                                                    src_trans_nb[3] };
+                    acl_tensor_ptr acl_src_trans_head           = ggml_cann_create_tensor(
+                        src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS);
+                    acl_tensor_ptr acl_dst_trans_head = ggml_cann_create_tensor(
+                        dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS);
+
+                    // Copy head views to contiguous buffers for RotaryPositionEmbedding
+                    // (RotaryPositionEmbedding may not support non-contiguous tensors)
+                    int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
+                    ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
+                    void *               src_head_contiguous_buffer = src_head_contiguous_allocator.get();
+                    ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
+                    void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
+
+                    size_t head_contiguous_nb[GGML_MAX_DIMS];
+                    head_contiguous_nb[0] = sizeof(float);
+                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                        head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
+                    }
+                    acl_tensor_ptr acl_src_head_contiguous =
+                        ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                                head_contiguous_nb, GGML_MAX_DIMS);
+                    acl_tensor_ptr acl_dst_head_contiguous =
+                        ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                                head_contiguous_nb, GGML_MAX_DIMS);
+
+                    // Copy from head view to contiguous buffer
+                    cann_copy(ctx, acl_src_trans_head.get(), acl_src_head_contiguous.get());
+
+                    // Only rotate the first rope_dims dimensions using contiguous buffers
+                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(),
+                                            acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
+                                            acl_dst_head_contiguous.get());
+
+                    // Copy result back from contiguous buffer to head view
+                    cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_trans_head.get());
+                    // Copy the unrotated tail portion from source to destination
+                    size_t tail_offset_trans = rope_dims * src_trans_nb[0];
+                    copy_tail_device((char *) src_trans_buffer + tail_offset_trans,
+                                     (char *) dst_trans_buffer + tail_offset_trans, ACL_FLOAT, sizeof(float),
+                                     src_trans_nb, src_trans_nb);
+                    aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+                    break;
+                }
+            default:
+                GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
+                break;
+        }
+    } else {
+        switch (src0->type) {
+            case GGML_TYPE_F32:
+                {
+                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
+                                            acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
+                    break;
+                }
+            case GGML_TYPE_F16:
+                {
+                    ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
+                    void *               src_trans_buffer = src_trans_allocator.get();
+                    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+                    void *               dst_trans_buffer = dst_trans_allocator.get();
+
+                    size_t src_trans_nb[GGML_MAX_DIMS];
+                    src_trans_nb[0] = sizeof(float);
+                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                        src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                    }
+
+                    acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
+                        src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+                    acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
+                        dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
 
-                aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
+                    aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
 
-                GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
-                                        acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
-                                        acl_dst_trans_tensor.get());
+                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
+                                            acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
+                                            acl_dst_trans_tensor.get());
 
-                aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+                    aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
+                    break;
+                }
+            default:
+                GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
                 break;
-            }
-        default:
-            GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
-            break;
+        }
     }
 }
 
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index b17445bb9a0..536478fe0c6 100644
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -315,7 +315,7 @@ struct ggml_cann_rope_cache {
         if (theta_scale_exp_host) {
             free(theta_scale_exp_host);
         }
-        if(position_select_index_host) {
+        if (position_select_index_host) {
             free(position_select_index_host);
         }
     }
@@ -330,17 +330,19 @@ struct ggml_cann_rope_cache {
                bool    indep_sects,
                bool    mrope_used,
                bool    is_imrope,
-               int     sections[4]) {
+               int     sections[4],
+               int64_t rope_dims) {
         return this->theta_scale_length == theta_scale_length && this->position_length == position_length &&
                this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale &&
                this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects &&
                this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] &&
-               this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3];
+               this->sections[1] == sections[1] && this->sections[2] == sections[2] &&
+               this->sections[3] == sections[3] && this->rope_dims == rope_dims;
     }
 
     void set(int64_t theta_scale_length,
              int64_t position_length,
-             float    ext_factor,
+             float   ext_factor,
              float   theta_scale,
              float   freq_scale,
              float   attn_factor,
@@ -348,7 +350,8 @@ struct ggml_cann_rope_cache {
              bool    indep_sects,
              bool    mrope_used,
              bool    is_imrope,
-             int     sections[4]) {
+             int     sections[4],
+             int64_t rope_dims) {
         this->theta_scale_length = theta_scale_length;
         this->position_length    = position_length;
         this->ext_factor         = ext_factor;
@@ -363,6 +366,7 @@ struct ggml_cann_rope_cache {
         this->sections[1]        = sections[1];
         this->sections[2]        = sections[2];
         this->sections[3]        = sections[3];
+        this->rope_dims          = rope_dims;
     }
 
     // memory cache, prepare before inferencing.
@@ -386,6 +390,7 @@ struct ggml_cann_rope_cache {
     bool    mrope_used                 = false;
     int     sections[4]                = { 0, 0, 0, 0 };
     bool    is_imrope                  = false;
+    int64_t rope_dims                  = 0;
 };
 
 struct ggml_cann_tensor_cache {
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index df28d67fb0b..cd6e3b90269 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2308,7 +2308,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
 
     bool cann_graph_update_required = false;
 #ifdef USE_ACL_GRAPH
-    bool use_cann_graph             = true;
+    bool use_cann_graph = true;
 
     static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
     if (!prefill_use_graph) {
@@ -2338,7 +2338,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
         }
     }
 #else
-    bool use_cann_graph             = false;
+    bool use_cann_graph = false;
 #endif  // USE_ACL_GRAPH
     evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
 
@@ -2474,16 +2474,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
             }
         case GGML_OP_ROPE:
             {
-                // TODO: with ops-test v == 1
-                // TODO: n_dims <= ne0
-                if (op->src[0]->ne[0] != op->op_params[1]) {
-                    return false;
-                }
-
                 if (op->src[0]->ne[0] > 896) {
                     return false;
                 }
 #ifdef ASCEND_310P
+                // TODO: Support rope_dim < ne00(dim)
+                if (op->src[0]->ne[0] != op->op_params[1]) {
+                    return false;
+                }
                 if (!ggml_is_contiguous(op->src[0])) {
                     return false;
                 }

From e0d679c9429a1cfeb5a444132e60da1d1d53c391 Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Sat, 29 Nov 2025 03:29:38 +0000
Subject: [PATCH 2/2] cann: fix review comment

---
 ggml/src/ggml-cann/aclnn_ops.cpp | 303 +++++++++++++------------------
 ggml/src/ggml-cann/common.h      |  11 +-
 2 files changed, 126 insertions(+), 188 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 8fa0e495a6f..835b53f6592 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2262,7 +2262,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
     // TODO: check theta_scale_length and position_length.
     if (src2 == nullptr && ctx.rope_cache.cached &&
         ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor,
-                             is_neox, indep_sects, mrope_used, is_imrope, sections, rope_dims)) {
+                             is_neox, indep_sects, mrope_used, is_imrope, sections)) {
         // use cache.
         return;
     }
@@ -2294,7 +2294,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
     acl_tensor_ptr acl_theta_scale_tensor;
     bool           theta_scale_updated = false;
     if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale ||
-        ctx.rope_cache.indep_sects != indep_sects || ctx.rope_cache.rope_dims != rope_dims) {
+        ctx.rope_cache.indep_sects != indep_sects) {
         theta_scale_updated = true;
         if (ctx.rope_cache.theta_scale_exp_host != nullptr) {
             free(ctx.rope_cache.theta_scale_exp_host);
@@ -2331,18 +2331,17 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
         ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
                                    ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
                                    ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
-
-        acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
-                                                         theta_scale_ne, theta_scale_nb, 1);
     }
+    acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+                                                     theta_scale_ne, theta_scale_nb, 1);
 
     // Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
+    // TODO: acl_yarn_ramp_tensor use rope cache.
     bool                 yarn_ramp_tensor_updated = false;
     ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
     acl_tensor_ptr       acl_yarn_ramp_tensor;
-    if (ext_factor != 0 &&
-        (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale ||
-         ctx.rope_cache.rope_dims != rope_dims || ctx.rope_cache.indep_sects != indep_sects)) {
+    if (ext_factor != 0 && (theta_scale_updated || ctx.rope_cache.theta_scale_length != theta_scale_length ||
+                            ctx.rope_cache.freq_scale != freq_scale)) {
         yarn_ramp_tensor_updated = true;
 
         // -rope_yarn_ramp
@@ -2619,7 +2618,7 @@ static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
     // Update cached value.
     ctx.rope_cache.cached = true;
     ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox,
-                       indep_sects, mrope_used, is_imrope, sections, rope_dims);
+                       indep_sects, mrope_used, is_imrope, sections);
 }
 
 #ifdef __cplusplus
@@ -2670,11 +2669,13 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
-    bool       is_neox   = mode & GGML_ROPE_TYPE_NEOX;
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;  // qwen3vl apply interleaved mrope
-    const bool mrope_used =
-        mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
-    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+    bool       is_neox    = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_imrope  = mode == GGML_ROPE_TYPE_IMROPE;  // qwen3vl apply interleaved mrope
+    // mrope_used means the GGML_ROPE_TYPE_MROPE bit is set.
+    // Note: this bit is also set for imrope and some vision modes,
+    // so mrope_used does NOT exclusively indicate pure mrope.
+    const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision  = mode == GGML_ROPE_TYPE_VISION;
 
     if (mrope_used) {
         GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
@@ -2689,6 +2690,11 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
     }
 
     int64_t rope_dims = n_dims;
+
+    //Our current RotaryPositionEmbedding does not support the VISION mode,
+    //but essentially it only modifies theta_base in mrope,
+    //then repeats it at the end in the same way as is_neox.
+    //In fact, RoPE is still applied across all dimensions.
     if (is_vision) {
         rope_dims = src0->ne[0];
     }
@@ -2853,27 +2859,98 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
     return;
 #endif
     int64_t acl_mode = is_neox ? 0 : 1;
+
+    // Pre-define head and tail dimensions for reuse
+    int64_t head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
+    int64_t tail_ne[GGML_MAX_DIMS] = { tail_dims, ne01, ne02, ne03 };
+
+    // Step 1: Prepare trans tensors for F16 type conversion to F32 if needed
+    bool                 src_dst_need_trans = false;
+    ggml_cann_pool_alloc src_trans_allocator(ctx.pool());
+    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool());
+    acl_tensor_ptr       acl_src_trans_tensor;
+    acl_tensor_ptr       acl_dst_trans_tensor;
+    void *               src_trans_buffer = nullptr;
+    void *               dst_trans_buffer = nullptr;
+    size_t               src_dst_trans_nb[GGML_MAX_DIMS];
+    if (src0->type == GGML_TYPE_F16) {
+        src_dst_need_trans = true;
+        src_trans_buffer   = src_trans_allocator.alloc(ggml_nelements(src0) * sizeof(float));
+        dst_trans_buffer   = dst_trans_allocator.alloc(ggml_nelements(dst) * sizeof(float));
+
+        src_dst_trans_nb[0] = sizeof(float);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            src_dst_trans_nb[i] = src_dst_trans_nb[i - 1] * src0->ne[i - 1];
+        }
+        acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne,
+                                                       src_dst_trans_nb, GGML_MAX_DIMS);
+        acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne,
+                                                       src_dst_trans_nb, GGML_MAX_DIMS);
+        aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
+    }
+
+    // Step 2: Prepare head tensors for tail splitting if needed
+    acl_tensor_ptr acl_src_head;
+    acl_tensor_ptr acl_dst_head;
     if (has_tail) {
         // Create head views for RotaryPositionEmbedding (only first rope_dims dimensions)
-        int64_t        head_ne[GGML_MAX_DIMS] = { rope_dims, ne01, ne02, ne03 };
-        size_t         head_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 };
-        size_t         head_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 };
-        acl_tensor_ptr acl_src_head =
-            ggml_cann_create_tensor((char *) src0->data, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
-                                    head_ne, head_nb_src, GGML_MAX_DIMS);
-        acl_tensor_ptr acl_dst_head = ggml_cann_create_tensor((char *) dst->data, ggml_cann_type_mapping(dst->type),
-                                                              ggml_element_size(dst), head_ne, head_nb_dst, GGML_MAX_DIMS);
-        int64_t        tail_ne[GGML_MAX_DIMS]     = { tail_dims, ne01, ne02, ne03 };
-        size_t         tail_nb_src[GGML_MAX_DIMS] = { nb00, nb01, nb02, nb03 };
-        size_t         tail_nb_dst[GGML_MAX_DIMS] = { nb0, nb1, nb2, nb3 };
-        size_t         src_tail_offset = rope_dims * nb00;
-        size_t         dst_tail_offset = rope_dims * nb0;
+        // RotaryPositionEmbedding requires contiguous dst tensor, so we use a temporary buffer
+        if (src_dst_need_trans) {
+            // Use F32 trans tensor strides
+            acl_src_head = ggml_cann_create_tensor((char *) src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                                   src_dst_trans_nb, GGML_MAX_DIMS);
+        } else {
+            // Use original F32 tensor strides
+            acl_src_head = ggml_cann_create_tensor((char *) src0->data, ACL_FLOAT, sizeof(float), head_ne, src0->nb,
+                                                   GGML_MAX_DIMS);
+        }
+
+        int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
+        ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
+        void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
+
+        size_t head_contiguous_nb[GGML_MAX_DIMS];
+        head_contiguous_nb[0] = sizeof(float);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
+        }
+        acl_dst_head = ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
+                                               head_contiguous_nb, GGML_MAX_DIMS);
+    }
+
+    // Step 3: Execute RotaryPositionEmbedding
+    if (has_tail) {
+        // Rotate only the head portion (first rope_dims dimensions)
+        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head.get(), acl_cos_reshape_tensor.get(),
+                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_head.get());
+
+        // Copy head result from contiguous buffer back to destination tensor
+        if (src_dst_need_trans) {
+            acl_tensor_ptr acl_dst_head_target = ggml_cann_create_tensor(
+                (char *) dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, src_dst_trans_nb, GGML_MAX_DIMS);
+            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
+        } else {
+            acl_tensor_ptr acl_dst_head_target =
+                ggml_cann_create_tensor((char *) dst->data, ACL_FLOAT, sizeof(float), head_ne, dst->nb, GGML_MAX_DIMS);
+            cann_copy(ctx, acl_dst_head.get(), acl_dst_head_target.get());
+        }
+    } else if (src_dst_need_trans) {
+        // Rotate full tensor (no tail), using trans tensors
+        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
+                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
+    } else {
+        // Rotate full tensor (no tail), using original tensors
+        GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
+                                acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
+    }
+
+    // Step 4: Copy unrotated tail portion from source to destination
+    if (has_tail) {
+        size_t src_tail_offset;
+        size_t dst_tail_offset;
 
         auto copy_tail_device = [&](void * src_ptr, void * dst_ptr, aclDataType dtype, size_t elem_size,
                                     size_t * nb_src_arr, size_t * nb_dst_arr) {
-            if (!has_tail) {
-                return;
-            }
             acl_tensor_ptr acl_src_tail =
                 ggml_cann_create_tensor(src_ptr, dtype, elem_size, tail_ne, nb_src_arr, GGML_MAX_DIMS);
             acl_tensor_ptr acl_dst_tail =
@@ -2881,158 +2958,24 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
             cann_copy(ctx, acl_src_tail.get(), acl_dst_tail.get());
         };
 
-        switch (src0->type) {
-            case GGML_TYPE_F32:
-                {
-                    // Copy head views to contiguous buffers for RotaryPositionEmbedding
-                    // (RotaryPositionEmbedding may not support non-contiguous tensors)
-                    int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
-                    ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
-                    void *               src_head_contiguous_buffer = src_head_contiguous_allocator.get();
-                    ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
-                    void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
-
-                    size_t head_contiguous_nb[GGML_MAX_DIMS];
-                    head_contiguous_nb[0] = sizeof(float);
-                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                        head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
-                    }
-
-                    acl_tensor_ptr acl_src_head_contiguous =
-                        ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                                head_contiguous_nb, GGML_MAX_DIMS);
-                    acl_tensor_ptr acl_dst_head_contiguous =
-                        ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                                head_contiguous_nb, GGML_MAX_DIMS);
-
-                    // Copy from non-contiguous head view to contiguous buffer
-                    cann_copy(ctx, acl_src_head.get(), acl_src_head_contiguous.get());
-
-                    // Only rotate the first rope_dims dimensions using contiguous buffers
-                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(),
-                                            acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
-                                            acl_dst_head_contiguous.get());
-
-                    // Copy result back from contiguous buffer to non-contiguous head view
-                    cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_head.get());
-
-                    // Copy the unrotated tail portion from source to destination
-                    copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
-                                     ggml_cann_type_mapping(dst->type), ggml_element_size(dst), tail_nb_src,
-                                     tail_nb_dst);
-                    break;
-                }
-            case GGML_TYPE_F16:
-                {
-                    ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
-                    void *               src_trans_buffer = src_trans_allocator.get();
-                    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
-                    void *               dst_trans_buffer = dst_trans_allocator.get();
-
-                    size_t src_trans_nb[GGML_MAX_DIMS];
-                    src_trans_nb[0] = sizeof(float);
-                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                        src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
-                    }
-
-                    acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
-                        src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
-                    acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
-                        dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
-
-                    aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
-
-                    cann_copy(ctx, acl_src_trans_tensor.get(), acl_dst_trans_tensor.get());
-
-                    // Create head views for FP32 tensors
-                    size_t         head_trans_nb[GGML_MAX_DIMS] = { src_trans_nb[0], src_trans_nb[1], src_trans_nb[2],
-                                                                    src_trans_nb[3] };
-                    acl_tensor_ptr acl_src_trans_head           = ggml_cann_create_tensor(
-                        src_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS);
-                    acl_tensor_ptr acl_dst_trans_head = ggml_cann_create_tensor(
-                        dst_trans_buffer, ACL_FLOAT, sizeof(float), head_ne, head_trans_nb, GGML_MAX_DIMS);
-
-                    // Copy head views to contiguous buffers for RotaryPositionEmbedding
-                    // (RotaryPositionEmbedding may not support non-contiguous tensors)
-                    int64_t              head_elements = rope_dims * ne01 * ne02 * ne03;
-                    ggml_cann_pool_alloc src_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
-                    void *               src_head_contiguous_buffer = src_head_contiguous_allocator.get();
-                    ggml_cann_pool_alloc dst_head_contiguous_allocator(ctx.pool(), head_elements * sizeof(float));
-                    void *               dst_head_contiguous_buffer = dst_head_contiguous_allocator.get();
-
-                    size_t head_contiguous_nb[GGML_MAX_DIMS];
-                    head_contiguous_nb[0] = sizeof(float);
-                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                        head_contiguous_nb[i] = head_contiguous_nb[i - 1] * head_ne[i - 1];
-                    }
-                    acl_tensor_ptr acl_src_head_contiguous =
-                        ggml_cann_create_tensor(src_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                                head_contiguous_nb, GGML_MAX_DIMS);
-                    acl_tensor_ptr acl_dst_head_contiguous =
-                        ggml_cann_create_tensor(dst_head_contiguous_buffer, ACL_FLOAT, sizeof(float), head_ne,
-                                                head_contiguous_nb, GGML_MAX_DIMS);
-
-                    // Copy from head view to contiguous buffer
-                    cann_copy(ctx, acl_src_trans_head.get(), acl_src_head_contiguous.get());
-
-                    // Only rotate the first rope_dims dimensions using contiguous buffers
-                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_head_contiguous.get(),
-                                            acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
-                                            acl_dst_head_contiguous.get());
-
-                    // Copy result back from contiguous buffer to head view
-                    cann_copy(ctx, acl_dst_head_contiguous.get(), acl_dst_trans_head.get());
-                    // Copy the unrotated tail portion from source to destination
-                    size_t tail_offset_trans = rope_dims * src_trans_nb[0];
-                    copy_tail_device((char *) src_trans_buffer + tail_offset_trans,
-                                     (char *) dst_trans_buffer + tail_offset_trans, ACL_FLOAT, sizeof(float),
-                                     src_trans_nb, src_trans_nb);
-                    aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
-                    break;
-                }
-            default:
-                GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
-                break;
+        if (src_dst_need_trans) {
+            // Use F32 trans tensor strides and offsets
+            src_tail_offset = rope_dims * src_dst_trans_nb[0];
+            dst_tail_offset = rope_dims * src_dst_trans_nb[0];
+            copy_tail_device((char *) src_trans_buffer + src_tail_offset, (char *) dst_trans_buffer + dst_tail_offset,
+                             ACL_FLOAT, sizeof(float), src_dst_trans_nb, src_dst_trans_nb);
+        } else {
+            // Use original tensor strides and offsets
+            src_tail_offset = rope_dims * nb00;
+            dst_tail_offset = rope_dims * nb0;
+            copy_tail_device((char *) src0->data + src_tail_offset, (char *) dst->data + dst_tail_offset,
+                             ggml_cann_type_mapping(dst->type), ggml_element_size(dst), src0->nb, dst->nb);
         }
-    } else {
-        switch (src0->type) {
-            case GGML_TYPE_F32:
-                {
-                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
-                                            acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
-                    break;
-                }
-            case GGML_TYPE_F16:
-                {
-                    ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
-                    void *               src_trans_buffer = src_trans_allocator.get();
-                    ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
-                    void *               dst_trans_buffer = dst_trans_allocator.get();
-
-                    size_t src_trans_nb[GGML_MAX_DIMS];
-                    src_trans_nb[0] = sizeof(float);
-                    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                        src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
-                    }
-
-                    acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
-                        src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
-                    acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
-                        dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
-
-                    aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
-
-                    GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
-                                            acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
-                                            acl_dst_trans_tensor.get());
+    }
 
-                    aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
-                    break;
-                }
-            default:
-                GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
-                break;
-        }
+    // Step 5: Cast back to F16 if needed
+    if (src_dst_need_trans) {
+        aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
     }
 }
 
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index 536478fe0c6..45c7294e682 100644
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -330,14 +330,12 @@ struct ggml_cann_rope_cache {
                bool    indep_sects,
                bool    mrope_used,
                bool    is_imrope,
-               int     sections[4],
-               int64_t rope_dims) {
+               int     sections[4]) {
         return this->theta_scale_length == theta_scale_length && this->position_length == position_length &&
                this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale &&
                this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects &&
                this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] &&
-               this->sections[1] == sections[1] && this->sections[2] == sections[2] &&
-               this->sections[3] == sections[3] && this->rope_dims == rope_dims;
+               this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3];
     }
 
     void set(int64_t theta_scale_length,
@@ -350,8 +348,7 @@ struct ggml_cann_rope_cache {
              bool    indep_sects,
              bool    mrope_used,
              bool    is_imrope,
-             int     sections[4],
-             int64_t rope_dims) {
+             int     sections[4]) {
         this->theta_scale_length = theta_scale_length;
         this->position_length    = position_length;
         this->ext_factor         = ext_factor;
@@ -366,7 +363,6 @@ struct ggml_cann_rope_cache {
         this->sections[1]        = sections[1];
         this->sections[2]        = sections[2];
         this->sections[3]        = sections[3];
-        this->rope_dims          = rope_dims;
     }
 
     // memory cache, prepare before inferencing.
@@ -390,7 +386,6 @@ struct ggml_cann_rope_cache {
     bool    mrope_used                 = false;
     int     sections[4]                = { 0, 0, 0, 0 };
     bool    is_imrope                  = false;
-    int64_t rope_dims                  = 0;
 };
 
 struct ggml_cann_tensor_cache {