From c2bfbbf6424800007888d016f13c0b501b200c68 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Mon, 17 Nov 2025 22:54:28 +0100
Subject: [PATCH 1/5] Fix too relaxed check on CUDA "fast copy"
 (can_be_transposed) condition

---
 ggml/src/ggml-cuda/cpy.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 50612237c8a23..6d9fe5145880e 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -384,7 +384,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     char * src1_ddc = (char *) src1->data;
 
     const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
-    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1;
+    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && 
+        src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
 
     if (src0->type == src1->type && contiguous_srcs) {
         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));

From 5e7c26f297a57d4feac9811016ac72d94e1fb8a9 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Mon, 17 Nov 2025 22:59:28 +0100
Subject: [PATCH 2/5] Argh.

---
 ggml/src/ggml-cuda/cpy.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 6d9fe5145880e..c1afde9627f09 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -384,7 +384,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     char * src1_ddc = (char *) src1->data;
 
     const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
-    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && 
+    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
         src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
 
     if (src0->type == src1->type && contiguous_srcs) {

From d51f719607bbdbc4d19b28f2b87e5c776f218300 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Mon, 17 Nov 2025 23:29:20 +0100
Subject: [PATCH 3/5] Making CISC happy ;)

---
 tests/test-backend-ops.cpp | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 267bead8c4ab7..173bc1a04f12a 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2800,6 +2800,33 @@ struct test_cont : public test_case {
     }
 };
 
+struct test_irregular_cont : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_irregular_cont(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {1, 4, 2, 1})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(src);
+        ggml_set_name(src, "src");
+
+        ggml_tensor * view = ggml_view_4d(ctx, src, src->ne[0], 1, src->ne[2], src->ne[3],
+                                src->nb[1], src->nb[2], src->nb[3], src->nb[0] * (src->ne[1] - 1));
+
+        ggml_tensor * out = ggml_cont(ctx, view);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_ADD
 // GGML_OP_SUB
 // GGML_OP_MUL
@@ -6956,6 +6983,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
     test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
 
+    test_cases.emplace_back(new test_irregular_cont());
+    test_cases.emplace_back(new test_irregular_cont(GGML_TYPE_F32, {1, 8, 17, 1}));
+    test_cases.emplace_back(new test_irregular_cont(GGML_TYPE_BF16, {1, 4, 2, 1}));
+    test_cases.emplace_back(new test_irregular_cont(GGML_TYPE_BF16, {1, 8, 17, 1}));
+
     auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
         for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {
             test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));

From f378da999b17f9aa024164b97a3b1077b09cbedf Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Tue, 18 Nov 2025 13:40:16 +0100
Subject: [PATCH 4/5] Integrate CONT tests

---
 tests/test-backend-ops.cpp | 67 ++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 173bc1a04f12a..7f01ac959f714 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2776,51 +2776,34 @@ struct test_cpy : public test_case {
 struct test_cont : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
+    bool use_view_slice;
 
     std::string vars() override {
-        return VARS_TO_STR2(type, ne);
+        return VARS_TO_STR3(type, ne, use_view_slice);
     }
 
     test_cont(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type(type), ne(ne) {}
+            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            bool use_view_slice = false)
+        : type(type), ne(ne), use_view_slice(use_view_slice) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
         ggml_set_param(src);
         ggml_set_name(src, "src");
 
-        src = ggml_transpose(ctx, src);
-        ggml_set_name(src, "src_transposed");
 
-        ggml_tensor * out = ggml_cont(ctx, src);
-        ggml_set_name(out, "out");
-
-        return out;
-    }
-};
-
-struct test_irregular_cont : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_irregular_cont(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {1, 4, 2, 1})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_set_param(src);
-        ggml_set_name(src, "src");
-
-        ggml_tensor * view = ggml_view_4d(ctx, src, src->ne[0], 1, src->ne[2], src->ne[3],
-                                src->nb[1], src->nb[2], src->nb[3], src->nb[0] * (src->ne[1] - 1));
+        ggml_tensor * dst;
+        if (use_view_slice) {
+            dst = ggml_view_4d(ctx, src, src->ne[0], 1, src->ne[2], src->ne[3],
+                src->nb[1], src->nb[2], src->nb[3], src->nb[0] * (src->ne[1] - 1));
+            ggml_set_name(dst, "src_view_slice");
+        } else {
+            dst = ggml_transpose(ctx, src);
+            ggml_set_name(dst, "src_transposed");
+        }
 
-        ggml_tensor * out = ggml_cont(ctx, view);
+        ggml_tensor * out = ggml_cont(ctx, dst);
         ggml_set_name(out, "out");
 
         return out;
@@ -6983,10 +6966,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
     test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
 
-    test_cases.emplace_back(new test_irregular_cont());
-    test_cases.emplace_back(new test_irregular_cont(GGML_TYPE_F32, {1, 8, 17, 1}));
-    test_cases.emplace_back(new test_irregular_cont(GGML_TYPE_BF16, {1, 4, 2, 1}));
-    test_cases.emplace_back(new test_irregular_cont(GGML_TYPE_BF16, {1, 8, 17, 1}));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}, true));
+
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {1, 4, 2, 1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {1, 8, 17, 1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {1, 4, 2, 1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {1, 8, 17, 1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {1, 4, 2, 1}, true));
+    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {1, 8, 17, 1}, true));
 
     auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
         for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {

From ccd0dfaaab5249e5f89cc756b82790dccad35056 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Tue, 18 Nov 2025 14:02:55 +0100
Subject: [PATCH 5/5] Use loopy loop

---
 tests/test-backend-ops.cpp | 35 ++++++++---------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 7f01ac959f714..d58f1e6889dbe 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6955,33 +6955,14 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
 
-    test_cases.emplace_back(new test_cont());
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
-
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}, true));
-
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {1, 4, 2, 1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {1, 8, 17, 1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {1, 4, 2, 1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {1, 8, 17, 1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {1, 4, 2, 1}, true));
-    test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {1, 8, 17, 1}, true));
+    for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
+        for (bool use_view_slice : { true, false }) {
+            for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
+                {2, 3, 5, 7}, {1, 4, 2, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {
+                test_cases.emplace_back(new test_cont(type_dst, ne, use_view_slice));
+            }
+        }
+    }
 
     auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
         for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {