From 772bb890fc9faf5f6359fe124d53e10988fbba75 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 15 Aug 2025 08:41:14 +0000
Subject: [PATCH 1/6] issue/360: kunlun conv failed

---
 src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 298 ++++++++++++++++++++
 src/infiniop/ops/conv/kunlun/conv_kunlun.h  |   8 +
 src/infiniop/ops/conv/operator.cc           |  15 +
 3 files changed, 321 insertions(+)
 create mode 100644 src/infiniop/ops/conv/kunlun/conv_kunlun.cc
 create mode 100644 src/infiniop/ops/conv/kunlun/conv_kunlun.h
diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
new file mode 100644
index 000000000..2246ecc03
--- /dev/null
+++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
@@ -0,0 +1,298 @@
+#include "conv_kunlun.h"
+#include "../../../../utils.h"
+#include "../../../devices/kunlun/kunlun_common.h"
+#include "../../../devices/kunlun/kunlun_handle.h"
+#include <xpu/refactor/context/xpu_act_type.h>
+namespace op::conv::kunlun {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::kunlun::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    infiniopTensorDescriptor_t b_desc,
+    const void *pads,
+    const void *strides,
+    const void *dilations,
+    size_t n) {
+    auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = ConvInfo::create(handle_, y_desc, x_desc, w_desc, b_desc,
+                                   pads, strides, dilations, n);
+
+    CHECK_RESULT(result);
+    auto conv_info = result.take();
+    size_t min_workspace_size = conv_info.bias_dims_size() * sizeof(float);
+    *desc_ptr = new Descriptor(
+        dtype,
+        conv_info,
+        min_workspace_size,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t conv_kernel(
+    std::shared_ptr<device::kunlun::Handle::Internal> internal,
+    const ConvInfo &info,
+    infiniDtype_t dtype,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *w,
+    const void *bias,
+    void *stream) {
+    char *workspace_value = reinterpret_cast<char *>(workspace);
+    int64_t bias_ndims = info.bias_dims_size();
+    int64_t bias_size = 1;
+    for (int64_t i = 0; i < bias_ndims; i++) {
+        bias_size *= info.bias_dim(i);
+    }
+    float *bias_F32 = (float *)workspace_value;
+    switch (info.ndim()) {
+    case 1: {
+        int64_t ksize = (int64_t)info.kernel_dim(0);
+        int64_t stride = (int64_t)info.stride_info(0);
+        std::initializer_list<int64_t> pad = {(int64_t)info.pad_info(0)};
+        int64_t dilation = (int64_t)info.dilation_info(0);
+        printf("x_shape:(%ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0));
+        printf("kernel_dim:(%ld)\n", ksize);
+        printf("stride:(%ld)\n", stride);
+        printf("pad:(%ld)\n", (int64_t)info.pad_info(0));
+        printf("dilation:(%ld)\n", dilation);
+        std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl;
+        if (dtype == INFINI_DTYPE_F16) {
+            // float16 *host_x, *host_w, *host_bias;
+            // host_x = (float16 *)malloc((int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16));
+            // host_w = (float16 *)malloc((int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16));
+            // host_bias = (float16 *)malloc((int)bias_size * sizeof(float16));
+            // xpu_memcpy(host_x, x, (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST);
+            // xpu_memcpy(host_w, w, (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST);
+            // xpu_memcpy(host_bias, bias, (int)bias_size * sizeof(float16), XPU_DEVICE_TO_HOST);
+            // for (int i = 0; i < (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0); i++) {
+            //     printf("%.4f ", static_cast<float>(host_x[i]));
+            // }
+            // printf("\n");
+            // for (int i = 0; i < (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0); i++) {
+            //     printf("%.4f ", static_cast<float>(host_w[i]));
+            // }
+            // printf("\n");
+            // for (int i = 0; i < (int)bias_size; i++) {
+            //     printf("%.4f ", static_cast<float>(host_bias[i]));
+            // }
+            // printf("\n");
+            if (bias_size > 0) {
+                CHECK_STATUS(internal->useXdnn(
+                    (kunlunStream_t)stream,
+                    [&](xdnnHandle_t handle) {
+                        CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
+                        CHECK_KUNLUN((xdnn::conv1d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                              (int64_t)info.kernel_dim(0), ksize,
+                                                                                              stride, pad,
+                                                                                              dilation, 1, nullptr,
+                                                                                              nullptr, nullptr, true, bias_F32,
+                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                              nullptr)));
+                        return INFINI_STATUS_SUCCESS;
+                    }));
+            } else {
+                CHECK_STATUS(internal->useXdnn(
+                    (kunlunStream_t)stream,
+                    [&](xdnnHandle_t handle) {
+                        CHECK_KUNLUN((xdnn::conv1d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                              (int64_t)info.kernel_dim(0), ksize,
+                                                                                              stride, pad,
+                                                                                              dilation, 1, nullptr,
+                                                                                              nullptr, nullptr, true, nullptr,
+                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                              nullptr)));
+                        return INFINI_STATUS_SUCCESS;
+                    }));
+            }
+            return INFINI_STATUS_SUCCESS;
+
+        } else if (dtype == INFINI_DTYPE_F32) {
+            CHECK_STATUS(internal->useXdnn(
+                (kunlunStream_t)stream,
+                [&](xdnnHandle_t handle) {
+                    CHECK_KUNLUN((xdnn::conv1d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                    (int64_t)info.kernel_dim(0), ksize,
+                                                                                    stride, pad,
+                                                                                    dilation, 1, nullptr,
+                                                                                    nullptr, nullptr, true, (float *)bias,
+                                                                                    nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                    nullptr)));
+                    return INFINI_STATUS_SUCCESS;
+                }));
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        break;
+    }
+    case 2: {
+        std::vector<int64_t> ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1)};
+        std::vector<int64_t> stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1)};
+        std::vector<int64_t> pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1)};
+        std::vector<int64_t> dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)};
+        printf("x_shape:(%ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1));
+        printf("kernel_dim:(%ld, %ld)\n", ksize[0], ksize[1]);
+        printf("stride:(%ld, %ld)\n", stride[0], stride[1]);
+        printf("pad:(%ld, %ld)\n", pad[0], pad[1]);
+        printf("dilation:(%ld, %ld)\n", dilation[0], dilation[1]);
+        std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl;
+        if (dtype == INFINI_DTYPE_F16) {
+            if (bias_size > 0) {
+                CHECK_STATUS(internal->useXdnn(
+                    (kunlunStream_t)stream,
+                    [&](xdnnHandle_t handle) {
+                        CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
+                        CHECK_KUNLUN((xdnn::conv2d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              stride, pad,
+                                                                                              dilation, 1, nullptr,
+                                                                                              nullptr, nullptr, true, bias_F32,
+                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
+                                                                                              nullptr, -1)));
+                        return INFINI_STATUS_SUCCESS;
+                    }));
+            } else {
+                CHECK_STATUS(internal->useXdnn(
+                    (kunlunStream_t)stream,
+                    [&](xdnnHandle_t handle) {
+                        CHECK_KUNLUN((xdnn::conv2d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              stride, pad,
+                                                                                              dilation, 1, nullptr,
+                                                                                              nullptr, nullptr, true, nullptr,
+                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
+                                                                                              nullptr, -1)));
+                        return INFINI_STATUS_SUCCESS;
+                    }));
+            }
+            return INFINI_STATUS_SUCCESS;
+
+        } else if (dtype == INFINI_DTYPE_F32) {
+            CHECK_STATUS(internal->useXdnn(
+                (kunlunStream_t)stream,
+                [&](xdnnHandle_t handle) {
+                    CHECK_KUNLUN((xdnn::conv2d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                    (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize,
+                                                                                    stride, pad,
+                                                                                    dilation, 1, nullptr,
+                                                                                    nullptr, nullptr, true, (float *)bias,
+                                                                                    nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
+                                                                                    nullptr, -1)));
+                    return INFINI_STATUS_SUCCESS;
+                }));
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        break;
+    }
+    case 3: {
+        std::vector<int64_t> ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1), (int64_t)info.kernel_dim(2)};
+        std::vector<int64_t> stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1), (int64_t)info.stride_info(2)};
+        std::vector<int64_t> pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1), (int64_t)info.pad_info(2)};
+        std::vector<int64_t> dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1), (int64_t)info.dilation_info(2)};
+
+        printf("x_shape:(%ld, %ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1), info.input_dim(2));
+        printf("kernel_dim:(%ld, %ld, %ld)\n", ksize[0], ksize[1], ksize[2]);
+        printf("stride:(%ld, %ld, %ld)\n", stride[0], stride[1], stride[2]);
+        printf("pad:(%ld, %ld, %ld)\n", pad[0], pad[1], pad[2]);
+        printf("dilation:(%ld, %ld, %ld)\n", dilation[0], dilation[1], dilation[2]);
+        std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl;
+        if (dtype == INFINI_DTYPE_F16) {
+            if (bias_size > 0) {
+                CHECK_STATUS(internal->useXdnn(
+                    (kunlunStream_t)stream,
+                    [&](xdnnHandle_t handle) {
+                        CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
+                        CHECK_KUNLUN((xdnn::conv3d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              stride, pad,
+                                                                                              dilation, 1, nullptr,
+                                                                                              nullptr, nullptr, true, bias_F32,
+                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                              nullptr)));
+                        return INFINI_STATUS_SUCCESS;
+                    }));
+            } else {
+                CHECK_STATUS(internal->useXdnn(
+                    (kunlunStream_t)stream,
+                    [&](xdnnHandle_t handle) {
+                        CHECK_KUNLUN((xdnn::conv3d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              stride, pad,
+                                                                                              dilation, 1, nullptr,
+                                                                                              nullptr, nullptr, true, nullptr,
+                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                              nullptr)));
+                        return INFINI_STATUS_SUCCESS;
+                    }));
+            }
+            return INFINI_STATUS_SUCCESS;
+        } else if (dtype == INFINI_DTYPE_F32) {
+            CHECK_STATUS(internal->useXdnn(
+                (kunlunStream_t)stream,
+                [&](xdnnHandle_t handle) {
+                    CHECK_KUNLUN((xdnn::conv3d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                    (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize,
+                                                                                    stride, pad,
+                                                                                    dilation, 1, nullptr,
+                                                                                    nullptr, nullptr, true, (float *)bias,
+                                                                                    nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                    nullptr)));
+                    return INFINI_STATUS_SUCCESS;
+                }));
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *w,
+    const void *bias,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    CHECK_STATUS(conv_kernel(
+        _opaque->internal,
+        _info,
+        _dtype,
+        workspace,
+        workspace_size,
+        y,
+        x,
+        w,
+        bias,
+        stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::conv::kunlun
diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.h b/src/infiniop/ops/conv/kunlun/conv_kunlun.h
new file mode 100644
index 000000000..bd21d2168
--- /dev/null
+++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.h
@@ -0,0 +1,8 @@
+#ifndef __CONV_KUNLUN_H__
+#define __CONV_KUNLUN_H__
+
+#include "../conv.h"
+
+DESCRIPTOR(kunlun)
+
+#endif // __CONV_KUNLUN_H__
diff --git a/src/infiniop/ops/conv/operator.cc b/src/infiniop/ops/conv/operator.cc
index df033f44f..9e93ea230 100644
--- a/src/infiniop/ops/conv/operator.cc
+++ b/src/infiniop/ops/conv/operator.cc
@@ -8,6 +8,9 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
 #include "nvidia/conv_nvidia.cuh"
 #endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/conv_kunlun.h"
+#endif
 
 __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle,
                                                          infiniopConvDescriptor_t *desc_ptr,
@@ -42,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
 #ifdef ENABLE_ILUVATAR_API
         CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
 
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -70,6 +76,9 @@ infiniopGetConvWorkspaceSize(
 #ifdef ENABLE_ILUVATAR_API
         GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
 
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -106,6 +115,9 @@ __C infiniStatus_t infiniopConv(
 #ifdef ENABLE_ILUVATAR_API
         CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
 
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -130,6 +142,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
 #ifdef ENABLE_ILUVATAR_API
         DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
 
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

From 858513daabb923ea6e2ad821833727c8ed440737 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Mon, 18 Aug 2025 05:42:31 +0000
Subject: [PATCH 2/6] issue/360: modified bias_size

---
 src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 14 +++++--
 test/infiniop/conv.py                       | 45 ++++++++++++---------
 2 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
index 2246ecc03..1c65e9289 100644
--- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
+++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
@@ -59,8 +59,12 @@ infiniStatus_t conv_kernel(
     char *workspace_value = reinterpret_cast<char *>(workspace);
     int64_t bias_ndims = info.bias_dims_size();
     int64_t bias_size = 1;
-    for (int64_t i = 0; i < bias_ndims; i++) {
-        bias_size *= info.bias_dim(i);
+    if (bias_ndims > 0) {
+        for (int64_t i = 0; i < bias_ndims; i++) {
+            bias_size *= info.bias_dim(i);
+        }
+    } else {
+        bias_size = 0;
     }
     float *bias_F32 = (float *)workspace_value;
     switch (info.ndim()) {
@@ -146,7 +150,11 @@ infiniStatus_t conv_kernel(
     case 2: {
         std::vector<int64_t> ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1)};
         std::vector<int64_t> stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1)};
-        std::vector<int64_t> pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1)};
+        std::vector<int64_t> pad = {
+            (int64_t)info.pad_info(0),
+            (int64_t)info.pad_info(0),
+            (int64_t)info.pad_info(1),
+            (int64_t)info.pad_info(1)};
         std::vector<int64_t> dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)};
         printf("x_shape:(%ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1));
         printf("kernel_dim:(%ld, %ld)\n", ksize[0], ksize[1]);
diff --git a/test/infiniop/conv.py b/test/infiniop/conv.py
index 6cb99da9f..f30d312cd 100644
--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -49,6 +49,15 @@
         (1, 2),
         (2, 1),
     ),
+    (
+        (1, 3, 32, 32),
+        (32 * 32 * 3, 32 * 32, 32, 1),
+        (2, 3, 5, 5),
+        (75, 25, 5, 1),
+        (2, 2),
+        (2, 2),
+        (1, 1),
+    ),
     (
         (32, 3, 32, 32),
         (32 * 32 * 3, 32 * 32, 32, 1),
@@ -96,27 +105,27 @@
 
 
 def conv(x, w, stride, padding, dilation, y_tensor, bias=None):
-    match len(x.shape) - 2:
-        case 1:
-            y_tensor.copy_(
-                F.conv1d(
-                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
-                )
+    dim = len(x.shape) - 2
+    if dim == 1:
+        y_tensor.copy_(
+            F.conv1d(
+                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
             )
-        case 2:
-            y_tensor.copy_(
-                F.conv2d(
-                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
-                )
+        )
+    elif dim == 2:
+        y_tensor.copy_(
+            F.conv2d(
+                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
             )
-        case 3:
-            y_tensor.copy_(
-                F.conv3d(
-                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
-                )
+        )
+    elif dim == 3:
+        y_tensor.copy_(
+            F.conv3d(
+                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
             )
-        case _:
-            print("Error: Pytorch -> Unsupported tensor dimension")
+        )
+    else:
+        print("Error: Pytorch -> Unsupported tensor dimension")
 
 
 # infer the shape of the output given the inputs for a N-ary convolution

From d260f2f221bd907c962a0ce9ec94af8b4baef7c9 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 21 Aug 2025 02:58:01 +0000
Subject: [PATCH 3/6] issue/360: success conv

---
 src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 58 +++++----------------
 test/infiniop/conv.py                       | 45 +++++++---------
 2 files changed, 30 insertions(+), 73 deletions(-)

diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
index 1c65e9289..b08e70dda 100644
--- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
+++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
@@ -73,39 +73,16 @@ infiniStatus_t conv_kernel(
         int64_t stride = (int64_t)info.stride_info(0);
         std::initializer_list<int64_t> pad = {(int64_t)info.pad_info(0)};
         int64_t dilation = (int64_t)info.dilation_info(0);
-        printf("x_shape:(%ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0));
-        printf("kernel_dim:(%ld)\n", ksize);
-        printf("stride:(%ld)\n", stride);
-        printf("pad:(%ld)\n", (int64_t)info.pad_info(0));
-        printf("dilation:(%ld)\n", dilation);
-        std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl;
+
         if (dtype == INFINI_DTYPE_F16) {
-            // float16 *host_x, *host_w, *host_bias;
-            // host_x = (float16 *)malloc((int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16));
-            // host_w = (float16 *)malloc((int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16));
-            // host_bias = (float16 *)malloc((int)bias_size * sizeof(float16));
-            // xpu_memcpy(host_x, x, (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST);
-            // xpu_memcpy(host_w, w, (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST);
-            // xpu_memcpy(host_bias, bias, (int)bias_size * sizeof(float16), XPU_DEVICE_TO_HOST);
-            // for (int i = 0; i < (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0); i++) {
-            //     printf("%.4f ", static_cast<float>(host_x[i]));
-            // }
-            // printf("\n");
-            // for (int i = 0; i < (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0); i++) {
-            //     printf("%.4f ", static_cast<float>(host_w[i]));
-            // }
-            // printf("\n");
-            // for (int i = 0; i < (int)bias_size; i++) {
-            //     printf("%.4f ", static_cast<float>(host_bias[i]));
-            // }
-            // printf("\n");
+
             if (bias_size > 0) {
                 CHECK_STATUS(internal->useXdnn(
                     (kunlunStream_t)stream,
                     [&](xdnnHandle_t handle) {
                         CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
                         CHECK_KUNLUN((xdnn::conv1d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.kernel_dim(0), ksize,
+                                                                                              (int64_t)info.out_channels(), ksize,
                                                                                               stride, pad,
                                                                                               dilation, 1, nullptr,
                                                                                               nullptr, nullptr, true, bias_F32,
@@ -118,7 +95,7 @@ infiniStatus_t conv_kernel(
                     (kunlunStream_t)stream,
                     [&](xdnnHandle_t handle) {
                         CHECK_KUNLUN((xdnn::conv1d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.kernel_dim(0), ksize,
+                                                                                              (int64_t)info.out_channels(), ksize,
                                                                                               stride, pad,
                                                                                               dilation, 1, nullptr,
                                                                                               nullptr, nullptr, true, nullptr,
@@ -134,7 +111,7 @@ infiniStatus_t conv_kernel(
                 (kunlunStream_t)stream,
                 [&](xdnnHandle_t handle) {
                     CHECK_KUNLUN((xdnn::conv1d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                    (int64_t)info.kernel_dim(0), ksize,
+                                                                                    (int64_t)info.out_channels(), ksize,
                                                                                     stride, pad,
                                                                                     dilation, 1, nullptr,
                                                                                     nullptr, nullptr, true, (float *)bias,
@@ -156,12 +133,7 @@ infiniStatus_t conv_kernel(
             (int64_t)info.pad_info(1),
             (int64_t)info.pad_info(1)};
         std::vector<int64_t> dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)};
-        printf("x_shape:(%ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1));
-        printf("kernel_dim:(%ld, %ld)\n", ksize[0], ksize[1]);
-        printf("stride:(%ld, %ld)\n", stride[0], stride[1]);
-        printf("pad:(%ld, %ld)\n", pad[0], pad[1]);
-        printf("dilation:(%ld, %ld)\n", dilation[0], dilation[1]);
-        std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl;
+
         if (dtype == INFINI_DTYPE_F16) {
             if (bias_size > 0) {
                 CHECK_STATUS(internal->useXdnn(
@@ -169,7 +141,7 @@ infiniStatus_t conv_kernel(
                     [&](xdnnHandle_t handle) {
                         CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
                         CHECK_KUNLUN((xdnn::conv2d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
                                                                                               stride, pad,
                                                                                               dilation, 1, nullptr,
                                                                                               nullptr, nullptr, true, bias_F32,
@@ -182,7 +154,7 @@ infiniStatus_t conv_kernel(
                     (kunlunStream_t)stream,
                     [&](xdnnHandle_t handle) {
                         CHECK_KUNLUN((xdnn::conv2d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
                                                                                               stride, pad,
                                                                                               dilation, 1, nullptr,
                                                                                               nullptr, nullptr, true, nullptr,
@@ -198,7 +170,7 @@ infiniStatus_t conv_kernel(
                 (kunlunStream_t)stream,
                 [&](xdnnHandle_t handle) {
                     CHECK_KUNLUN((xdnn::conv2d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                    (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize,
+                                                                                    (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
                                                                                     stride, pad,
                                                                                     dilation, 1, nullptr,
                                                                                     nullptr, nullptr, true, (float *)bias,
@@ -217,12 +189,6 @@ infiniStatus_t conv_kernel(
         std::vector<int64_t> pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1), (int64_t)info.pad_info(2)};
         std::vector<int64_t> dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1), (int64_t)info.dilation_info(2)};
 
-        printf("x_shape:(%ld, %ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1), info.input_dim(2));
-        printf("kernel_dim:(%ld, %ld, %ld)\n", ksize[0], ksize[1], ksize[2]);
-        printf("stride:(%ld, %ld, %ld)\n", stride[0], stride[1], stride[2]);
-        printf("pad:(%ld, %ld, %ld)\n", pad[0], pad[1], pad[2]);
-        printf("dilation:(%ld, %ld, %ld)\n", dilation[0], dilation[1], dilation[2]);
-        std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl;
         if (dtype == INFINI_DTYPE_F16) {
             if (bias_size > 0) {
                 CHECK_STATUS(internal->useXdnn(
@@ -230,7 +196,7 @@ infiniStatus_t conv_kernel(
                     [&](xdnnHandle_t handle) {
                         CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
                         CHECK_KUNLUN((xdnn::conv3d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
                                                                                               stride, pad,
                                                                                               dilation, 1, nullptr,
                                                                                               nullptr, nullptr, true, bias_F32,
@@ -243,7 +209,7 @@ infiniStatus_t conv_kernel(
                     (kunlunStream_t)stream,
                     [&](xdnnHandle_t handle) {
                         CHECK_KUNLUN((xdnn::conv3d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize,
+                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
                                                                                               stride, pad,
                                                                                               dilation, 1, nullptr,
                                                                                               nullptr, nullptr, true, nullptr,
@@ -258,7 +224,7 @@ infiniStatus_t conv_kernel(
                 (kunlunStream_t)stream,
                 [&](xdnnHandle_t handle) {
                     CHECK_KUNLUN((xdnn::conv3d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                    (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize,
+                                                                                    (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
                                                                                     stride, pad,
                                                                                     dilation, 1, nullptr,
                                                                                     nullptr, nullptr, true, (float *)bias,
diff --git a/test/infiniop/conv.py b/test/infiniop/conv.py
index f30d312cd..6cb99da9f 100644
--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -49,15 +49,6 @@
         (1, 2),
         (2, 1),
     ),
-    (
-        (1, 3, 32, 32),
-        (32 * 32 * 3, 32 * 32, 32, 1),
-        (2, 3, 5, 5),
-        (75, 25, 5, 1),
-        (2, 2),
-        (2, 2),
-        (1, 1),
-    ),
     (
         (32, 3, 32, 32),
         (32 * 32 * 3, 32 * 32, 32, 1),
@@ -105,27 +96,27 @@
 
 
 def conv(x, w, stride, padding, dilation, y_tensor, bias=None):
-    dim = len(x.shape) - 2
-    if dim == 1:
-        y_tensor.copy_(
-            F.conv1d(
-                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+    match len(x.shape) - 2:
+        case 1:
+            y_tensor.copy_(
+                F.conv1d(
+                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+                )
             )
-        )
-    elif dim == 2:
-        y_tensor.copy_(
-            F.conv2d(
-                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+        case 2:
+            y_tensor.copy_(
+                F.conv2d(
+                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+                )
             )
-        )
-    elif dim == 3:
-        y_tensor.copy_(
-            F.conv3d(
-                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+        case 3:
+            y_tensor.copy_(
+                F.conv3d(
+                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
+                )
             )
-        )
-    else:
-        print("Error: Pytorch -> Unsupported tensor dimension")
+        case _:
+            print("Error: Pytorch -> Unsupported tensor dimension")
 
 
 # infer the shape of the output given the inputs for a N-ary convolution

From 270207193854e49bb8e5992163a53c97d7d13cca Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 21 Aug 2025 03:56:59 +0000
Subject: [PATCH 4/6] issue/360: optimize code

---
 src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 251 +++++++-------------
 1 file changed, 92 insertions(+), 159 deletions(-)

diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
index b08e70dda..f7a1ce644 100644
--- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
+++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
@@ -45,10 +45,10 @@ infiniStatus_t Descriptor::create(
     return INFINI_STATUS_SUCCESS;
 }
 
+template <typename Tdata>
 infiniStatus_t conv_kernel(
     std::shared_ptr<device::kunlun::Handle::Internal> internal,
     const ConvInfo &info,
-    infiniDtype_t dtype,
     void *workspace,
     size_t workspace_size,
     void *y,
@@ -74,114 +74,58 @@ infiniStatus_t conv_kernel(
         std::initializer_list<int64_t> pad = {(int64_t)info.pad_info(0)};
         int64_t dilation = (int64_t)info.dilation_info(0);
 
-        if (dtype == INFINI_DTYPE_F16) {
-
-            if (bias_size > 0) {
-                CHECK_STATUS(internal->useXdnn(
-                    (kunlunStream_t)stream,
-                    [&](xdnnHandle_t handle) {
-                        CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
-                        CHECK_KUNLUN((xdnn::conv1d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.out_channels(), ksize,
-                                                                                              stride, pad,
-                                                                                              dilation, 1, nullptr,
-                                                                                              nullptr, nullptr, true, bias_F32,
-                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
-                                                                                              nullptr)));
-                        return INFINI_STATUS_SUCCESS;
-                    }));
-            } else {
-                CHECK_STATUS(internal->useXdnn(
-                    (kunlunStream_t)stream,
-                    [&](xdnnHandle_t handle) {
-                        CHECK_KUNLUN((xdnn::conv1d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.out_channels(), ksize,
-                                                                                              stride, pad,
-                                                                                              dilation, 1, nullptr,
-                                                                                              nullptr, nullptr, true, nullptr,
-                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
-                                                                                              nullptr)));
-                        return INFINI_STATUS_SUCCESS;
-                    }));
-            }
-            return INFINI_STATUS_SUCCESS;
-
-        } else if (dtype == INFINI_DTYPE_F32) {
-            CHECK_STATUS(internal->useXdnn(
-                (kunlunStream_t)stream,
-                [&](xdnnHandle_t handle) {
-                    CHECK_KUNLUN((xdnn::conv1d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                    (int64_t)info.out_channels(), ksize,
-                                                                                    stride, pad,
-                                                                                    dilation, 1, nullptr,
-                                                                                    nullptr, nullptr, true, (float *)bias,
-                                                                                    nullptr, baidu::xpu::api::Activation_t::LINEAR,
-                                                                                    nullptr)));
-                    return INFINI_STATUS_SUCCESS;
-                }));
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-        break;
+        CHECK_STATUS(internal->useXdnn(
+            (kunlunStream_t)stream,
+            [&](xdnnHandle_t handle) {
+                if (bias_size > 0) {
+                    if constexpr (std::is_same<Tdata, float16>::value) {
+                        CHECK_KUNLUN((xdnn::cast<Tdata, float>(handle, (Tdata *)bias, bias_F32, bias_size)));
+                    } else if constexpr (std::is_same<Tdata, float>::value) {
+                        bias_F32 = (float *)bias;
+                    }
+                } else {
+                    bias_F32 = nullptr;
+                }
+                CHECK_KUNLUN((xdnn::conv1d_fusion<Tdata, Tdata, Tdata, int16_t>(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                (int64_t)info.out_channels(), ksize,
+                                                                                stride, pad,
+                                                                                dilation, 1, nullptr,
+                                                                                nullptr, nullptr, true, bias_F32,
+                                                                                nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                nullptr)));
+                return INFINI_STATUS_SUCCESS;
+            }));
+        return INFINI_STATUS_SUCCESS;
     }
     case 2: {
         std::vector<int64_t> ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1)};
         std::vector<int64_t> stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1)};
         std::vector<int64_t> pad = {
             (int64_t)info.pad_info(0),
-            (int64_t)info.pad_info(0),
-            (int64_t)info.pad_info(1),
             (int64_t)info.pad_info(1)};
         std::vector<int64_t> dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)};
-
-        if (dtype == INFINI_DTYPE_F16) {
-            if (bias_size > 0) {
-                CHECK_STATUS(internal->useXdnn(
-                    (kunlunStream_t)stream,
-                    [&](xdnnHandle_t handle) {
-                        CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
-                        CHECK_KUNLUN((xdnn::conv2d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
-                                                                                              stride, pad,
-                                                                                              dilation, 1, nullptr,
-                                                                                              nullptr, nullptr, true, bias_F32,
-                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
-                                                                                              nullptr, -1)));
-                        return INFINI_STATUS_SUCCESS;
-                    }));
-            } else {
-                CHECK_STATUS(internal->useXdnn(
-                    (kunlunStream_t)stream,
-                    [&](xdnnHandle_t handle) {
-                        CHECK_KUNLUN((xdnn::conv2d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
-                                                                                              stride, pad,
-                                                                                              dilation, 1, nullptr,
-                                                                                              nullptr, nullptr, true, nullptr,
-                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
-                                                                                              nullptr, -1)));
-                        return INFINI_STATUS_SUCCESS;
-                    }));
-            }
-            return INFINI_STATUS_SUCCESS;
-
-        } else if (dtype == INFINI_DTYPE_F32) {
-            CHECK_STATUS(internal->useXdnn(
-                (kunlunStream_t)stream,
-                [&](xdnnHandle_t handle) {
-                    CHECK_KUNLUN((xdnn::conv2d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                    (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
-                                                                                    stride, pad,
-                                                                                    dilation, 1, nullptr,
-                                                                                    nullptr, nullptr, true, (float *)bias,
-                                                                                    nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
-                                                                                    nullptr, -1)));
-                    return INFINI_STATUS_SUCCESS;
-                }));
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-        break;
+        CHECK_STATUS(internal->useXdnn(
+            (kunlunStream_t)stream,
+            [&](xdnnHandle_t handle) {
+                if (bias_size > 0) {
+                    if constexpr (std::is_same<Tdata, float16>::value) {
+                        CHECK_KUNLUN((xdnn::cast<Tdata, float>(handle, (Tdata *)bias, bias_F32, bias_size)));
+                    } else if constexpr (std::is_same<Tdata, float>::value) {
+                        bias_F32 = (float *)bias;
+                    }
+                } else {
+                    bias_F32 = nullptr;
+                }
+                CHECK_KUNLUN((xdnn::conv2d_fusion<Tdata, Tdata, Tdata, int16_t>(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
+                                                                                stride, pad,
+                                                                                dilation, 1, nullptr,
+                                                                                nullptr, nullptr, true, bias_F32,
+                                                                                nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
+                                                                                nullptr, -1)));
+                return INFINI_STATUS_SUCCESS;
+            }));
+        return INFINI_STATUS_SUCCESS;
     }
     case 3: {
         std::vector<int64_t> ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1), (int64_t)info.kernel_dim(2)};
@@ -189,53 +133,28 @@ infiniStatus_t conv_kernel(
         std::vector<int64_t> pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1), (int64_t)info.pad_info(2)};
         std::vector<int64_t> dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1), (int64_t)info.dilation_info(2)};
 
-        if (dtype == INFINI_DTYPE_F16) {
-            if (bias_size > 0) {
-                CHECK_STATUS(internal->useXdnn(
-                    (kunlunStream_t)stream,
-                    [&](xdnnHandle_t handle) {
-                        CHECK_KUNLUN((xdnn::cast<float16, float>(handle, (float16 *)bias, bias_F32, bias_size)));
-                        CHECK_KUNLUN((xdnn::conv3d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
-                                                                                              stride, pad,
-                                                                                              dilation, 1, nullptr,
-                                                                                              nullptr, nullptr, true, bias_F32,
-                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
-                                                                                              nullptr)));
-                        return INFINI_STATUS_SUCCESS;
-                    }));
-            } else {
-                CHECK_STATUS(internal->useXdnn(
-                    (kunlunStream_t)stream,
-                    [&](xdnnHandle_t handle) {
-                        CHECK_KUNLUN((xdnn::conv3d_fusion<float16, float16, float16, int16_t>(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                              (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
-                                                                                              stride, pad,
-                                                                                              dilation, 1, nullptr,
-                                                                                              nullptr, nullptr, true, nullptr,
-                                                                                              nullptr, baidu::xpu::api::Activation_t::LINEAR,
-                                                                                              nullptr)));
-                        return INFINI_STATUS_SUCCESS;
-                    }));
-            }
-            return INFINI_STATUS_SUCCESS;
-        } else if (dtype == INFINI_DTYPE_F32) {
-            CHECK_STATUS(internal->useXdnn(
-                (kunlunStream_t)stream,
-                [&](xdnnHandle_t handle) {
-                    CHECK_KUNLUN((xdnn::conv3d_fusion<float, float, float, int16_t>(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
-                                                                                    (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
-                                                                                    stride, pad,
-                                                                                    dilation, 1, nullptr,
-                                                                                    nullptr, nullptr, true, (float *)bias,
-                                                                                    nullptr, baidu::xpu::api::Activation_t::LINEAR,
-                                                                                    nullptr)));
-                    return INFINI_STATUS_SUCCESS;
-                }));
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-        break;
+        CHECK_STATUS(internal->useXdnn(
+            (kunlunStream_t)stream,
+            [&](xdnnHandle_t handle) {
+                if (bias_size > 0) {
+                    if constexpr (std::is_same<Tdata, float16>::value) {
+                        CHECK_KUNLUN((xdnn::cast<Tdata, float>(handle, (Tdata *)bias, bias_F32, bias_size)));
+                    } else if constexpr (std::is_same<Tdata, float>::value) {
+                        bias_F32 = (float *)bias;
+                    }
+                } else {
+                    bias_F32 = nullptr;
+                }
+                CHECK_KUNLUN((xdnn::conv3d_fusion<Tdata, Tdata, Tdata, int16_t>(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
+                                                                                (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
+                                                                                stride, pad,
+                                                                                dilation, 1, nullptr,
+                                                                                nullptr, nullptr, true, bias_F32,
+                                                                                nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                nullptr)));
+                return INFINI_STATUS_SUCCESS;
+            }));
+        return INFINI_STATUS_SUCCESS;
     }
     default:
         return INFINI_STATUS_BAD_TENSOR_SHAPE;
@@ -254,17 +173,31 @@ infiniStatus_t Descriptor::calculate(
     if (workspace_size < _workspace_size) {
         return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
     }
-    CHECK_STATUS(conv_kernel(
-        _opaque->internal,
-        _info,
-        _dtype,
-        workspace,
-        workspace_size,
-        y,
-        x,
-        w,
-        bias,
-        stream));
+    if (_dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(conv_kernel<float16>(
+            _opaque->internal,
+            _info,
+            workspace,
+            workspace_size,
+            y,
+            x,
+            w,
+            bias,
+            stream));
+    } else if (_dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(conv_kernel<float>(
+            _opaque->internal,
+            _info,
+            workspace,
+            workspace_size,
+            y,
+            x,
+            w,
+            bias,
+            stream));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
 
     return INFINI_STATUS_SUCCESS;
 }

From 43b05f29140d5656801c20e53b39089adf709f9f Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 21 Aug 2025 04:39:02 +0000
Subject: [PATCH 5/6] issue/360: optim kunlun code

---
 src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 41 +++++++--------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
index f7a1ce644..f8375f0af 100644
--- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
+++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
@@ -67,6 +67,20 @@ infiniStatus_t conv_kernel(
         bias_size = 0;
     }
     float *bias_F32 = (float *)workspace_value;
+    CHECK_STATUS(internal->useXdnn(
+        (kunlunStream_t)stream,
+        [&](xdnnHandle_t handle) {
+            if (bias_size > 0) {
+                if constexpr (std::is_same<Tdata, float16>::value) {
+                    CHECK_KUNLUN((xdnn::cast<Tdata, float>(handle, (Tdata *)bias, bias_F32, bias_size)));
+                } else if constexpr (std::is_same<Tdata, float>::value) {
+                    bias_F32 = (float *)bias;
+                }
+            } else {
+                bias_F32 = nullptr;
+            }
+            return INFINI_STATUS_SUCCESS;
+        }));
     switch (info.ndim()) {
     case 1: {
         int64_t ksize = (int64_t)info.kernel_dim(0);
@@ -77,15 +91,6 @@ infiniStatus_t conv_kernel(
         CHECK_STATUS(internal->useXdnn(
             (kunlunStream_t)stream,
             [&](xdnnHandle_t handle) {
-                if (bias_size > 0) {
-                    if constexpr (std::is_same<Tdata, float16>::value) {
-                        CHECK_KUNLUN((xdnn::cast<Tdata, float>(handle, (Tdata *)bias, bias_F32, bias_size)));
-                    } else if constexpr (std::is_same<Tdata, float>::value) {
-                        bias_F32 = (float *)bias;
-                    }
-                } else {
-                    bias_F32 = nullptr;
-                }
                 CHECK_KUNLUN((xdnn::conv1d_fusion<Tdata, Tdata, Tdata, int16_t>(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
                                                                                 (int64_t)info.out_channels(), ksize,
                                                                                 stride, pad,
@@ -107,15 +112,6 @@ infiniStatus_t conv_kernel(
         CHECK_STATUS(internal->useXdnn(
             (kunlunStream_t)stream,
             [&](xdnnHandle_t handle) {
-                if (bias_size > 0) {
-                    if constexpr (std::is_same<Tdata, float16>::value) {
-                        CHECK_KUNLUN((xdnn::cast<Tdata, float>(handle, (Tdata *)bias, bias_F32, bias_size)));
-                    } else if constexpr (std::is_same<Tdata, float>::value) {
-                        bias_F32 = (float *)bias;
-                    }
-                } else {
-                    bias_F32 = nullptr;
-                }
                 CHECK_KUNLUN((xdnn::conv2d_fusion<Tdata, Tdata, Tdata, int16_t>(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
                                                                                 (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize,
                                                                                 stride, pad,
@@ -136,15 +132,6 @@ infiniStatus_t conv_kernel(
         CHECK_STATUS(internal->useXdnn(
             (kunlunStream_t)stream,
             [&](xdnnHandle_t handle) {
-                if (bias_size > 0) {
-                    if constexpr (std::is_same<Tdata, float16>::value) {
-                        CHECK_KUNLUN((xdnn::cast<Tdata, float>(handle, (Tdata *)bias, bias_F32, bias_size)));
-                    } else if constexpr (std::is_same<Tdata, float>::value) {
-                        bias_F32 = (float *)bias;
-                    }
-                } else {
-                    bias_F32 = nullptr;
-                }
                 CHECK_KUNLUN((xdnn::conv3d_fusion<Tdata, Tdata, Tdata, int16_t>(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0),
                                                                                 (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize,
                                                                                 stride, pad,

From ed47cde073683a7e5195154886ebe061e1c8b7ff Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Tue, 2 Sep 2025 07:17:23 +0000
Subject: [PATCH 6/6] issue/360: modified kunlun_common.h

---
 src/infiniop/devices/kunlun/kunlun_common.h | 1 +
 src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/infiniop/devices/kunlun/kunlun_common.h b/src/infiniop/devices/kunlun/kunlun_common.h
index 3e1bd8d1d..a1c0e0ae4 100644
--- a/src/infiniop/devices/kunlun/kunlun_common.h
+++ b/src/infiniop/devices/kunlun/kunlun_common.h
@@ -2,6 +2,7 @@
 #define __KUNLUN_COMMON_H__
 
 #include "../../../utils.h"
+#include <xpu/refactor/context/xpu_act_type.h>
 #include <xpu/runtime.h>
 #include <xpu/runtime_ex.h>
 #include <xpu/xdnn.h>
diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
index f8375f0af..577e5f990 100644
--- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
+++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc
@@ -2,7 +2,7 @@
 #include "../../../../utils.h"
 #include "../../../devices/kunlun/kunlun_common.h"
 #include "../../../devices/kunlun/kunlun_handle.h"
-#include <xpu/refactor/context/xpu_act_type.h>
+
 namespace op::conv::kunlun {
 
 struct Descriptor::Opaque {
@@ -96,7 +96,7 @@ infiniStatus_t conv_kernel(
                                                                                 stride, pad,
                                                                                 dilation, 1, nullptr,
                                                                                 nullptr, nullptr, true, bias_F32,
-                                                                                nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                nullptr, xdnn::Activation_t::LINEAR,
                                                                                 nullptr)));
                 return INFINI_STATUS_SUCCESS;
             }));
@@ -117,7 +117,7 @@ infiniStatus_t conv_kernel(
                                                                                 stride, pad,
                                                                                 dilation, 1, nullptr,
                                                                                 nullptr, nullptr, true, bias_F32,
-                                                                                nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr,
+                                                                                nullptr, xdnn::Activation_t::LINEAR, nullptr,
                                                                                 nullptr, -1)));
                 return INFINI_STATUS_SUCCESS;
             }));
@@ -137,7 +137,7 @@ infiniStatus_t conv_kernel(
                                                                                 stride, pad,
                                                                                 dilation, 1, nullptr,
                                                                                 nullptr, nullptr, true, bias_F32,
-                                                                                nullptr, baidu::xpu::api::Activation_t::LINEAR,
+                                                                                nullptr, xdnn::Activation_t::LINEAR,
                                                                                 nullptr)));
                 return INFINI_STATUS_SUCCESS;
             }));