From 772bb890fc9faf5f6359fe124d53e10988fbba75 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Fri, 15 Aug 2025 08:41:14 +0000 Subject: [PATCH 1/6] issue/360: kunlun conv failed --- src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 298 ++++++++++++++++++++ src/infiniop/ops/conv/kunlun/conv_kunlun.h | 8 + src/infiniop/ops/conv/operator.cc | 15 + 3 files changed, 321 insertions(+) create mode 100644 src/infiniop/ops/conv/kunlun/conv_kunlun.cc create mode 100644 src/infiniop/ops/conv/kunlun/conv_kunlun.h diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc new file mode 100644 index 000000000..2246ecc03 --- /dev/null +++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc @@ -0,0 +1,298 @@ +#include "conv_kunlun.h" +#include "../../../../utils.h" +#include "../../../devices/kunlun/kunlun_common.h" +#include "../../../devices/kunlun/kunlun_handle.h" +#include +namespace op::conv::kunlun { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc, + const void *pads, + const void *strides, + const void *dilations, + size_t n) { + auto handle = reinterpret_cast(handle_); + auto dtype = y_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = ConvInfo::create(handle_, y_desc, x_desc, w_desc, b_desc, + pads, strides, dilations, n); + + CHECK_RESULT(result); + auto conv_info = result.take(); + size_t min_workspace_size = conv_info.bias_dims_size() * sizeof(float); + *desc_ptr = new Descriptor( + dtype, + conv_info, + min_workspace_size, + new Opaque{handle->internal()}, + handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t conv_kernel( + std::shared_ptr internal, + const ConvInfo &info, + infiniDtype_t dtype, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *w, + const void *bias, + void *stream) { + char *workspace_value = reinterpret_cast(workspace); + int64_t bias_ndims = info.bias_dims_size(); + int64_t bias_size = 1; + for (int64_t i = 0; i < bias_ndims; i++) { + bias_size *= info.bias_dim(i); + } + float *bias_F32 = (float *)workspace_value; + switch (info.ndim()) { + case 1: { + int64_t ksize = (int64_t)info.kernel_dim(0); + int64_t stride = (int64_t)info.stride_info(0); + std::initializer_list pad = {(int64_t)info.pad_info(0)}; + int64_t dilation = (int64_t)info.dilation_info(0); + printf("x_shape:(%ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0)); + printf("kernel_dim:(%ld)\n", ksize); + printf("stride:(%ld)\n", stride); + printf("pad:(%ld)\n", (int64_t)info.pad_info(0)); + printf("dilation:(%ld)\n", dilation); + std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl; + if (dtype == INFINI_DTYPE_F16) { + // float16 *host_x, *host_w, *host_bias; + // host_x = (float16 *)malloc((int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16)); + // host_w = (float16 *)malloc((int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16)); + // host_bias = (float16 *)malloc((int)bias_size * sizeof(float16)); + // xpu_memcpy(host_x, x, (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST); + // xpu_memcpy(host_w, w, (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST); + // xpu_memcpy(host_bias, bias, (int)bias_size * sizeof(float16), XPU_DEVICE_TO_HOST); + // for (int i = 0; i < (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0); i++) { + // printf("%.4f ", static_cast(host_x[i])); + // } + // printf("\n"); + // for (int i = 0; i < (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0); i++) { + // printf("%.4f ", static_cast(host_w[i])); + // } + // printf("\n"); + // for (int i = 0; i < (int)bias_size; i++) { + // printf("%.4f ", static_cast(host_bias[i])); + // } + // printf("\n"); + if (bias_size > 0) { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); + CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, bias_F32, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + } else { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, nullptr, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + } + return INFINI_STATUS_SUCCESS; + + } else if (dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, (float *)bias, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + } + case 2: { + std::vector ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1)}; + std::vector stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1)}; + std::vector pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1)}; + std::vector dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)}; + printf("x_shape:(%ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1)); + printf("kernel_dim:(%ld, %ld)\n", ksize[0], ksize[1]); + printf("stride:(%ld, %ld)\n", stride[0], stride[1]); + printf("pad:(%ld, %ld)\n", pad[0], pad[1]); + printf("dilation:(%ld, %ld)\n", dilation[0], dilation[1]); + std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl; + if (dtype == INFINI_DTYPE_F16) { + if (bias_size > 0) { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); + CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, bias_F32, + nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, + nullptr, -1))); + return INFINI_STATUS_SUCCESS; + })); + } else { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, nullptr, + nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, + nullptr, -1))); + return INFINI_STATUS_SUCCESS; + })); + } + return INFINI_STATUS_SUCCESS; + + } else if (dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, (float *)bias, + nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, + nullptr, -1))); + return INFINI_STATUS_SUCCESS; + })); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + } + case 3: { + std::vector ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1), (int64_t)info.kernel_dim(2)}; + std::vector stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1), (int64_t)info.stride_info(2)}; + std::vector pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1), (int64_t)info.pad_info(2)}; + std::vector dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1), (int64_t)info.dilation_info(2)}; + + printf("x_shape:(%ld, %ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1), info.input_dim(2)); + printf("kernel_dim:(%ld, %ld, %ld)\n", ksize[0], ksize[1], ksize[2]); + printf("stride:(%ld, %ld, %ld)\n", stride[0], stride[1], stride[2]); + printf("pad:(%ld, %ld, %ld)\n", pad[0], pad[1], pad[2]); + printf("dilation:(%ld, %ld, %ld)\n", dilation[0], dilation[1], dilation[2]); + std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl; + if (dtype == INFINI_DTYPE_F16) { + if (bias_size > 0) { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); + CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, bias_F32, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + } else { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, nullptr, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + } + return INFINI_STATUS_SUCCESS; + } else if (dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, (float *)bias, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *w, + const void *bias, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + CHECK_STATUS(conv_kernel( + _opaque->internal, + _info, + _dtype, + workspace, + workspace_size, + y, + x, + w, + bias, + stream)); + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::conv::kunlun diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.h b/src/infiniop/ops/conv/kunlun/conv_kunlun.h new file mode 100644 index 000000000..bd21d2168 --- /dev/null +++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.h @@ -0,0 +1,8 @@ +#ifndef __CONV_KUNLUN_H__ +#define __CONV_KUNLUN_H__ + +#include "../conv.h" + +DESCRIPTOR(kunlun) + +#endif // __CONV_KUNLUN_H__ diff --git a/src/infiniop/ops/conv/operator.cc b/src/infiniop/ops/conv/operator.cc index df033f44f..9e93ea230 100644 --- a/src/infiniop/ops/conv/operator.cc +++ b/src/infiniop/ops/conv/operator.cc @@ -8,6 +8,9 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) #include "nvidia/conv_nvidia.cuh" #endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/conv_kunlun.h" +#endif __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle, infiniopConvDescriptor_t *desc_ptr, @@ -42,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle #ifdef ENABLE_ILUVATAR_API CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -70,6 +76,9 @@ infiniopGetConvWorkspaceSize( #ifdef ENABLE_ILUVATAR_API GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -106,6 +115,9 @@ __C infiniStatus_t infiniopConv( #ifdef ENABLE_ILUVATAR_API CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -130,6 +142,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) { #ifdef ENABLE_ILUVATAR_API DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; From 858513daabb923ea6e2ad821833727c8ed440737 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Mon, 18 Aug 2025 05:42:31 +0000 Subject: [PATCH 2/6] issue/360: modified bias_size --- src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 14 +++++-- test/infiniop/conv.py | 45 ++++++++++++--------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc index 2246ecc03..1c65e9289 100644 --- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc +++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc @@ -59,8 +59,12 @@ infiniStatus_t conv_kernel( char *workspace_value = reinterpret_cast(workspace); int64_t bias_ndims = info.bias_dims_size(); int64_t bias_size = 1; - for (int64_t i = 0; i < bias_ndims; i++) { - bias_size *= info.bias_dim(i); + if (bias_ndims > 0) { + for (int64_t i = 0; i < bias_ndims; i++) { + bias_size *= info.bias_dim(i); + } + } else { + bias_size = 0; } float *bias_F32 = (float *)workspace_value; switch (info.ndim()) { @@ -146,7 +150,11 @@ infiniStatus_t conv_kernel( case 2: { std::vector ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1)}; std::vector stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1)}; - std::vector pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1)}; + std::vector pad = { + (int64_t)info.pad_info(0), + (int64_t)info.pad_info(0), + (int64_t)info.pad_info(1), + (int64_t)info.pad_info(1)}; std::vector dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)}; printf("x_shape:(%ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1)); printf("kernel_dim:(%ld, %ld)\n", ksize[0], ksize[1]); diff --git a/test/infiniop/conv.py b/test/infiniop/conv.py index 6cb99da9f..f30d312cd 100644 --- a/test/infiniop/conv.py +++ b/test/infiniop/conv.py @@ -49,6 +49,15 @@ (1, 2), (2, 1), ), + ( + (1, 3, 32, 32), + (32 * 32 * 3, 32 * 32, 32, 1), + (2, 3, 5, 5), + (75, 25, 5, 1), + (2, 2), + (2, 2), + (1, 1), + ), ( (32, 3, 32, 32), (32 * 32 * 3, 32 * 32, 32, 1), @@ -96,27 +105,27 @@ def conv(x, w, stride, padding, dilation, y_tensor, bias=None): - match len(x.shape) - 2: - case 1: - y_tensor.copy_( - F.conv1d( - x, w, bias=bias, stride=stride, padding=padding, dilation=dilation - ) + dim = len(x.shape) - 2 + if dim == 1: + y_tensor.copy_( + F.conv1d( + x, w, bias=bias, stride=stride, padding=padding, dilation=dilation ) - case 2: - y_tensor.copy_( - F.conv2d( - x, w, bias=bias, stride=stride, padding=padding, dilation=dilation - ) + ) + elif dim == 2: + y_tensor.copy_( + F.conv2d( + x, w, bias=bias, stride=stride, padding=padding, dilation=dilation ) - case 3: - y_tensor.copy_( - F.conv3d( - x, w, bias=bias, stride=stride, padding=padding, dilation=dilation - ) + ) + elif dim == 3: + y_tensor.copy_( + F.conv3d( + x, w, bias=bias, stride=stride, padding=padding, dilation=dilation ) - case _: - print("Error: Pytorch -> Unsupported tensor dimension") + ) + else: + print("Error: Pytorch -> Unsupported tensor dimension") # infer the shape of the output given the inputs for a N-ary convolution From d260f2f221bd907c962a0ce9ec94af8b4baef7c9 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 21 Aug 2025 02:58:01 +0000 Subject: [PATCH 3/6] issue/360: success conv --- src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 58 +++++---------------- test/infiniop/conv.py | 45 +++++++--------- 2 files changed, 30 insertions(+), 73 deletions(-) diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc index 1c65e9289..b08e70dda 100644 --- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc +++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc @@ -73,39 +73,16 @@ infiniStatus_t conv_kernel( int64_t stride = (int64_t)info.stride_info(0); std::initializer_list pad = {(int64_t)info.pad_info(0)}; int64_t dilation = (int64_t)info.dilation_info(0); - printf("x_shape:(%ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0)); - printf("kernel_dim:(%ld)\n", ksize); - printf("stride:(%ld)\n", stride); - printf("pad:(%ld)\n", (int64_t)info.pad_info(0)); - printf("dilation:(%ld)\n", dilation); - std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl; + if (dtype == INFINI_DTYPE_F16) { - // float16 *host_x, *host_w, *host_bias; - // host_x = (float16 *)malloc((int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16)); - // host_w = (float16 *)malloc((int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16)); - // host_bias = (float16 *)malloc((int)bias_size * sizeof(float16)); - // xpu_memcpy(host_x, x, (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST); - // xpu_memcpy(host_w, w, (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0) * sizeof(float16), XPU_DEVICE_TO_HOST); - // xpu_memcpy(host_bias, bias, (int)bias_size * sizeof(float16), XPU_DEVICE_TO_HOST); - // for (int i = 0; i < (int)info.batch() * (int)info.in_channels() * (int)info.input_dim(0); i++) { - // printf("%.4f ", static_cast(host_x[i])); - // } - // printf("\n"); - // for (int i = 0; i < (int)bias_size * (int)info.in_channels() * (int)info.kernel_dim(0); i++) { - // printf("%.4f ", static_cast(host_w[i])); - // } - // printf("\n"); - // for (int i = 0; i < (int)bias_size; i++) { - // printf("%.4f ", static_cast(host_bias[i])); - // } - // printf("\n"); + if (bias_size > 0) { CHECK_STATUS(internal->useXdnn( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, bias_F32, @@ -118,7 +95,7 @@ infiniStatus_t conv_kernel( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, nullptr, @@ -134,7 +111,7 @@ infiniStatus_t conv_kernel( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, (float *)bias, @@ -156,12 +133,7 @@ infiniStatus_t conv_kernel( (int64_t)info.pad_info(1), (int64_t)info.pad_info(1)}; std::vector dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)}; - printf("x_shape:(%ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1)); - printf("kernel_dim:(%ld, %ld)\n", ksize[0], ksize[1]); - printf("stride:(%ld, %ld)\n", stride[0], stride[1]); - printf("pad:(%ld, %ld)\n", pad[0], pad[1]); - printf("dilation:(%ld, %ld)\n", dilation[0], dilation[1]); - std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl; + if (dtype == INFINI_DTYPE_F16) { if (bias_size > 0) { CHECK_STATUS(internal->useXdnn( @@ -169,7 +141,7 @@ infiniStatus_t conv_kernel( [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, bias_F32, @@ -182,7 +154,7 @@ infiniStatus_t conv_kernel( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, nullptr, @@ -198,7 +170,7 @@ infiniStatus_t conv_kernel( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, (float *)bias, @@ -217,12 +189,6 @@ infiniStatus_t conv_kernel( std::vector pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1), (int64_t)info.pad_info(2)}; std::vector dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1), (int64_t)info.dilation_info(2)}; - printf("x_shape:(%ld, %ld, %ld, %ld, %ld)\n", info.batch(), info.in_channels(), info.input_dim(0), info.input_dim(1), info.input_dim(2)); - printf("kernel_dim:(%ld, %ld, %ld)\n", ksize[0], ksize[1], ksize[2]); - printf("stride:(%ld, %ld, %ld)\n", stride[0], stride[1], stride[2]); - printf("pad:(%ld, %ld, %ld)\n", pad[0], pad[1], pad[2]); - printf("dilation:(%ld, %ld, %ld)\n", dilation[0], dilation[1], dilation[2]); - std::cout << "ndim: " << info.ndim() << " bias_size: " << bias_size << std::endl; if (dtype == INFINI_DTYPE_F16) { if (bias_size > 0) { CHECK_STATUS(internal->useXdnn( @@ -230,7 +196,7 @@ infiniStatus_t conv_kernel( [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, bias_F32, @@ -243,7 +209,7 @@ infiniStatus_t conv_kernel( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, nullptr, @@ -258,7 +224,7 @@ infiniStatus_t conv_kernel( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.kernel_dim(0), ksize, + (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, (float *)bias, diff --git a/test/infiniop/conv.py b/test/infiniop/conv.py index f30d312cd..6cb99da9f 100644 --- a/test/infiniop/conv.py +++ b/test/infiniop/conv.py @@ -49,15 +49,6 @@ (1, 2), (2, 1), ), - ( - (1, 3, 32, 32), - (32 * 32 * 3, 32 * 32, 32, 1), - (2, 3, 5, 5), - (75, 25, 5, 1), - (2, 2), - (2, 2), - (1, 1), - ), ( (32, 3, 32, 32), (32 * 32 * 3, 32 * 32, 32, 1), @@ -105,27 +96,27 @@ def conv(x, w, stride, padding, dilation, y_tensor, bias=None): - dim = len(x.shape) - 2 - if dim == 1: - y_tensor.copy_( - F.conv1d( - x, w, bias=bias, stride=stride, padding=padding, dilation=dilation + match len(x.shape) - 2: + case 1: + y_tensor.copy_( + F.conv1d( + x, w, bias=bias, stride=stride, padding=padding, dilation=dilation + ) ) - ) - elif dim == 2: - y_tensor.copy_( - F.conv2d( - x, w, bias=bias, stride=stride, padding=padding, dilation=dilation + case 2: + y_tensor.copy_( + F.conv2d( + x, w, bias=bias, stride=stride, padding=padding, dilation=dilation + ) ) - ) - elif dim == 3: - y_tensor.copy_( - F.conv3d( - x, w, bias=bias, stride=stride, padding=padding, dilation=dilation + case 3: + y_tensor.copy_( + F.conv3d( + x, w, bias=bias, stride=stride, padding=padding, dilation=dilation + ) ) - ) - else: - print("Error: Pytorch -> Unsupported tensor dimension") + case _: + print("Error: Pytorch -> Unsupported tensor dimension") # infer the shape of the output given the inputs for a N-ary convolution From 270207193854e49bb8e5992163a53c97d7d13cca Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 21 Aug 2025 03:56:59 +0000 Subject: [PATCH 4/6] issue/360: optimize code --- src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 251 +++++++------------- 1 file changed, 92 insertions(+), 159 deletions(-) diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc index b08e70dda..f7a1ce644 100644 --- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc +++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc @@ -45,10 +45,10 @@ infiniStatus_t Descriptor::create( return INFINI_STATUS_SUCCESS; } +template infiniStatus_t conv_kernel( std::shared_ptr internal, const ConvInfo &info, - infiniDtype_t dtype, void *workspace, size_t workspace_size, void *y, @@ -74,114 +74,58 @@ infiniStatus_t conv_kernel( std::initializer_list pad = {(int64_t)info.pad_info(0)}; int64_t dilation = (int64_t)info.dilation_info(0); - if (dtype == INFINI_DTYPE_F16) { - - if (bias_size > 0) { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); - CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, bias_F32, - nullptr, baidu::xpu::api::Activation_t::LINEAR, - nullptr))); - return INFINI_STATUS_SUCCESS; - })); - } else { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, nullptr, - nullptr, baidu::xpu::api::Activation_t::LINEAR, - nullptr))); - return INFINI_STATUS_SUCCESS; - })); - } - return INFINI_STATUS_SUCCESS; - - } else if (dtype == INFINI_DTYPE_F32) { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, (float *)bias, - nullptr, baidu::xpu::api::Activation_t::LINEAR, - nullptr))); - return INFINI_STATUS_SUCCESS; - })); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - break; + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + if (bias_size > 0) { + if constexpr (std::is_same::value) { + CHECK_KUNLUN((xdnn::cast(handle, (Tdata *)bias, bias_F32, bias_size))); + } else if constexpr (std::is_same::value) { + bias_F32 = (float *)bias; + } + } else { + bias_F32 = nullptr; + } + CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.out_channels(), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, bias_F32, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; } case 2: { std::vector ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1)}; std::vector stride = {(int64_t)info.stride_info(0), (int64_t)info.stride_info(1)}; std::vector pad = { (int64_t)info.pad_info(0), - (int64_t)info.pad_info(0), - (int64_t)info.pad_info(1), (int64_t)info.pad_info(1)}; std::vector dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1)}; - - if (dtype == INFINI_DTYPE_F16) { - if (bias_size > 0) { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); - CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, bias_F32, - nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, - nullptr, -1))); - return INFINI_STATUS_SUCCESS; - })); - } else { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, nullptr, - nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, - nullptr, -1))); - return INFINI_STATUS_SUCCESS; - })); - } - return INFINI_STATUS_SUCCESS; - - } else if (dtype == INFINI_DTYPE_F32) { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, (float *)bias, - nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, - nullptr, -1))); - return INFINI_STATUS_SUCCESS; - })); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - break; + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + if (bias_size > 0) { + if constexpr (std::is_same::value) { + CHECK_KUNLUN((xdnn::cast(handle, (Tdata *)bias, bias_F32, bias_size))); + } else if constexpr (std::is_same::value) { + bias_F32 = (float *)bias; + } + } else { + bias_F32 = nullptr; + } + CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, bias_F32, + nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, + nullptr, -1))); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; } case 3: { std::vector ksize = {(int64_t)info.kernel_dim(0), (int64_t)info.kernel_dim(1), (int64_t)info.kernel_dim(2)}; @@ -189,53 +133,28 @@ infiniStatus_t conv_kernel( std::vector pad = {(int64_t)info.pad_info(0), (int64_t)info.pad_info(1), (int64_t)info.pad_info(2)}; std::vector dilation = {(int64_t)info.dilation_info(0), (int64_t)info.dilation_info(1), (int64_t)info.dilation_info(2)}; - if (dtype == INFINI_DTYPE_F16) { - if (bias_size > 0) { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::cast(handle, (float16 *)bias, bias_F32, bias_size))); - CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, bias_F32, - nullptr, baidu::xpu::api::Activation_t::LINEAR, - nullptr))); - return INFINI_STATUS_SUCCESS; - })); - } else { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float16 *)x, (float16 *)w, (float16 *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, nullptr, - nullptr, baidu::xpu::api::Activation_t::LINEAR, - nullptr))); - return INFINI_STATUS_SUCCESS; - })); - } - return INFINI_STATUS_SUCCESS; - } else if (dtype == INFINI_DTYPE_F32) { - CHECK_STATUS(internal->useXdnn( - (kunlunStream_t)stream, - [&](xdnnHandle_t handle) { - CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (float *)x, (float *)w, (float *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), - (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, - stride, pad, - dilation, 1, nullptr, - nullptr, nullptr, true, (float *)bias, - nullptr, baidu::xpu::api::Activation_t::LINEAR, - nullptr))); - return INFINI_STATUS_SUCCESS; - })); - } else { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } - break; + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + if (bias_size > 0) { + if constexpr (std::is_same::value) { + CHECK_KUNLUN((xdnn::cast(handle, (Tdata *)bias, bias_F32, bias_size))); + } else if constexpr (std::is_same::value) { + bias_F32 = (float *)bias; + } + } else { + bias_F32 = nullptr; + } + CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), + (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, + stride, pad, + dilation, 1, nullptr, + nullptr, nullptr, true, bias_F32, + nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr))); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; } default: return INFINI_STATUS_BAD_TENSOR_SHAPE; @@ -254,17 +173,31 @@ infiniStatus_t Descriptor::calculate( if (workspace_size < _workspace_size) { return INFINI_STATUS_INSUFFICIENT_WORKSPACE; } - CHECK_STATUS(conv_kernel( - _opaque->internal, - _info, - _dtype, - workspace, - workspace_size, - y, - x, - w, - bias, - stream)); + if (_dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(conv_kernel( + _opaque->internal, + _info, + workspace, + workspace_size, + y, + x, + w, + bias, + stream)); + } else if (_dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(conv_kernel( + _opaque->internal, + _info, + workspace, + workspace_size, + y, + x, + w, + bias, + stream)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } return INFINI_STATUS_SUCCESS; } From 43b05f29140d5656801c20e53b39089adf709f9f Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 21 Aug 2025 04:39:02 +0000 Subject: [PATCH 5/6] issue/360: optim kunlun code --- src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 41 +++++++-------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc index f7a1ce644..f8375f0af 100644 --- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc +++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc @@ -67,6 +67,20 @@ infiniStatus_t conv_kernel( bias_size = 0; } float *bias_F32 = (float *)workspace_value; + CHECK_STATUS(internal->useXdnn( + (kunlunStream_t)stream, + [&](xdnnHandle_t handle) { + if (bias_size > 0) { + if constexpr (std::is_same::value) { + CHECK_KUNLUN((xdnn::cast(handle, (Tdata *)bias, bias_F32, bias_size))); + } else if constexpr (std::is_same::value) { + bias_F32 = (float *)bias; + } + } else { + bias_F32 = nullptr; + } + return INFINI_STATUS_SUCCESS; + })); switch (info.ndim()) { case 1: { int64_t ksize = (int64_t)info.kernel_dim(0); @@ -77,15 +91,6 @@ infiniStatus_t conv_kernel( CHECK_STATUS(internal->useXdnn( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { - if (bias_size > 0) { - if constexpr (std::is_same::value) { - CHECK_KUNLUN((xdnn::cast(handle, (Tdata *)bias, bias_F32, bias_size))); - } else if constexpr (std::is_same::value) { - bias_F32 = (float *)bias; - } - } else { - bias_F32 = nullptr; - } CHECK_KUNLUN((xdnn::conv1d_fusion(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), (int64_t)info.out_channels(), ksize, stride, pad, @@ -107,15 +112,6 @@ infiniStatus_t conv_kernel( CHECK_STATUS(internal->useXdnn( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { - if (bias_size > 0) { - if constexpr (std::is_same::value) { - CHECK_KUNLUN((xdnn::cast(handle, (Tdata *)bias, bias_F32, bias_size))); - } else if constexpr (std::is_same::value) { - bias_F32 = (float *)bias; - } - } else { - bias_F32 = nullptr; - } CHECK_KUNLUN((xdnn::conv2d_fusion(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), (int64_t)info.input_dim(1), (int64_t)info.out_channels(), ksize, stride, pad, @@ -136,15 +132,6 @@ infiniStatus_t conv_kernel( CHECK_STATUS(internal->useXdnn( (kunlunStream_t)stream, [&](xdnnHandle_t handle) { - if (bias_size > 0) { - if constexpr (std::is_same::value) { - CHECK_KUNLUN((xdnn::cast(handle, (Tdata *)bias, bias_F32, bias_size))); - } else if constexpr (std::is_same::value) { - bias_F32 = (float *)bias; - } - } else { - bias_F32 = nullptr; - } CHECK_KUNLUN((xdnn::conv3d_fusion(handle, (Tdata *)x, (Tdata *)w, (Tdata *)y, (int64_t)info.batch(), (int64_t)info.in_channels(), (int64_t)info.input_dim(0), (int64_t)info.input_dim(1), (int64_t)info.input_dim(2), (int64_t)info.out_channels(), ksize, stride, pad, From ed47cde073683a7e5195154886ebe061e1c8b7ff Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Tue, 2 Sep 2025 07:17:23 +0000 Subject: [PATCH 6/6] issue/360: modified kunlun_common.h --- src/infiniop/devices/kunlun/kunlun_common.h | 1 + src/infiniop/ops/conv/kunlun/conv_kunlun.cc | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/infiniop/devices/kunlun/kunlun_common.h b/src/infiniop/devices/kunlun/kunlun_common.h index 3e1bd8d1d..a1c0e0ae4 100644 --- a/src/infiniop/devices/kunlun/kunlun_common.h +++ b/src/infiniop/devices/kunlun/kunlun_common.h @@ -2,6 +2,7 @@ #define __KUNLUN_COMMON_H__ #include "../../../utils.h" +#include #include #include #include diff --git a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc index f8375f0af..577e5f990 100644 --- a/src/infiniop/ops/conv/kunlun/conv_kunlun.cc +++ b/src/infiniop/ops/conv/kunlun/conv_kunlun.cc @@ -2,7 +2,7 @@ #include "../../../../utils.h" #include "../../../devices/kunlun/kunlun_common.h" #include "../../../devices/kunlun/kunlun_handle.h" -#include + namespace op::conv::kunlun { struct Descriptor::Opaque { @@ -96,7 +96,7 @@ infiniStatus_t conv_kernel( stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, bias_F32, - nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr, xdnn::Activation_t::LINEAR, nullptr))); return INFINI_STATUS_SUCCESS; })); @@ -117,7 +117,7 @@ infiniStatus_t conv_kernel( stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, bias_F32, - nullptr, baidu::xpu::api::Activation_t::LINEAR, nullptr, + nullptr, xdnn::Activation_t::LINEAR, nullptr, nullptr, -1))); return INFINI_STATUS_SUCCESS; })); @@ -137,7 +137,7 @@ infiniStatus_t conv_kernel( stride, pad, dilation, 1, nullptr, nullptr, nullptr, true, bias_F32, - nullptr, baidu::xpu::api::Activation_t::LINEAR, + nullptr, xdnn::Activation_t::LINEAR, nullptr))); return INFINI_STATUS_SUCCESS; }));