From b96af842619c9966f1c158ffb2918a6290b944df Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 5 Dec 2025 14:56:29 +0800
Subject: [PATCH 1/7] issue/664: success linear

---
 include/infiniop/ops/linear.h                 |  40 +++
 include/infiniop/ops/quant.h                  |  28 ++
 src/infiniop/ops/linear/cuda/kernel.cuh       |  40 +++
 src/infiniop/ops/linear/info.h                |  79 +++++
 src/infiniop/ops/linear/linear.h              |  54 ++++
 .../ops/linear/nvidia/linear_nvidia.cu        | 193 ++++++++++++
 .../ops/linear/nvidia/linear_nvidia.cuh       |   7 +
 src/infiniop/ops/linear/operator.cc           | 117 ++++++++
 src/infiniop/ops/quant/cuda/kernel.cuh        | 277 ++++++++++++++++++
 src/infiniop/ops/quant/info.h                 |  60 ++++
 src/infiniop/ops/quant/nvidia/quant_nvidia.cu | 118 ++++++++
 .../ops/quant/nvidia/quant_nvidia.cuh         |   7 +
 src/infiniop/ops/quant/operator.cc            |  98 +++++++
 src/infiniop/ops/quant/quant.h                |  40 +++
 test/infiniop/libinfiniop/op_register.py      |  83 ++++++
 test/infiniop/linear.py                       | 266 +++++++++++++++++
 test/infiniop/quant.py                        | 211 +++++++++++++
 17 files changed, 1718 insertions(+)
 create mode 100644 include/infiniop/ops/linear.h
 create mode 100644 include/infiniop/ops/quant.h
 create mode 100644 src/infiniop/ops/linear/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/linear/info.h
 create mode 100644 src/infiniop/ops/linear/linear.h
 create mode 100644 src/infiniop/ops/linear/nvidia/linear_nvidia.cu
 create mode 100644 src/infiniop/ops/linear/nvidia/linear_nvidia.cuh
 create mode 100644 src/infiniop/ops/linear/operator.cc
 create mode 100644 src/infiniop/ops/quant/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/quant/info.h
 create mode 100644 src/infiniop/ops/quant/nvidia/quant_nvidia.cu
 create mode 100644 src/infiniop/ops/quant/nvidia/quant_nvidia.cuh
 create mode 100644 src/infiniop/ops/quant/operator.cc
 create mode 100644 src/infiniop/ops/quant/quant.h
 create mode 100644 test/infiniop/linear.py
 create mode 100644 test/infiniop/quant.py

diff --git a/include/infiniop/ops/linear.h b/include/infiniop/ops/linear.h
new file mode 100644
index 000000000..06f599aea
--- /dev/null
+++ b/include/infiniop/ops/linear.h
@@ -0,0 +1,40 @@
+#ifndef __INFINIOP_LINEAR_API_H__
+#define __INFINIOP_LINEAR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopLinearDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLinearDescriptor(infiniopHandle_t handle,
+                                                           infiniopLinearDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t d_desc,
+                                                           infiniopTensorDescriptor_t c_desc,
+                                                           infiniopTensorDescriptor_t bias_desc,
+                                                           infiniopTensorDescriptor_t x_desc,
+                                                           infiniopTensorDescriptor_t x_scale_desc,
+                                                           infiniopTensorDescriptor_t x_zero_desc,
+                                                           infiniopTensorDescriptor_t weights_desc,
+                                                           infiniopTensorDescriptor_t weights_scale_desc,
+                                                           infiniopTensorDescriptor_t weights_zero_desc,
+                                                           float alpha,
+                                                           float beta);
+
+__C __export infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLinear(infiniopLinearDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *d,
+                                           const void *c,
+                                           const void *bias,
+                                           const void *x,
+                                           const void *x_scale,
+                                           const void *x_zero,
+                                           const void *weights,
+                                           const void *weights_scale,
+                                           const void *weights_zero,
+                                           void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/quant.h b/include/infiniop/ops/quant.h
new file mode 100644
index 000000000..90027c04a
--- /dev/null
+++ b/include/infiniop/ops/quant.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_QUANT_API_H__
+#define __INFINIOP_QUANT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopQuantDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateQuantDescriptor(infiniopHandle_t handle,
+                                                          infiniopQuantDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t x_packed_desc,
+                                                          infiniopTensorDescriptor_t x_scale_desc,
+                                                          infiniopTensorDescriptor_t x_zero_desc,
+                                                          infiniopTensorDescriptor_t x_desc);
+
+__C __export infiniStatus_t infiniopGetQuantWorkspaceSize(infiniopQuantDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopQuant(infiniopQuantDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *x_packed,
+                                          void *x_scale,
+                                          void *x_zero,
+                                          const void *x,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyQuantDescriptor(infiniopQuantDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/linear/cuda/kernel.cuh b/src/infiniop/ops/linear/cuda/kernel.cuh
new file mode 100644
index 000000000..da9a7c41d
--- /dev/null
+++ b/src/infiniop/ops/linear/cuda/kernel.cuh
@@ -0,0 +1,40 @@
+#ifndef __LINEAR_KERNEL_CUH__
+#define __LINEAR_KERNEL_CUH__
+
+template <typename Tdata>
+__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = ((float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx] + K * (float)x_zero[row] * (float)w_zero[col]));
+    float output2 = 0.0f;
+    float output3 = 0.0f;
+    float tmp2 = (float)x_scale[row] * (float)w_scale[col] * (float)w_zero[col];
+    float tmp3 = (float)x_scale[row] * (float)x_zero[row] * (float)w_scale[col];
+    for (int ind = 0; ind < K; ind++) {
+        output2 += tmp2 * (float)x_packed[row * K + ind];
+        output3 += tmp3 * (float)w_packed[ind * N + col];
+    }
+    float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx] + (float)bias[col];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+
+template <typename Tdata>
+__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = (float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx]);
+
+    float output = alpha * output1 + beta * (float)c[idx] + (float)bias[col];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+#endif // __LINEAR_KERNEL_CUH__
diff --git a/src/infiniop/ops/linear/info.h b/src/infiniop/ops/linear/info.h
new file mode 100644
index 000000000..866125d86
--- /dev/null
+++ b/src/infiniop/ops/linear/info.h
@@ -0,0 +1,79 @@
+#ifndef __LINEAR_INFO_H__
+#define __LINEAR_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::linear {
+
+class LinearInfo {
+private:
+    LinearInfo() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t M, K, N;
+    float alpha, beta;
+
+    static utils::Result<LinearInfo> createLinearInfo(
+        infiniopTensorDescriptor_t d_desc,
+        infiniopTensorDescriptor_t c_desc,
+        infiniopTensorDescriptor_t bias_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc,
+        infiniopTensorDescriptor_t weights_desc,
+        infiniopTensorDescriptor_t weights_scale_desc,
+        infiniopTensorDescriptor_t weights_zero_desc,
+        float alpha,
+        float beta) {
+
+        CHECK_OR_RETURN(
+            d_desc != nullptr && c_desc != nullptr && bias_desc != nullptr && x_desc != nullptr && x_scale_desc != nullptr && weights_desc != nullptr && weights_scale_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = d_desc->dtype();
+        const infiniDtype_t packed_type = x_desc->dtype();
+        CHECK_OR_RETURN(dtype == c_desc->dtype() && dtype == bias_desc->dtype() && dtype == x_scale_desc->dtype() && dtype == weights_scale_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(packed_type == weights_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+        CHECK_OR_RETURN(bias_desc->ndim() == 1,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(d_desc->ndim() == 2
+                            && c_desc->ndim() == 2
+                            && x_desc->ndim() == 2
+                            && x_scale_desc->ndim() == 2
+                            && weights_desc->ndim() == 2
+                            && weights_scale_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t M = d_desc->dim(0);
+        size_t N = d_desc->dim(1);
+        size_t K = x_desc->dim(1);
+        CHECK_OR_RETURN(N == bias_desc->dim(0),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(M == x_desc->dim(0)
+                            || M == x_scale_desc->dim(0)
+                            || 1 == x_scale_desc->dim(1)
+                            || 1 == weights_scale_desc->dim(0)
+                            || N == weights_scale_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<LinearInfo>(LinearInfo{
+            dtype,
+            packed_type,
+            M,
+            K,
+            N,
+            alpha,
+            beta});
+    }
+};
+
+} // namespace op::linear
+
+#endif //  __LINEAR_INFO_H__
diff --git a/src/infiniop/ops/linear/linear.h b/src/infiniop/ops/linear/linear.h
new file mode 100644
index 000000000..1c0ac51a4
--- /dev/null
+++ b/src/infiniop/ops/linear/linear.h
@@ -0,0 +1,54 @@
+#ifndef __LINEAR_H__
+#define __LINEAR_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                                        \
+                                                                                                                     \
+    namespace op::linear::NAMESPACE {                                                                                \
+    class Descriptor final : public InfiniopDescriptor {                                                             \
+        struct Opaque;                                                                                               \
+        Opaque *_opaque;                                                                                             \
+        LinearInfo _info;                                                                                            \
+        size_t _workspace_size;                                                                                      \
+                                                                                                                     \
+        Descriptor(Opaque *opaque, LinearInfo info,                                                                  \
+                   size_t workspace_size,                                                                            \
+                   infiniDevice_t device_type, int device_id)                                                        \
+            : InfiniopDescriptor{device_type, device_id},                                                            \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}                                       \
+                                                                                                                     \
+    public:                                                                                                          \
+        ~Descriptor();                                                                                               \
+                                                                                                                     \
+        size_t minWorkspaceSize() const { return _workspace_size; }                                                  \
+                                                                                                                     \
+        static infiniStatus_t create(                                                                                \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                                          \
+            infiniopTensorDescriptor_t d_desc,                                                                       \
+            infiniopTensorDescriptor_t c_desc,                                                                       \
+            infiniopTensorDescriptor_t bias_desc,                                                                    \
+            infiniopTensorDescriptor_t x_desc,                                                                       \
+            infiniopTensorDescriptor_t x_scale_desc,                                                                 \
+            infiniopTensorDescriptor_t x_zero_desc,                                                                  \
+            infiniopTensorDescriptor_t weights_desc,                                                                 \
+            infiniopTensorDescriptor_t weights_scale_desc,                                                           \
+            infiniopTensorDescriptor_t weights_zero_desc,                                                            \
+            float alpha,                                                                                             \
+            float beta);                                                                                             \
+        template <unsigned int BLOCK_SIZE, typename Tdata>                                                           \
+        infiniStatus_t launchKernel(const LinearInfo &info, Tdata *y,                                                \
+                                    const Tdata *c, const Tdata *bias, const int8_t *x_packed,                       \
+                                    const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed,               \
+                                    const Tdata *w_scale, const Tdata *w_zero, void *stream, void *workspace) const; \
+                                                                                                                     \
+        infiniStatus_t calculate(                                                                                    \
+            void *workspace, size_t workspace_size,                                                                  \
+            void *d, const void *c, const void *bias, const void *x,                                                 \
+            const void *x_scale, const void *x_zero, const void *weights,                                            \
+            const void *weights_scale, const void *weights_zero, void *stream) const;                                \
+    };                                                                                                               \
+    }
+
+#endif // __LINEAR_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/linear/nvidia/linear_nvidia.cu b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu
new file mode 100644
index 000000000..d46da86be
--- /dev/null
+++ b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu
@@ -0,0 +1,193 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "linear_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+#include <cublasLt.h>
+
+#include "../cuda/kernel.cuh"
+
+#if defined ENABLE_NVIDIA_API
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+void int8Gemm(
+    const int8_t *x_packed, const int8_t *w_packed, int32_t *y_packed,
+    int M, int N, int K, cudaStream_t stream) {
+    using ElementA = int8_t;
+    using ElementB = int8_t;
+    using ElementC = int32_t;
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    // Use SIMT opclass to avoid tensor-op interleaved layout requirements
+    using Gemm = cutlass::gemm::device::Gemm<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementC, // accumulator type
+        cutlass::arch::OpClassSimt,
+        cutlass::arch::Sm75>;
+
+    Gemm gemm_op;
+
+    cutlass::gemm::GemmCoord problem_size(M, N, K);
+
+    typename Gemm::Arguments args{
+        problem_size,
+        {x_packed, K},
+        {w_packed, N},
+        {y_packed, N},
+        {y_packed, N},
+        {1, 0}};
+
+    cutlass::Status status = gemm_op.initialize(args, nullptr, stream);
+    if (status != cutlass::Status::kSuccess) {
+        printf("[CUTLASS SIMT] initialize failed: %d\n", int(status));
+        return;
+    }
+    status = gemm_op();
+    if (status != cutlass::Status::kSuccess) {
+        printf("[CUTLASS SIMT] run failed: %d\n", int(status));
+        return;
+    }
+}
+#endif
+
+template <typename Tdata>
+INFINIOP_CUDA_KERNEL post(
+    Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
+    postKernel<Tdata>(y, y_packed, c, bias, x_packed, x_scale, x_zero, w_packed, w_scale, w_zero, M, K, N, alpha, beta);
+}
+
+template <typename Tdata>
+INFINIOP_CUDA_KERNEL postSym(
+    Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
+    postSymKernel<Tdata>(y, y_packed, c, bias, x_packed, x_scale, w_packed, w_scale, M, K, N, alpha, beta);
+}
+
+namespace op::linear::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t d_desc,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc,
+    infiniopTensorDescriptor_t weights_desc,
+    infiniopTensorDescriptor_t weights_scale_desc,
+    infiniopTensorDescriptor_t weights_zero_desc,
+    float alpha,
+    float beta) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto info = LinearInfo::createLinearInfo(d_desc, c_desc, bias_desc, x_desc, x_scale_desc, x_zero_desc, weights_desc, weights_scale_desc, weights_zero_desc, alpha, beta);
+    CHECK_RESULT(info);
+    size_t workspace_size = c_desc->dim(0) * c_desc->dim(1) * sizeof(int32_t);
+    *desc_ptr = new Descriptor(
+        new Opaque{handle->internal()},
+        info.take(), workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t Descriptor::launchKernel(const LinearInfo &info, Tdata *y, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, void *stream_, void *workspace) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    int M = (int)info.M;
+    int K = (int)info.K;
+    int N = (int)info.N;
+    float alpha = info.alpha;
+    float beta = info.beta;
+    char *workspace_ptr = reinterpret_cast<char *>(workspace);
+    int32_t *y_packed = reinterpret_cast<int32_t *>(workspace_ptr);
+#if defined ENABLE_NVIDIA_API
+    int8Gemm(x_packed, w_packed, y_packed, M, N, K, stream);
+#elif defined ENABLE_QY_API
+    const int32_t alpha_I = 1;
+    const int32_t beta_I = 0;
+    CHECK_STATUS(this->_opaque->internal->useCublas(
+        stream,
+        [&](cublasHandle_t handle) {
+            CHECK_CUBLAS(cublasGemmEx(
+                handle,
+                CUBLAS_OP_N, // A = w_packed, column-major view
+                CUBLAS_OP_N, // B = x_packed, column-major view
+                N,           // m = N
+                M,           // n = M
+                K,           // k = K
+                &alpha_I,
+                w_packed, CUDA_R_8I, N, // lda = m = N
+                x_packed, CUDA_R_8I, K, // ldb = k = K
+                &beta_I,
+                y_packed, CUDA_R_32I, N, // ldc = m = N
+                CUBLAS_COMPUTE_32I,
+                CUBLAS_GEMM_DEFAULT));
+            return INFINI_STATUS_SUCCESS;
+        }));
+#endif
+    constexpr unsigned int BLOCK_SIZE_x = 32;
+    constexpr unsigned int BLOCK_SIZE_y = 32;
+
+    int num_block_x = (N + BLOCK_SIZE_x - 1) / BLOCK_SIZE_x;
+    int num_block_y = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+    dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+    dim3 grid_dim(num_block_x, num_block_y, 1);
+    if (x_zero == nullptr && w_zero == nullptr) {
+        postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, bias, x_packed, x_scale, w_packed, w_scale, M, K, N, alpha, beta);
+    } else {
+        post<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, bias, x_packed, x_scale, x_zero, w_packed, w_scale, w_zero, M, K, N, alpha, beta);
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *d,
+                                     const void *c,
+                                     const void *bias,
+                                     const void *x,
+                                     const void *x_scale,
+                                     const void *x_zero,
+                                     const void *weights,
+                                     const void *weights_scale,
+                                     const void *weights_zero,
+                                     void *stream) const {
+#define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \
+    launchKernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)d, (const TDATA *)c, (const TDATA *)bias, (const int8_t *)x, (const TDATA *)x_scale, (const TDATA *)x_zero, (const int8_t *)weights, (const TDATA *)weights_scale, (const TDATA *)weights_zero, stream, workspace)
+#define CALCULATE_LINEAR_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                           \
+        if (_info.dtype == INFINI_DTYPE_F16)                    \
+            return CALCULATE_LINEAR(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)               \
+            return CALCULATE_LINEAR(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)              \
+            return CALCULATE_LINEAR(BLOCK_SIZE, __nv_bfloat16); \
+        else                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;              \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::linear::nvidia
diff --git a/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh b/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh
new file mode 100644
index 000000000..fdc3ddf64
--- /dev/null
+++ b/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __LINEAR_NVIDIA_API_H__
+#define __LINEAR_NVIDIA_API_H__
+#include "../linear.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __LINEAR_NVIDIA_API_H__
diff --git a/src/infiniop/ops/linear/operator.cc b/src/infiniop/ops/linear/operator.cc
new file mode 100644
index 000000000..c069c3bd5
--- /dev/null
+++ b/src/infiniop/ops/linear/operator.cc
@@ -0,0 +1,117 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/linear.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/linear_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateLinearDescriptor(infiniopHandle_t handle,
+                                                  infiniopLinearDescriptor_t *desc_ptr,
+                                                  infiniopTensorDescriptor_t d_desc,
+                                                  infiniopTensorDescriptor_t c_desc,
+                                                  infiniopTensorDescriptor_t bias_desc,
+                                                  infiniopTensorDescriptor_t x_desc,
+                                                  infiniopTensorDescriptor_t x_scale_desc,
+                                                  infiniopTensorDescriptor_t x_zero_desc,
+                                                  infiniopTensorDescriptor_t weights_desc,
+                                                  infiniopTensorDescriptor_t weights_scale_desc,
+                                                  infiniopTensorDescriptor_t weights_zero_desc,
+                                                  float alpha,
+                                                  float beta) {
+#define CREATE(CASE, NAMESPACE)                                               \
+    case CASE:                                                                \
+        return op::linear::NAMESPACE::Descriptor::create(                     \
+            handle,                                                           \
+            reinterpret_cast<op::linear::NAMESPACE::Descriptor **>(desc_ptr), \
+            d_desc,                                                           \
+            c_desc,                                                           \
+            bias_desc,                                                        \
+            x_desc,                                                           \
+            x_scale_desc,                                                     \
+            x_zero_desc,                                                      \
+            weights_desc,                                                     \
+            weights_scale_desc,                                               \
+            weights_zero_desc,                                                \
+            alpha,                                                            \
+            beta);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::linear::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopLinear(infiniopLinearDescriptor_t desc,
+                                  void *workspace,
+                                  size_t workspace_size,
+                                  void *d,
+                                  const void *c,
+                                  const void *bias,
+                                  const void *x,
+                                  const void *x_scale,
+                                  const void *x_zero,
+                                  const void *weights,
+                                  const void *weights_scale,
+                                  const void *weights_zero,
+                                  void *stream) {
+#define CACULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                         \
+        return reinterpret_cast<op::linear::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, d, c, bias, x, x_scale, x_zero, weights, weights_scale, weights_zero, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        CACULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CACULATE(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CACULATE
+}
+
+__C infiniStatus_t infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                            \
+    case CASE:                                                              \
+        delete reinterpret_cast<op::linear::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/quant/cuda/kernel.cuh b/src/infiniop/ops/quant/cuda/kernel.cuh
new file mode 100644
index 000000000..e59ba4d0c
--- /dev/null
+++ b/src/infiniop/ops/quant/cuda/kernel.cuh
@@ -0,0 +1,277 @@
+#ifndef __QUANT_KERNEL_CUH__
+#define __QUANT_KERNEL_CUH__
+
+#include <cub/block/block_reduce.cuh>
+__device__ inline int round_half_away_from_zero(float x) {
+    float ax = fabsf(x);
+    float r = floorf(ax + 0.5f);
+    return (x >= 0.0f) ? (int)r : -(int)r;
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockQuantKernel(
+    int8_t *x_packed, Tdata *x_scale, Tdata *x_zero, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    // ---- 1. reduce max ----
+    float local_max = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(
+        x + tid, K);
+
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_min = __FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_min = fminf(thread_min, (float)x[tid + ind]);
+    }
+    float local_min = BlockReduce(temp_storage).Reduce(thread_min, cub::Min());
+
+    __shared__ float global_min_f;
+    if (threadIdx.x == 0) {
+        global_min_f = local_min;
+    }
+    __syncthreads();
+
+    // ---- 3. 使用 float（匹配 python）计算 scale/zero ----
+    float global_max = global_max_f;
+    float global_min = global_min_f;
+
+    float scale = (global_max - global_min) / 255.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+    float zero = -global_min * inv_scale - 128.0f;
+
+    // 写回 scale, zero
+    x_scale[row] = (Tdata)scale;
+    x_zero[row] = (Tdata)zero;
+
+    // ---- 4. 使用 float + half-away-from-zero（与 Python 完全一致）----
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale + zero;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -128) {
+            q = -128;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+__device__ void blockQuantSymKernel(
+    int8_t *x_packed, Tdata *x_scale, const Tdata *x,
+    int M, int K) {
+    int row = blockIdx.x;
+    int tid = row * K;
+
+    typedef cub::BlockReduce<float, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    // ---- 2. reduce min ----
+    float thread_max = -__FLT_MAX__;
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+        thread_max = fmaxf(thread_max, fabs((float)x[tid + ind]));
+    }
+    float local_max = BlockReduce(temp_storage).Reduce(thread_max, cub::Max());
+
+    __shared__ float global_max_f;
+    if (threadIdx.x == 0) {
+        global_max_f = local_max;
+    }
+    __syncthreads();
+
+    // ---- 3. 使用 float（匹配 python）计算 scale/zero ----
+    float global_max = global_max_f;
+
+    float scale = global_max / 127.0f;
+    if (scale < 1e-8f) {
+        scale = 1e-8f;
+    }
+
+    float inv_scale = 1.0f / scale;
+
+    // 写回 scale, zero
+    x_scale[row] = (Tdata)scale;
+
+    // ---- 4. 使用 float + half-away-from-zero（与 Python 完全一致）----
+    for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE) {
+
+        float v = (float)x[tid + ind];
+        float qf = v * inv_scale;
+
+        int q = round_half_away_from_zero(qf);
+
+        if (q > 127) {
+            q = 127;
+        }
+        if (q < -128) {
+            q = -128;
+        }
+
+        x_packed[tid + ind] = (int8_t)q;
+    }
+}
+
+template <typename T>
+struct MaxOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return max(a, b);
+    }
+};
+template <typename T>
+struct MinOp {
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return min(a, b);
+    }
+};
+template <template <typename> class ReductionOp, typename T,
+          int thread_group_width>
+__inline__ __device__ T WarpAllReduce(T val) {
+    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+        val = ReductionOp<T>()(val, __shfl_xor_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpQuantKernel(
+    int8_t *x_packed, Tdata *x_scale, Tdata *x_zero, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+        __shared__ float min_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+        float min_data = __FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            max_data = fmaxf(max_data, v);
+            min_data = fminf(min_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+        min_data = WarpAllReduce<MinOp, float, BLOCK_SIZE_x>(min_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+            min_total[threadIdx.y] = min_data;
+        }
+        __syncthreads();
+
+        // ---- float scale/zero（与 Python float32 匹配）----
+        float max_f = max_total[threadIdx.y];
+        float min_f = min_total[threadIdx.y];
+
+        float scale = (max_f - min_f) / 255.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+        float zero = -min_f * inv_scale - 128.0f;
+
+        x_scale[otherIdx] = (Tdata)scale;
+        x_zero[otherIdx] = (Tdata)zero;
+
+        // ---- float + half-away-from-zero 量化 ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale + zero;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -128) {
+                q = -128;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+__device__ void warpQuantSymKernel(
+    int8_t *x_packed, Tdata *x_scale, const Tdata *x,
+    int M, int K) {
+    int otherIdx = blockIdx.x * blockDim.y + threadIdx.y;
+    int tid = otherIdx * K;
+
+    if (otherIdx < M) {
+
+        __shared__ float max_total[BLOCK_SIZE_y];
+
+        float max_data = -__FLT_MAX__;
+
+        // ---- reduce max/min ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = fabs((float)x[tid + ind]);
+            max_data = fmaxf(max_data, v);
+        }
+
+        max_data = WarpAllReduce<MaxOp, float, BLOCK_SIZE_x>(max_data);
+
+        if (threadIdx.x == 0) {
+            max_total[threadIdx.y] = max_data;
+        }
+        __syncthreads();
+
+        // ---- float scale/zero（与 Python float32 匹配）----
+        float max_f = max_total[threadIdx.y];
+
+        float scale = max_f / 127.0f;
+        if (scale < 1e-8f) {
+            scale = 1e-8f;
+        }
+
+        float inv_scale = 1.0f / scale;
+
+        x_scale[otherIdx] = (Tdata)scale;
+
+        // ---- float + half-away-from-zero 量化 ----
+        for (int ind = threadIdx.x; ind < K; ind += BLOCK_SIZE_x) {
+            float v = (float)x[tid + ind];
+            float qf = v * inv_scale;
+
+            int q = round_half_away_from_zero(qf);
+
+            if (q > 127) {
+                q = 127;
+            }
+            if (q < -128) {
+                q = -128;
+            }
+
+            x_packed[tid + ind] = (int8_t)q;
+        }
+    }
+}
+
+#endif // __QUANT_KERNEL_CUH__
diff --git a/src/infiniop/ops/quant/info.h b/src/infiniop/ops/quant/info.h
new file mode 100644
index 000000000..f40c2b4e7
--- /dev/null
+++ b/src/infiniop/ops/quant/info.h
@@ -0,0 +1,60 @@
+#ifndef __QUANT_INFO_H__
+#define __QUANT_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::quant {
+
+class QuantInfo {
+private:
+    QuantInfo() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t M, K;
+
+    static utils::Result<QuantInfo> createQuantInfo(
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc,
+        infiniopTensorDescriptor_t x_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+        CHECK_OR_RETURN(dtype == x_scale_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        CHECK_OR_RETURN(x_desc->ndim() == 2
+                            && x_packed_desc->ndim() == 2
+                            && x_scale_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t M = x_desc->dim(0);
+        size_t K = x_desc->dim(1);
+
+        CHECK_OR_RETURN(M == x_packed_desc->dim(0)
+                            || K == x_packed_desc->dim(1)
+                            || M == x_scale_desc->dim(0)
+                            || 1 == x_scale_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<QuantInfo>(QuantInfo{
+            dtype,
+            packed_type,
+            M,
+            K,
+        });
+    }
+};
+
+} // namespace op::quant
+
+#endif //  __QUANT_INFO_H__
diff --git a/src/infiniop/ops/quant/nvidia/quant_nvidia.cu b/src/infiniop/ops/quant/nvidia/quant_nvidia.cu
new file mode 100644
index 000000000..2147025f3
--- /dev/null
+++ b/src/infiniop/ops/quant/nvidia/quant_nvidia.cu
@@ -0,0 +1,118 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "quant_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockQuant(
+    int8_t *x_packed, Tdata *x_scale, Tdata *x_zero, const Tdata *x, int M, int K) {
+    blockQuantKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_CUDA_KERNEL blockQuantSym(
+    int8_t *x_packed, Tdata *x_scale, const Tdata *x, int M, int K) {
+    blockQuantSymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x, M, K);
+}
+
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpQuant(
+    int8_t *x_packed, Tdata *x_scale, Tdata *x_zero, const Tdata *x, int M, int K) {
+    warpQuantKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_CUDA_KERNEL warpQuantSym(
+    int8_t *x_packed, Tdata *x_scale, const Tdata *x, int M, int K) {
+    warpQuantSymKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x, M, K);
+}
+
+namespace op::quant::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto info = QuantInfo::createQuantInfo(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t quantKernel(const QuantInfo &info, int8_t *x_packed, Tdata *x_scale, Tdata *x_zero, const Tdata *x, cudaStream_t stream) {
+    int M = (int)info.M;
+    int K = (int)info.K;
+
+    if (K >= 1024) {
+        if (x_zero == nullptr) {
+            blockQuantSym<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            blockQuant<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+
+    } else {
+        constexpr unsigned int BLOCK_SIZE_x = 32;
+        constexpr unsigned int BLOCK_SIZE_y = 32;
+        int num_block_x = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+        dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+        if (x_zero == nullptr) {
+            warpQuantSym<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            warpQuant<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x_packed, void *x_scale, void *x_zero, const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+#define QUANT(BLOCK_SIZE, TDATA) \
+    quantKernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (TDATA *)x_scale, (TDATA *)x_zero, (const TDATA *)x, stream)
+#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)         \
+            return QUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)    \
+            return QUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)   \
+            return QUANT(BLOCK_SIZE, __nv_bfloat16); \
+        else                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;   \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        QUANT_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::quant::nvidia
diff --git a/src/infiniop/ops/quant/nvidia/quant_nvidia.cuh b/src/infiniop/ops/quant/nvidia/quant_nvidia.cuh
new file mode 100644
index 000000000..92639cbaf
--- /dev/null
+++ b/src/infiniop/ops/quant/nvidia/quant_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __QUANT_NVIDIA_API_H__
+#define __QUANT_NVIDIA_API_H__
+#include "../quant.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __QUANT_NVIDIA_API_H__
diff --git a/src/infiniop/ops/quant/operator.cc b/src/infiniop/ops/quant/operator.cc
new file mode 100644
index 000000000..2ad9e4be1
--- /dev/null
+++ b/src/infiniop/ops/quant/operator.cc
@@ -0,0 +1,98 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/quant.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/quant_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateQuantDescriptor(infiniopHandle_t handle,
+                                                 infiniopQuantDescriptor_t *desc_ptr,
+                                                 infiniopTensorDescriptor_t x_packed_desc,
+                                                 infiniopTensorDescriptor_t x_scale_desc,
+                                                 infiniopTensorDescriptor_t x_zero_desc,
+                                                 infiniopTensorDescriptor_t x_desc) {
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::quant::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::quant::NAMESPACE::Descriptor **>(desc_ptr), \
+            x_packed_desc,                                                   \
+            x_scale_desc,                                                    \
+            x_zero_desc,                                                     \
+            x_desc);
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetQuantWorkspaceSize(infiniopQuantDescriptor_t desc, size_t *size) {
+    switch (desc->device_type) {
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::quant::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopQuant(infiniopQuantDescriptor_t desc,
+                                 void *workspace,
+                                 size_t workspace_size,
+                                 void *x_packed,
+                                 void *x_scale,
+                                 void *x_zero,
+                                 const void *x,
+                                 void *stream) {
+#define QUANT(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                        \
+        return reinterpret_cast<op::quant::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, x_packed, x_scale, x_zero, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        QUANT(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        QUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef QUANT
+}
+
+__C infiniStatus_t infiniopDestroyQuantDescriptor(infiniopQuantDescriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                           \
+    case CASE:                                                             \
+        delete reinterpret_cast<op::quant::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
diff --git a/src/infiniop/ops/quant/quant.h b/src/infiniop/ops/quant/quant.h
new file mode 100644
index 000000000..50f187507
--- /dev/null
+++ b/src/infiniop/ops/quant/quant.h
@@ -0,0 +1,40 @@
+#ifndef __QUANT_H__
+#define __QUANT_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                \
+                                                                                             \
+    namespace op::quant::NAMESPACE {                                                         \
+    class Descriptor final : public InfiniopDescriptor {                                     \
+        struct Opaque;                                                                       \
+        Opaque *_opaque;                                                                     \
+        QuantInfo _info;                                                                     \
+        size_t _workspace_size;                                                              \
+                                                                                             \
+        Descriptor(Opaque *opaque, QuantInfo info,                                           \
+                   size_t workspace_size,                                                    \
+                   infiniDevice_t device_type, int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id},                                    \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}               \
+                                                                                             \
+    public:                                                                                  \
+        ~Descriptor();                                                                       \
+                                                                                             \
+        size_t minWorkspaceSize() const { return _workspace_size; }                          \
+                                                                                             \
+        static infiniStatus_t create(                                                        \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                  \
+            infiniopTensorDescriptor_t x_packed_desc,                                        \
+            infiniopTensorDescriptor_t x_scale_desc,                                         \
+            infiniopTensorDescriptor_t x_zero_desc,                                          \
+            infiniopTensorDescriptor_t x_desc);                                              \
+                                                                                             \
+        infiniStatus_t calculate(                                                            \
+            void *workspace, size_t workspace_size,                                          \
+            void *x_packed, void *x_scale, void *x_zero, const void *x, void *stream) const; \
+        };                                                                                       \
+    }
+
+#endif // __QUANT_H__
\ No newline at end of file
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 5b2974111..169ba4a0a 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -908,6 +908,89 @@ def lp_norm_(lib):
     ]
 
 
+@OpRegister.operator
+def linear_(lib):
+    lib.infiniopCreateLinearDescriptor.restype = c_int32
+    lib.infiniopCreateLinearDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+        c_float,
+    ]
+
+    lib.infiniopGetLinearWorkspaceSize.restype = c_int32
+    lib.infiniopGetLinearWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLinear.restype = c_int32
+    lib.infiniopLinear.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLinearDescriptor.restype = c_int32
+    lib.infiniopDestroyLinearDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def quant_(lib):
+    lib.infiniopCreateQuantDescriptor.restype = c_int32
+    lib.infiniopCreateQuantDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetQuantWorkspaceSize.restype = c_int32
+    lib.infiniopGetQuantWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopQuant.restype = c_int32
+    lib.infiniopQuant.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyQuantDescriptor.restype = c_int32
+    lib.infiniopDestroyQuantDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def tanh_(lib):
     lib.infiniopCreateTanhDescriptor.restype = c_int32
diff --git a/test/infiniop/linear.py b/test/infiniop/linear.py
new file mode 100644
index 000000000..eca626c00
--- /dev/null
+++ b/test/infiniop/linear.py
@@ -0,0 +1,266 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # x_shape, w_shape, y_shape, alpha, beta
+    ((8, 8), (8, 8), False, (8, 8), 1.0, 0.0),
+    ((128, 512), (512, 1024), True, (128, 1024), 1.0, 0.0),
+    ((128, 128), (128, 128), False, (128, 128), 2.0, 1.0),
+    ((256, 1024), (1024, 2048), True, (256, 2048), 1.0, 1.0),
+    ((256, 2048), (2048, 1024), False, (256, 1024), 1.5, 2.5),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def linearFunction(c, bias, x, w, alpha, beta):
+    ans = alpha * torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype) + beta * c + bias
+    return ans
+
+def quantWeights(w: torch.Tensor, symmetric, axis):
+    """
+    对权重矩阵 w ∈ [K, N] 做 per-channel (按列) 量化。
+    返回:
+      w_packed: int8 量化权重，形状 [K, N]
+      w_scale:  每列的scale，形状 [1, N]，dtype与w相同
+      w_zero:   每列的zero point，形状 [1, N]，dtype与w相同
+    """
+    assert w.dim() == 2, "w must be [K, N]"
+    if symmetric:
+        # 对称量化：zero=0, 只用最大绝对值
+        w_abs_max = torch.max(w.abs(), dim=axis, keepdim=True)[0]
+
+        # 避免除 0
+        w_scale = w_abs_max / 127.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+
+        # 计算量化值 q = round(w / scale)
+        w_q = torch.round(w / w_scale)
+
+        # 限制到 [-128, 127]
+        w_q = torch.clamp(w_q, -128, 127)
+
+        # 转 int8
+        w_packed = w_q.to(torch.int8)
+
+        # 对称量化 zero 固定为 0
+        w_zero = None
+
+        return w_packed, w_scale.to(w.dtype), w_zero
+    else:
+        # 计算每列的最小值和最大值
+        w_min = w.min(dim=axis, keepdim=True)[0]
+        w_max = w.max(dim=axis, keepdim=True)[0]
+
+        # 避免除以零
+        w_scale = (w_max - w_min) / 255.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+
+        # 计算zero point
+        w_zero = -w_min / w_scale - 128.0
+
+        # 计算量化值
+        w_q = torch.round(w / w_scale + w_zero)
+
+        # 限制范围[-128, 127]
+        w_q = torch.clamp(w_q, -128, 127)
+
+        # 转为int8
+        w_packed = w_q.to(torch.int8)
+
+        return w_packed, w_scale.to(w.dtype), w_zero.to(w.dtype)
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    w_shape,
+    symmetric,
+    y_shape,
+    alpha,
+    beta,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, alpha:{alpha}, beta:{beta}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+    N = w_shape[1]
+    bias = TestTensor((N, ), None, dtype, device)
+    x = TestTensor(x_shape, None, dtype, device)
+    w = TestTensor(w_shape, None, dtype, device)
+    y = TestTensor(y_shape, None, dtype, device)
+    if inplace == Inplace.INPLACE:
+        d = y
+    else:
+        d = TestTensor(y_shape, None, dtype, device)
+    ans = linearFunction(y.torch_tensor(), bias.torch_tensor(), x.torch_tensor(), w.torch_tensor(), alpha, beta)
+    x_p, x_s, x_z = quantWeights(x.torch_tensor(), symmetric, 1)
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=x_p)
+    x_scale = TestTensor((M, 1), None, dtype, device, mode="manual", set_tensor=x_s)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), None, dtype, device, mode="manual", set_tensor=x_z)
+
+    w_packed, w_scale, w_zero = quantWeights(w.torch_tensor(), symmetric, 0)
+    weights = TestTensor(w_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=w_packed)
+    weights_scale = TestTensor((1, N), None, dtype, device, mode="manual", set_tensor=w_scale)
+    if symmetric:
+        weights_zero = None
+    else:
+        weights_zero = TestTensor((1, N), None, dtype, device, mode="manual", set_tensor=w_zero)
+    
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLinearDescriptor(
+            handle, 
+            ctypes.byref(descriptor), 
+            d.descriptor, 
+            y.descriptor, 
+            bias.descriptor, 
+            x_packed.descriptor, 
+            x_scale.descriptor, 
+            None if symmetric else x_zero.descriptor,
+            weights.descriptor, 
+            weights_scale.descriptor, 
+            None if symmetric else weights_zero.descriptor,
+            alpha, 
+            beta
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+    d.destroy_desc()
+    bias.destroy_desc()
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+    weights.destroy_desc()
+    weights_scale.destroy_desc()
+    if symmetric == False:
+        weights_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLinearWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_linear():
+        check_error(
+            LIBINFINIOP.infiniopLinear(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                d.data(),
+                y.data(),
+                bias.data(),
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                weights.data(),
+                weights_scale.data(),
+                None if symmetric else weights_zero.data(),
+                None,
+            )
+        )
+
+    lib_linear()
+
+    if sync is not None:
+        sync()
+    
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(d.actual_tensor(), ans, atol=atol, rtol=rtol)
+    
+    assert torch.allclose(d.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: linearFunction(y.torch_tensor(), bias.torch_tensor(), x.torch_tensor(), w.torch_tensor(), alpha, beta), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/quant.py b/test/infiniop/quant.py
new file mode 100644
index 000000000..759c47389
--- /dev/null
+++ b/test/infiniop/quant.py
@@ -0,0 +1,211 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug_all,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES = [
+    # x_shape, symmetric
+    ((8, 8), False),
+    ((128, 512), True),
+    ((128, 128), False),
+    ((256, 1024), True),
+    ((256, 2048), False),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def quant(w: torch.Tensor, symmetric: torch.bool):
+    if(symmetric):
+        w_absmax = torch.max(torch.abs(w), dim=1, keepdim=True)[0]
+
+        # 避免除 0
+        w_scale = w_absmax / 127.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+
+        # 对称量化 zero-point == 0（且不用偏移 128）
+        w_zero = None
+
+        # 量化
+        w_q = torch.round(w / w_scale)
+
+        # 对称范围 [-128, 127]
+        w_q = torch.clamp(w_q, -128, 127)
+
+        w_packed = w_q.to(torch.int8)
+        return w_packed, w_scale.to(w.dtype), w_zero
+    else:
+        # 计算每列的最小值和最大值
+        w_min = w.min(dim=1, keepdim=True)[0]
+        w_max = w.max(dim=1, keepdim=True)[0]
+
+        # 避免除以零
+        w_scale = (w_max - w_min) / 255.0
+        w_scale = torch.clamp(w_scale, min=1e-8)
+        # 计算zero point
+        w_zero = -w_min / w_scale - 128.0
+        # 计算量化值
+        w_q = torch.round(w / w_scale + w_zero)
+        # 限制范围[-128, 127]
+        w_q = torch.clamp(w_q, -128, 127)
+        # 转为int8
+        w_packed = w_q.to(torch.int8)
+
+        return w_packed, w_scale.to(w.dtype), w_zero.to(w.dtype)
+
+
+def test(
+    handle,
+    device,
+    x_shape,
+    symmetric,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Quant on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+    x = TestTensor(x_shape, None, dtype, device)
+    ans_packed, ans_scale, ans_zero = quant(x.torch_tensor(), symmetric)
+
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, dtype, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), None, dtype, device)
+    
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateQuantDescriptor(
+            handle, 
+            ctypes.byref(descriptor), 
+            x_packed.descriptor, 
+            x_scale.descriptor, 
+            None if symmetric else x_zero.descriptor,
+            x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetQuantWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_quant():
+        check_error(
+            LIBINFINIOP.infiniopQuant(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_quant()
+
+    if sync is not None:
+        sync()
+    
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) #quant算子cuda和python难以对齐
+    if DEBUG:
+        if symmetric:
+            debug_all(
+                (x_packed.actual_tensor(), x_scale.actual_tensor()),
+                (ans_packed, ans_scale),
+                "and",
+                atol=atol,
+                rtol=rtol,
+            )
+        else:
+            debug_all(
+                (x_packed.actual_tensor(), x_scale.actual_tensor(), x_zero.actual_tensor()),
+                (ans_packed, ans_scale, ans_zero),
+                "and",
+                atol=atol,
+                rtol=rtol,
+            )
+    
+    print(max(abs(x_packed.actual_tensor() - ans_packed).flatten()))
+    if symmetric:
+        assert (torch.allclose(x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol) 
+                and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol) 
+                and torch.allclose(x_zero.actual_tensor(), ans_zero, atol=atol, rtol=rtol))
+    else:
+        assert (torch.allclose(x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol) 
+                and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol))
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: quant(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_quant(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyQuantDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From e9d97fb24734d65b4b2073eb4eca60e8b60ce63f Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 5 Dec 2025 15:18:02 +0800
Subject: [PATCH 2/7] issue/664: modified format

---
 src/infiniop/ops/quant/quant.h           |  2 +-
 test/infiniop/libinfiniop/op_register.py |  1 +
 test/infiniop/linear.py                  | 62 ++++++++++++++++--------
 test/infiniop/quant.py                   | 39 +++++++++------
 4 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/src/infiniop/ops/quant/quant.h b/src/infiniop/ops/quant/quant.h
index 50f187507..74906b9cd 100644
--- a/src/infiniop/ops/quant/quant.h
+++ b/src/infiniop/ops/quant/quant.h
@@ -34,7 +34,7 @@
         infiniStatus_t calculate(                                                            \
             void *workspace, size_t workspace_size,                                          \
             void *x_packed, void *x_scale, void *x_zero, const void *x, void *stream) const; \
-        };                                                                                       \
+    };                                                                                       \
     }
 
 #endif // __QUANT_H__
\ No newline at end of file
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 169ba4a0a..b158721a0 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -955,6 +955,7 @@ def linear_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def quant_(lib):
     lib.infiniopCreateQuantDescriptor.restype = c_int32
diff --git a/test/infiniop/linear.py b/test/infiniop/linear.py
index eca626c00..1aab06f9b 100644
--- a/test/infiniop/linear.py
+++ b/test/infiniop/linear.py
@@ -32,6 +32,7 @@
     ((256, 2048), (2048, 1024), False, (256, 1024), 1.5, 2.5),
 ]
 
+
 class Inplace(Enum):
     OUT_OF_PLACE = auto()
     INPLACE = auto()
@@ -66,9 +67,14 @@ class Inplace(Enum):
 
 
 def linearFunction(c, bias, x, w, alpha, beta):
-    ans = alpha * torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype) + beta * c + bias
+    ans = (
+        alpha * torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype)
+        + beta * c
+        + bias
+    )
     return ans
 
+
 def quantWeights(w: torch.Tensor, symmetric, axis):
     """
     对权重矩阵 w ∈ [K, N] 做 per-channel (按列) 量化。
@@ -141,7 +147,7 @@ def test(
     )
     M, K = x_shape
     N = w_shape[1]
-    bias = TestTensor((N, ), None, dtype, device)
+    bias = TestTensor((N,), None, dtype, device)
     x = TestTensor(x_shape, None, dtype, device)
     w = TestTensor(w_shape, None, dtype, device)
     y = TestTensor(y_shape, None, dtype, device)
@@ -149,9 +155,18 @@ def test(
         d = y
     else:
         d = TestTensor(y_shape, None, dtype, device)
-    ans = linearFunction(y.torch_tensor(), bias.torch_tensor(), x.torch_tensor(), w.torch_tensor(), alpha, beta)
+    ans = linearFunction(
+        y.torch_tensor(),
+        bias.torch_tensor(),
+        x.torch_tensor(),
+        w.torch_tensor(),
+        alpha,
+        beta,
+    )
     x_p, x_s, x_z = quantWeights(x.torch_tensor(), symmetric, 1)
-    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=x_p)
+    x_packed = TestTensor(
+        x_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=x_p
+    )
     x_scale = TestTensor((M, 1), None, dtype, device, mode="manual", set_tensor=x_s)
     if symmetric:
         x_zero = None
@@ -159,32 +174,38 @@ def test(
         x_zero = TestTensor((M, 1), None, dtype, device, mode="manual", set_tensor=x_z)
 
     w_packed, w_scale, w_zero = quantWeights(w.torch_tensor(), symmetric, 0)
-    weights = TestTensor(w_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=w_packed)
-    weights_scale = TestTensor((1, N), None, dtype, device, mode="manual", set_tensor=w_scale)
+    weights = TestTensor(
+        w_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=w_packed
+    )
+    weights_scale = TestTensor(
+        (1, N), None, dtype, device, mode="manual", set_tensor=w_scale
+    )
     if symmetric:
         weights_zero = None
     else:
-        weights_zero = TestTensor((1, N), None, dtype, device, mode="manual", set_tensor=w_zero)
-    
+        weights_zero = TestTensor(
+            (1, N), None, dtype, device, mode="manual", set_tensor=w_zero
+        )
+
     if sync is not None:
         sync()
 
     descriptor = infiniopOperatorDescriptor_t()
     check_error(
         LIBINFINIOP.infiniopCreateLinearDescriptor(
-            handle, 
-            ctypes.byref(descriptor), 
-            d.descriptor, 
-            y.descriptor, 
-            bias.descriptor, 
-            x_packed.descriptor, 
-            x_scale.descriptor, 
+            handle,
+            ctypes.byref(descriptor),
+            d.descriptor,
+            y.descriptor,
+            bias.descriptor,
+            x_packed.descriptor,
+            x_scale.descriptor,
             None if symmetric else x_zero.descriptor,
-            weights.descriptor, 
-            weights_scale.descriptor, 
+            weights.descriptor,
+            weights_scale.descriptor,
             None if symmetric else weights_zero.descriptor,
-            alpha, 
-            beta
+            alpha,
+            beta,
         )
     )
 
@@ -233,12 +254,11 @@ def lib_linear():
 
     if sync is not None:
         sync()
-    
 
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
         debug(d.actual_tensor(), ans, atol=atol, rtol=rtol)
-    
+
     assert torch.allclose(d.actual_tensor(), ans, atol=atol, rtol=rtol)
 
     # Profiling workflow
diff --git a/test/infiniop/quant.py b/test/infiniop/quant.py
index 759c47389..9ec54d2b0 100644
--- a/test/infiniop/quant.py
+++ b/test/infiniop/quant.py
@@ -49,7 +49,7 @@
 
 
 def quant(w: torch.Tensor, symmetric: torch.bool):
-    if(symmetric):
+    if symmetric:
         w_absmax = torch.max(torch.abs(w), dim=1, keepdim=True)[0]
 
         # 避免除 0
@@ -108,19 +108,19 @@ def test(
         x_zero = None
     else:
         x_zero = TestTensor((M, 1), None, dtype, device)
-    
+
     if sync is not None:
         sync()
 
     descriptor = infiniopOperatorDescriptor_t()
     check_error(
         LIBINFINIOP.infiniopCreateQuantDescriptor(
-            handle, 
-            ctypes.byref(descriptor), 
-            x_packed.descriptor, 
-            x_scale.descriptor, 
+            handle,
+            ctypes.byref(descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
             None if symmetric else x_zero.descriptor,
-            x.descriptor
+            x.descriptor,
         )
     )
 
@@ -157,8 +157,8 @@ def lib_quant():
 
     if sync is not None:
         sync()
-    
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) #quant算子cuda和python难以对齐
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)  # quant算子cuda和python难以对齐
     if DEBUG:
         if symmetric:
             debug_all(
@@ -170,21 +170,28 @@ def lib_quant():
             )
         else:
             debug_all(
-                (x_packed.actual_tensor(), x_scale.actual_tensor(), x_zero.actual_tensor()),
+                (
+                    x_packed.actual_tensor(),
+                    x_scale.actual_tensor(),
+                    x_zero.actual_tensor(),
+                ),
                 (ans_packed, ans_scale, ans_zero),
                 "and",
                 atol=atol,
                 rtol=rtol,
             )
-    
+
     print(max(abs(x_packed.actual_tensor() - ans_packed).flatten()))
     if symmetric:
-        assert (torch.allclose(x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol) 
-                and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol) 
-                and torch.allclose(x_zero.actual_tensor(), ans_zero, atol=atol, rtol=rtol))
+        assert (
+            torch.allclose(x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol)
+            and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol)
+            and torch.allclose(x_zero.actual_tensor(), ans_zero, atol=atol, rtol=rtol)
+        )
     else:
-        assert (torch.allclose(x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol) 
-                and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol))
+        assert torch.allclose(
+            x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol
+        ) and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol)
 
     # Profiling workflow
     if PROFILE:

From 3b7120256c4a65aed972f16103676aca78d769f5 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Mon, 8 Dec 2025 14:08:20 +0800
Subject: [PATCH 3/7] issue/664: success quant_linear

---
 test/infiniop/linear.py       |  12 +-
 test/infiniop/quant.py        | 218 -----------------------
 test/infiniop/quant_linear.py | 319 ++++++++++++++++++++++++++++++++++
 3 files changed, 325 insertions(+), 224 deletions(-)
 delete mode 100644 test/infiniop/quant.py
 create mode 100644 test/infiniop/quant_linear.py

diff --git a/test/infiniop/linear.py b/test/infiniop/linear.py
index 1aab06f9b..0b0322648 100644
--- a/test/infiniop/linear.py
+++ b/test/infiniop/linear.py
@@ -165,26 +165,26 @@ def test(
     )
     x_p, x_s, x_z = quantWeights(x.torch_tensor(), symmetric, 1)
     x_packed = TestTensor(
-        x_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=x_p
+        x_shape, x_p.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=x_p
     )
-    x_scale = TestTensor((M, 1), None, dtype, device, mode="manual", set_tensor=x_s)
+    x_scale = TestTensor((M, 1), x_s.stride(), dtype, device, mode="manual", set_tensor=x_s)
     if symmetric:
         x_zero = None
     else:
-        x_zero = TestTensor((M, 1), None, dtype, device, mode="manual", set_tensor=x_z)
+        x_zero = TestTensor((M, 1), x_z.stride(), dtype, device, mode="manual", set_tensor=x_z)
 
     w_packed, w_scale, w_zero = quantWeights(w.torch_tensor(), symmetric, 0)
     weights = TestTensor(
-        w_shape, None, InfiniDtype.I8, device, mode="manual", set_tensor=w_packed
+        w_shape, w_packed.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=w_packed
     )
     weights_scale = TestTensor(
-        (1, N), None, dtype, device, mode="manual", set_tensor=w_scale
+        (1, N), w_scale.stride(), dtype, device, mode="manual", set_tensor=w_scale
     )
     if symmetric:
         weights_zero = None
     else:
         weights_zero = TestTensor(
-            (1, N), None, dtype, device, mode="manual", set_tensor=w_zero
+            (1, N), w_zero.stride(), dtype, device, mode="manual", set_tensor=w_zero
         )
 
     if sync is not None:
diff --git a/test/infiniop/quant.py b/test/infiniop/quant.py
deleted file mode 100644
index 9ec54d2b0..000000000
--- a/test/infiniop/quant.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug_all,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# These are not meant to be imported from other modules
-_TEST_CASES = [
-    # x_shape, symmetric
-    ((8, 8), False),
-    ((128, 512), True),
-    ((128, 128), False),
-    ((256, 1024), True),
-    ((256, 2048), False),
-]
-
-# Data types used for testing
-_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
-    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
-    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def quant(w: torch.Tensor, symmetric: torch.bool):
-    if symmetric:
-        w_absmax = torch.max(torch.abs(w), dim=1, keepdim=True)[0]
-
-        # 避免除 0
-        w_scale = w_absmax / 127.0
-        w_scale = torch.clamp(w_scale, min=1e-8)
-
-        # 对称量化 zero-point == 0（且不用偏移 128）
-        w_zero = None
-
-        # 量化
-        w_q = torch.round(w / w_scale)
-
-        # 对称范围 [-128, 127]
-        w_q = torch.clamp(w_q, -128, 127)
-
-        w_packed = w_q.to(torch.int8)
-        return w_packed, w_scale.to(w.dtype), w_zero
-    else:
-        # 计算每列的最小值和最大值
-        w_min = w.min(dim=1, keepdim=True)[0]
-        w_max = w.max(dim=1, keepdim=True)[0]
-
-        # 避免除以零
-        w_scale = (w_max - w_min) / 255.0
-        w_scale = torch.clamp(w_scale, min=1e-8)
-        # 计算zero point
-        w_zero = -w_min / w_scale - 128.0
-        # 计算量化值
-        w_q = torch.round(w / w_scale + w_zero)
-        # 限制范围[-128, 127]
-        w_q = torch.clamp(w_q, -128, 127)
-        # 转为int8
-        w_packed = w_q.to(torch.int8)
-
-        return w_packed, w_scale.to(w.dtype), w_zero.to(w.dtype)
-
-
-def test(
-    handle,
-    device,
-    x_shape,
-    symmetric,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    print(
-        f"Testing Quant on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric}, dtype:{InfiniDtypeNames[dtype]}"
-    )
-    M, K = x_shape
-    x = TestTensor(x_shape, None, dtype, device)
-    ans_packed, ans_scale, ans_zero = quant(x.torch_tensor(), symmetric)
-
-    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
-    x_scale = TestTensor((M, 1), None, dtype, device)
-    if symmetric:
-        x_zero = None
-    else:
-        x_zero = TestTensor((M, 1), None, dtype, device)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateQuantDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            x_packed.descriptor,
-            x_scale.descriptor,
-            None if symmetric else x_zero.descriptor,
-            x.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x.destroy_desc()
-    x_packed.destroy_desc()
-    x_scale.destroy_desc()
-    if symmetric == False:
-        x_zero.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetQuantWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, x.device)
-
-    def lib_quant():
-        check_error(
-            LIBINFINIOP.infiniopQuant(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                x_packed.data(),
-                x_scale.data(),
-                None if symmetric else x_zero.data(),
-                x.data(),
-                None,
-            )
-        )
-
-    lib_quant()
-
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)  # quant算子cuda和python难以对齐
-    if DEBUG:
-        if symmetric:
-            debug_all(
-                (x_packed.actual_tensor(), x_scale.actual_tensor()),
-                (ans_packed, ans_scale),
-                "and",
-                atol=atol,
-                rtol=rtol,
-            )
-        else:
-            debug_all(
-                (
-                    x_packed.actual_tensor(),
-                    x_scale.actual_tensor(),
-                    x_zero.actual_tensor(),
-                ),
-                (ans_packed, ans_scale, ans_zero),
-                "and",
-                atol=atol,
-                rtol=rtol,
-            )
-
-    print(max(abs(x_packed.actual_tensor() - ans_packed).flatten()))
-    if symmetric:
-        assert (
-            torch.allclose(x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol)
-            and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol)
-            and torch.allclose(x_zero.actual_tensor(), ans_zero, atol=atol, rtol=rtol)
-        )
-    else:
-        assert torch.allclose(
-            x_packed.actual_tensor(), ans_packed, atol=atol, rtol=rtol
-        ) and torch.allclose(x_scale.actual_tensor(), ans_scale, atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: quant(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_quant(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyQuantDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/quant_linear.py b/test/infiniop/quant_linear.py
new file mode 100644
index 000000000..97595c972
--- /dev/null
+++ b/test/infiniop/quant_linear.py
@@ -0,0 +1,319 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # x_shape, w_shape, y_shape, alpha, beta
+    ((8, 8), (8, 8), False, (8, 8), 1.0, 0.0),
+    ((128, 512), (512, 1024), True, (128, 1024), 1.0, 0.0),
+    ((128, 128), (128, 128), False, (128, 128), 2.0, 1.0),
+    ((256, 1024), (1024, 2048), True, (256, 2048), 1.0, 1.0),
+    ((256, 2048), (2048, 1024), False, (256, 1024), 1.5, 2.5),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def linearFunction(c, bias, x, w, alpha, beta):
+    ans = (
+        alpha * torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype)
+        + beta * c
+        + bias
+    )
+    return ans
+
+def computeQuant(
+        handle,
+        device,
+        x, 
+        symmetric,
+        sync=None,
+):
+    x_shape = x.shape
+    dtype = x.dt
+    M, K = x_shape
+
+    x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
+    x_scale = TestTensor((M, 1), None, dtype, device)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), None, dtype, device)
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateQuantDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetQuantWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+    
+    def lib_quant():
+        check_error(
+            LIBINFINIOP.infiniopQuant(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_quant()
+    
+    if sync is not None:
+        sync()
+    check_error(LIBINFINIOP.infiniopDestroyQuantDescriptor(descriptor))
+    if symmetric:
+        return x_packed.actual_tensor(), x_scale.actual_tensor(), None
+    else:
+        return x_packed.actual_tensor(), x_scale.actual_tensor(), x_zero.actual_tensor()
+
+def test(
+    handle,
+    device,
+    x_shape,
+    w_shape,
+    symmetric,
+    y_shape,
+    alpha,
+    beta,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Quant Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, alpha:{alpha}, beta:{beta}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+    N = w_shape[1]
+    bias = TestTensor((N,), None, dtype, device)
+    x = TestTensor(x_shape, None, dtype, device)
+    w = TestTensor(w_shape, None, dtype, device)
+    y = TestTensor(y_shape, None, dtype, device)
+    if inplace == Inplace.INPLACE:
+        d = y
+    else:
+        d = TestTensor(y_shape, None, dtype, device)
+    ans = linearFunction(
+        y.torch_tensor(),
+        bias.torch_tensor(),
+        x.torch_tensor(),
+        w.torch_tensor(),
+        alpha,
+        beta,
+    )
+    w_data_t = w.actual_tensor().clone().t().contiguous()
+    w_t = TestTensor((N, K), w_data_t.stride(), dtype, device, mode="manual", set_tensor=w_data_t)
+    
+    w_packed, w_scale, w_zero = computeQuant(
+        handle,
+        device,
+        w_t, 
+        symmetric,
+        sync=None)
+    
+    weights = TestTensor(
+        w_shape, w_packed.t().contiguous().stride(), InfiniDtype.I8, device, mode="manual", set_tensor=w_packed.t().contiguous()
+    )
+    weights_scale = TestTensor(
+        (1, N), w_scale.t().contiguous().stride(), dtype, device, mode="manual", set_tensor=w_scale.t().contiguous()
+    )
+    if symmetric:
+        weights_zero = None
+    else:
+        weights_zero = TestTensor(
+            (1, N), w_zero.t().contiguous().stride(), dtype, device, mode="manual", set_tensor=w_zero.t().contiguous()
+        )
+    
+    x_p, x_s, x_z = computeQuant(
+        handle,
+        device,
+        x, 
+        symmetric,
+        sync=None)
+    
+    x_packed = TestTensor(
+        x_shape, x_p.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=x_p
+    )
+    x_scale = TestTensor((M, 1), x_s.stride(), dtype, device, mode="manual", set_tensor=x_s)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), x_z.stride(), dtype, device, mode="manual", set_tensor=x_z)
+    
+    
+    
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLinearDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            d.descriptor,
+            y.descriptor,
+            bias.descriptor,
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            weights.descriptor,
+            weights_scale.descriptor,
+            None if symmetric else weights_zero.descriptor,
+            alpha,
+            beta,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+    d.destroy_desc()
+    bias.destroy_desc()
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+    weights.destroy_desc()
+    weights_scale.destroy_desc()
+    if symmetric == False:
+        weights_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLinearWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_linear():
+        check_error(
+            LIBINFINIOP.infiniopLinear(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                d.data(),
+                y.data(),
+                bias.data(),
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                weights.data(),
+                weights_scale.data(),
+                None if symmetric else weights_zero.data(),
+                None,
+            )
+        )
+
+    lib_linear()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(d.actual_tensor(), ans, atol=atol, rtol=rtol)
+    
+    assert torch.allclose(d.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: linearFunction(y.torch_tensor(), bias.torch_tensor(), x.torch_tensor(), w.torch_tensor(), alpha, beta), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 3d54b129a5d949c7a2860de377dfbdcc6c1f7f9e Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 10 Dec 2025 14:21:18 +0800
Subject: [PATCH 4/7] issue/664: add bias_exit

---
 src/infiniop/ops/linear/cuda/kernel.cuh       | 36 ++++++++++++++
 src/infiniop/ops/linear/info.h                | 10 ++--
 .../ops/linear/nvidia/linear_nvidia.cu        | 24 ++++++++--
 test/infiniop/quant_linear.py                 | 48 ++++++++++++-------
 4 files changed, 91 insertions(+), 27 deletions(-)

diff --git a/src/infiniop/ops/linear/cuda/kernel.cuh b/src/infiniop/ops/linear/cuda/kernel.cuh
index da9a7c41d..480fef99a 100644
--- a/src/infiniop/ops/linear/cuda/kernel.cuh
+++ b/src/infiniop/ops/linear/cuda/kernel.cuh
@@ -23,6 +23,28 @@ __device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Td
     y[idx] = static_cast<Tdata>(output);
 }
 
+template <typename Tdata>
+__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = ((float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx] + K * (float)x_zero[row] * (float)w_zero[col]));
+    float output2 = 0.0f;
+    float output3 = 0.0f;
+    float tmp2 = (float)x_scale[row] * (float)w_scale[col] * (float)w_zero[col];
+    float tmp3 = (float)x_scale[row] * (float)x_zero[row] * (float)w_scale[col];
+    for (int ind = 0; ind < K; ind++) {
+        output2 += tmp2 * (float)x_packed[row * K + ind];
+        output3 += tmp3 * (float)w_packed[ind * N + col];
+    }
+    float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+
 template <typename Tdata>
 __device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
     int row = blockIdx.y * blockDim.y + threadIdx.y;
@@ -37,4 +59,18 @@ __device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const
 
     y[idx] = static_cast<Tdata>(output);
 }
+template <typename Tdata>
+__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = (float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx]);
+
+    float output = alpha * output1 + beta * (float)c[idx];
+
+    y[idx] = static_cast<Tdata>(output);
+}
 #endif // __LINEAR_KERNEL_CUH__
diff --git a/src/infiniop/ops/linear/info.h b/src/infiniop/ops/linear/info.h
index 866125d86..9fc8fd460 100644
--- a/src/infiniop/ops/linear/info.h
+++ b/src/infiniop/ops/linear/info.h
@@ -30,19 +30,18 @@ class LinearInfo {
         float beta) {
 
         CHECK_OR_RETURN(
-            d_desc != nullptr && c_desc != nullptr && bias_desc != nullptr && x_desc != nullptr && x_scale_desc != nullptr && weights_desc != nullptr && weights_scale_desc != nullptr,
+            d_desc != nullptr && c_desc != nullptr && x_desc != nullptr && x_scale_desc != nullptr && weights_desc != nullptr && weights_scale_desc != nullptr,
             INFINI_STATUS_NULL_POINTER);
 
         const infiniDtype_t dtype = d_desc->dtype();
         const infiniDtype_t packed_type = x_desc->dtype();
-        CHECK_OR_RETURN(dtype == c_desc->dtype() && dtype == bias_desc->dtype() && dtype == x_scale_desc->dtype() && dtype == weights_scale_desc->dtype(),
+        CHECK_OR_RETURN(dtype == c_desc->dtype() && dtype == x_scale_desc->dtype() && dtype == weights_scale_desc->dtype(),
                         INFINI_STATUS_BAD_TENSOR_DTYPE);
         CHECK_OR_RETURN(packed_type == weights_desc->dtype(),
                         INFINI_STATUS_BAD_TENSOR_DTYPE);
         CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
         CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
-        CHECK_OR_RETURN(bias_desc->ndim() == 1,
-                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
         CHECK_OR_RETURN(d_desc->ndim() == 2
                             && c_desc->ndim() == 2
                             && x_desc->ndim() == 2
@@ -54,8 +53,7 @@ class LinearInfo {
         size_t M = d_desc->dim(0);
         size_t N = d_desc->dim(1);
         size_t K = x_desc->dim(1);
-        CHECK_OR_RETURN(N == bias_desc->dim(0),
-                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
         CHECK_OR_RETURN(M == x_desc->dim(0)
                             || M == x_scale_desc->dim(0)
                             || 1 == x_scale_desc->dim(1)
diff --git a/src/infiniop/ops/linear/nvidia/linear_nvidia.cu b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu
index d46da86be..e7ac17bad 100644
--- a/src/infiniop/ops/linear/nvidia/linear_nvidia.cu
+++ b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu
@@ -63,12 +63,22 @@ INFINIOP_CUDA_KERNEL post(
     Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
     postKernel<Tdata>(y, y_packed, c, bias, x_packed, x_scale, x_zero, w_packed, w_scale, w_zero, M, K, N, alpha, beta);
 }
+template <typename Tdata>
+INFINIOP_CUDA_KERNEL post(
+    Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
+    postKernel<Tdata>(y, y_packed, c, x_packed, x_scale, x_zero, w_packed, w_scale, w_zero, M, K, N, alpha, beta);
+}
 
 template <typename Tdata>
 INFINIOP_CUDA_KERNEL postSym(
     Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
     postSymKernel<Tdata>(y, y_packed, c, bias, x_packed, x_scale, w_packed, w_scale, M, K, N, alpha, beta);
 }
+template <typename Tdata>
+INFINIOP_CUDA_KERNEL postSym(
+    Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
+    postSymKernel<Tdata>(y, y_packed, c, x_packed, x_scale, w_packed, w_scale, M, K, N, alpha, beta);
+}
 
 namespace op::linear::nvidia {
 
@@ -145,10 +155,18 @@ infiniStatus_t Descriptor::launchKernel(const LinearInfo &info, Tdata *y, const
     int num_block_y = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
     dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
     dim3 grid_dim(num_block_x, num_block_y, 1);
-    if (x_zero == nullptr && w_zero == nullptr) {
-        postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, bias, x_packed, x_scale, w_packed, w_scale, M, K, N, alpha, beta);
+    if (bias == nullptr) {
+        if (x_zero == nullptr && w_zero == nullptr) {
+            postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, x_packed, x_scale, w_packed, w_scale, M, K, N, alpha, beta);
+        } else {
+            post<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, x_packed, x_scale, x_zero, w_packed, w_scale, w_zero, M, K, N, alpha, beta);
+        }
     } else {
-        post<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, bias, x_packed, x_scale, x_zero, w_packed, w_scale, w_zero, M, K, N, alpha, beta);
+        if (x_zero == nullptr && w_zero == nullptr) {
+            postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, bias, x_packed, x_scale, w_packed, w_scale, M, K, N, alpha, beta);
+        } else {
+            post<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, c, bias, x_packed, x_scale, x_zero, w_packed, w_scale, w_zero, M, K, N, alpha, beta);
+        }
     }
 
     return INFINI_STATUS_SUCCESS;
diff --git a/test/infiniop/quant_linear.py b/test/infiniop/quant_linear.py
index 97595c972..5167c6288 100644
--- a/test/infiniop/quant_linear.py
+++ b/test/infiniop/quant_linear.py
@@ -24,12 +24,13 @@
 # ==============================================================================
 # These are not meant to be imported from other modules
 _TEST_CASES_ = [
-    # x_shape, w_shape, y_shape, alpha, beta
-    ((8, 8), (8, 8), False, (8, 8), 1.0, 0.0),
-    ((128, 512), (512, 1024), True, (128, 1024), 1.0, 0.0),
-    ((128, 128), (128, 128), False, (128, 128), 2.0, 1.0),
-    ((256, 1024), (1024, 2048), True, (256, 2048), 1.0, 1.0),
-    ((256, 2048), (2048, 1024), False, (256, 1024), 1.5, 2.5),
+    # x_shape, w_shape, symmetric, bias_exit, y_shape, alpha, beta
+    ((8, 8), (8, 8), False, True, (8, 8), 1.0, 0.0),
+    ((128, 512), (512, 1024), True, False, (128, 1024), 1.0, 0.0),
+    ((128, 128), (128, 128), False, True, (128, 128), 2.0, 1.0),
+    ((256, 1024), (1024, 2048), True, False, (256, 2048), 1.0, 1.0),
+    ((256, 2048), (2048, 1024), False, True, (256, 1024), 1.5, 2.5),
+    ((1024, 2048), (2048, 4096), True, False, (1024, 4096), 1.0, 0.0),
 ]
 
 
@@ -67,11 +68,17 @@ class Inplace(Enum):
 
 
 def linearFunction(c, bias, x, w, alpha, beta):
-    ans = (
-        alpha * torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype)
-        + beta * c
-        + bias
-    )
+    if bias is not None:
+        ans = (
+            alpha * torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype)
+            + beta * c
+            + bias
+        )
+    else:
+        ans = (
+            alpha * torch.matmul(x.to(torch.float32), w.to(torch.float32)).to(x.dtype)
+            + beta * c
+        )
     return ans
 
 def computeQuant(
@@ -151,6 +158,7 @@ def test(
     x_shape,
     w_shape,
     symmetric,
+    bias_exit,
     y_shape,
     alpha,
     beta,
@@ -159,11 +167,14 @@ def test(
     sync=None,
 ):
     print(
-        f"Testing Quant Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, alpha:{alpha}, beta:{beta}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
+        f"Testing Quant Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, bias:{bias_exit} ,alpha:{alpha}, beta:{beta}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
     )
     M, K = x_shape
     N = w_shape[1]
-    bias = TestTensor((N,), None, dtype, device)
+    if bias_exit:
+        bias = TestTensor((N,), None, dtype, device)
+    else:
+        bias = None
     x = TestTensor(x_shape, None, dtype, device)
     w = TestTensor(w_shape, None, dtype, device)
     y = TestTensor(y_shape, None, dtype, device)
@@ -173,7 +184,7 @@ def test(
         d = TestTensor(y_shape, None, dtype, device)
     ans = linearFunction(
         y.torch_tensor(),
-        bias.torch_tensor(),
+        bias.torch_tensor() if bias_exit else None,
         x.torch_tensor(),
         w.torch_tensor(),
         alpha,
@@ -230,7 +241,7 @@ def test(
             ctypes.byref(descriptor),
             d.descriptor,
             y.descriptor,
-            bias.descriptor,
+            bias.descriptor if bias_exit else None,
             x_packed.descriptor,
             x_scale.descriptor,
             None if symmetric else x_zero.descriptor,
@@ -246,7 +257,8 @@ def test(
     x.destroy_desc()
     y.destroy_desc()
     d.destroy_desc()
-    bias.destroy_desc()
+    if bias_exit:
+        bias.destroy_desc()
     x_packed.destroy_desc()
     x_scale.destroy_desc()
     if symmetric == False:
@@ -272,7 +284,7 @@ def lib_linear():
                 workspace_size.value,
                 d.data(),
                 y.data(),
-                bias.data(),
+                bias.data() if bias_exit else None,
                 x_packed.data(),
                 x_scale.data(),
                 None if symmetric else x_zero.data(),
@@ -297,7 +309,7 @@ def lib_linear():
     # Profiling workflow
     if PROFILE:
         # fmt: off
-        profile_operation("PyTorch", lambda: linearFunction(y.torch_tensor(), bias.torch_tensor(), x.torch_tensor(), w.torch_tensor(), alpha, beta), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: linearFunction(y.torch_tensor(), bias.torch_tensor() if bias_exit else None, x.torch_tensor(), w.torch_tensor(), alpha, beta), device, NUM_PRERUN, NUM_ITERATIONS)
         profile_operation("    lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
         # fmt: on
 

From 8e62da36c7a1947ad9412e31d0066e8658631955 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Wed, 10 Dec 2025 16:08:03 +0800
Subject: [PATCH 5/7] issue/664: cutlass failed

---
 test/infiniop/quant_linear.py | 199 +++++++++++++++++++++++++++++++++-
 1 file changed, 197 insertions(+), 2 deletions(-)

diff --git a/test/infiniop/quant_linear.py b/test/infiniop/quant_linear.py
index 5167c6288..ca70e3230 100644
--- a/test/infiniop/quant_linear.py
+++ b/test/infiniop/quant_linear.py
@@ -114,7 +114,7 @@ def computeQuant(
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    x.destroy_desc()
+
     x_packed.destroy_desc()
     x_scale.destroy_desc()
     if symmetric == False:
@@ -315,6 +315,199 @@ def lib_linear():
 
     check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor))
 
+def compare(
+    handle,
+    device,
+    x_shape,
+    w_shape,
+    symmetric,
+    bias_exit,
+    y_shape,
+    alpha,
+    beta,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    if bias_exit or inplace == Inplace.INPLACE:
+        return
+    print(
+        f"Compare Quant Linear with gemm on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, alpha:{alpha}, beta:{beta} dtype:{InfiniDtypeNames[dtype]}"
+    )
+    M, K = x_shape
+    N = w_shape[1]
+    if bias_exit:
+        bias = TestTensor((N,), None, dtype, device)
+    else:
+        bias = None
+    x = TestTensor(x_shape, None, dtype, device)
+    w = TestTensor(w_shape, None, dtype, device)
+    y = TestTensor(y_shape, None, dtype, device)
+    d = TestTensor(y_shape, None, dtype, device)
+        
+    
+    w_data_t = w.actual_tensor().clone().t().contiguous()
+    w_t = TestTensor((N, K), w_data_t.stride(), dtype, device, mode="manual", set_tensor=w_data_t)
+    
+    w_packed, w_scale, w_zero = computeQuant(
+        handle,
+        device,
+        w_t, 
+        symmetric,
+        sync=None)
+    
+    weights = TestTensor(
+        w_shape, w_packed.t().contiguous().stride(), InfiniDtype.I8, device, mode="manual", set_tensor=w_packed.t().contiguous()
+    )
+    weights_scale = TestTensor(
+        (1, N), w_scale.t().contiguous().stride(), dtype, device, mode="manual", set_tensor=w_scale.t().contiguous()
+    )
+    if symmetric:
+        weights_zero = None
+    else:
+        weights_zero = TestTensor(
+            (1, N), w_zero.t().contiguous().stride(), dtype, device, mode="manual", set_tensor=w_zero.t().contiguous()
+        )
+    
+    x_p, x_s, x_z = computeQuant(
+        handle,
+        device,
+        x, 
+        symmetric,
+        sync=None)
+    
+    x_packed = TestTensor(
+        x_shape, x_p.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=x_p
+    )
+    x_scale = TestTensor((M, 1), x_s.stride(), dtype, device, mode="manual", set_tensor=x_s)
+    if symmetric:
+        x_zero = None
+    else:
+        x_zero = TestTensor((M, 1), x_z.stride(), dtype, device, mode="manual", set_tensor=x_z)
+    
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    
+    check_error(
+        LIBINFINIOP.infiniopCreateLinearDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            y.descriptor,
+            bias.descriptor if bias_exit else None,
+            x_packed.descriptor,
+            x_scale.descriptor,
+            None if symmetric else x_zero.descriptor,
+            weights.descriptor,
+            weights_scale.descriptor,
+            None if symmetric else weights_zero.descriptor,
+            alpha,
+            beta,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    #x.destroy_desc()
+    y.destroy_desc()
+    #d.destroy_desc()
+    if bias_exit:
+        bias.destroy_desc()
+    x_packed.destroy_desc()
+    x_scale.destroy_desc()
+    if symmetric == False:
+        x_zero.destroy_desc()
+    weights.destroy_desc()
+    weights_scale.destroy_desc()
+    if symmetric == False:
+        weights_zero.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLinearWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_linear():
+        check_error(
+            LIBINFINIOP.infiniopLinear(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                y.data(),
+                bias.data() if bias_exit else None,
+                x_packed.data(),
+                x_scale.data(),
+                None if symmetric else x_zero.data(),
+                weights.data(),
+                weights_scale.data(),
+                None if symmetric else weights_zero.data(),
+                None,
+            )
+        )
+
+    lib_linear()
+
+    
+    check_error(
+        LIBINFINIOP.infiniopCreateGemmDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            d.descriptor,
+            x.descriptor,
+            w.descriptor,
+        )
+    )
+    
+    #计算GEMM的workspace和数值结果
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGemmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    # Execute infiniop gemm operator
+    def lib_gemm():
+        check_error(
+            LIBINFINIOP.infiniopGemm(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                d.data(),
+                x.data(),
+                w.data(),
+                alpha,
+                beta,
+                None,
+            )
+        )
+
+    lib_gemm()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), d.actual_tensor(), atol=atol, rtol=rtol)
+    
+    assert torch.allclose(y.actual_tensor(), d.actual_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("gemm        ", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("quant_linear", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor)) #只要一个Destroy即可
+    
 
 if __name__ == "__main__":
     args = get_args()
@@ -325,7 +518,9 @@ def lib_linear():
     NUM_PRERUN = args.num_prerun
     NUM_ITERATIONS = args.num_iterations
 
+    # for device in get_test_devices(args):
+    #     test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
     for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, compare, _TEST_CASES, _TENSOR_DTYPES)
 
     print("\033[92mTest passed!\033[0m")

From eaf33bb4342389a90540411c98c4333c7eac985e Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 12 Dec 2025 11:16:48 +0800
Subject: [PATCH 6/7] issue/664: success lib.gemm and lib.linear

---
 test/infiniop/quant_linear.py | 38 +++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/test/infiniop/quant_linear.py b/test/infiniop/quant_linear.py
index ca70e3230..2bbf832b7 100644
--- a/test/infiniop/quant_linear.py
+++ b/test/infiniop/quant_linear.py
@@ -342,9 +342,10 @@ def compare(
         bias = None
     x = TestTensor(x_shape, None, dtype, device)
     w = TestTensor(w_shape, None, dtype, device)
-    y = TestTensor(y_shape, None, dtype, device)
+    c = TestTensor(y_shape, None, dtype, device)
+    c_data = c.torch_tensor().clone()
     d = TestTensor(y_shape, None, dtype, device)
-        
+    y = TestTensor(y_shape, c_data.stride(), dtype, device, mode="manual", set_tensor=c_data)
     
     w_data_t = w.actual_tensor().clone().t().contiguous()
     w_t = TestTensor((N, K), w_data_t.stride(), dtype, device, mode="manual", set_tensor=w_data_t)
@@ -394,8 +395,8 @@ def compare(
         LIBINFINIOP.infiniopCreateLinearDescriptor(
             handle,
             ctypes.byref(descriptor),
-            y.descriptor,
-            y.descriptor,
+            d.descriptor,
+            c.descriptor,
             bias.descriptor if bias_exit else None,
             x_packed.descriptor,
             x_scale.descriptor,
@@ -409,9 +410,9 @@ def compare(
     )
 
     # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    #x.destroy_desc()
-    y.destroy_desc()
-    #d.destroy_desc()
+    
+    d.destroy_desc()
+    c.destroy_desc()
     if bias_exit:
         bias.destroy_desc()
     x_packed.destroy_desc()
@@ -437,8 +438,8 @@ def lib_linear():
                 descriptor,
                 workspace.data(),
                 workspace_size.value,
-                y.data(),
-                y.data(),
+                d.data(),
+                c.data(),
                 bias.data() if bias_exit else None,
                 x_packed.data(),
                 x_scale.data(),
@@ -451,18 +452,26 @@ def lib_linear():
         )
 
     lib_linear()
+    if PROFILE:
+        # fmt: off
+        profile_operation("quant_linear", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
 
-    
+    check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor))
+    #下面开始计算GEMM
+    descriptor = infiniopOperatorDescriptor_t()
     check_error(
         LIBINFINIOP.infiniopCreateGemmDescriptor(
             handle,
             ctypes.byref(descriptor),
-            d.descriptor,
+            y.descriptor,
             x.descriptor,
             w.descriptor,
         )
     )
-    
+    y.destroy_desc()
+    x.destroy_desc()
+    w.destroy_desc()
     #计算GEMM的workspace和数值结果
     workspace_size = c_uint64(0)
     check_error(
@@ -479,7 +488,7 @@ def lib_gemm():
                 descriptor,
                 workspace.data(),
                 workspace_size.value,
-                d.data(),
+                y.data(),
                 x.data(),
                 w.data(),
                 alpha,
@@ -503,10 +512,9 @@ def lib_gemm():
     if PROFILE:
         # fmt: off
         profile_operation("gemm        ", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("quant_linear", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
         # fmt: on
 
-    check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor)) #只要一个Destroy即可
+    check_error(LIBINFINIOP.infiniopDestroyGemmDescriptor(descriptor)) 
     
 
 if __name__ == "__main__":

From 4bb37f4884c52ce2018127b8e1c0fe0bdd72f1a9 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Fri, 12 Dec 2025 11:18:22 +0800
Subject: [PATCH 7/7] issue/664: success profile

---
 test/infiniop/quant_linear.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/infiniop/quant_linear.py b/test/infiniop/quant_linear.py
index 2bbf832b7..f9a1ce122 100644
--- a/test/infiniop/quant_linear.py
+++ b/test/infiniop/quant_linear.py
@@ -526,8 +526,8 @@ def lib_gemm():
     NUM_PRERUN = args.num_prerun
     NUM_ITERATIONS = args.num_iterations
 
-    # for device in get_test_devices(args):
-    #     test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
     for device in get_test_devices(args):
         test_operator(device, compare, _TEST_CASES, _TENSOR_DTYPES)