InfiniTensor · xgqdut2016 · Dec 5, 2025 · Dec 5, 2025 · Dec 8, 2025 · Dec 10, 2025
diff --git a/include/infiniop/ops/linear.h b/include/infiniop/ops/linear.h
@@ -0,0 +1,40 @@
+#ifndef __INFINIOP_LINEAR_API_H__
+#define __INFINIOP_LINEAR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopLinearDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLinearDescriptor(infiniopHandle_t handle,
+                                                           infiniopLinearDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t d_desc,
+                                                           infiniopTensorDescriptor_t c_desc,
+                                                           infiniopTensorDescriptor_t bias_desc,
+                                                           infiniopTensorDescriptor_t x_desc,
+                                                           infiniopTensorDescriptor_t x_scale_desc,
+                                                           infiniopTensorDescriptor_t x_zero_desc,
+                                                           infiniopTensorDescriptor_t weights_desc,
+                                                           infiniopTensorDescriptor_t weights_scale_desc,
+                                                           infiniopTensorDescriptor_t weights_zero_desc,
+                                                           float alpha,
+                                                           float beta);
+
+__C __export infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLinear(infiniopLinearDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *d,
+                                           const void *c,
+                                           const void *bias,
+                                           const void *x,
+                                           const void *x_scale,
+                                           const void *x_zero,
+                                           const void *weights,
+                                           const void *weights_scale,
+                                           const void *weights_zero,
+                                           void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/quant.h b/include/infiniop/ops/quant.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_QUANT_API_H__
+#define __INFINIOP_QUANT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopQuantDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateQuantDescriptor(infiniopHandle_t handle,
+                                                          infiniopQuantDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t x_packed_desc,
+                                                          infiniopTensorDescriptor_t x_scale_desc,
+                                                          infiniopTensorDescriptor_t x_zero_desc,
+                                                          infiniopTensorDescriptor_t x_desc);
+
+__C __export infiniStatus_t infiniopGetQuantWorkspaceSize(infiniopQuantDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopQuant(infiniopQuantDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *x_packed,
+                                          void *x_scale,
+                                          void *x_zero,
+                                          const void *x,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyQuantDescriptor(infiniopQuantDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/linear/cuda/kernel.cuh b/src/infiniop/ops/linear/cuda/kernel.cuh
@@ -0,0 +1,76 @@
+#ifndef __LINEAR_KERNEL_CUH__
+#define __LINEAR_KERNEL_CUH__
+
+template <typename Tdata>
+__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = ((float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx] + K * (float)x_zero[row] * (float)w_zero[col]));
+    float output2 = 0.0f;
+    float output3 = 0.0f;
+    float tmp2 = (float)x_scale[row] * (float)w_scale[col] * (float)w_zero[col];
+    float tmp3 = (float)x_scale[row] * (float)x_zero[row] * (float)w_scale[col];
+    for (int ind = 0; ind < K; ind++) {
+        output2 += tmp2 * (float)x_packed[row * K + ind];
+        output3 += tmp3 * (float)w_packed[ind * N + col];
+    }
+    float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx] + (float)bias[col];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+
+template <typename Tdata>
+__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = ((float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx] + K * (float)x_zero[row] * (float)w_zero[col]));
+    float output2 = 0.0f;
+    float output3 = 0.0f;
+    float tmp2 = (float)x_scale[row] * (float)w_scale[col] * (float)w_zero[col];
+    float tmp3 = (float)x_scale[row] * (float)x_zero[row] * (float)w_scale[col];
+    for (int ind = 0; ind < K; ind++) {
+        output2 += tmp2 * (float)x_packed[row * K + ind];
+        output3 += tmp3 * (float)w_packed[ind * N + col];
+    }
+    float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+
+template <typename Tdata>
+__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = (float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx]);
+
+    float output = alpha * output1 + beta * (float)c[idx] + (float)bias[col];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+template <typename Tdata>
+__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= M || col >= N) {
+        return;
+    }
+    int idx = row * N + col;
+    float output1 = (float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx]);
+
+    float output = alpha * output1 + beta * (float)c[idx];
+
+    y[idx] = static_cast<Tdata>(output);
+}
+#endif // __LINEAR_KERNEL_CUH__
diff --git a/src/infiniop/ops/linear/info.h b/src/infiniop/ops/linear/info.h
@@ -0,0 +1,77 @@
+#ifndef __LINEAR_INFO_H__
+#define __LINEAR_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::linear {
+
+class LinearInfo {
+private:
+    LinearInfo() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t M, K, N;
+    float alpha, beta;
+
+    static utils::Result<LinearInfo> createLinearInfo(
+        infiniopTensorDescriptor_t d_desc,
+        infiniopTensorDescriptor_t c_desc,
+        infiniopTensorDescriptor_t bias_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc,
+        infiniopTensorDescriptor_t weights_desc,
+        infiniopTensorDescriptor_t weights_scale_desc,
+        infiniopTensorDescriptor_t weights_zero_desc,
+        float alpha,
+        float beta) {
+
+        CHECK_OR_RETURN(
+            d_desc != nullptr && c_desc != nullptr && x_desc != nullptr && x_scale_desc != nullptr && weights_desc != nullptr && weights_scale_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = d_desc->dtype();
+        const infiniDtype_t packed_type = x_desc->dtype();
+        CHECK_OR_RETURN(dtype == c_desc->dtype() && dtype == x_scale_desc->dtype() && dtype == weights_scale_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(packed_type == weights_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        CHECK_OR_RETURN(d_desc->ndim() == 2
+                            && c_desc->ndim() == 2
+                            && x_desc->ndim() == 2
+                            && x_scale_desc->ndim() == 2
+                            && weights_desc->ndim() == 2
+                            && weights_scale_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t M = d_desc->dim(0);
+        size_t N = d_desc->dim(1);
+        size_t K = x_desc->dim(1);
+
+        CHECK_OR_RETURN(M == x_desc->dim(0)
+                            || M == x_scale_desc->dim(0)
+                            || 1 == x_scale_desc->dim(1)
+                            || 1 == weights_scale_desc->dim(0)
+                            || N == weights_scale_desc->dim(1),
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        return utils::Result<LinearInfo>(LinearInfo{
+            dtype,
+            packed_type,
+            M,
+            K,
+            N,
+            alpha,
+            beta});
+    }
+};
+
+} // namespace op::linear
+
+#endif //  __LINEAR_INFO_H__
diff --git a/src/infiniop/ops/linear/linear.h b/src/infiniop/ops/linear/linear.h
@@ -0,0 +1,54 @@
+#ifndef __LINEAR_H__
+#define __LINEAR_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                                        \
+                                                                                                                     \
+    namespace op::linear::NAMESPACE {                                                                                \
+    class Descriptor final : public InfiniopDescriptor {                                                             \
+        struct Opaque;                                                                                               \
+        Opaque *_opaque;                                                                                             \
+        LinearInfo _info;                                                                                            \
+        size_t _workspace_size;                                                                                      \
+                                                                                                                     \
+        Descriptor(Opaque *opaque, LinearInfo info,                                                                  \
+                   size_t workspace_size,                                                                            \
+                   infiniDevice_t device_type, int device_id)                                                        \
+            : InfiniopDescriptor{device_type, device_id},                                                            \
+              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}                                       \
+                                                                                                                     \
+    public:                                                                                                          \
+        ~Descriptor();                                                                                               \
+                                                                                                                     \
+        size_t minWorkspaceSize() const { return _workspace_size; }                                                  \
+                                                                                                                     \
+        static infiniStatus_t create(                                                                                \
+            infiniopHandle_t handle, Descriptor **desc_ptr,                                                          \
+            infiniopTensorDescriptor_t d_desc,                                                                       \
+            infiniopTensorDescriptor_t c_desc,                                                                       \
+            infiniopTensorDescriptor_t bias_desc,                                                                    \
+            infiniopTensorDescriptor_t x_desc,                                                                       \
+            infiniopTensorDescriptor_t x_scale_desc,                                                                 \
+            infiniopTensorDescriptor_t x_zero_desc,                                                                  \
+            infiniopTensorDescriptor_t weights_desc,                                                                 \
+            infiniopTensorDescriptor_t weights_scale_desc,                                                           \
+            infiniopTensorDescriptor_t weights_zero_desc,                                                            \
+            float alpha,                                                                                             \
+            float beta);                                                                                             \
+        template <unsigned int BLOCK_SIZE, typename Tdata>                                                           \
+        infiniStatus_t launchKernel(const LinearInfo &info, Tdata *y,                                                \
+                                    const Tdata *c, const Tdata *bias, const int8_t *x_packed,                       \
+                                    const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed,               \
+                                    const Tdata *w_scale, const Tdata *w_zero, void *stream, void *workspace) const; \
+                                                                                                                     \
+        infiniStatus_t calculate(                                                                                    \
+            void *workspace, size_t workspace_size,                                                                  \
+            void *d, const void *c, const void *bias, const void *x,                                                 \
+            const void *x_scale, const void *x_zero, const void *weights,                                            \
+            const void *weights_scale, const void *weights_zero, void *stream) const;                                \
+    };                                                                                                               \
+    }
+
+#endif // __LINEAR_H__