Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions include/infiniop/ops/linear.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef __INFINIOP_LINEAR_API_H__
#define __INFINIOP_LINEAR_API_H__

#include "../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopLinearDescriptor_t;

__C __export infiniStatus_t infiniopCreateLinearDescriptor(infiniopHandle_t handle,
infiniopLinearDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t d_desc,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t bias_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t weights_desc,
infiniopTensorDescriptor_t weights_scale_desc,
infiniopTensorDescriptor_t weights_zero_desc,
float alpha,
float beta);

__C __export infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopLinear(infiniopLinearDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *d,
const void *c,
const void *bias,
const void *x,
const void *x_scale,
const void *x_zero,
const void *weights,
const void *weights_scale,
const void *weights_zero,
void *stream);

__C __export infiniStatus_t infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc);

#endif
28 changes: 28 additions & 0 deletions include/infiniop/ops/quant.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef __INFINIOP_QUANT_API_H__
#define __INFINIOP_QUANT_API_H__

#include "../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopQuantDescriptor_t;

__C __export infiniStatus_t infiniopCreateQuantDescriptor(infiniopHandle_t handle,
infiniopQuantDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);

__C __export infiniStatus_t infiniopGetQuantWorkspaceSize(infiniopQuantDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopQuant(infiniopQuantDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyQuantDescriptor(infiniopQuantDescriptor_t desc);

#endif
76 changes: 76 additions & 0 deletions src/infiniop/ops/linear/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#ifndef __LINEAR_KERNEL_CUH__
#define __LINEAR_KERNEL_CUH__

template <typename Tdata>
__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = ((float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx] + K * (float)x_zero[row] * (float)w_zero[col]));
float output2 = 0.0f;
float output3 = 0.0f;
float tmp2 = (float)x_scale[row] * (float)w_scale[col] * (float)w_zero[col];
float tmp3 = (float)x_scale[row] * (float)x_zero[row] * (float)w_scale[col];
for (int ind = 0; ind < K; ind++) {
output2 += tmp2 * (float)x_packed[row * K + ind];
output3 += tmp3 * (float)w_packed[ind * N + col];
}
float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx] + (float)bias[col];

y[idx] = static_cast<Tdata>(output);
}

template <typename Tdata>
__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, const Tdata *w_scale, const Tdata *w_zero, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = ((float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx] + K * (float)x_zero[row] * (float)w_zero[col]));
float output2 = 0.0f;
float output3 = 0.0f;
float tmp2 = (float)x_scale[row] * (float)w_scale[col] * (float)w_zero[col];
float tmp3 = (float)x_scale[row] * (float)x_zero[row] * (float)w_scale[col];
for (int ind = 0; ind < K; ind++) {
output2 += tmp2 * (float)x_packed[row * K + ind];
output3 += tmp3 * (float)w_packed[ind * N + col];
}
float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx];

y[idx] = static_cast<Tdata>(output);
}

template <typename Tdata>
__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = (float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx]);

float output = alpha * output1 + beta * (float)c[idx] + (float)bias[col];

y[idx] = static_cast<Tdata>(output);
}
template <typename Tdata>
__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const Tdata *x_scale, const int8_t *w_packed, const Tdata *w_scale, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = (float)x_scale[row] * (float)w_scale[col] * ((float)y_packed[idx]);

float output = alpha * output1 + beta * (float)c[idx];

y[idx] = static_cast<Tdata>(output);
}
#endif // __LINEAR_KERNEL_CUH__
77 changes: 77 additions & 0 deletions src/infiniop/ops/linear/info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#ifndef __LINEAR_INFO_H__
#define __LINEAR_INFO_H__

#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"

namespace op::linear {

class LinearInfo {
private:
LinearInfo() = default;

public:
infiniDtype_t dtype, packed_type;
size_t M, K, N;
float alpha, beta;

static utils::Result<LinearInfo> createLinearInfo(
infiniopTensorDescriptor_t d_desc,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t bias_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t weights_desc,
infiniopTensorDescriptor_t weights_scale_desc,
infiniopTensorDescriptor_t weights_zero_desc,
float alpha,
float beta) {

CHECK_OR_RETURN(
d_desc != nullptr && c_desc != nullptr && x_desc != nullptr && x_scale_desc != nullptr && weights_desc != nullptr && weights_scale_desc != nullptr,
INFINI_STATUS_NULL_POINTER);

const infiniDtype_t dtype = d_desc->dtype();
const infiniDtype_t packed_type = x_desc->dtype();
CHECK_OR_RETURN(dtype == c_desc->dtype() && dtype == x_scale_desc->dtype() && dtype == weights_scale_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(packed_type == weights_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);

CHECK_OR_RETURN(d_desc->ndim() == 2
&& c_desc->ndim() == 2
&& x_desc->ndim() == 2
&& x_scale_desc->ndim() == 2
&& weights_desc->ndim() == 2
&& weights_scale_desc->ndim() == 2,
INFINI_STATUS_BAD_TENSOR_SHAPE);

size_t M = d_desc->dim(0);
size_t N = d_desc->dim(1);
size_t K = x_desc->dim(1);

CHECK_OR_RETURN(M == x_desc->dim(0)
|| M == x_scale_desc->dim(0)
|| 1 == x_scale_desc->dim(1)
|| 1 == weights_scale_desc->dim(0)
|| N == weights_scale_desc->dim(1),
INFINI_STATUS_BAD_TENSOR_SHAPE);

return utils::Result<LinearInfo>(LinearInfo{
dtype,
packed_type,
M,
K,
N,
alpha,
beta});
}
};

} // namespace op::linear

#endif // __LINEAR_INFO_H__
54 changes: 54 additions & 0 deletions src/infiniop/ops/linear/linear.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef __LINEAR_H__
#define __LINEAR_H__

#include "../../operator.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
\
namespace op::linear::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
LinearInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, LinearInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), _info(info), _workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t minWorkspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, Descriptor **desc_ptr, \
infiniopTensorDescriptor_t d_desc, \
infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t bias_desc, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t x_scale_desc, \
infiniopTensorDescriptor_t x_zero_desc, \
infiniopTensorDescriptor_t weights_desc, \
infiniopTensorDescriptor_t weights_scale_desc, \
infiniopTensorDescriptor_t weights_zero_desc, \
float alpha, \
float beta); \
template <unsigned int BLOCK_SIZE, typename Tdata> \
infiniStatus_t launchKernel(const LinearInfo &info, Tdata *y, \
const Tdata *c, const Tdata *bias, const int8_t *x_packed, \
const Tdata *x_scale, const Tdata *x_zero, const int8_t *w_packed, \
const Tdata *w_scale, const Tdata *w_zero, void *stream, void *workspace) const; \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *d, const void *c, const void *bias, const void *x, \
const void *x_scale, const void *x_zero, const void *weights, \
const void *weights_scale, const void *weights_zero, void *stream) const; \
}; \
}

#endif // __LINEAR_H__
Loading