diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp index 0937a4821..0a672d334 100644 --- a/include/infinicore/ops.hpp +++ b/include/infinicore/ops.hpp @@ -4,6 +4,7 @@ #include "ops/attention.hpp" #include "ops/causal_softmax.hpp" #include "ops/matmul.hpp" +#include "ops/maximum.hpp" #include "ops/ones.hpp" #include "ops/rearrange.hpp" #include "ops/rms_norm.hpp" diff --git a/include/infinicore/ops/maximum.hpp b/include/infinicore/ops/maximum.hpp new file mode 100644 index 000000000..9fd16bf03 --- /dev/null +++ b/include/infinicore/ops/maximum.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../device.hpp" +#include "common/op.hpp" + +namespace infinicore::op { +class Maximum { +public: + using schema = void (*)(Tensor, Tensor, Tensor); + static void execute(Tensor c, Tensor a, Tensor b); + static common::OpDispatcher &dispatcher(); +}; + +Tensor maximum(Tensor a, Tensor b); +void maximum_(Tensor c, Tensor a, Tensor b); +} // namespace infinicore::op diff --git a/include/infiniop.h b/include/infiniop.h index 92e6f5963..b02df1f23 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -13,6 +13,7 @@ #include "infiniop/ops/layer_norm.h" #include "infiniop/ops/logsoftmax.h" #include "infiniop/ops/lp_norm.h" +#include "infiniop/ops/maximum.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/ones.h" #include "infiniop/ops/random_sample.h" diff --git a/include/infiniop/ops/maximum.h b/include/infiniop/ops/maximum.h new file mode 100644 index 000000000..882cc4afc --- /dev/null +++ b/include/infiniop/ops/maximum.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_MAXIMUM_API_H__ +#define __INFINIOP_MAXIMUM_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMaximumDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMaximumDescriptor(infiniopHandle_t handle, + infiniopMaximumDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetMaximumWorkspaceSize(infiniopMaximumDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMaximum(infiniopMaximumDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMaximumDescriptor(infiniopMaximumDescriptor_t desc); + +#endif diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 5c541ec3c..4806468c1 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -42,6 +42,7 @@ from infinicore.ops.add import add from infinicore.ops.attention import attention from infinicore.ops.matmul import matmul +from infinicore.ops.maximum import maximum from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow from infinicore.ops.rearrange import rearrange @@ -102,6 +103,7 @@ "add", "attention", "matmul", + "maximum", "mul", "narrow", "rearrange", diff --git a/python/infinicore/ops/maximum.py b/python/infinicore/ops/maximum.py new file mode 100644 index 000000000..2c3afdc6a --- /dev/null +++ b/python/infinicore/ops/maximum.py @@ -0,0 +1,11 @@ +from infinicore.lib import _infinicore +from infinicore.tensor import Tensor + + +def maximum(input, other, *, out=None): + if out is None: + return Tensor(_infinicore.maximum(input._underlying, other._underlying)) + + _infinicore.maximum_(out._underlying, input._underlying, other._underlying) + + return out diff --git a/src/infinicore/ops/maximum/maximum.cc b/src/infinicore/ops/maximum/maximum.cc new file mode 100644 index 000000000..a7832b2d7 --- /dev/null +++ b/src/infinicore/ops/maximum/maximum.cc @@ -0,0 +1,28 @@ +#include "infinicore/ops/maximum.hpp" + +#include "../../utils.hpp" + +namespace infinicore::op { + +common::OpDispatcher &Maximum::dispatcher() { + static common::OpDispatcher dispatcher_; + return dispatcher_; +}; + +void Maximum::execute(Tensor c, Tensor a, Tensor b) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + infinicore::context::setDevice(c->device()); + dispatcher().lookup(c->device().getType())(c, a, b); +} + +Tensor maximum(Tensor a, Tensor b) { + auto c = Tensor::empty(a->shape(), a->dtype(), a->device()); + maximum_(c, a, b); + return c; +} + +void maximum_(Tensor c, Tensor a, Tensor b) { + Maximum::execute(c, a, b); +} + +} // namespace infinicore::op diff --git a/src/infinicore/ops/maximum/maximum_infiniop.cc b/src/infinicore/ops/maximum/maximum_infiniop.cc new file mode 100644 index 000000000..7fdbc3a18 --- /dev/null +++ b/src/infinicore/ops/maximum/maximum_infiniop.cc @@ -0,0 +1,52 @@ +#include "../../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include "infinicore/ops/maximum.hpp" +#include + +namespace infinicore::op::maximum_impl::infiniop { + +thread_local common::OpCache caches( + 100, // capacity + [](infiniopMaximumDescriptor_t &desc) { + if (desc != nullptr) { + INFINICORE_CHECK_ERROR(infiniopDestroyMaximumDescriptor(desc)); + desc = nullptr; + } + }); + +void calculate(Tensor c, Tensor a, Tensor b) { + size_t seed = hash_combine(c, b, a); + + auto device_type = context::getDevice().getType(); + auto device_index = context::getDevice().getIndex(); + + auto &cache = caches.getCache(device_type, device_index); + + auto desc_opt = cache.get(seed); + infiniopMaximumDescriptor_t desc = nullptr; + + if (!desc_opt) { + INFINICORE_CHECK_ERROR(infiniopCreateMaximumDescriptor( + context::getInfiniopHandle(c->device()), &desc, + c->desc(), a->desc(), b->desc())); + cache.put(seed, desc); + } else { + desc = *desc_opt; + } + + size_t workspace_size = 0; + INFINICORE_CHECK_ERROR(infiniopGetMaximumWorkspaceSize(desc, &workspace_size)); + std::shared_ptr workspace = context::allocateMemory(workspace_size); + + INFINICORE_CHECK_ERROR(infiniopMaximum( + desc, workspace->data(), workspace_size, + c->data(), a->data(), b->data(), context::getStream())); +} + +static bool registered = []() { + Maximum::dispatcher().registerAll(&calculate, false); + return true; +}(); + +} // namespace infinicore::op::maximum_impl::infiniop diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp index 978defa17..bac9c31ce 100644 --- a/src/infinicore/pybind11/ops.hpp +++ b/src/infinicore/pybind11/ops.hpp @@ -8,6 +8,7 @@ #include "ops/embedding.hpp" #include "ops/linear.hpp" #include "ops/matmul.hpp" +#include "ops/maximum.hpp" #include "ops/mul.hpp" #include "ops/random_sample.hpp" #include "ops/rearrange.hpp" @@ -27,6 +28,7 @@ inline void bind(py::module &m) { bind_random_sample(m); bind_linear(m); bind_matmul(m); + bind_maximum(m); bind_mul(m); bind_rearrange(m); bind_rms_norm(m); diff --git a/src/infinicore/pybind11/ops/maximum.hpp b/src/infinicore/pybind11/ops/maximum.hpp new file mode 100644 index 000000000..bec2ed0b2 --- /dev/null +++ b/src/infinicore/pybind11/ops/maximum.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include "infinicore/ops/maximum.hpp" + +namespace py = pybind11; + +namespace infinicore::ops { + +inline void bind_maximum(py::module &m) { + m.def("maximum", + &op::maximum, + py::arg("a"), + py::arg("b"), + R"doc(Element-wise maximum of two tensors.)doc"); + + m.def("maximum_", + &op::maximum_, + py::arg("c"), + py::arg("a"), + py::arg("b"), + R"doc(In-place element-wise tensor maximum.)doc"); +} + +} // namespace infinicore::ops diff --git a/src/infiniop/ops/maximum/bang/maximum_bang.h b/src/infiniop/ops/maximum/bang/maximum_bang.h new file mode 100644 index 000000000..c058d6e9b --- /dev/null +++ b/src/infiniop/ops/maximum/bang/maximum_bang.h @@ -0,0 +1,8 @@ +#ifndef __MAXIMUM_BANG_API_H__ +#define __MAXIMUM_BANG_API_H__ + +#include "../../../elementwise/bang/elementwise_bang.h" + +ELEMENTWISE_DESCRIPTOR(maximum, bang) + +#endif // __MAXIMUM_BANG_API_H__ diff --git a/src/infiniop/ops/maximum/bang/maximum_bang.mlu b/src/infiniop/ops/maximum/bang/maximum_bang.mlu new file mode 100644 index 000000000..b083dd4d0 --- /dev/null +++ b/src/infiniop/ops/maximum/bang/maximum_bang.mlu @@ -0,0 +1,53 @@ +#include "maximum_bang.h" +// Operator Interface Declaration +LAUNCH_ELEMENTWISE_KERNEL(Maximum) +namespace op::maximum::bang { +typedef struct MaximumOp { + static constexpr size_t num_inputs = 2; + template + static infiniStatus_t launch(Args... args) { + launchMaximumKernel(args...); + return INFINI_STATUS_SUCCESS; + } +} MaximumOp; +Descriptor::~Descriptor() = default; +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + // create Bang elementwise descriptor + CREATE_ELEMENTWISE_BANG_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *queue) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, workspace, output, inputs, queue); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, workspace, output, inputs, queue); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, workspace, output, inputs, queue); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::maximum::bang \ No newline at end of file diff --git a/src/infiniop/ops/maximum/bang/maximum_bang_internal.mlu b/src/infiniop/ops/maximum/bang/maximum_bang_internal.mlu new file mode 100644 index 000000000..d7bce0fbc --- /dev/null +++ b/src/infiniop/ops/maximum/bang/maximum_bang_internal.mlu @@ -0,0 +1,22 @@ +#ifndef __MAXIMUM_BANG_INTERNAL_H__ +#define __MAXIMUM_BANG_INTERNAL_H__ +#include "../../../elementwise/bang/elementwise_bang_kernel.h" +typedef struct MaximumOp { +public: + static constexpr size_t num_inputs = 2; + template + __mlu_device__ void operator()(T *out, const T *a, const T *b, size_t num_elements) const { + if constexpr (std::is_same_v || std::is_same_v || std::is_same_v) { + __bang_maximum(out, a, b, num_elements); + } else { + for (size_t i = 0; i < num_elements; ++i) { + out[i] = a[i] > b[i] ? a[i] : b[i]; + } + } + } +} MaximumOp; +LAUNCH_ELEMENTWISE_KERNEL_IMPL(Maximum, MaximumOp) +LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(Maximum, half) +LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(Maximum, bfloat16_t) +LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(Maximum, float) +#endif // __MAXIMUM_BANG_INTERNAL_H__ \ No newline at end of file diff --git a/src/infiniop/ops/maximum/cpu/maximum_cpu.cc b/src/infiniop/ops/maximum/cpu/maximum_cpu.cc new file mode 100644 index 000000000..0f91d7f64 --- /dev/null +++ b/src/infiniop/ops/maximum/cpu/maximum_cpu.cc @@ -0,0 +1,58 @@ +#include "maximum_cpu.h" + +namespace op::maximum::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::maximum::cpu diff --git a/src/infiniop/ops/maximum/cpu/maximum_cpu.h b/src/infiniop/ops/maximum/cpu/maximum_cpu.h new file mode 100644 index 000000000..82cc50ef3 --- /dev/null +++ b/src/infiniop/ops/maximum/cpu/maximum_cpu.h @@ -0,0 +1,19 @@ +#ifndef __MAXIMUM_CPU_H__ +#define __MAXIMUM_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(maximum, cpu) + +namespace op::maximum::cpu { +typedef struct MaximumOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a > b ? a : b; + } +} MaximumOp; +} // namespace op::maximum::cpu + +#endif // __MAXIMUM_CPU_H__ diff --git a/src/infiniop/ops/maximum/operator.cc b/src/infiniop/ops/maximum/operator.cc new file mode 100644 index 000000000..162e88ffa --- /dev/null +++ b/src/infiniop/ops/maximum/operator.cc @@ -0,0 +1,201 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/maximum.h" + +#ifdef ENABLE_CPU_API +#include "cpu/maximum_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) +#include "nvidia/maximum_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/maximum_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/maximum_kunlun.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/maximum_bang.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/maximum_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMaximumDescriptor( + infiniopHandle_t handle, + infiniopMaximumDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::maximum::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CREATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMaximumWorkspaceSize(infiniopMaximumDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + GET(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopMaximum( + infiniopMaximumDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + CALCULATE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyMaximumDescriptor(infiniopMaximumDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_QY_API + DELETE(INFINI_DEVICE_QY, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infinicore/ops/maximum.py b/test/infinicore/ops/maximum.py index 0e3fc19c6..e89b8f179 100644 --- a/test/infinicore/ops/maximum.py +++ b/test/infinicore/ops/maximum.py @@ -98,9 +98,9 @@ def get_test_cases(self): def torch_operator(self, *args, **kwargs): return torch.maximum(*args, **kwargs) - # def infinicore_operator(self, *args, **kwargs): - # """InfiniCore implementation (operator not yet available).""" - # return infinicore.maximum(*args, **kwargs) + def infinicore_operator(self, *args, **kwargs): + """InfiniCore implementation (operator not yet available).""" + return infinicore.maximum(*args, **kwargs) def main():