Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/infinicore/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "ops/attention.hpp"
#include "ops/causal_softmax.hpp"
#include "ops/matmul.hpp"
#include "ops/maximum.hpp"
#include "ops/ones.hpp"
#include "ops/rearrange.hpp"
#include "ops/rms_norm.hpp"
Expand Down
16 changes: 16 additions & 0 deletions include/infinicore/ops/maximum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once

#include "../device.hpp"
#include "common/op.hpp"

namespace infinicore::op {
class Maximum {
public:
using schema = void (*)(Tensor, Tensor, Tensor);
static void execute(Tensor c, Tensor a, Tensor b);
static common::OpDispatcher<schema> &dispatcher();
};

Tensor maximum(Tensor a, Tensor b);
void maximum_(Tensor c, Tensor a, Tensor b);
} // namespace infinicore::op
1 change: 1 addition & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "infiniop/ops/layer_norm.h"
#include "infiniop/ops/logsoftmax.h"
#include "infiniop/ops/lp_norm.h"
#include "infiniop/ops/maximum.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/ones.h"
#include "infiniop/ops/random_sample.h"
Expand Down
26 changes: 26 additions & 0 deletions include/infiniop/ops/maximum.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef __INFINIOP_MAXIMUM_API_H__
#define __INFINIOP_MAXIMUM_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopMaximumDescriptor_t;

__C __export infiniStatus_t infiniopCreateMaximumDescriptor(infiniopHandle_t handle,
infiniopMaximumDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);

__C __export infiniStatus_t infiniopGetMaximumWorkspaceSize(infiniopMaximumDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopMaximum(infiniopMaximumDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream);

__C __export infiniStatus_t infiniopDestroyMaximumDescriptor(infiniopMaximumDescriptor_t desc);

#endif
2 changes: 2 additions & 0 deletions python/infinicore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from infinicore.ops.add import add
from infinicore.ops.attention import attention
from infinicore.ops.matmul import matmul
from infinicore.ops.maximum import maximum
from infinicore.ops.mul import mul
from infinicore.ops.narrow import narrow
from infinicore.ops.rearrange import rearrange
Expand Down Expand Up @@ -102,6 +103,7 @@
"add",
"attention",
"matmul",
"maximum",
"mul",
"narrow",
"rearrange",
Expand Down
11 changes: 11 additions & 0 deletions python/infinicore/ops/maximum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor


def maximum(input, other, *, out=None):
if out is None:
return Tensor(_infinicore.maximum(input._underlying, other._underlying))

_infinicore.maximum_(out._underlying, input._underlying, other._underlying)

return out
28 changes: 28 additions & 0 deletions src/infinicore/ops/maximum/maximum.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include "infinicore/ops/maximum.hpp"

#include "../../utils.hpp"

namespace infinicore::op {

common::OpDispatcher<Maximum::schema> &Maximum::dispatcher() {
static common::OpDispatcher<Maximum::schema> dispatcher_;
return dispatcher_;
};

void Maximum::execute(Tensor c, Tensor a, Tensor b) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
infinicore::context::setDevice(c->device());
dispatcher().lookup(c->device().getType())(c, a, b);
}

Tensor maximum(Tensor a, Tensor b) {
auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
maximum_(c, a, b);
return c;
}

void maximum_(Tensor c, Tensor a, Tensor b) {
Maximum::execute(c, a, b);
}

} // namespace infinicore::op
52 changes: 52 additions & 0 deletions src/infinicore/ops/maximum/maximum_infiniop.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/maximum.hpp"
#include <infiniop.h>

namespace infinicore::op::maximum_impl::infiniop {

thread_local common::OpCache<size_t, infiniopMaximumDescriptor_t> caches(
100, // capacity
[](infiniopMaximumDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyMaximumDescriptor(desc));
desc = nullptr;
}
});

void calculate(Tensor c, Tensor a, Tensor b) {
size_t seed = hash_combine(c, b, a);

auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();

auto &cache = caches.getCache(device_type, device_index);

auto desc_opt = cache.get(seed);
infiniopMaximumDescriptor_t desc = nullptr;

if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateMaximumDescriptor(
context::getInfiniopHandle(c->device()), &desc,
c->desc(), a->desc(), b->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}

size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetMaximumWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);

INFINICORE_CHECK_ERROR(infiniopMaximum(
desc, workspace->data(), workspace_size,
c->data(), a->data(), b->data(), context::getStream()));
}

static bool registered = []() {
Maximum::dispatcher().registerAll(&calculate, false);
return true;
}();

} // namespace infinicore::op::maximum_impl::infiniop
2 changes: 2 additions & 0 deletions src/infinicore/pybind11/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "ops/embedding.hpp"
#include "ops/linear.hpp"
#include "ops/matmul.hpp"
#include "ops/maximum.hpp"
#include "ops/mul.hpp"
#include "ops/random_sample.hpp"
#include "ops/rearrange.hpp"
Expand All @@ -27,6 +28,7 @@ inline void bind(py::module &m) {
bind_random_sample(m);
bind_linear(m);
bind_matmul(m);
bind_maximum(m);
bind_mul(m);
bind_rearrange(m);
bind_rms_norm(m);
Expand Down
26 changes: 26 additions & 0 deletions src/infinicore/pybind11/ops/maximum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#pragma once

#include <pybind11/pybind11.h>

#include "infinicore/ops/maximum.hpp"

namespace py = pybind11;

namespace infinicore::ops {

inline void bind_maximum(py::module &m) {
m.def("maximum",
&op::maximum,
py::arg("a"),
py::arg("b"),
R"doc(Element-wise maximum of two tensors.)doc");

m.def("maximum_",
&op::maximum_,
py::arg("c"),
py::arg("a"),
py::arg("b"),
R"doc(In-place element-wise tensor maximum.)doc");
}

} // namespace infinicore::ops
8 changes: 8 additions & 0 deletions src/infiniop/ops/maximum/bang/maximum_bang.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __MAXIMUM_BANG_API_H__
#define __MAXIMUM_BANG_API_H__

#include "../../../elementwise/bang/elementwise_bang.h"

ELEMENTWISE_DESCRIPTOR(maximum, bang)

#endif // __MAXIMUM_BANG_API_H__
53 changes: 53 additions & 0 deletions src/infiniop/ops/maximum/bang/maximum_bang.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "maximum_bang.h"
// Operator Interface Declaration
LAUNCH_ELEMENTWISE_KERNEL(Maximum)
namespace op::maximum::bang {
typedef struct MaximumOp {
static constexpr size_t num_inputs = 2;
template <typename Tdata, typename... Args>
static infiniStatus_t launch(Args... args) {
launchMaximumKernel<Tdata>(args...);
return INFINI_STATUS_SUCCESS;
}
} MaximumOp;
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::bang::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create Bang elementwise descriptor
CREATE_ELEMENTWISE_BANG_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *queue) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<MaximumOp, half>(_info, workspace, output, inputs, queue);
case INFINI_DTYPE_BF16:
return _device_info->calculate<MaximumOp, bfloat16_t>(_info, workspace, output, inputs, queue);
case INFINI_DTYPE_F32:
return _device_info->calculate<MaximumOp, float>(_info, workspace, output, inputs, queue);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::maximum::bang
22 changes: 22 additions & 0 deletions src/infiniop/ops/maximum/bang/maximum_bang_internal.mlu
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef __MAXIMUM_BANG_INTERNAL_H__
#define __MAXIMUM_BANG_INTERNAL_H__
#include "../../../elementwise/bang/elementwise_bang_kernel.h"
typedef struct MaximumOp {
public:
static constexpr size_t num_inputs = 2;
template <typename T>
__mlu_device__ void operator()(T *out, const T *a, const T *b, size_t num_elements) const {
if constexpr (std::is_same_v<T, half> || std::is_same_v<T, bfloat16_t> || std::is_same_v<T, float>) {
__bang_maximum(out, a, b, num_elements);
} else {
for (size_t i = 0; i < num_elements; ++i) {
out[i] = a[i] > b[i] ? a[i] : b[i];
}
}
}
} MaximumOp;
LAUNCH_ELEMENTWISE_KERNEL_IMPL(Maximum, MaximumOp)
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(Maximum, half)
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(Maximum, bfloat16_t)
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(Maximum, float)
#endif // __MAXIMUM_BANG_INTERNAL_H__
58 changes: 58 additions & 0 deletions src/infiniop/ops/maximum/cpu/maximum_cpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#include "maximum_cpu.h"

namespace op::maximum::cpu {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);

CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);

// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<MaximumOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<MaximumOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<MaximumOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<MaximumOp, bf16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<MaximumOp, int32_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<MaximumOp, int64_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}
} // namespace op::maximum::cpu
19 changes: 19 additions & 0 deletions src/infiniop/ops/maximum/cpu/maximum_cpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef __MAXIMUM_CPU_H__
#define __MAXIMUM_CPU_H__

#include "../../../elementwise/cpu/elementwise_cpu.h"

ELEMENTWISE_DESCRIPTOR(maximum, cpu)

namespace op::maximum::cpu {
typedef struct MaximumOp {
public:
static constexpr size_t num_inputs = 2;
template <typename T>
T operator()(const T &a, const T &b) const {
return a > b ? a : b;
}
} MaximumOp;
} // namespace op::maximum::cpu

#endif // __MAXIMUM_CPU_H__
Loading