Skip to content

Commit 84bed58

Browse files
committed
Merge remote-tracking branch 'origin/main' into issue/252
2 parents 86f64e0 + 2f20af7 commit 84bed58

33 files changed

+2317
-31
lines changed

src/infiniccl/infiniccl.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "./ascend/infiniccl_ascend.h"
44
#include "./cuda/infiniccl_cuda.h"
5+
#include "./maca/infiniccl_maca.h"
56

67
__C infiniStatus_t infinicclCommInitAll(
78
infiniDevice_t device_type,
@@ -16,6 +17,7 @@ __C infiniStatus_t infinicclCommInitAll(
1617
switch (device_type) {
1718
COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda)
1819
COMM_INIT_ALL(INFINI_DEVICE_ASCEND, ascend)
20+
COMM_INIT_ALL(INFINI_DEVICE_METAX, maca)
1921
default:
2022
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
2123
}
@@ -35,6 +37,8 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
3537
switch (comm->device_type) {
3638
COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda)
3739
COMM_DESTROY(INFINI_DEVICE_ASCEND, ascend)
40+
COMM_DESTROY(INFINI_DEVICE_METAX, maca)
41+
3842
default:
3943
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
4044
}
@@ -61,6 +65,8 @@ __C infiniStatus_t infinicclAllReduce(
6165
switch (comm->device_type) {
6266
ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda)
6367
ALL_REDUCE(INFINI_DEVICE_ASCEND, ascend)
68+
ALL_REDUCE(INFINI_DEVICE_METAX, maca)
69+
6470
default:
6571
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
6672
}
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#include "infiniccl_maca.h"
2+
3+
#include "../../utils.h"
4+
5+
#include <hccl.h>
6+
#include <hcr/hc_runtime_api.h>
7+
8+
#include <iostream>
9+
#include <vector>
10+
11+
#define CHECK_HCCL(API__) CHECK_INTERNAL(API__, hcclSuccess)
12+
13+
inline hcStream_t getMacaStream(infinirtStream_t stream) {
14+
if (stream == nullptr) {
15+
return 0;
16+
}
17+
return static_cast<hcStream_t>(stream);
18+
}
19+
20+
inline hcclDataType_t getHcclDtype(infiniDtype_t datatype) {
21+
switch (datatype) {
22+
case INFINI_DTYPE_F32:
23+
return hcclFloat;
24+
case INFINI_DTYPE_F16:
25+
return hcclHalf;
26+
default:
27+
std::abort();
28+
return hcclHalf;
29+
}
30+
}
31+
32+
inline hcclRedOp_t getHcclRedOp(infinicclReduceOp_t op) {
33+
switch (op) {
34+
case INFINICCL_SUM:
35+
return hcclSum;
36+
case INFINICCL_PROD:
37+
return hcclProd;
38+
case INFINICCL_MAX:
39+
return hcclMax;
40+
case INFINICCL_MIN:
41+
return hcclMin;
42+
case INFINICCL_AVG:
43+
return hcclAvg;
44+
default:
45+
std::abort();
46+
return hcclSum;
47+
}
48+
}
49+
50+
inline hcclComm_t getHcclComm(infinicclComm_t comm) {
51+
return static_cast<hcclComm_t>(comm->comm);
52+
}
53+
54+
namespace infiniccl::maca {
55+
56+
infiniStatus_t commInitAll(
57+
infinicclComm_t *comms,
58+
int ndevice,
59+
const int *device_ids) {
60+
61+
std::vector<hcclComm_t> hccl_comms(ndevice);
62+
CHECK_HCCL(hcclCommInitAll(hccl_comms.data(), ndevice, (int const *)device_ids));
63+
64+
for (int i = 0; i < ndevice; i++) {
65+
comms[i] = new InfinicclComm{INFINI_DEVICE_METAX, device_ids[i], (void *)(hccl_comms[i])};
66+
}
67+
68+
return INFINI_STATUS_SUCCESS;
69+
}
70+
71+
infiniStatus_t commDestroy(infinicclComm_t comm) {
72+
CHECK_HCCL(hcclCommDestroy(getHcclComm(comm)));
73+
delete comm;
74+
return INFINI_STATUS_SUCCESS;
75+
}
76+
77+
infiniStatus_t allReduce(
78+
void *sendbuf,
79+
void *recvbuf,
80+
size_t count,
81+
infiniDtype_t datatype,
82+
infinicclReduceOp_t op,
83+
infinicclComm_t comm,
84+
infinirtStream_t stream) {
85+
86+
if (datatype != INFINI_DTYPE_F32 && datatype != INFINI_DTYPE_F16) {
87+
return INFINI_STATUS_BAD_PARAM;
88+
}
89+
90+
CHECK_HCCL(hcclAllReduce(sendbuf, recvbuf, count, getHcclDtype(datatype),
91+
getHcclRedOp(op), getHcclComm(comm), getMacaStream(stream)));
92+
93+
return INFINI_STATUS_SUCCESS;
94+
}
95+
} // namespace infiniccl::maca
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef INFINICCL_MACA_H_
2+
#define INFINICCL_MACA_H_
3+
4+
#include "../infiniccl_impl.h"
5+
6+
#if defined(ENABLE_METAX_API) && defined(ENABLE_CCL)
7+
INFINICCL_DEVICE_API_IMPL(maca)
8+
#else
9+
INFINICCL_DEVICE_API_NOOP(maca)
10+
#endif
11+
12+
#endif /* INFINICCL_MACA_H_ */

src/infiniop-test/include/ops.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
*/
88
DECLARE_INFINIOP_TEST(gemm)
99
DECLARE_INFINIOP_TEST(random_sample)
10+
DECLARE_INFINIOP_TEST(rms_norm)
1011
DECLARE_INFINIOP_TEST(mul)
12+
DECLARE_INFINIOP_TEST(rope)
1113
DECLARE_INFINIOP_TEST(clip)
1214
DECLARE_INFINIOP_TEST(swiglu)
1315
DECLARE_INFINIOP_TEST(add)
@@ -33,6 +35,8 @@ DECLARE_INFINIOP_TEST(add)
3335
REGISTER_INFINIOP_TEST(mul) \
3436
REGISTER_INFINIOP_TEST(clip) \
3537
REGISTER_INFINIOP_TEST(swiglu) \
38+
REGISTER_INFINIOP_TEST(rope) \
39+
REGISTER_INFINIOP_TEST(rms_norm) \
3640
}
3741

3842
namespace infiniop_test {

src/infiniop-test/src/ops/random_sample.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ std::vector<std::string> Test::tensor_names() {
110110
}
111111

112112
std::vector<std::string> Test::output_names() {
113-
return {"result"};
113+
return {};
114114
}
115115

116116
std::string Test::toString() const {
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#include "ops.hpp"
2+
#include "utils.hpp"
3+
#include <infinirt.h>
4+
#include <iomanip>
5+
#include <iostream>
6+
7+
namespace infiniop_test::rms_norm {
8+
struct Test::Attributes {
9+
float epsilon;
10+
std::shared_ptr<Tensor> x;
11+
std::shared_ptr<Tensor> w;
12+
std::shared_ptr<Tensor> ans;
13+
std::shared_ptr<Tensor> y;
14+
};
15+
16+
std::shared_ptr<Test> Test::build(
17+
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
18+
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
19+
double rtol, double atol) {
20+
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
21+
test->_attributes = new Attributes();
22+
23+
if (attributes.find("epsilon") == attributes.end()
24+
|| tensors.find("x") == tensors.end()
25+
|| tensors.find("w") == tensors.end()
26+
|| tensors.find("ans") == tensors.end()
27+
|| tensors.find("y") == tensors.end()) {
28+
throw std::runtime_error("Invalid Test: Missing attributes or tensors");
29+
}
30+
31+
test->_attributes->epsilon = *reinterpret_cast<float *>(attributes["epsilon"].data());
32+
33+
test->_attributes->x = tensors["x"];
34+
test->_attributes->w = tensors["w"];
35+
test->_attributes->ans = tensors["ans"];
36+
test->_attributes->y = tensors["y"];
37+
38+
return test;
39+
}
40+
41+
std::shared_ptr<infiniop_test::Result> Test::run(
42+
infiniopHandle_t handle, infiniDevice_t device, int device_id,
43+
size_t warm_ups, size_t iterations) {
44+
45+
infiniopRMSNormDescriptor_t op_desc;
46+
CHECK_OR(infiniopCreateRMSNormDescriptor(handle, &op_desc,
47+
_attributes->y->desc(),
48+
_attributes->x->desc(),
49+
_attributes->w->desc(),
50+
_attributes->epsilon),
51+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create RMSNorm descriptor"));
52+
53+
auto x = _attributes->x->to(device, device_id);
54+
auto w = _attributes->w->to(device, device_id);
55+
auto y = _attributes->y->to(device, device_id);
56+
57+
size_t workspace_size;
58+
CHECK_OR(infiniopGetRMSNormWorkspaceSize(op_desc, &workspace_size),
59+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
60+
void *workspace = nullptr;
61+
if (workspace_size > 0) {
62+
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
63+
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
64+
}
65+
66+
CHECK_OR(infiniopRMSNorm(op_desc,
67+
workspace, workspace_size,
68+
y->data(),
69+
x->data(),
70+
w->data(),
71+
nullptr),
72+
return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed"));
73+
74+
try {
75+
allClose(y, _attributes->ans, _rtol, _atol);
76+
} catch (const std::exception &e) {
77+
return TEST_FAILED(RESULT_INCORRECT, e.what());
78+
}
79+
80+
double elapsed_time = 0.;
81+
82+
elapsed_time = benchmark(
83+
[=]() {
84+
infiniopRMSNorm(op_desc,
85+
workspace, workspace_size,
86+
y->data(),
87+
x->data(),
88+
w->data(),
89+
nullptr);
90+
},
91+
warm_ups, iterations);
92+
93+
if (workspace != nullptr) {
94+
infinirtFree(workspace);
95+
}
96+
97+
return TEST_PASSED(elapsed_time);
98+
}
99+
100+
std::vector<std::string> Test::attribute_names() {
101+
return {"epsilon"};
102+
}
103+
104+
std::vector<std::string> Test::tensor_names() {
105+
return {"x", "w", "ans", "y"};
106+
}
107+
108+
std::vector<std::string> Test::output_names() {
109+
return {"y"};
110+
}
111+
112+
std::string Test::toString() const {
113+
std::ostringstream oss;
114+
oss << op_name() << std::endl;
115+
oss << "- epsilon=" << _attributes->epsilon << std::endl;
116+
oss << "- x: " << _attributes->x->info() << std::endl;
117+
oss << "- w: " << _attributes->w->info() << std::endl;
118+
oss << "- y: " << _attributes->y->info() << std::endl;
119+
oss << std::scientific << std::setprecision(2);
120+
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
121+
return oss.str();
122+
}
123+
124+
Test::~Test() {
125+
delete _attributes;
126+
}
127+
128+
} // namespace infiniop_test::rms_norm

0 commit comments

Comments
 (0)