Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 28 additions & 9 deletions backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ def __init__(self, target:str) -> None:
assert isinstance(self.capability, int)
self.binary_ext = "cnbin"
elif self.driver.target == 'maca':
self.capability = target.arch
self.capability = 80
self.binary_ext = "mcfatbin"
elif self.driver.target == 'ascend':
self.binary_ext = "npubin"
Expand Down Expand Up @@ -173,15 +175,21 @@ def add_stages(self, stages, options):
# from triton.backends.dicp_triton.mlu import ttir_to_cnfatbin, get_architecture_descriptor
# stages["cnbin"] = lambda src, metadata: ttir_to_cnfatbin(src, metadata, get_architecture_descriptor(self.driver, options), False, True)
elif self.driver.target == 'maca':
from triton.backends.dicp_triton.maca import ttir_to_ttgir, optimize_ttgir, ttgir_to_llir, llir_to_mcfatbin, get_architecture_descriptor
arch = get_architecture_descriptor()
extern_libs = dict()
stages["ttgir"] = lambda src, metadata: optimize_ttgir(ttir_to_ttgir(src, 4), options.num_stages, arch)
stages["llir"] = lambda src, metadata: ttgir_to_llir(src, arch)
mxcc_arch = os.environ.get('MACA_PATH') + "/mxgpu_llvm/bin/mxcc"
if mxcc_arch is None:
raise RuntimeError('mxcc_arch is None (not specified)')
stages["mcfatbin"] = lambda src, metadata: llir_to_mcfatbin(src, mxcc_arch, os.environ.get('MACA_PATH'))
from triton.backends.dicp_triton.maca import make_ttir, make_ttgir, make_mlir, make_llir, make_mcfatbin
stages["ttir"] = lambda src, metadata: make_ttir(src, metadata, options)
stages["ttgir"] = lambda src, metadata: make_ttgir(src, metadata, options, self.capability)
stages["mlir"] = lambda src, metadata: make_mlir(src, metadata, options, self.capability)
stages["llir"] = lambda src, metadata: make_llir(src, metadata, options, self.capability)
stages["mcfatbin"] = lambda src, metadata: make_mcfatbin(src, metadata, options, self.capability)
# from triton.backends.dicp_triton.maca import ttir_to_ttgir, optimize_ttgir, ttgir_to_llir, llir_to_mcfatbin, get_architecture_descriptor
# arch = get_architecture_descriptor()
# extern_libs = dict()
# stages["ttgir"] = lambda src, metadata: optimize_ttgir(ttir_to_ttgir(src, 4), options.num_stages, arch)
# stages["llir"] = lambda src, metadata: ttgir_to_llir(src, arch)
# mxcc_arch = os.environ.get('MACA_PATH') + "/mxgpu_llvm/bin/mxcc"
# if mxcc_arch is None:
# raise RuntimeError('mxcc_arch is None (not specified)')
# stages["mcfatbin"] = lambda src, metadata: llir_to_mcfatbin(src, mxcc_arch, os.environ.get('MACA_PATH'))
elif self.driver.target =='ascend':
from triton.backends.dicp_triton.npu import make_ttir, ttir_to_linalg, linalg_to_bin_enable_npu_compile
stages["ttir"] = lambda src, metadata: make_ttir(src, metadata, options)
Expand Down Expand Up @@ -235,6 +243,17 @@ def parse_options(self, options: dict) -> Any:
args["enable_mlu_bound_check"] = os.getenv("TRITON_ENABLE_MLU_BOUND_CHECK",
"0") == "1"
return MLUOptions(**args)
elif self.target.backend == 'maca':
from triton.backends.dicp_triton.maca import MACAOptions
# args = {k: options[k] for k in MACAOptions.__dataclass_fields__.keys() if k in options}
# return MACAOptions(**args)
args = {k: options[k] for k in MACAOptions.__dataclass_fields__.keys() if k in options}
# USE_MACA: support allow_fp8e4nv(i.e. float8_e4m3fn)
args["allow_fp8e4nv"] = True
# args["allow_fp8e4nv"] = False
args["allow_fp8e4b15"] = False
args["max_num_imprecise_acc_default"] = 2**30 if self.capability == 90 else 0
return MACAOptions(**args)
else:
args = {'arch': self.target}
args.update({k: options[k] for k in DICPOptions.__dataclass_fields__.keys() if k in options})
Expand Down
4 changes: 4 additions & 0 deletions backend/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,12 @@ def test_npucompiler():
reset = "\x1b[0m"
warnings.warn(red + str(e_npucompiler) + reset)
return False
elif self.target == "muxi":
import torch
return True
except Exception as e:
import torch
return True
try:
if torch.mlu:
return True
Expand Down
251 changes: 176 additions & 75 deletions backend/maca.c
Original file line number Diff line number Diff line change
@@ -1,95 +1,196 @@
#include <mcr/mc_runtime.h>
#include <dlfcn.h>
#include <stdbool.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdio.h>
#include <stdlib.h>
static inline void gpuAssert(mcError_t code, const char *file, int line)
{
if (code != mcSuccess)
{
const char* prefix = "Triton Error [MACA]: ";
const char* str = mcGetErrorString(code);
char err[1024] = {0};
strcat(err, prefix);
strcat(err, str);
PyErr_SetString(PyExc_RuntimeError, err);
}
#include <stdatomic.h>

// Raises a Python exception and returns false if code is not MC_SUCCESS.
static bool gpuAssert(mcError_t code, const char *file, int line) {
if (code == mcSuccess)
return true;

const char *prefix = "Triton Error [MACA]: ";
const char *str = mcGetErrorString(code);
char err[1024] = {0};
strcat(err, prefix);
strcat(err, str);
PyGILState_STATE gil_state;
gil_state = PyGILState_Ensure();
PyErr_SetString(PyExc_RuntimeError, err);
PyGILState_Release(gil_state);
return false;
}

#define MACA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); if(PyErr_Occurred()) return NULL; }

static PyObject* getDeviceProperties(PyObject* self, PyObject* args){
int device_id;
if(!PyArg_ParseTuple(args, "i", &device_id))
return NULL;
// Get device handle
MCdevice device;
mcDeviceGet(&device, device_id);

// create a struct to hold device properties
int max_shared_mem;
int multiprocessor_count;
int sm_clock_rate;
int mem_clock_rate;
int mem_bus_width;
MACA_CHECK(mcDeviceGetAttribute(&max_shared_mem, mcDeviceAttributeMaxSharedMemoryPerBlock, device));
MACA_CHECK(mcDeviceGetAttribute(&multiprocessor_count, mcDeviceAttributeMultiProcessorCount, device));
MACA_CHECK(mcDeviceGetAttribute(&sm_clock_rate, mcDeviceAttributeClockRate, device));
MACA_CHECK(mcDeviceGetAttribute(&mem_clock_rate, mcDeviceAttributeMemoryClockRate, device));
MACA_CHECK(mcDeviceGetAttribute(&mem_bus_width, mcDeviceAttributeMemoryBusWidth, device));


return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", max_shared_mem,
"multiprocessor_count", multiprocessor_count,
"sm_clock_rate", sm_clock_rate,
"mem_clock_rate", mem_clock_rate,
"mem_bus_width", mem_bus_width);
// To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block.
#define MACA_CHECK_AND_RETURN_NULL(ans) \
do { \
if (!gpuAssert((ans), __FILE__, __LINE__)) \
return NULL; \
} while (0)

// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
#define MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans) \
do { \
if (!gpuAssert((ans), __FILE__, __LINE__)) { \
PyEval_RestoreThread(_save); \
return NULL; \
} \
} while (0)

// Used to check if functions exist in old CUDA driver versions.
#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction) \
do { \
if ((funcPointer) == NULL) { \
(funcPointer) = (initializerFunction)(); \
if ((funcPointer) == NULL) { \
return NULL; \
} \
} \
} while (0)

static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
int device_id;
if (!PyArg_ParseTuple(args, "i", &device_id))
return NULL;
// Get device handle
MCdevice device;
mcDeviceGet(&device, device_id);

// create a struct to hold device properties
int max_shared_mem = 64 * 1024; // 64KB, no CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN
int max_num_regs;
int multiprocessor_count;
int warp_size = 64;
int sm_clock_rate;
int mem_clock_rate;
int mem_bus_width;
MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
&max_num_regs, mcDeviceAttributeMaxSharedMemoryPerBlock, device));
MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
&multiprocessor_count, mcDeviceAttributeMultiProcessorCount, device));
MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
&sm_clock_rate, mcDeviceAttributeClockRate, device));
MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
&mem_clock_rate, mcDeviceAttributeMemoryClockRate, device));
MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
&mem_bus_width, mcDeviceAttributeMemoryBusWidth, device));

return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
max_shared_mem, "max_num_regs", max_num_regs,
"multiprocessor_count", multiprocessor_count, "warpSize",
warp_size, "sm_clock_rate", sm_clock_rate,
"mem_clock_rate", mem_clock_rate, "mem_bus_width",
mem_bus_width);
}

static PyObject* loadBinary(PyObject* self, PyObject* args) {
const char* name;
const char* data;
Py_ssize_t data_size;
int shared;
int device;
if(!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, &device)) {
return NULL;
}
mcFunction_t fun;
mcModule_t mod;
// create driver handles
MACA_CHECK(mcModuleLoadData(&mod, data));
MACA_CHECK(mcModuleGetFunction(&fun, mod, name));

// get allocated registers and spilled registers from the function
int n_regs = 0;
int n_spills = 0;

if(PyErr_Occurred()) {
return NULL;
}
return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills);
static PyObject *loadBinary(PyObject *self, PyObject *args) {
const char *name;
const char *data;
Py_ssize_t data_size;
int shared;
int device;
if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
&device)) {
return NULL;
}
mcFunction_t fun;
mcModule_t mod;
int32_t n_regs = 0;
int32_t n_spills = 0;
// create driver handles
MCcontext pctx = 0;

Py_BEGIN_ALLOW_THREADS;
// TODO: MCcontext implement not found
MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxGetCurrent(&pctx));
if (!pctx) {
MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
mcDevicePrimaryCtxRetain(&pctx, device));
MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxSetCurrent(pctx));
}
MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcModuleLoadData(&mod, data));
MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcModuleGetFunction(&fun, mod, name));
// get allocated registers and spilled registers from the function
MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
mcFuncGetAttribute(&n_regs, MC_FUNC_ATTRIBUTE_NUM_REGS, fun));
MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
mcFuncGetAttribute(&n_spills, MC_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
n_spills /= 4;
Py_END_ALLOW_THREADS;

if (PyErr_Occurred()) {
return NULL;
}
return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills);
}

static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
long size;
if (!PyArg_ParseTuple(args, "l", &size)) {
return NULL;
}
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative");
return NULL;
}

Py_BEGIN_ALLOW_THREADS;

// Ensure we have an active context.
// MCcontext ctx = NULL;
// TODO: CU_LIMIT_PRINTF_FIFO_SIZE implement not found
// MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxGetCurrent(&ctx));
// if (!ctx) {
// MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
// mcDevicePrimaryCtxRetain(&ctx, /*device=*/0));
// MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxSetCurrent(ctx));
// }

// // We can't set the fifo size after running a kernel that calls printf. This
// // is true even if the set() call is a nop and the new size is the same as the
// // old size.
// //
// // This is unfriendly, so check if the old size matches the new size, and skip
// // the set() call if so.
// size_t oldSize = 0;
// MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
// mcCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
// if (oldSize != size) {
// MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
// mcCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
// }

Py_END_ALLOW_THREADS;
return Py_None;
}

static PyMethodDef ModuleMethods[] = {
{"load_binary", loadBinary, METH_VARARGS, "Load provided mcfatbin into MACA driver"},
{"get_device_properties", getDeviceProperties, METH_VARARGS, "Get the properties for a given device"},
{NULL, NULL, 0, NULL} // sentinel
{"load_binary", loadBinary, METH_VARARGS,
"Load provided cubin into CUDA driver"},
{"get_device_properties", getDeviceProperties, METH_VARARGS,
"Get the properties for a given device"},
{"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS,
"Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which "
"controls how many bytes can be streamed from kernels before data starts "
"being dropped. This inherits all the limitations of this call; in "
"particular it's an error to change this value after launching any kernel "
"that calls printf()."},
{NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
PyModuleDef_HEAD_INIT,
"maca_utils",
NULL, //documentation
-1, //size
ModuleMethods
};
static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "maca_utils",
NULL, // documentation
-1, // size
ModuleMethods};

PyMODINIT_FUNC PyInit_maca_utils(void) {
PyObject *m = PyModule_Create(&ModuleDef);
if(m == NULL) {
if (m == NULL) {
return NULL;
}

PyModule_AddFunctions(m, ModuleMethods);

return m;
}
Loading