DeepLink-org · hellozmz · Dec 3, 2025
diff --git a/backend/compiler.py b/backend/compiler.py
@@ -114,6 +114,8 @@ def __init__(self, target:str) -> None:
             assert isinstance(self.capability, int)
             self.binary_ext = "cnbin"
         elif self.driver.target == 'maca':
+            self.capability = target.arch
+            self.capability = 80
             self.binary_ext = "mcfatbin"
         elif self.driver.target == 'ascend':
             self.binary_ext = "npubin"
@@ -173,15 +175,21 @@ def add_stages(self, stages, options):
             # from triton.backends.dicp_triton.mlu import ttir_to_cnfatbin, get_architecture_descriptor
             # stages["cnbin"] = lambda src, metadata: ttir_to_cnfatbin(src, metadata, get_architecture_descriptor(self.driver, options), False, True)
         elif self.driver.target == 'maca':
-            from triton.backends.dicp_triton.maca import ttir_to_ttgir, optimize_ttgir, ttgir_to_llir, llir_to_mcfatbin, get_architecture_descriptor
-            arch = get_architecture_descriptor()
-            extern_libs = dict()
-            stages["ttgir"] = lambda src, metadata: optimize_ttgir(ttir_to_ttgir(src, 4), options.num_stages, arch)
-            stages["llir"] = lambda src, metadata: ttgir_to_llir(src, arch)
-            mxcc_arch = os.environ.get('MACA_PATH') + "/mxgpu_llvm/bin/mxcc"
-            if mxcc_arch is None:
-                raise RuntimeError('mxcc_arch is None (not specified)')
-            stages["mcfatbin"] = lambda src, metadata: llir_to_mcfatbin(src, mxcc_arch, os.environ.get('MACA_PATH'))
+            from triton.backends.dicp_triton.maca import make_ttir, make_ttgir, make_mlir, make_llir, make_mcfatbin
+            stages["ttir"] = lambda src, metadata: make_ttir(src, metadata, options)
+            stages["ttgir"] = lambda src, metadata: make_ttgir(src, metadata, options, self.capability)
+            stages["mlir"] = lambda src, metadata: make_mlir(src, metadata, options, self.capability)
+            stages["llir"] = lambda src, metadata: make_llir(src, metadata, options, self.capability)
+            stages["mcfatbin"] = lambda src, metadata: make_mcfatbin(src, metadata, options, self.capability)
+            # from triton.backends.dicp_triton.maca import ttir_to_ttgir, optimize_ttgir, ttgir_to_llir, llir_to_mcfatbin, get_architecture_descriptor
+            # arch = get_architecture_descriptor()
+            # extern_libs = dict()
+            # stages["ttgir"] = lambda src, metadata: optimize_ttgir(ttir_to_ttgir(src, 4), options.num_stages, arch)
+            # stages["llir"] = lambda src, metadata: ttgir_to_llir(src, arch)
+            # mxcc_arch = os.environ.get('MACA_PATH') + "/mxgpu_llvm/bin/mxcc"
+            # if mxcc_arch is None:
+            #     raise RuntimeError('mxcc_arch is None (not specified)')
+            # stages["mcfatbin"] = lambda src, metadata: llir_to_mcfatbin(src, mxcc_arch, os.environ.get('MACA_PATH'))
         elif self.driver.target =='ascend':
             from triton.backends.dicp_triton.npu import make_ttir, ttir_to_linalg, linalg_to_bin_enable_npu_compile
             stages["ttir"] = lambda src, metadata: make_ttir(src, metadata, options)
@@ -235,6 +243,17 @@ def parse_options(self, options: dict) -> Any:
                 args["enable_mlu_bound_check"] = os.getenv("TRITON_ENABLE_MLU_BOUND_CHECK",
                                                         "0") == "1"
             return MLUOptions(**args)
+        elif self.target.backend == 'maca':
+            from triton.backends.dicp_triton.maca import MACAOptions
+            # args = {k: options[k] for k in MACAOptions.__dataclass_fields__.keys() if k in options}
+            # return MACAOptions(**args)
+            args = {k: options[k] for k in MACAOptions.__dataclass_fields__.keys() if k in options}
+            # USE_MACA: support allow_fp8e4nv(i.e. float8_e4m3fn)
+            args["allow_fp8e4nv"] = True
+            # args["allow_fp8e4nv"] = False
+            args["allow_fp8e4b15"] = False
+            args["max_num_imprecise_acc_default"] = 2**30 if self.capability == 90 else 0
+            return MACAOptions(**args)
         else:
             args = {'arch': self.target}
             args.update({k: options[k] for k in DICPOptions.__dataclass_fields__.keys() if k in options})

diff --git a/backend/driver.py b/backend/driver.py
@@ -161,8 +161,12 @@ def test_npucompiler():
                     reset = "\x1b[0m"
                     warnings.warn(red + str(e_npucompiler) + reset)
                     return False
+            elif self.target == "muxi":
+                import torch
+                return True
         except Exception as e:
             import torch
+            return True
             try:
                 if torch.mlu:
                     return True

diff --git a/backend/maca.c b/backend/maca.c
@@ -1,95 +1,196 @@
 #include <mcr/mc_runtime.h>
+#include <dlfcn.h>
+#include <stdbool.h>
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <stdio.h>
-#include <stdlib.h>
-static inline void gpuAssert(mcError_t code, const char *file, int line)
-{
-  if (code != mcSuccess)
-  {
-     const char* prefix = "Triton Error [MACA]: ";
-     const char* str = mcGetErrorString(code);
-     char err[1024] = {0};
-     strcat(err, prefix);
-     strcat(err, str);
-     PyErr_SetString(PyExc_RuntimeError, err);
-  }
+#include <stdatomic.h>
+
+// Raises a Python exception and returns false if code is not MC_SUCCESS.
+static bool gpuAssert(mcError_t code, const char *file, int line) {
+  if (code == mcSuccess)
+    return true;
+
+  const char *prefix = "Triton Error [MACA]: ";
+  const char *str = mcGetErrorString(code);
+  char err[1024] = {0};
+  strcat(err, prefix);
+  strcat(err, str);
+  PyGILState_STATE gil_state;
+  gil_state = PyGILState_Ensure();
+  PyErr_SetString(PyExc_RuntimeError, err);
+  PyGILState_Release(gil_state);
+  return false;
 }
 
-#define MACA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); if(PyErr_Occurred()) return NULL; }
-
-static PyObject* getDeviceProperties(PyObject* self, PyObject* args){
-    int device_id;
-    if(!PyArg_ParseTuple(args, "i", &device_id))
-        return NULL;
-    // Get device handle
-    MCdevice device;
-    mcDeviceGet(&device, device_id);
-
-    // create a struct to hold device properties
-    int max_shared_mem;
-    int multiprocessor_count;
-    int sm_clock_rate;
-    int mem_clock_rate;
-    int mem_bus_width;
-    MACA_CHECK(mcDeviceGetAttribute(&max_shared_mem, mcDeviceAttributeMaxSharedMemoryPerBlock, device));
-    MACA_CHECK(mcDeviceGetAttribute(&multiprocessor_count, mcDeviceAttributeMultiProcessorCount, device));
-    MACA_CHECK(mcDeviceGetAttribute(&sm_clock_rate, mcDeviceAttributeClockRate, device));
-    MACA_CHECK(mcDeviceGetAttribute(&mem_clock_rate, mcDeviceAttributeMemoryClockRate, device));
-    MACA_CHECK(mcDeviceGetAttribute(&mem_bus_width, mcDeviceAttributeMemoryBusWidth, device));
-
-
-    return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i}", "max_shared_mem", max_shared_mem,
-                               "multiprocessor_count", multiprocessor_count,
-                               "sm_clock_rate", sm_clock_rate,
-                               "mem_clock_rate", mem_clock_rate,
-                               "mem_bus_width", mem_bus_width);
+// To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block.
+#define MACA_CHECK_AND_RETURN_NULL(ans)                                        \
+  do {                                                                         \
+    if (!gpuAssert((ans), __FILE__, __LINE__))                                 \
+      return NULL;                                                             \
+  } while (0)
+
+// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
+#define MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans)                          \
+  do {                                                                         \
+    if (!gpuAssert((ans), __FILE__, __LINE__)) {                               \
+      PyEval_RestoreThread(_save);                                             \
+      return NULL;                                                             \
+    }                                                                          \
+  } while (0)
+
+// Used to check if functions exist in old CUDA driver versions.
+#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction)  \
+  do {                                                                         \
+    if ((funcPointer) == NULL) {                                               \
+      (funcPointer) = (initializerFunction)();                                 \
+      if ((funcPointer) == NULL) {                                             \
+        return NULL;                                                           \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
+  int device_id;
+  if (!PyArg_ParseTuple(args, "i", &device_id))
+    return NULL;
+  // Get device handle
+  MCdevice device;
+  mcDeviceGet(&device, device_id);
+
+  // create a struct to hold device properties
+  int max_shared_mem = 64 * 1024; // 64KB, no CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN
+  int max_num_regs;
+  int multiprocessor_count;
+  int warp_size = 64;
+  int sm_clock_rate;
+  int mem_clock_rate;
+  int mem_bus_width;
+  MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
+      &max_num_regs, mcDeviceAttributeMaxSharedMemoryPerBlock, device));
+  MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
+      &multiprocessor_count, mcDeviceAttributeMultiProcessorCount, device));
+  MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
+      &sm_clock_rate, mcDeviceAttributeClockRate, device));
+  MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
+      &mem_clock_rate, mcDeviceAttributeMemoryClockRate, device));
+  MACA_CHECK_AND_RETURN_NULL(mcDeviceGetAttribute(
+      &mem_bus_width, mcDeviceAttributeMemoryBusWidth, device));
+
+  return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
+                       max_shared_mem, "max_num_regs", max_num_regs,
+                       "multiprocessor_count", multiprocessor_count, "warpSize",
+                       warp_size, "sm_clock_rate", sm_clock_rate,
+                       "mem_clock_rate", mem_clock_rate, "mem_bus_width",
+                       mem_bus_width);
 }
 
-static PyObject* loadBinary(PyObject* self, PyObject* args) {
-    const char* name;
-    const char* data;
-    Py_ssize_t data_size;
-    int shared;
-    int device;
-    if(!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, &device)) {
-        return NULL;
-    }
-    mcFunction_t fun;
-    mcModule_t mod;
-    // create driver handles
-    MACA_CHECK(mcModuleLoadData(&mod, data));
-    MACA_CHECK(mcModuleGetFunction(&fun, mod, name));
-
-    // get allocated registers and spilled registers from the function
-    int n_regs = 0;
-    int n_spills = 0;
-
-    if(PyErr_Occurred()) {
-      return NULL;
-    }
-    return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills);
+static PyObject *loadBinary(PyObject *self, PyObject *args) {
+  const char *name;
+  const char *data;
+  Py_ssize_t data_size;
+  int shared;
+  int device;
+  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
+                        &device)) {
+    return NULL;
+  }
+  mcFunction_t fun;
+  mcModule_t mod;
+  int32_t n_regs = 0;
+  int32_t n_spills = 0;
+  // create driver handles
+  MCcontext pctx = 0;
+
+  Py_BEGIN_ALLOW_THREADS;
+  // TODO: MCcontext implement not found
+  MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxGetCurrent(&pctx));
+  if (!pctx) {
+    MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        mcDevicePrimaryCtxRetain(&pctx, device));
+    MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxSetCurrent(pctx));
+  }
+  MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcModuleLoadData(&mod, data));
+  MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcModuleGetFunction(&fun, mod, name));
+  // get allocated registers and spilled registers from the function
+  MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      mcFuncGetAttribute(&n_regs, MC_FUNC_ATTRIBUTE_NUM_REGS, fun));
+  MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      mcFuncGetAttribute(&n_spills, MC_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
+  n_spills /= 4;
+  Py_END_ALLOW_THREADS;
+
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills);
+}
+
+static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
+  long size;
+  if (!PyArg_ParseTuple(args, "l", &size)) {
+    return NULL;
+  }
+  if (size < 0) {
+    PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative");
+    return NULL;
+  }
+
+  Py_BEGIN_ALLOW_THREADS;
+
+  // Ensure we have an active context.
+  // MCcontext ctx = NULL;
+  // TODO: CU_LIMIT_PRINTF_FIFO_SIZE implement not found
+  // MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxGetCurrent(&ctx));
+  // if (!ctx) {
+  //   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+  //       mcDevicePrimaryCtxRetain(&ctx, /*device=*/0));
+  //   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxSetCurrent(ctx));
+  // }
+
+  // // We can't set the fifo size after running a kernel that calls printf.  This
+  // // is true even if the set() call is a nop and the new size is the same as the
+  // // old size.
+  // //
+  // // This is unfriendly, so check if the old size matches the new size, and skip
+  // // the set() call if so.
+  // size_t oldSize = 0;
+  // MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+  //     mcCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
+  // if (oldSize != size) {
+  //   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+  //       mcCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
+  // }
+
+  Py_END_ALLOW_THREADS;
+  return Py_None;
 }
 
 static PyMethodDef ModuleMethods[] = {
-  {"load_binary", loadBinary, METH_VARARGS, "Load provided mcfatbin into MACA driver"},
-  {"get_device_properties", getDeviceProperties, METH_VARARGS, "Get the properties for a given device"},
-  {NULL, NULL, 0, NULL} // sentinel
+    {"load_binary", loadBinary, METH_VARARGS,
+     "Load provided cubin into CUDA driver"},
+    {"get_device_properties", getDeviceProperties, METH_VARARGS,
+     "Get the properties for a given device"},
+    {"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS,
+     "Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which "
+     "controls how many bytes can be streamed from kernels before data starts "
+     "being dropped.  This inherits all the limitations of this call; in "
+     "particular it's an error to change this value after launching any kernel "
+     "that calls printf()."},
+    {NULL, NULL, 0, NULL} // sentinel
 };
 
-static struct PyModuleDef ModuleDef = {
-  PyModuleDef_HEAD_INIT,
-  "maca_utils",
-  NULL, //documentation
-  -1, //size
-  ModuleMethods
-};
+static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "maca_utils",
+                                       NULL, // documentation
+                                       -1,   // size
+                                       ModuleMethods};
 
 PyMODINIT_FUNC PyInit_maca_utils(void) {
   PyObject *m = PyModule_Create(&ModuleDef);
-  if(m == NULL) {
+  if (m == NULL) {
     return NULL;
   }
+
   PyModule_AddFunctions(m, ModuleMethods);
+
   return m;
 }