Expose ObjectCode as public API + prune unnecessary input arguments (#435)

ksimpson-work · web-flow · commit 1ac7d2c6de5b · 2025-02-19T12:28:00.000-08:00
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
@@ -7,6 +7,7 @@
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
 from cuda.core.experimental._linker import Linker, LinkerOptions
+from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
 from cuda.core.experimental._system import System
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
@@ -65,9 +65,7 @@ def close(self):
     __slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited")
 
     def __init__(self):
-        raise NotImplementedError(
-            "directly creating an Event object can be ambiguous. Please call call Stream.record()."
-        )
+        raise NotImplementedError("directly creating an Event object can be ambiguous. Please call Stream.record().")
 
     @staticmethod
     def _init(options: Optional[EventOptions] = None):
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
@@ -439,7 +439,7 @@ def link(self, target_type) -> ObjectCode:
                 addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
                 code = (ctypes.c_char * size).from_address(addr)
 
-        return ObjectCode(bytes(code), target_type)
+        return ObjectCode._init(bytes(code), target_type)
 
     def get_error_log(self) -> str:
         """Get the error log generated by the linker.
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-
+from typing import Optional, Union
 from warnings import warn
 
 from cuda.core.experimental._utils import driver, get_binding_version, handle_return, precondition
@@ -213,47 +213,42 @@ def attributes(self):
 
 
 class ObjectCode:
-    """Represent a compiled program that was loaded onto the device.
+    """Represent a compiled program to be loaded onto the device.
 
     This object provides a unified interface for different types of
-    compiled programs that are loaded onto the device.
+    compiled programs that will be loaded onto the device.
 
-    Loads the module library with specified module code and JIT options.
+    Note
+    ----
+    This class has no default constructor. If you already have a cubin that you would
+    like to load, use the :meth:`from_cubin` alternative constructor. For all other
+    possible code types (ex: "ptx"), only :class:`~cuda.core.experimental.Program`
+    accepts them and returns an :class:`ObjectCode` instance with its
+    :meth:`~cuda.core.experimental.Program.compile` method.
 
     Note
     ----
     Usage under CUDA 11.x will only load to the current device
     context.
-
-    Parameters
-    ----------
-    module : Union[bytes, str]
-        Either a bytes object containing the module to load, or
-        a file path string containing that module for loading.
-    code_type : Any
-        String of the compiled type.
-        Supported options are "ptx", "cubin", "ltoir" and "fatbin".
-    jit_options : Optional
-        Mapping of JIT options to use during module loading.
-        (Default to no options)
-    symbol_mapping : Optional
-        Keyword argument dictionary specifying how symbol names
-        should be mapped before trying to retrieve them.
-        (Default to no mappings)
-
     """
 
-    __slots__ = ("_handle", "_backend_version", "_jit_options", "_code_type", "_module", "_loader", "_sym_map")
+    __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin")
 
-    def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None):
-        if code_type not in self._supported_code_type:
-            raise ValueError
+    def __init__(self):
+        raise NotImplementedError(
+            "directly creating an ObjectCode object can be ambiguous. Please either call Program.compile() "
+            "or one of the ObjectCode.from_*() constructors"
+        )
+
+    @staticmethod
+    def _init(module, code_type, *, symbol_mapping: Optional[dict] = None):
+        self = ObjectCode.__new__(ObjectCode)
+        assert code_type in self._supported_code_type, f"{code_type=} is not supported"
         _lazy_init()
 
         # handle is assigned during _lazy_load
         self._handle = None
-        self._jit_options = jit_options
 
         self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
         self._loader = _backend[self._backend_version]
@@ -262,42 +257,41 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None):
         self._module = module
         self._sym_map = {} if symbol_mapping is None else symbol_mapping
 
+        return self
+
+    @staticmethod
+    def from_cubin(module: Union[bytes, str], *, symbol_mapping: Optional[dict] = None) -> "ObjectCode":
+        """Create an :class:`ObjectCode` instance from an existing cubin.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory cubin to load, or
+            a file path string pointing to the on-disk cubin to load.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+        return ObjectCode._init(module, "cubin", symbol_mapping=symbol_mapping)
+
     # TODO: do we want to unload in a finalizer? Probably not..
 
     def _lazy_load_module(self, *args, **kwargs):
         if self._handle is not None:
             return
-        jit_options = self._jit_options
         module = self._module
         if isinstance(module, str):
-            # TODO: this option is only taken by the new library APIs, but we have
-            # a bug that we can't easily support it just yet (NVIDIA/cuda-python#73).
-            if jit_options is not None:
-                raise ValueError
-            self._handle = handle_return(self._loader["file"](module))
+            if self._backend_version == "new":
+                self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
+            else:  # "old" backend
+                self._handle = handle_return(self._loader["file"](module.encode()))
         else:
             assert isinstance(module, bytes)
-            if jit_options is None:
-                jit_options = {}
             if self._backend_version == "new":
-                args = (
-                    module,
-                    list(jit_options.keys()),
-                    list(jit_options.values()),
-                    len(jit_options),
-                    # TODO: support library options
-                    [],
-                    [],
-                    0,
-                )
+                self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
             else:  # "old" backend
-                args = (
-                    module,
-                    len(jit_options),
-                    list(jit_options.keys()),
-                    list(jit_options.values()),
-                )
-            self._handle = handle_return(self._loader["data"](*args))
+                self._handle = handle_return(self._loader["data"](module, 0, [], []))
 
     @precondition(_lazy_load_module)
     def get_kernel(self, name):
@@ -314,12 +308,12 @@ def get_kernel(self, name):
             Newly created kernel object.
 
         """
+        if self._code_type not in ("cubin", "ptx", "fatbin"):
+            raise RuntimeError(f"get_kernel() is not supported for {self._code_type}")
         try:
             name = self._sym_map[name]
         except KeyError:
             name = name.encode()
 
         data = handle_return(self._loader["kernel"](self._handle, name))
         return Kernel._from_obj(data, self)
-
-    # TODO: implement from_handle()
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
@@ -386,7 +386,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             if not isinstance(code, str):
                 raise TypeError("ptx Program expects code argument to be a string")
             self._linker = Linker(
-                ObjectCode(code.encode(), code_type), options=self._translate_program_options(options)
+                ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
             self._backend = "linker"
         else:
@@ -472,7 +472,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
                     logs.write(log.decode())
 
-            return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)
+            return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping)
 
         if self._backend == "linker":
             return self._linker.link(target_type)
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
@@ -32,6 +32,7 @@ CUDA compilation toolchain
 
    Program
    Linker
+   ObjectCode
 
    :template: dataclass.rst
 
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -3,7 +3,7 @@
 ``cuda.core`` 0.2.0 Release Notes
 =================================
 
-Released on <TODO>, 2024
+Released on <TODO>, 2025
 
 Highlights
 ----------
@@ -12,14 +12,19 @@ Highlights
 - Add :class:`~DeviceProperties` to provide pythonic access to device properties.
 - Add kernel attributes to :class:`~Kernel`
 
-Limitations
------------
-
-- <TODO>
-
 Breaking Changes
 ----------------
 
 - Change ``__cuda_stream__`` from attribute to method
 - The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.
-- :meth: `~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary. 
+- :meth:`~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary.
+
+New features
+------------
+
+- Expose :class:`ObjectCode` as a public API, which allows loading cubins from memory or disk. For loading other kinds of code types, please continue using :class:`Program`.
+
+Limitations
+-----------
+
+- <TODO>
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
@@ -34,14 +34,8 @@
 }
 """
 arch0 = "".join(f"{i}" for i in dev0.compute_capability)
-prog_add = Program(code_add, code_type="c++")
-mod_add = prog_add.compile(
-    "cubin",
-    options=(
-        "-std=c++17",
-        "-arch=sm_" + arch0,
-    ),
-)
+prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch0}"})
+mod_add = prog_add.compile("cubin")
 ker_add = mod_add.get_kernel("vector_add")
 
 # Set GPU 1
@@ -63,14 +57,8 @@
 }
 """
 arch1 = "".join(f"{i}" for i in dev1.compute_capability)
-prog_sub = Program(code_sub, code_type="c++")
-mod_sub = prog_sub.compile(
-    "cubin",
-    options=(
-        "-std=c++17",
-        "-arch=sm_" + arch1,
-    ),
-)
+prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch1}"})
+mod_sub = prog_sub.compile("cubin")
 ker_sub = mod_sub.get_kernel("vector_sub")
 
 
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
@@ -10,7 +10,7 @@
 import pytest
 from conftest import can_load_generated_ptx
 
-from cuda.core.experimental import Program, ProgramOptions, system
+from cuda.core.experimental import ObjectCode, Program, ProgramOptions, system
 
 
 @pytest.fixture(scope="function")
@@ -37,7 +37,7 @@ def get_saxpy_kernel(init_cuda):
     )
 
     # run in single precision
-    return mod.get_kernel("saxpy<float>")
+    return mod.get_kernel("saxpy<float>"), mod
 
 
 @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
@@ -72,7 +72,7 @@ def test_get_kernel(init_cuda):
     ],
 )
 def test_read_only_kernel_attributes(get_saxpy_kernel, attr, expected_type):
-    kernel = get_saxpy_kernel
+    kernel, _ = get_saxpy_kernel
     method = getattr(kernel.attributes, attr)
     # get the value without providing a device ordinal
     value = method()
@@ -82,3 +82,23 @@ def test_read_only_kernel_attributes(get_saxpy_kernel, attr, expected_type):
     for device in system.devices:
         value = method(device.device_id)
     assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}"
+
+
+def test_object_code_load_cubin(get_saxpy_kernel):
+    _, mod = get_saxpy_kernel
+    cubin = mod._module
+    sym_map = mod._sym_map
+    assert isinstance(cubin, bytes)
+    mod = ObjectCode.from_cubin(cubin, symbol_mapping=sym_map)
+    mod.get_kernel("saxpy<double>")  # force loading
+
+
+def test_object_code_load_cubin_from_file(get_saxpy_kernel, tmp_path):
+    _, mod = get_saxpy_kernel
+    cubin = mod._module
+    sym_map = mod._sym_map
+    assert isinstance(cubin, bytes)
+    cubin_file = tmp_path / "test.cubin"
+    cubin_file.write_bytes(cubin)
+    mod = ObjectCode.from_cubin(str(cubin_file), symbol_mapping=sym_map)
+    mod.get_kernel("saxpy<double>")  # force loading