From d312f5eeca2b2c9d48fa48a8de2ca3a079651331 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 00:35:19 +0000
Subject: [PATCH 01/22] Initial plan


From c255dee3e680e0171526d8d8e609623b3803a8fd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 00:47:33 +0000
Subject: [PATCH 02/22] Add CUDA backend support with runtime detection

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 iris/_hip.py | 170 ++++++++++++++++++++++++++++++++++++++++++++
 iris/cuda.py | 164 +++++++++++++++++++++++++++++++++++++++++++
 iris/hip.py  | 194 ++++++++++++---------------------------------------
 iris/iris.py |   4 +-
 4 files changed, 380 insertions(+), 152 deletions(-)
 create mode 100644 iris/_hip.py
 create mode 100644 iris/cuda.py

diff --git a/iris/_hip.py b/iris/_hip.py
new file mode 100644
index 00000000..f6f4d8ff
--- /dev/null
+++ b/iris/_hip.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import ctypes
+import numpy as np
+import sys
+import torch
+import subprocess
+import os
+
+rt_path = "libamdhip64.so"
+hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
+
+
+def hip_try(err):
+    if err != 0:
+        hip_runtime.hipGetErrorString.restype = ctypes.c_char_p
+        error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
+        raise RuntimeError(f"HIP error code {err}: {error_string}")
+
+
+class hipIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("reserved", ctypes.c_char * 64)]
+
+
+def get_ipc_handle_size():
+    """Return the size of IPC handle in bytes (64 for HIP)."""
+    return 64
+
+
+def open_ipc_handle(ipc_handle_data, rank):
+    ptr = ctypes.c_void_p()
+    hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+    hip_runtime.hipIpcOpenMemHandle.argtypes = [
+        ctypes.POINTER(ctypes.c_void_p),
+        hipIpcMemHandle_t,
+        ctypes.c_uint,
+    ]
+    if isinstance(ipc_handle_data, np.ndarray):
+        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 64:
+            raise ValueError("ipc_handle_data must be a 64-element uint8 numpy array")
+        ipc_handle_bytes = ipc_handle_data.tobytes()
+        ipc_handle_data = (ctypes.c_char * 64).from_buffer_copy(ipc_handle_bytes)
+    else:
+        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 64 elements")
+
+    raw_memory = ctypes.create_string_buffer(64)
+    ctypes.memset(raw_memory, 0x00, 64)
+    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
+    ipc_handle_data_bytes = bytes(ipc_handle_data)
+    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 64)
+
+    hip_try(
+        hip_runtime.hipIpcOpenMemHandle(
+            ctypes.byref(ptr),
+            ipc_handle_struct,
+            hipIpcMemLazyEnablePeerAccess,
+        )
+    )
+
+    return ptr.value
+
+
+def get_ipc_handle(ptr, rank):
+    ipc_handle = hipIpcMemHandle_t()
+    hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    return ipc_handle
+
+
+def count_devices():
+    device_count = ctypes.c_int()
+    hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
+    return device_count.value
+
+
+def set_device(gpu_id):
+    hip_try(hip_runtime.hipSetDevice(gpu_id))
+
+
+def get_device_id():
+    device_id = ctypes.c_int()
+    hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id)))
+    return device_id.value
+
+
+def get_cu_count(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+
+    hipDeviceAttributeMultiprocessorCount = 63
+    cu_count = ctypes.c_int()
+
+    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
+
+    return cu_count.value
+
+
+def get_rocm_version():
+    major, minor = -1, -1
+
+    # Try hipconfig --path first
+    try:
+        result = subprocess.run(["hipconfig", "--path"], capture_output=True, text=True, check=True)
+        rocm_path = result.stdout.strip()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # Then look for $ROCM_PATH environment variable
+        rocm_path = os.environ.get("ROCM_PATH")
+        if not rocm_path:
+            # Finally, try default location
+            rocm_path = "/opt/rocm"
+
+    # Try to read version from .info/version file
+    try:
+        version_file_path = os.path.join(rocm_path, ".info", "version")
+        with open(version_file_path, "r") as version_file:
+            version = version_file.readline().strip()
+            major = int(version.split(".")[0])
+            minor = int(version.split(".")[1])
+    except (FileNotFoundError, IOError, ValueError, IndexError):
+        # If we can't read the version file, return -1, -1
+        pass
+
+    return (major, minor)
+
+
+def get_wall_clock_rate(device_id):
+    hipDeviceAttributeWallClockRate = 10017
+    wall_clock_rate = ctypes.c_int()
+    status = hip_runtime.hipDeviceGetAttribute(
+        ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id
+    )
+    hip_try(status)
+    return wall_clock_rate.value
+
+
+def get_arch_string(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+    arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
+    arch_name = arch_full.split(":")[0]
+    return arch_name
+
+
+def get_num_xcc(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+    rocm_major, _ = get_rocm_version()
+    if rocm_major < 7:
+        return 8
+    hipDeviceAttributeNumberOfXccs = 10018
+    xcc_count = ctypes.c_int()
+    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
+    return xcc_count.value
+
+
+def malloc_fine_grained(size):
+    hipDeviceMallocFinegrained = 0x1
+    ptr = ctypes.c_void_p()
+    hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
+    return ptr
+
+
+def hip_malloc(size):
+    ptr = ctypes.c_void_p()
+    hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size))
+    return ptr
+
+
+def hip_free(ptr):
+    hip_try(hip_runtime.hipFree(ptr))
diff --git a/iris/cuda.py b/iris/cuda.py
new file mode 100644
index 00000000..aa7a04ea
--- /dev/null
+++ b/iris/cuda.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import ctypes
+import numpy as np
+import sys
+import torch
+import subprocess
+import os
+
+rt_path = "libcudart.so"
+cuda_runtime = ctypes.cdll.LoadLibrary(rt_path)
+
+
+def hip_try(err):
+    if err != 0:
+        cuda_runtime.cudaGetErrorString.restype = ctypes.c_char_p
+        error_string = cuda_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
+        raise RuntimeError(f"CUDA error code {err}: {error_string}")
+
+
+class hipIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+def get_ipc_handle_size():
+    """Return the size of IPC handle in bytes (128 for CUDA)."""
+    return 128
+
+
+def open_ipc_handle(ipc_handle_data, rank):
+    ptr = ctypes.c_void_p()
+    cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+    cuda_runtime.cudaIpcOpenMemHandle.argtypes = [
+        ctypes.POINTER(ctypes.c_void_p),
+        hipIpcMemHandle_t,
+        ctypes.c_uint,
+    ]
+    if isinstance(ipc_handle_data, np.ndarray):
+        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 128:
+            raise ValueError("ipc_handle_data must be a 128-element uint8 numpy array")
+        ipc_handle_bytes = ipc_handle_data.tobytes()
+        ipc_handle_data = (ctypes.c_char * 128).from_buffer_copy(ipc_handle_bytes)
+    else:
+        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 128 elements")
+
+    raw_memory = ctypes.create_string_buffer(128)
+    ctypes.memset(raw_memory, 0x00, 128)
+    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
+    ipc_handle_data_bytes = bytes(ipc_handle_data)
+    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 128)
+
+    hip_try(
+        cuda_runtime.cudaIpcOpenMemHandle(
+            ctypes.byref(ptr),
+            ipc_handle_struct,
+            cudaIpcMemLazyEnablePeerAccess,
+        )
+    )
+
+    return ptr.value
+
+
+def get_ipc_handle(ptr, rank):
+    ipc_handle = hipIpcMemHandle_t()
+    hip_try(cuda_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    return ipc_handle
+
+
+def count_devices():
+    device_count = ctypes.c_int()
+    hip_try(cuda_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
+    return device_count.value
+
+
+def set_device(gpu_id):
+    hip_try(cuda_runtime.cudaSetDevice(gpu_id))
+
+
+def get_device_id():
+    device_id = ctypes.c_int()
+    hip_try(cuda_runtime.cudaGetDevice(ctypes.byref(device_id)))
+    return device_id.value
+
+
+def get_cu_count(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+
+    cudaDeviceAttributeMultiprocessorCount = 16
+    cu_count = ctypes.c_int()
+
+    hip_try(
+        cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDeviceAttributeMultiprocessorCount, device_id)
+    )
+
+    return cu_count.value
+
+
+def get_rocm_version():
+    # Return CUDA version instead
+    major, minor = -1, -1
+
+    # Try nvcc --version
+    try:
+        result = subprocess.run(["nvcc", "--version"], capture_output=True, text=True, check=True)
+        # Parse version from output like "release 12.0, V12.0.76"
+        for line in result.stdout.split("\n"):
+            if "release" in line.lower():
+                version_part = line.split("release")[1].strip().split(",")[0]
+                parts = version_part.split(".")
+                if len(parts) >= 2:
+                    major = int(parts[0])
+                    minor = int(parts[1])
+                    break
+    except (subprocess.CalledProcessError, FileNotFoundError, ValueError, IndexError):
+        # If we can't get CUDA version, try environment variable
+        cuda_version = os.environ.get("CUDA_VERSION")
+        if cuda_version:
+            try:
+                parts = cuda_version.split(".")
+                major = int(parts[0])
+                minor = int(parts[1]) if len(parts) > 1 else 0
+            except (ValueError, IndexError):
+                pass
+
+    return (major, minor)
+
+
+def get_wall_clock_rate(device_id):
+    cudaDevAttrMemoryClockRate = 36
+    wall_clock_rate = ctypes.c_int()
+    status = cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrMemoryClockRate, device_id)
+    hip_try(status)
+    return wall_clock_rate.value
+
+
+def get_arch_string(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+    # For CUDA, get compute capability
+    device_props = torch.cuda.get_device_properties(device_id)
+    major = device_props.major
+    minor = device_props.minor
+    return f"sm_{major}{minor}"
+
+
+def get_num_xcc(device_id=None):
+    # XCC is AMD-specific, return 1 for CUDA
+    return 1
+
+
+def malloc_fine_grained(size):
+    return hip_malloc(size)
+
+
+def hip_malloc(size):
+    ptr = ctypes.c_void_p()
+    hip_try(cuda_runtime.cudaMalloc(ctypes.byref(ptr), size))
+    return ptr
+
+
+def hip_free(ptr):
+    hip_try(cuda_runtime.cudaFree(ptr))
diff --git a/iris/hip.py b/iris/hip.py
index ba9e2051..bf6d73c9 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -1,165 +1,57 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 
-import ctypes
-import numpy as np
-import sys
-import torch
-import subprocess
-import os
-
-rt_path = "libamdhip64.so"
-hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
-
-
-def hip_try(err):
-    if err != 0:
-        hip_runtime.hipGetErrorString.restype = ctypes.c_char_p
-        error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
-        raise RuntimeError(f"HIP error code {err}: {error_string}")
-
-
-class hipIpcMemHandle_t(ctypes.Structure):
-    _fields_ = [("reserved", ctypes.c_char * 64)]
-
-
-def open_ipc_handle(ipc_handle_data, rank):
-    ptr = ctypes.c_void_p()
-    hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
-    hip_runtime.hipIpcOpenMemHandle.argtypes = [
-        ctypes.POINTER(ctypes.c_void_p),
-        hipIpcMemHandle_t,
-        ctypes.c_uint,
-    ]
-    if isinstance(ipc_handle_data, np.ndarray):
-        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 64:
-            raise ValueError("ipc_handle_data must be a 64-element uint8 numpy array")
-        ipc_handle_bytes = ipc_handle_data.tobytes()
-        ipc_handle_data = (ctypes.c_char * 64).from_buffer_copy(ipc_handle_bytes)
-    else:
-        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 64 elements")
-
-    raw_memory = ctypes.create_string_buffer(64)
-    ctypes.memset(raw_memory, 0x00, 64)
-    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
-    ipc_handle_data_bytes = bytes(ipc_handle_data)
-    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 64)
-
-    hip_try(
-        hip_runtime.hipIpcOpenMemHandle(
-            ctypes.byref(ptr),
-            ipc_handle_struct,
-            hipIpcMemLazyEnablePeerAccess,
-        )
-    )
-
-    return ptr.value
-
-
-def get_ipc_handle(ptr, rank):
-    ipc_handle = hipIpcMemHandle_t()
-    hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
-    return ipc_handle
-
-
-def count_devices():
-    device_count = ctypes.c_int()
-    hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
-    return device_count.value
-
-
-def set_device(gpu_id):
-    hip_try(hip_runtime.hipSetDevice(gpu_id))
-
+"""
+HIP-compatible API facade for Iris.
 
-def get_device_id():
-    device_id = ctypes.c_int()
-    hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id)))
-    return device_id.value
+This module provides a HIP-compatible interface that transparently redirects
+to either the HIP backend (AMD GPUs) or CUDA backend (NVIDIA GPUs) based on
+runtime detection or configuration.
 
+The backend is selected based on:
+1. IRIS_BACKEND environment variable (set to 'cuda' or 'hip')
+2. Auto-detection based on available libraries
+"""
 
-def get_cu_count(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-
-    hipDeviceAttributeMultiprocessorCount = 63
-    cu_count = ctypes.c_int()
-
-    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
-
-    return cu_count.value
-
-
-def get_rocm_version():
-    major, minor = -1, -1
+import os
+import sys
 
-    # Try hipconfig --path first
+# Detect backend
+def _detect_backend():
+    """Detect which backend to use based on environment and available libraries."""
+    backend_env = os.environ.get("IRIS_BACKEND", "").lower()
+    if backend_env in ("cuda", "nvidia"):
+        return "cuda"
+    elif backend_env in ("hip", "amd", "rocm"):
+        return "hip"
+    
+    # Auto-detect by trying to load libraries
+    import ctypes
     try:
-        result = subprocess.run(["hipconfig", "--path"], capture_output=True, text=True, check=True)
-        rocm_path = result.stdout.strip()
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        # Then look for $ROCM_PATH environment variable
-        rocm_path = os.environ.get("ROCM_PATH")
-        if not rocm_path:
-            # Finally, try default location
-            rocm_path = "/opt/rocm"
-
-    # Try to read version from .info/version file
+        ctypes.cdll.LoadLibrary("libamdhip64.so")
+        return "hip"
+    except (OSError, FileNotFoundError):
+        pass
+    
     try:
-        version_file_path = os.path.join(rocm_path, ".info", "version")
-        with open(version_file_path, "r") as version_file:
-            version = version_file.readline().strip()
-            major = int(version.split(".")[0])
-            minor = int(version.split(".")[1])
-    except (FileNotFoundError, IOError, ValueError, IndexError):
-        # If we can't read the version file, return -1, -1
+        ctypes.cdll.LoadLibrary("libcudart.so")
+        return "cuda"
+    except (OSError, FileNotFoundError):
         pass
-
-    return (major, minor)
-
-
-def get_wall_clock_rate(device_id):
-    hipDeviceAttributeWallClockRate = 10017
-    wall_clock_rate = ctypes.c_int()
-    status = hip_runtime.hipDeviceGetAttribute(
-        ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id
-    )
-    hip_try(status)
-    return wall_clock_rate.value
-
-
-def get_arch_string(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-    arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
-    arch_name = arch_full.split(":")[0]
-    return arch_name
-
-
-def get_num_xcc(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-    rocm_major, _ = get_rocm_version()
-    if rocm_major < 7:
-        return 8
-    hipDeviceAttributeNumberOfXccs = 10018
-    xcc_count = ctypes.c_int()
-    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
-    return xcc_count.value
-
-
-def malloc_fine_grained(size):
-    hipDeviceMallocFinegrained = 0x1
-    ptr = ctypes.c_void_p()
-    hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
-    return ptr
+    
+    # Default to hip for backward compatibility
+    return "hip"
 
 
-def hip_malloc(size):
-    ptr = ctypes.c_void_p()
-    hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size))
-    return ptr
+_backend = _detect_backend()
 
+# Import from the appropriate backend module
+if _backend == "cuda":
+    from iris.cuda import *
+else:
+    from iris._hip import *
 
-def hip_free(ptr):
-    hip_try(hip_runtime.hipFree(ptr))
+# Make backend information available
+def get_backend():
+    """Get the currently active backend name ('hip' or 'cuda')."""
+    return _backend
diff --git a/iris/iris.py b/iris/iris.py
index be91eacb..5a9b5142 100644
--- a/iris/iris.py
+++ b/iris/iris.py
@@ -39,6 +39,7 @@
     get_ipc_handle,
     open_ipc_handle,
     get_wall_clock_rate,
+    get_ipc_handle_size,
 )
 import numpy as np
 import math
@@ -89,7 +90,8 @@ def __init__(self, heap_size=1 << 30):
 
         heap_bases = np.zeros(num_ranks, dtype=np.uint64)
         heap_bases[cur_rank] = heap_base
-        ipc_handles = np.zeros((num_ranks, 64), dtype=np.uint8)
+        ipc_handle_size = get_ipc_handle_size()
+        ipc_handles = np.zeros((num_ranks, ipc_handle_size), dtype=np.uint8)
         ipc_handle = get_ipc_handle(heap_base_ptr, cur_rank)
 
         distributed_barrier()

From 259afe6ba00220b732e3a621d251f38b3e3769a7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 00:51:49 +0000
Subject: [PATCH 03/22] Add unit tests and fix linting issues for CUDA backend

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 iris/hip.py                               |  27 ++-
 tests/unittests/test_backend_detection.py | 233 ++++++++++++++++++++++
 2 files changed, 254 insertions(+), 6 deletions(-)
 create mode 100644 tests/unittests/test_backend_detection.py

diff --git a/iris/hip.py b/iris/hip.py
index bf6d73c9..5d820135 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -15,6 +15,8 @@
 
 import os
 import sys
+import importlib.util
+
 
 # Detect backend
 def _detect_backend():
@@ -24,32 +26,45 @@ def _detect_backend():
         return "cuda"
     elif backend_env in ("hip", "amd", "rocm"):
         return "hip"
-    
+
     # Auto-detect by trying to load libraries
     import ctypes
+
     try:
         ctypes.cdll.LoadLibrary("libamdhip64.so")
         return "hip"
     except (OSError, FileNotFoundError):
         pass
-    
+
     try:
         ctypes.cdll.LoadLibrary("libcudart.so")
         return "cuda"
     except (OSError, FileNotFoundError):
         pass
-    
+
     # Default to hip for backward compatibility
     return "hip"
 
 
 _backend = _detect_backend()
 
-# Import from the appropriate backend module
+# Load the appropriate backend module directly without triggering __init__.py
+_module_dir = os.path.dirname(__file__)
 if _backend == "cuda":
-    from iris.cuda import *
+    _module_path = os.path.join(_module_dir, "cuda.py")
+    _spec = importlib.util.spec_from_file_location("iris._cuda_backend", _module_path)
 else:
-    from iris._hip import *
+    _module_path = os.path.join(_module_dir, "_hip.py")
+    _spec = importlib.util.spec_from_file_location("iris._hip_backend", _module_path)
+
+_runtime_module = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_runtime_module)
+
+# Export all public symbols from the backend module
+for _name in dir(_runtime_module):
+    if not _name.startswith("_"):
+        globals()[_name] = getattr(_runtime_module, _name)
+
 
 # Make backend information available
 def get_backend():
diff --git a/tests/unittests/test_backend_detection.py b/tests/unittests/test_backend_detection.py
new file mode 100644
index 00000000..ced3e850
--- /dev/null
+++ b/tests/unittests/test_backend_detection.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+"""
+Unit tests for backend detection and runtime module selection.
+
+These tests verify that the backend detection logic works correctly
+and that the appropriate backend module is selected based on configuration.
+"""
+
+import os
+import sys
+import pytest
+import importlib
+import importlib.util
+
+
+def test_backend_detection_default():
+    """Test that default backend is HIP when no environment variable is set."""
+    # Clear any existing IRIS_BACKEND setting
+    old_env = os.environ.pop("IRIS_BACKEND", None)
+
+    try:
+        # Load hip.py directly to test detection logic
+        spec = importlib.util.spec_from_file_location(
+            "hip_test",
+            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
+        )
+        hip_module = importlib.util.module_from_spec(spec)
+
+        # Execute the module - detection happens before trying to load backend
+        try:
+            spec.loader.exec_module(hip_module)
+        except OSError:
+            # Expected - GPU library not found, but detection logic runs first
+            pass
+
+        # Check that backend was set to 'hip' (default)
+        assert hasattr(hip_module, '_backend')
+        assert hip_module._backend == 'hip', "Default backend should be 'hip'"
+
+    finally:
+        # Restore environment
+        if old_env is not None:
+            os.environ["IRIS_BACKEND"] = old_env
+
+
+def test_backend_detection_cuda_env():
+    """Test that CUDA backend is selected when IRIS_BACKEND=cuda."""
+    old_env = os.environ.get("IRIS_BACKEND")
+
+    try:
+        os.environ["IRIS_BACKEND"] = "cuda"
+
+        # Load hip.py directly
+        spec = importlib.util.spec_from_file_location(
+            "hip_test_cuda",
+            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
+        )
+        hip_module = importlib.util.module_from_spec(spec)
+
+        try:
+            spec.loader.exec_module(hip_module)
+        except OSError:
+            # Expected - GPU library not found
+            pass
+
+        # Check that backend was set to 'cuda'
+        assert hasattr(hip_module, '_backend')
+        assert hip_module._backend == 'cuda', "Backend should be 'cuda' when IRIS_BACKEND=cuda"
+
+    finally:
+        # Restore environment
+        if old_env is not None:
+            os.environ["IRIS_BACKEND"] = old_env
+        else:
+            os.environ.pop("IRIS_BACKEND", None)
+
+
+def test_backend_detection_nvidia_alias():
+    """Test that CUDA backend is selected when IRIS_BACKEND=nvidia."""
+    old_env = os.environ.get("IRIS_BACKEND")
+
+    try:
+        os.environ["IRIS_BACKEND"] = "nvidia"
+
+        # Load hip.py directly
+        spec = importlib.util.spec_from_file_location(
+            "hip_test_nvidia",
+            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
+        )
+        hip_module = importlib.util.module_from_spec(spec)
+
+        try:
+            spec.loader.exec_module(hip_module)
+        except OSError:
+            # Expected - GPU library not found
+            pass
+
+        # Check that backend was set to 'cuda'
+        assert hasattr(hip_module, '_backend')
+        assert hip_module._backend == 'cuda', "Backend should be 'cuda' when IRIS_BACKEND=nvidia"
+
+    finally:
+        # Restore environment
+        if old_env is not None:
+            os.environ["IRIS_BACKEND"] = old_env
+        else:
+            os.environ.pop("IRIS_BACKEND", None)
+
+
+def test_backend_detection_hip_env():
+    """Test that HIP backend is selected when IRIS_BACKEND=hip."""
+    old_env = os.environ.get("IRIS_BACKEND")
+
+    try:
+        os.environ["IRIS_BACKEND"] = "hip"
+
+        # Load hip.py directly
+        spec = importlib.util.spec_from_file_location(
+            "hip_test_hip",
+            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
+        )
+        hip_module = importlib.util.module_from_spec(spec)
+
+        try:
+            spec.loader.exec_module(hip_module)
+        except OSError:
+            # Expected - GPU library not found
+            pass
+
+        # Check that backend was set to 'hip'
+        assert hasattr(hip_module, '_backend')
+        assert hip_module._backend == 'hip', "Backend should be 'hip' when IRIS_BACKEND=hip"
+
+    finally:
+        # Restore environment
+        if old_env is not None:
+            os.environ["IRIS_BACKEND"] = old_env
+        else:
+            os.environ.pop("IRIS_BACKEND", None)
+
+
+def test_backend_detection_amd_alias():
+    """Test that HIP backend is selected when IRIS_BACKEND=amd."""
+    old_env = os.environ.get("IRIS_BACKEND")
+
+    try:
+        os.environ["IRIS_BACKEND"] = "amd"
+
+        # Load hip.py directly
+        spec = importlib.util.spec_from_file_location(
+            "hip_test_amd",
+            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
+        )
+        hip_module = importlib.util.module_from_spec(spec)
+
+        try:
+            spec.loader.exec_module(hip_module)
+        except OSError:
+            # Expected - GPU library not found
+            pass
+
+        # Check that backend was set to 'hip'
+        assert hasattr(hip_module, '_backend')
+        assert hip_module._backend == 'hip', "Backend should be 'hip' when IRIS_BACKEND=amd"
+
+    finally:
+        # Restore environment
+        if old_env is not None:
+            os.environ["IRIS_BACKEND"] = old_env
+        else:
+            os.environ.pop("IRIS_BACKEND", None)
+
+
+def test_ipc_handle_size_definition():
+    """Test that IPC handle size functions are defined correctly in source files."""
+    import re
+
+    # Check _hip.py defines get_ipc_handle_size returning 64
+    hip_file = os.path.join(os.path.dirname(__file__), "../../iris/_hip.py")
+    with open(hip_file, 'r') as f:
+        hip_content = f.read()
+
+    assert 'def get_ipc_handle_size()' in hip_content, "_hip.py should define get_ipc_handle_size"
+    assert 'return 64' in hip_content, "_hip.py should return 64 for IPC handle size"
+
+    # Check cuda.py defines get_ipc_handle_size returning 128
+    cuda_file = os.path.join(os.path.dirname(__file__), "../../iris/cuda.py")
+    with open(cuda_file, 'r') as f:
+        cuda_content = f.read()
+
+    assert 'def get_ipc_handle_size()' in cuda_content, "cuda.py should define get_ipc_handle_size"
+    assert 'return 128' in cuda_content, "cuda.py should return 128 for IPC handle size"
+
+
+def test_hip_module_structure():
+    """Test that hip.py has the expected structure for backend redirection."""
+    hip_file = os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
+    with open(hip_file, 'r') as f:
+        hip_content = f.read()
+
+    # Check for backend detection function
+    assert '_detect_backend' in hip_content, "hip.py should have _detect_backend function"
+
+    # Check for get_backend function
+    assert 'def get_backend():' in hip_content, "hip.py should have get_backend function"
+
+    # Check for environment variable handling
+    assert 'IRIS_BACKEND' in hip_content, "hip.py should check IRIS_BACKEND environment variable"
+
+    # Check for backend aliases
+    assert 'cuda' in hip_content and 'nvidia' in hip_content, "hip.py should support cuda/nvidia aliases"
+    assert 'hip' in hip_content and 'amd' in hip_content, "hip.py should support hip/amd aliases"
+
+
+def test_iris_py_uses_get_ipc_handle_size():
+    """Test that iris.py uses get_ipc_handle_size from hip module."""
+    iris_file = os.path.join(os.path.dirname(__file__), "../../iris/iris.py")
+    with open(iris_file, 'r') as f:
+        iris_content = f.read()
+
+    # Check that get_ipc_handle_size is imported
+    assert 'get_ipc_handle_size' in iris_content, "iris.py should import get_ipc_handle_size"
+
+    # Check that it's used instead of hardcoded 64
+    assert 'ipc_handle_size = get_ipc_handle_size()' in iris_content, "iris.py should call get_ipc_handle_size()"
+
+
+if __name__ == "__main__":
+    # Run tests
+    pytest.main([__file__, "-v"])
+

From 3f3e78e5c69843fefc21a6909c33e0adc3810c7c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 00:52:45 +0000
Subject: [PATCH 04/22] Update README with CUDA backend documentation

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 README.md | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2131d3f1..774d0e42 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,9 @@ if __name__ == "__main__":
 ### Quick Installation
 
 > [!NOTE]
-> **Requirements**: Python 3.10+, PyTorch 2.0+ (ROCm version), ROCm 6.3.1+ HIP runtime, and Triton
+> **Requirements**: Python 3.10+, PyTorch 2.0+, and Triton
+> - For AMD GPUs: ROCm 6.3.1+ HIP runtime and PyTorch ROCm version
+> - For NVIDIA GPUs: CUDA 11.0+ runtime and PyTorch CUDA version
 
 For a quick installation directly from the repository:
 
@@ -111,6 +113,19 @@ For a quick installation directly from the repository:
 pip install git+https://github.com/ROCm/iris.git
 ```
 
+To use Iris with NVIDIA GPUs, set the backend before importing:
+
+```bash
+export IRIS_BACKEND=cuda
+```
+
+Or install with the backend specified:
+
+```shell
+# Note: Backend selection is via environment variable, not install-time config
+IRIS_BACKEND=cuda pip install git+https://github.com/ROCm/iris.git
+```
+
 ### Docker Compose (Recommended for Development)
 
 The recommended way to get started is using Docker Compose, which provides a development environment with the Iris directory mounted inside the container. This allows you to make changes to the code outside the container and see them reflected inside.
@@ -139,11 +154,44 @@ Check out our [examples](examples/) directory for ready-to-run scripts and usage
 
 Iris currently supports:
 
+### AMD GPUs (HIP Backend)
 - MI300X, MI350X & MI355X
 
+### NVIDIA GPUs (CUDA Backend)
+- All CUDA-capable GPUs with compute capability 7.0+
+
 > [!NOTE]
 > Iris may work on other AMD GPUs with ROCm compatibility.
 
+### Backend Selection
+
+Iris automatically detects the available GPU backend at runtime. You can also explicitly select the backend using the `IRIS_BACKEND` environment variable:
+
+```bash
+# Use CUDA backend for NVIDIA GPUs
+export IRIS_BACKEND=cuda
+# or
+export IRIS_BACKEND=nvidia
+
+# Use HIP backend for AMD GPUs (default)
+export IRIS_BACKEND=hip
+# or
+export IRIS_BACKEND=amd
+```
+
+Alternatively, set the environment variable in your Python script:
+
+```python
+import os
+os.environ["IRIS_BACKEND"] = "cuda"  # Must be set before importing iris
+import iris
+```
+
+The backend detection priority is:
+1. `IRIS_BACKEND` environment variable
+2. Auto-detection based on available GPU libraries
+3. Default to HIP for backward compatibility
+
 ## Roadmap
 
 We plan to extend Iris with the following features:

From c6a9aa3cf77fc7de3e892026c94f96779269c4ca Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 00:54:32 +0000
Subject: [PATCH 05/22] Add CUDA backend example script

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 examples/cuda_backend_example.py | 92 ++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 examples/cuda_backend_example.py

diff --git a/examples/cuda_backend_example.py b/examples/cuda_backend_example.py
new file mode 100644
index 00000000..ba343b2d
--- /dev/null
+++ b/examples/cuda_backend_example.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+"""
+Example demonstrating how to use Iris with CUDA backend for NVIDIA GPUs.
+
+This script shows how to set the backend and verify it's correctly loaded.
+"""
+
+import os
+import sys
+
+# Set CUDA backend before importing iris
+# This must be done before importing iris to take effect
+os.environ["IRIS_BACKEND"] = "cuda"
+
+# Now import iris - it will use the CUDA backend
+import iris
+
+def main():
+    """
+    Demonstrate CUDA backend usage with Iris.
+    
+    This example shows:
+    1. How to set the CUDA backend
+    2. How to verify the backend is loaded
+    3. Basic Iris operations with CUDA
+    """
+    
+    print("=" * 60)
+    print("Iris CUDA Backend Example")
+    print("=" * 60)
+    
+    # Check which backend is being used
+    try:
+        backend = iris.hip.get_backend()
+        print(f"✓ Backend loaded: {backend}")
+        
+        if backend == "cuda":
+            print("✓ Successfully using CUDA backend for NVIDIA GPUs")
+        else:
+            print(f"! Note: Using {backend} backend instead of CUDA")
+    except Exception as e:
+        print(f"✗ Could not determine backend: {e}")
+    
+    # Initialize Iris with a symmetric heap
+    heap_size = 1 << 30  # 1 GB
+    print(f"\nInitializing Iris with {heap_size / (1024**3):.1f} GB heap...")
+    
+    try:
+        ctx = iris.iris(heap_size)
+        print(f"✓ Iris initialized successfully")
+        print(f"  - Rank: {ctx.get_rank()}")
+        print(f"  - Number of ranks: {ctx.get_num_ranks()}")
+        print(f"  - Device: {ctx.get_device()}")
+        print(f"  - Compute units: {ctx.get_cu_count()}")
+        
+        # Allocate a tensor on the symmetric heap
+        print("\nAllocating tensor on symmetric heap...")
+        tensor = ctx.zeros(1000, 1000, dtype=torch.float32)
+        print(f"✓ Tensor allocated: shape={tensor.shape}, dtype={tensor.dtype}")
+        print(f"  - On symmetric heap: {ctx._Iris__on_symmetric_heap(tensor)}")
+        print(f"  - Device: {tensor.device}")
+        
+    except Exception as e:
+        print(f"✗ Error initializing Iris: {e}")
+        print("\nNote: This example requires:")
+        print("  - NVIDIA GPU with CUDA support")
+        print("  - PyTorch with CUDA")
+        print("  - NCCL for distributed operations")
+        sys.exit(1)
+    
+    print("\n" + "=" * 60)
+    print("Example completed successfully!")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    import torch
+    import torch.distributed as dist
+    
+    # Check if CUDA is available
+    if not torch.cuda.is_available():
+        print("Error: CUDA is not available. This example requires NVIDIA GPU.")
+        print("\nTo use Iris with AMD GPUs, use the default HIP backend:")
+        print("  python your_script.py  # No IRIS_BACKEND needed")
+        sys.exit(1)
+    
+    # For this simple example, we'll run single-rank
+    # For multi-rank examples, see the examples/ directory
+    main()

From 0ba8573ad4863cea0657bc1130a2a40aa9f66aee Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 00:57:52 +0000
Subject: [PATCH 06/22] Fix linting issues in example script

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 examples/cuda_backend_example.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/cuda_backend_example.py b/examples/cuda_backend_example.py
index ba343b2d..534e42f3 100644
--- a/examples/cuda_backend_example.py
+++ b/examples/cuda_backend_example.py
@@ -21,48 +21,48 @@
 def main():
     """
     Demonstrate CUDA backend usage with Iris.
-    
+
     This example shows:
     1. How to set the CUDA backend
     2. How to verify the backend is loaded
     3. Basic Iris operations with CUDA
     """
-    
+
     print("=" * 60)
     print("Iris CUDA Backend Example")
     print("=" * 60)
-    
+
     # Check which backend is being used
     try:
         backend = iris.hip.get_backend()
         print(f"✓ Backend loaded: {backend}")
-        
+
         if backend == "cuda":
             print("✓ Successfully using CUDA backend for NVIDIA GPUs")
         else:
             print(f"! Note: Using {backend} backend instead of CUDA")
     except Exception as e:
         print(f"✗ Could not determine backend: {e}")
-    
+
     # Initialize Iris with a symmetric heap
     heap_size = 1 << 30  # 1 GB
     print(f"\nInitializing Iris with {heap_size / (1024**3):.1f} GB heap...")
-    
+
     try:
         ctx = iris.iris(heap_size)
-        print(f"✓ Iris initialized successfully")
+        print("✓ Iris initialized successfully")
         print(f"  - Rank: {ctx.get_rank()}")
         print(f"  - Number of ranks: {ctx.get_num_ranks()}")
         print(f"  - Device: {ctx.get_device()}")
         print(f"  - Compute units: {ctx.get_cu_count()}")
-        
+
         # Allocate a tensor on the symmetric heap
         print("\nAllocating tensor on symmetric heap...")
         tensor = ctx.zeros(1000, 1000, dtype=torch.float32)
         print(f"✓ Tensor allocated: shape={tensor.shape}, dtype={tensor.dtype}")
         print(f"  - On symmetric heap: {ctx._Iris__on_symmetric_heap(tensor)}")
         print(f"  - Device: {tensor.device}")
-        
+
     except Exception as e:
         print(f"✗ Error initializing Iris: {e}")
         print("\nNote: This example requires:")
@@ -70,7 +70,7 @@ def main():
         print("  - PyTorch with CUDA")
         print("  - NCCL for distributed operations")
         sys.exit(1)
-    
+
     print("\n" + "=" * 60)
     print("Example completed successfully!")
     print("=" * 60)
@@ -79,14 +79,14 @@ def main():
 if __name__ == "__main__":
     import torch
     import torch.distributed as dist
-    
+
     # Check if CUDA is available
     if not torch.cuda.is_available():
         print("Error: CUDA is not available. This example requires NVIDIA GPU.")
         print("\nTo use Iris with AMD GPUs, use the default HIP backend:")
         print("  python your_script.py  # No IRIS_BACKEND needed")
         sys.exit(1)
-    
+
     # For this simple example, we'll run single-rank
     # For multi-rank examples, see the examples/ directory
     main()

From 28965e69ea9c769f720d825c79209f6cd892b762 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 01:08:29 +0000
Subject: [PATCH 07/22] Switch to build-time backend configuration with
 --config-settings

- Add custom build backend (build_backend.py) to support --config-settings
- Update pyproject.toml to use custom build backend
- Modify iris/hip.py to prioritize build-time config over env vars
- Update README with pip install --config-settings backend=nvidia examples
- Add 3 new unit tests for build-time configuration (11 tests total pass)
- Update example script to document both build-time and runtime methods
- Add iris/.config/ to .gitignore

Backend detection priority is now:
1. Build-time configuration (--config-settings backend=nvidia/hip)
2. IRIS_BACKEND environment variable (runtime)
3. Auto-detection based on available libraries
4. Default to HIP for backward compatibility

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .gitignore                                |   4 +-
 README.md                                 |  41 ++++--
 build_backend.py                          |  63 ++++++++++
 examples/cuda_backend_example.py          |  16 ++-
 iris/hip.py                               |  27 +++-
 pyproject.toml                            |   3 +-
 tests/unittests/test_backend_detection.py | 146 ++++++++++++++++++++++
 7 files changed, 278 insertions(+), 22 deletions(-)
 create mode 100644 build_backend.py

diff --git a/.gitignore b/.gitignore
index cbaac1bc..6242c995 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,4 +27,6 @@ slurm*.out
 *.egg-info
 
 examples/gemm/results/*
-asm/
\ No newline at end of file
+asm/
+# Backend configuration (generated at build time)
+iris/.config/
diff --git a/README.md b/README.md
index 774d0e42..09f7d7ad 100644
--- a/README.md
+++ b/README.md
@@ -113,19 +113,21 @@ For a quick installation directly from the repository:
 pip install git+https://github.com/ROCm/iris.git
 ```
 
-To use Iris with NVIDIA GPUs, set the backend before importing:
+To use Iris with NVIDIA GPUs, install with the CUDA backend:
 
-```bash
-export IRIS_BACKEND=cuda
+```shell
+pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidia
 ```
 
-Or install with the backend specified:
+To use Iris with AMD GPUs (default):
 
 ```shell
-# Note: Backend selection is via environment variable, not install-time config
-IRIS_BACKEND=cuda pip install git+https://github.com/ROCm/iris.git
+pip install git+https://github.com/ROCm/iris.git --config-settings backend=hip
 ```
 
+> [!NOTE]
+> The backend can also be controlled at runtime via the `IRIS_BACKEND` environment variable if not set at build time.
+
 ### Docker Compose (Recommended for Development)
 
 The recommended way to get started is using Docker Compose, which provides a development environment with the Iris directory mounted inside the container. This allows you to make changes to the code outside the container and see them reflected inside.
@@ -165,7 +167,23 @@ Iris currently supports:
 
 ### Backend Selection
 
-Iris automatically detects the available GPU backend at runtime. You can also explicitly select the backend using the `IRIS_BACKEND` environment variable:
+Iris supports two methods for backend selection:
+
+#### 1. Build-time Configuration (Recommended)
+
+Install Iris with the desired backend using `--config-settings`:
+
+```bash
+# For NVIDIA GPUs
+pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidia
+
+# For AMD GPUs
+pip install git+https://github.com/ROCm/iris.git --config-settings backend=hip
+```
+
+#### 2. Runtime Environment Variable
+
+If no backend was specified at build time, you can control it via the `IRIS_BACKEND` environment variable:
 
 ```bash
 # Use CUDA backend for NVIDIA GPUs
@@ -179,7 +197,7 @@ export IRIS_BACKEND=hip
 export IRIS_BACKEND=amd
 ```
 
-Alternatively, set the environment variable in your Python script:
+Or set it in your Python script:
 
 ```python
 import os
@@ -188,9 +206,10 @@ import iris
 ```
 
 The backend detection priority is:
-1. `IRIS_BACKEND` environment variable
-2. Auto-detection based on available GPU libraries
-3. Default to HIP for backward compatibility
+1. Build-time configuration (set via `--config-settings`)
+2. `IRIS_BACKEND` environment variable
+3. Auto-detection based on available GPU libraries
+4. Default to HIP for backward compatibility
 
 ## Roadmap
 
diff --git a/build_backend.py b/build_backend.py
new file mode 100644
index 00000000..185668cc
--- /dev/null
+++ b/build_backend.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+"""
+Custom build backend to support backend selection via --config-settings.
+
+This allows users to install Iris with:
+    pip install . --config-settings backend=nvidia
+or:
+    pip install . --config-settings backend=hip
+"""
+
+import os
+from setuptools import build_meta as _orig
+
+# Re-export all setuptools build_meta functions
+prepare_metadata_for_build_wheel = _orig.prepare_metadata_for_build_wheel
+get_requires_for_build_wheel = _orig.get_requires_for_build_wheel
+get_requires_for_build_sdist = _orig.get_requires_for_build_sdist
+
+
+def _write_backend_config(config_settings):
+    """Write backend configuration file based on --config-settings."""
+    backend = None
+
+    if config_settings:
+        backend = config_settings.get("backend", "").lower()
+
+    # Normalize backend names
+    if backend in ("nvidia", "cuda"):
+        backend = "cuda"
+    elif backend in ("amd", "rocm", "hip"):
+        backend = "hip"
+    else:
+        backend = None  # Auto-detect at runtime
+
+    # Write configuration file
+    config_dir = os.path.join("iris", ".config")
+    os.makedirs(config_dir, exist_ok=True)
+
+    config_file = os.path.join(config_dir, "backend.txt")
+    if backend:
+        with open(config_file, "w") as f:
+            f.write(backend)
+        print(f"Iris: Configured to use {backend} backend")
+    else:
+        # Remove config file if it exists (auto-detect mode)
+        if os.path.exists(config_file):
+            os.remove(config_file)
+        print("Iris: No backend specified, will auto-detect at runtime")
+
+
+# Wrap build functions to inject backend configuration
+def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
+    """Build wheel with backend configuration."""
+    _write_backend_config(config_settings)
+    return _orig.build_wheel(wheel_directory, config_settings, metadata_directory)
+
+
+def build_sdist(sdist_directory, config_settings=None):
+    """Build sdist with backend configuration."""
+    _write_backend_config(config_settings)
+    return _orig.build_sdist(sdist_directory, config_settings)
diff --git a/examples/cuda_backend_example.py b/examples/cuda_backend_example.py
index 534e42f3..98670d24 100644
--- a/examples/cuda_backend_example.py
+++ b/examples/cuda_backend_example.py
@@ -5,19 +5,29 @@
 """
 Example demonstrating how to use Iris with CUDA backend for NVIDIA GPUs.
 
-This script shows how to set the backend and verify it's correctly loaded.
+This script shows how to use the CUDA backend and verify it's correctly loaded.
+
+Backend can be configured in two ways:
+1. Build-time (recommended):
+   pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidia
+
+2. Runtime (if not set at build time):
+   export IRIS_BACKEND=cuda
+   python cuda_backend_example.py
 """
 
 import os
 import sys
 
-# Set CUDA backend before importing iris
+# Set CUDA backend if not configured at build time
 # This must be done before importing iris to take effect
-os.environ["IRIS_BACKEND"] = "cuda"
+if "IRIS_BACKEND" not in os.environ:
+    os.environ["IRIS_BACKEND"] = "cuda"
 
 # Now import iris - it will use the CUDA backend
 import iris
 
+
 def main():
     """
     Demonstrate CUDA backend usage with Iris.
diff --git a/iris/hip.py b/iris/hip.py
index 5d820135..85c86b8c 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -6,11 +6,12 @@
 
 This module provides a HIP-compatible interface that transparently redirects
 to either the HIP backend (AMD GPUs) or CUDA backend (NVIDIA GPUs) based on
-runtime detection or configuration.
+build-time configuration, runtime detection, or auto-detection.
 
-The backend is selected based on:
-1. IRIS_BACKEND environment variable (set to 'cuda' or 'hip')
-2. Auto-detection based on available libraries
+The backend is selected based on (in priority order):
+1. Build-time configuration (set via pip install --config-settings backend=nvidia)
+2. IRIS_BACKEND environment variable (set to 'cuda' or 'hip')
+3. Auto-detection based on available libraries
 """
 
 import os
@@ -20,14 +21,28 @@
 
 # Detect backend
 def _detect_backend():
-    """Detect which backend to use based on environment and available libraries."""
+    """Detect which backend to use based on build-time config, environment, and available libraries."""
+    # 1. Check for build-time configuration file first
+    config_file = os.path.join(os.path.dirname(__file__), ".config", "backend.txt")
+    if os.path.exists(config_file):
+        try:
+            with open(config_file, "r") as f:
+                backend_config = f.read().strip().lower()
+                if backend_config in ("cuda", "nvidia"):
+                    return "cuda"
+                elif backend_config in ("hip", "amd", "rocm"):
+                    return "hip"
+        except (IOError, OSError):
+            pass
+
+    # 2. Check environment variable
     backend_env = os.environ.get("IRIS_BACKEND", "").lower()
     if backend_env in ("cuda", "nvidia"):
         return "cuda"
     elif backend_env in ("hip", "amd", "rocm"):
         return "hip"
 
-    # Auto-detect by trying to load libraries
+    # 3. Auto-detect by trying to load libraries
     import ctypes
 
     try:
diff --git a/pyproject.toml b/pyproject.toml
index 88f7b2e3..e191a8c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,8 @@
 
 [build-system]
 requires = ["setuptools>=61", "wheel", "setuptools-scm>=8"]
-build-backend = "setuptools.build_meta"
+build-backend = "build_backend"
+backend-path = ["."]
 
 [project]
 name = "iris"
diff --git a/tests/unittests/test_backend_detection.py b/tests/unittests/test_backend_detection.py
index ced3e850..cba15093 100644
--- a/tests/unittests/test_backend_detection.py
+++ b/tests/unittests/test_backend_detection.py
@@ -6,6 +6,12 @@
 
 These tests verify that the backend detection logic works correctly
 and that the appropriate backend module is selected based on configuration.
+
+Backend selection priority:
+1. Build-time configuration (--config-settings backend=nvidia)
+2. IRIS_BACKEND environment variable
+3. Auto-detection based on available libraries
+4. Default to HIP
 """
 
 import os
@@ -231,3 +237,143 @@ def test_iris_py_uses_get_ipc_handle_size():
     # Run tests
     pytest.main([__file__, "-v"])
 
+
+
+def test_build_time_config_cuda():
+    """Test that build-time configuration for CUDA is respected."""
+    import tempfile
+    import shutil
+
+    # Create a temporary config
+    config_dir = os.path.join(os.path.dirname(__file__), "../../iris/.config")
+    os.makedirs(config_dir, exist_ok=True)
+    config_file = os.path.join(config_dir, "backend.txt")
+
+    old_env = os.environ.get("IRIS_BACKEND")
+
+    try:
+        # Write CUDA config
+        with open(config_file, "w") as f:
+            f.write("cuda")
+
+        # Clear environment variable to test config priority
+        os.environ.pop("IRIS_BACKEND", None)
+
+        # Load hip.py
+        spec = importlib.util.spec_from_file_location(
+            "hip_test_buildtime_cuda",
+            os.path.join(os.path.dirname(__file__), "../../iris/hip.py"),
+        )
+        hip_module = importlib.util.module_from_spec(spec)
+
+        try:
+            spec.loader.exec_module(hip_module)
+        except OSError:
+            # Expected - GPU library not found
+            pass
+
+        # Check that backend was set to 'cuda' from config file
+        assert hasattr(hip_module, "_backend")
+        assert hip_module._backend == "cuda", "Build-time config should set backend to 'cuda'"
+
+    finally:
+        # Clean up
+        if os.path.exists(config_file):
+            os.remove(config_file)
+        if os.path.exists(config_dir) and not os.listdir(config_dir):
+            os.rmdir(config_dir)
+        # Restore environment
+        if old_env is not None:
+            os.environ["IRIS_BACKEND"] = old_env
+        else:
+            os.environ.pop("IRIS_BACKEND", None)
+
+
+def test_build_time_config_priority():
+    """Test that build-time configuration takes priority over environment variable."""
+    config_dir = os.path.join(os.path.dirname(__file__), "../../iris/.config")
+    os.makedirs(config_dir, exist_ok=True)
+    config_file = os.path.join(config_dir, "backend.txt")
+
+    old_env = os.environ.get("IRIS_BACKEND")
+
+    try:
+        # Write HIP config
+        with open(config_file, "w") as f:
+            f.write("hip")
+
+        # Set environment to CUDA (should be overridden by config)
+        os.environ["IRIS_BACKEND"] = "cuda"
+
+        # Load hip.py
+        spec = importlib.util.spec_from_file_location(
+            "hip_test_priority",
+            os.path.join(os.path.dirname(__file__), "../../iris/hip.py"),
+        )
+        hip_module = importlib.util.module_from_spec(spec)
+
+        try:
+            spec.loader.exec_module(hip_module)
+        except OSError:
+            # Expected - GPU library not found
+            pass
+
+        # Check that config takes priority
+        assert hasattr(hip_module, "_backend")
+        assert hip_module._backend == "hip", "Build-time config should take priority over env var"
+
+    finally:
+        # Clean up
+        if os.path.exists(config_file):
+            os.remove(config_file)
+        if os.path.exists(config_dir) and not os.listdir(config_dir):
+            os.rmdir(config_dir)
+        # Restore environment
+        if old_env is not None:
+            os.environ["IRIS_BACKEND"] = old_env
+        else:
+            os.environ.pop("IRIS_BACKEND", None)
+
+
+def test_build_backend_module():
+    """Test that the build_backend module can write configuration correctly."""
+
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
+
+    try:
+        from build_backend import _write_backend_config
+
+        config_dir = os.path.join(os.path.dirname(__file__), "../../iris/.config")
+        config_file = os.path.join(config_dir, "backend.txt")
+
+        # Test nvidia alias
+        _write_backend_config({"backend": "nvidia"})
+        assert os.path.exists(config_file), "Config file should be created"
+        with open(config_file, "r") as f:
+            assert f.read() == "cuda", "nvidia should map to cuda"
+
+        # Test hip
+        _write_backend_config({"backend": "hip"})
+        with open(config_file, "r") as f:
+            assert f.read() == "hip", "hip should stay as hip"
+
+        # Test amd alias
+        _write_backend_config({"backend": "amd"})
+        with open(config_file, "r") as f:
+            assert f.read() == "hip", "amd should map to hip"
+
+        # Test no config (should remove file)
+        _write_backend_config({})
+        assert not os.path.exists(config_file), "Config file should be removed for auto-detect"
+
+    finally:
+        # Clean up
+        if os.path.exists(config_file):
+            os.remove(config_file)
+        if os.path.exists(config_dir) and not os.listdir(config_dir):
+            os.rmdir(config_dir)
+
+
+if __name__ == "__main__":
+    # Run tests
+    pytest.main([__file__, "-v"])

From 0a5992dec6ce1bf0cbb41d75b3b0f3e4b5b9d22a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 02:43:59 +0000
Subject: [PATCH 08/22] Remove redundant line from cuda_backend_example.py
 docstring

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 examples/cuda_backend_example.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/cuda_backend_example.py b/examples/cuda_backend_example.py
index 98670d24..e4ee8d34 100644
--- a/examples/cuda_backend_example.py
+++ b/examples/cuda_backend_example.py
@@ -5,8 +5,6 @@
 """
 Example demonstrating how to use Iris with CUDA backend for NVIDIA GPUs.
 
-This script shows how to use the CUDA backend and verify it's correctly loaded.
-
 Backend can be configured in two ways:
 1. Build-time (recommended):
    pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidia

From 698604c9817a82e6c40b0180f86265dce9a96d58 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 02:50:39 +0000
Subject: [PATCH 09/22] Add build() function to setup.py for config_settings
 handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add build() hook in setup.py to handle --config-settings
- Sets IRIS_BACKEND environment variable during build
- Normalizes backend names (nvidia→cuda, amd/rocm→hip)
- Updated build_backend.py to also set env var for consistency
- All 11 unit tests pass

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 build_backend.py |  4 ++++
 setup.py         | 28 ++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/build_backend.py b/build_backend.py
index 185668cc..06f7e2c9 100644
--- a/build_backend.py
+++ b/build_backend.py
@@ -34,6 +34,10 @@ def _write_backend_config(config_settings):
     else:
         backend = None  # Auto-detect at runtime
 
+    # Also set environment variable as suggested in setup.py
+    if backend:
+        os.environ["IRIS_BACKEND"] = backend
+
     # Write configuration file
     config_dir = os.path.join("iris", ".config")
     os.makedirs(config_dir, exist_ok=True)
diff --git a/setup.py b/setup.py
index 69832461..e7b4cef6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,36 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 
+import os
 from setuptools import setup
 
+
+def build(setup_kwargs, config_settings=None):
+    """
+    Build hook to configure backend via --config-settings.
+
+    This function is called during the build process to handle
+    backend configuration from pip install --config-settings.
+    """
+    if config_settings is None:
+        config_settings = {}
+
+    backend = config_settings.get("backend", "amd")
+
+    # Normalize backend names
+    if backend.lower() in ("nvidia", "cuda"):
+        backend = "cuda"
+    elif backend.lower() in ("amd", "rocm", "hip"):
+        backend = "hip"
+    else:
+        backend = "hip"  # Default to hip
+
+    os.environ["IRIS_BACKEND"] = backend
+    print(f"Building Iris with backend={backend}")
+
+    return setup_kwargs
+
+
 # This setup.py provides backward compatibility for legacy metadata fields
 # that don't map directly from pyproject.toml's modern PEP 621 format.
 setup(

From fcfd0ccaafb4234603c820baae6f95730492a44c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 02:55:51 +0000
Subject: [PATCH 10/22] Remove example and test files, rename cuda.py to
 _cuda.py

- Remove examples/cuda_backend_example.py (per review feedback)
- Remove tests/unittests/test_backend_detection.py (per review feedback)
- Rename iris/cuda.py to iris/_cuda.py (per review feedback)
- Update iris/hip.py to reference _cuda.py instead of cuda.py

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 examples/cuda_backend_example.py          | 100 ------
 iris/{cuda.py => _cuda.py}                |   0
 iris/hip.py                               |   2 +-
 tests/unittests/test_backend_detection.py | 379 ----------------------
 4 files changed, 1 insertion(+), 480 deletions(-)
 delete mode 100644 examples/cuda_backend_example.py
 rename iris/{cuda.py => _cuda.py} (100%)
 delete mode 100644 tests/unittests/test_backend_detection.py

diff --git a/examples/cuda_backend_example.py b/examples/cuda_backend_example.py
deleted file mode 100644
index e4ee8d34..00000000
--- a/examples/cuda_backend_example.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-"""
-Example demonstrating how to use Iris with CUDA backend for NVIDIA GPUs.
-
-Backend can be configured in two ways:
-1. Build-time (recommended):
-   pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidia
-
-2. Runtime (if not set at build time):
-   export IRIS_BACKEND=cuda
-   python cuda_backend_example.py
-"""
-
-import os
-import sys
-
-# Set CUDA backend if not configured at build time
-# This must be done before importing iris to take effect
-if "IRIS_BACKEND" not in os.environ:
-    os.environ["IRIS_BACKEND"] = "cuda"
-
-# Now import iris - it will use the CUDA backend
-import iris
-
-
-def main():
-    """
-    Demonstrate CUDA backend usage with Iris.
-
-    This example shows:
-    1. How to set the CUDA backend
-    2. How to verify the backend is loaded
-    3. Basic Iris operations with CUDA
-    """
-
-    print("=" * 60)
-    print("Iris CUDA Backend Example")
-    print("=" * 60)
-
-    # Check which backend is being used
-    try:
-        backend = iris.hip.get_backend()
-        print(f"✓ Backend loaded: {backend}")
-
-        if backend == "cuda":
-            print("✓ Successfully using CUDA backend for NVIDIA GPUs")
-        else:
-            print(f"! Note: Using {backend} backend instead of CUDA")
-    except Exception as e:
-        print(f"✗ Could not determine backend: {e}")
-
-    # Initialize Iris with a symmetric heap
-    heap_size = 1 << 30  # 1 GB
-    print(f"\nInitializing Iris with {heap_size / (1024**3):.1f} GB heap...")
-
-    try:
-        ctx = iris.iris(heap_size)
-        print("✓ Iris initialized successfully")
-        print(f"  - Rank: {ctx.get_rank()}")
-        print(f"  - Number of ranks: {ctx.get_num_ranks()}")
-        print(f"  - Device: {ctx.get_device()}")
-        print(f"  - Compute units: {ctx.get_cu_count()}")
-
-        # Allocate a tensor on the symmetric heap
-        print("\nAllocating tensor on symmetric heap...")
-        tensor = ctx.zeros(1000, 1000, dtype=torch.float32)
-        print(f"✓ Tensor allocated: shape={tensor.shape}, dtype={tensor.dtype}")
-        print(f"  - On symmetric heap: {ctx._Iris__on_symmetric_heap(tensor)}")
-        print(f"  - Device: {tensor.device}")
-
-    except Exception as e:
-        print(f"✗ Error initializing Iris: {e}")
-        print("\nNote: This example requires:")
-        print("  - NVIDIA GPU with CUDA support")
-        print("  - PyTorch with CUDA")
-        print("  - NCCL for distributed operations")
-        sys.exit(1)
-
-    print("\n" + "=" * 60)
-    print("Example completed successfully!")
-    print("=" * 60)
-
-
-if __name__ == "__main__":
-    import torch
-    import torch.distributed as dist
-
-    # Check if CUDA is available
-    if not torch.cuda.is_available():
-        print("Error: CUDA is not available. This example requires NVIDIA GPU.")
-        print("\nTo use Iris with AMD GPUs, use the default HIP backend:")
-        print("  python your_script.py  # No IRIS_BACKEND needed")
-        sys.exit(1)
-
-    # For this simple example, we'll run single-rank
-    # For multi-rank examples, see the examples/ directory
-    main()
diff --git a/iris/cuda.py b/iris/_cuda.py
similarity index 100%
rename from iris/cuda.py
rename to iris/_cuda.py
diff --git a/iris/hip.py b/iris/hip.py
index 85c86b8c..d7645c39 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -66,7 +66,7 @@ def _detect_backend():
 # Load the appropriate backend module directly without triggering __init__.py
 _module_dir = os.path.dirname(__file__)
 if _backend == "cuda":
-    _module_path = os.path.join(_module_dir, "cuda.py")
+    _module_path = os.path.join(_module_dir, "_cuda.py")
     _spec = importlib.util.spec_from_file_location("iris._cuda_backend", _module_path)
 else:
     _module_path = os.path.join(_module_dir, "_hip.py")
diff --git a/tests/unittests/test_backend_detection.py b/tests/unittests/test_backend_detection.py
deleted file mode 100644
index cba15093..00000000
--- a/tests/unittests/test_backend_detection.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-"""
-Unit tests for backend detection and runtime module selection.
-
-These tests verify that the backend detection logic works correctly
-and that the appropriate backend module is selected based on configuration.
-
-Backend selection priority:
-1. Build-time configuration (--config-settings backend=nvidia)
-2. IRIS_BACKEND environment variable
-3. Auto-detection based on available libraries
-4. Default to HIP
-"""
-
-import os
-import sys
-import pytest
-import importlib
-import importlib.util
-
-
-def test_backend_detection_default():
-    """Test that default backend is HIP when no environment variable is set."""
-    # Clear any existing IRIS_BACKEND setting
-    old_env = os.environ.pop("IRIS_BACKEND", None)
-
-    try:
-        # Load hip.py directly to test detection logic
-        spec = importlib.util.spec_from_file_location(
-            "hip_test",
-            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
-        )
-        hip_module = importlib.util.module_from_spec(spec)
-
-        # Execute the module - detection happens before trying to load backend
-        try:
-            spec.loader.exec_module(hip_module)
-        except OSError:
-            # Expected - GPU library not found, but detection logic runs first
-            pass
-
-        # Check that backend was set to 'hip' (default)
-        assert hasattr(hip_module, '_backend')
-        assert hip_module._backend == 'hip', "Default backend should be 'hip'"
-
-    finally:
-        # Restore environment
-        if old_env is not None:
-            os.environ["IRIS_BACKEND"] = old_env
-
-
-def test_backend_detection_cuda_env():
-    """Test that CUDA backend is selected when IRIS_BACKEND=cuda."""
-    old_env = os.environ.get("IRIS_BACKEND")
-
-    try:
-        os.environ["IRIS_BACKEND"] = "cuda"
-
-        # Load hip.py directly
-        spec = importlib.util.spec_from_file_location(
-            "hip_test_cuda",
-            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
-        )
-        hip_module = importlib.util.module_from_spec(spec)
-
-        try:
-            spec.loader.exec_module(hip_module)
-        except OSError:
-            # Expected - GPU library not found
-            pass
-
-        # Check that backend was set to 'cuda'
-        assert hasattr(hip_module, '_backend')
-        assert hip_module._backend == 'cuda', "Backend should be 'cuda' when IRIS_BACKEND=cuda"
-
-    finally:
-        # Restore environment
-        if old_env is not None:
-            os.environ["IRIS_BACKEND"] = old_env
-        else:
-            os.environ.pop("IRIS_BACKEND", None)
-
-
-def test_backend_detection_nvidia_alias():
-    """Test that CUDA backend is selected when IRIS_BACKEND=nvidia."""
-    old_env = os.environ.get("IRIS_BACKEND")
-
-    try:
-        os.environ["IRIS_BACKEND"] = "nvidia"
-
-        # Load hip.py directly
-        spec = importlib.util.spec_from_file_location(
-            "hip_test_nvidia",
-            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
-        )
-        hip_module = importlib.util.module_from_spec(spec)
-
-        try:
-            spec.loader.exec_module(hip_module)
-        except OSError:
-            # Expected - GPU library not found
-            pass
-
-        # Check that backend was set to 'cuda'
-        assert hasattr(hip_module, '_backend')
-        assert hip_module._backend == 'cuda', "Backend should be 'cuda' when IRIS_BACKEND=nvidia"
-
-    finally:
-        # Restore environment
-        if old_env is not None:
-            os.environ["IRIS_BACKEND"] = old_env
-        else:
-            os.environ.pop("IRIS_BACKEND", None)
-
-
-def test_backend_detection_hip_env():
-    """Test that HIP backend is selected when IRIS_BACKEND=hip."""
-    old_env = os.environ.get("IRIS_BACKEND")
-
-    try:
-        os.environ["IRIS_BACKEND"] = "hip"
-
-        # Load hip.py directly
-        spec = importlib.util.spec_from_file_location(
-            "hip_test_hip",
-            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
-        )
-        hip_module = importlib.util.module_from_spec(spec)
-
-        try:
-            spec.loader.exec_module(hip_module)
-        except OSError:
-            # Expected - GPU library not found
-            pass
-
-        # Check that backend was set to 'hip'
-        assert hasattr(hip_module, '_backend')
-        assert hip_module._backend == 'hip', "Backend should be 'hip' when IRIS_BACKEND=hip"
-
-    finally:
-        # Restore environment
-        if old_env is not None:
-            os.environ["IRIS_BACKEND"] = old_env
-        else:
-            os.environ.pop("IRIS_BACKEND", None)
-
-
-def test_backend_detection_amd_alias():
-    """Test that HIP backend is selected when IRIS_BACKEND=amd."""
-    old_env = os.environ.get("IRIS_BACKEND")
-
-    try:
-        os.environ["IRIS_BACKEND"] = "amd"
-
-        # Load hip.py directly
-        spec = importlib.util.spec_from_file_location(
-            "hip_test_amd",
-            os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
-        )
-        hip_module = importlib.util.module_from_spec(spec)
-
-        try:
-            spec.loader.exec_module(hip_module)
-        except OSError:
-            # Expected - GPU library not found
-            pass
-
-        # Check that backend was set to 'hip'
-        assert hasattr(hip_module, '_backend')
-        assert hip_module._backend == 'hip', "Backend should be 'hip' when IRIS_BACKEND=amd"
-
-    finally:
-        # Restore environment
-        if old_env is not None:
-            os.environ["IRIS_BACKEND"] = old_env
-        else:
-            os.environ.pop("IRIS_BACKEND", None)
-
-
-def test_ipc_handle_size_definition():
-    """Test that IPC handle size functions are defined correctly in source files."""
-    import re
-
-    # Check _hip.py defines get_ipc_handle_size returning 64
-    hip_file = os.path.join(os.path.dirname(__file__), "../../iris/_hip.py")
-    with open(hip_file, 'r') as f:
-        hip_content = f.read()
-
-    assert 'def get_ipc_handle_size()' in hip_content, "_hip.py should define get_ipc_handle_size"
-    assert 'return 64' in hip_content, "_hip.py should return 64 for IPC handle size"
-
-    # Check cuda.py defines get_ipc_handle_size returning 128
-    cuda_file = os.path.join(os.path.dirname(__file__), "../../iris/cuda.py")
-    with open(cuda_file, 'r') as f:
-        cuda_content = f.read()
-
-    assert 'def get_ipc_handle_size()' in cuda_content, "cuda.py should define get_ipc_handle_size"
-    assert 'return 128' in cuda_content, "cuda.py should return 128 for IPC handle size"
-
-
-def test_hip_module_structure():
-    """Test that hip.py has the expected structure for backend redirection."""
-    hip_file = os.path.join(os.path.dirname(__file__), "../../iris/hip.py")
-    with open(hip_file, 'r') as f:
-        hip_content = f.read()
-
-    # Check for backend detection function
-    assert '_detect_backend' in hip_content, "hip.py should have _detect_backend function"
-
-    # Check for get_backend function
-    assert 'def get_backend():' in hip_content, "hip.py should have get_backend function"
-
-    # Check for environment variable handling
-    assert 'IRIS_BACKEND' in hip_content, "hip.py should check IRIS_BACKEND environment variable"
-
-    # Check for backend aliases
-    assert 'cuda' in hip_content and 'nvidia' in hip_content, "hip.py should support cuda/nvidia aliases"
-    assert 'hip' in hip_content and 'amd' in hip_content, "hip.py should support hip/amd aliases"
-
-
-def test_iris_py_uses_get_ipc_handle_size():
-    """Test that iris.py uses get_ipc_handle_size from hip module."""
-    iris_file = os.path.join(os.path.dirname(__file__), "../../iris/iris.py")
-    with open(iris_file, 'r') as f:
-        iris_content = f.read()
-
-    # Check that get_ipc_handle_size is imported
-    assert 'get_ipc_handle_size' in iris_content, "iris.py should import get_ipc_handle_size"
-
-    # Check that it's used instead of hardcoded 64
-    assert 'ipc_handle_size = get_ipc_handle_size()' in iris_content, "iris.py should call get_ipc_handle_size()"
-
-
-if __name__ == "__main__":
-    # Run tests
-    pytest.main([__file__, "-v"])
-
-
-
-def test_build_time_config_cuda():
-    """Test that build-time configuration for CUDA is respected."""
-    import tempfile
-    import shutil
-
-    # Create a temporary config
-    config_dir = os.path.join(os.path.dirname(__file__), "../../iris/.config")
-    os.makedirs(config_dir, exist_ok=True)
-    config_file = os.path.join(config_dir, "backend.txt")
-
-    old_env = os.environ.get("IRIS_BACKEND")
-
-    try:
-        # Write CUDA config
-        with open(config_file, "w") as f:
-            f.write("cuda")
-
-        # Clear environment variable to test config priority
-        os.environ.pop("IRIS_BACKEND", None)
-
-        # Load hip.py
-        spec = importlib.util.spec_from_file_location(
-            "hip_test_buildtime_cuda",
-            os.path.join(os.path.dirname(__file__), "../../iris/hip.py"),
-        )
-        hip_module = importlib.util.module_from_spec(spec)
-
-        try:
-            spec.loader.exec_module(hip_module)
-        except OSError:
-            # Expected - GPU library not found
-            pass
-
-        # Check that backend was set to 'cuda' from config file
-        assert hasattr(hip_module, "_backend")
-        assert hip_module._backend == "cuda", "Build-time config should set backend to 'cuda'"
-
-    finally:
-        # Clean up
-        if os.path.exists(config_file):
-            os.remove(config_file)
-        if os.path.exists(config_dir) and not os.listdir(config_dir):
-            os.rmdir(config_dir)
-        # Restore environment
-        if old_env is not None:
-            os.environ["IRIS_BACKEND"] = old_env
-        else:
-            os.environ.pop("IRIS_BACKEND", None)
-
-
-def test_build_time_config_priority():
-    """Test that build-time configuration takes priority over environment variable."""
-    config_dir = os.path.join(os.path.dirname(__file__), "../../iris/.config")
-    os.makedirs(config_dir, exist_ok=True)
-    config_file = os.path.join(config_dir, "backend.txt")
-
-    old_env = os.environ.get("IRIS_BACKEND")
-
-    try:
-        # Write HIP config
-        with open(config_file, "w") as f:
-            f.write("hip")
-
-        # Set environment to CUDA (should be overridden by config)
-        os.environ["IRIS_BACKEND"] = "cuda"
-
-        # Load hip.py
-        spec = importlib.util.spec_from_file_location(
-            "hip_test_priority",
-            os.path.join(os.path.dirname(__file__), "../../iris/hip.py"),
-        )
-        hip_module = importlib.util.module_from_spec(spec)
-
-        try:
-            spec.loader.exec_module(hip_module)
-        except OSError:
-            # Expected - GPU library not found
-            pass
-
-        # Check that config takes priority
-        assert hasattr(hip_module, "_backend")
-        assert hip_module._backend == "hip", "Build-time config should take priority over env var"
-
-    finally:
-        # Clean up
-        if os.path.exists(config_file):
-            os.remove(config_file)
-        if os.path.exists(config_dir) and not os.listdir(config_dir):
-            os.rmdir(config_dir)
-        # Restore environment
-        if old_env is not None:
-            os.environ["IRIS_BACKEND"] = old_env
-        else:
-            os.environ.pop("IRIS_BACKEND", None)
-
-
-def test_build_backend_module():
-    """Test that the build_backend module can write configuration correctly."""
-
-    sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
-
-    try:
-        from build_backend import _write_backend_config
-
-        config_dir = os.path.join(os.path.dirname(__file__), "../../iris/.config")
-        config_file = os.path.join(config_dir, "backend.txt")
-
-        # Test nvidia alias
-        _write_backend_config({"backend": "nvidia"})
-        assert os.path.exists(config_file), "Config file should be created"
-        with open(config_file, "r") as f:
-            assert f.read() == "cuda", "nvidia should map to cuda"
-
-        # Test hip
-        _write_backend_config({"backend": "hip"})
-        with open(config_file, "r") as f:
-            assert f.read() == "hip", "hip should stay as hip"
-
-        # Test amd alias
-        _write_backend_config({"backend": "amd"})
-        with open(config_file, "r") as f:
-            assert f.read() == "hip", "amd should map to hip"
-
-        # Test no config (should remove file)
-        _write_backend_config({})
-        assert not os.path.exists(config_file), "Config file should be removed for auto-detect"
-
-    finally:
-        # Clean up
-        if os.path.exists(config_file):
-            os.remove(config_file)
-        if os.path.exists(config_dir) and not os.listdir(config_dir):
-            os.rmdir(config_dir)
-
-
-if __name__ == "__main__":
-    # Run tests
-    pytest.main([__file__, "-v"])

From a162b9e4df17a53d12089d621f820fb2c928af83 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 03:01:34 +0000
Subject: [PATCH 11/22] Replace env var approach with _backend_selected.py file

- Change build() in setup.py to write iris/_backend_selected.py instead of setting env var
- Update build_backend.py to write _backend_selected.py instead of .config/backend.txt
- Update iris/hip.py to import _backend_selected module instead of reading file
- Update .gitignore to track _backend_selected.py instead of .config/
- Change default backend from "amd" to "hip" in setup.py
- Remove environment variable usage per feedback

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .gitignore       |  3 ++-
 build_backend.py | 20 ++++++--------------
 iris/hip.py      | 21 ++++++++++-----------
 setup.py         |  6 ++++--
 4 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6242c995..5ce8808f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,5 +28,6 @@ slurm*.out
 
 examples/gemm/results/*
 asm/
+
 # Backend configuration (generated at build time)
-iris/.config/
+iris/_backend_selected.py
diff --git a/build_backend.py b/build_backend.py
index 06f7e2c9..e2c21c04 100644
--- a/build_backend.py
+++ b/build_backend.py
@@ -34,23 +34,15 @@ def _write_backend_config(config_settings):
     else:
         backend = None  # Auto-detect at runtime
 
-    # Also set environment variable as suggested in setup.py
+    # Write backend selection to a Python file
     if backend:
-        os.environ["IRIS_BACKEND"] = backend
-
-    # Write configuration file
-    config_dir = os.path.join("iris", ".config")
-    os.makedirs(config_dir, exist_ok=True)
-
-    config_file = os.path.join(config_dir, "backend.txt")
-    if backend:
-        with open(config_file, "w") as f:
-            f.write(backend)
+        with open("iris/_backend_selected.py", "w") as f:
+            f.write(f'BACKEND = "{backend}"\n')
         print(f"Iris: Configured to use {backend} backend")
     else:
-        # Remove config file if it exists (auto-detect mode)
-        if os.path.exists(config_file):
-            os.remove(config_file)
+        # Remove file if it exists (auto-detect mode)
+        if os.path.exists("iris/_backend_selected.py"):
+            os.remove("iris/_backend_selected.py")
         print("Iris: No backend specified, will auto-detect at runtime")
 
 
diff --git a/iris/hip.py b/iris/hip.py
index d7645c39..26db03b9 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -23,17 +23,16 @@
 def _detect_backend():
     """Detect which backend to use based on build-time config, environment, and available libraries."""
     # 1. Check for build-time configuration file first
-    config_file = os.path.join(os.path.dirname(__file__), ".config", "backend.txt")
-    if os.path.exists(config_file):
-        try:
-            with open(config_file, "r") as f:
-                backend_config = f.read().strip().lower()
-                if backend_config in ("cuda", "nvidia"):
-                    return "cuda"
-                elif backend_config in ("hip", "amd", "rocm"):
-                    return "hip"
-        except (IOError, OSError):
-            pass
+    try:
+        from . import _backend_selected
+
+        backend_config = _backend_selected.BACKEND.lower()
+        if backend_config in ("cuda", "nvidia"):
+            return "cuda"
+        elif backend_config in ("hip", "amd", "rocm"):
+            return "hip"
+    except (ImportError, AttributeError):
+        pass
 
     # 2. Check environment variable
     backend_env = os.environ.get("IRIS_BACKEND", "").lower()
diff --git a/setup.py b/setup.py
index e7b4cef6..49454359 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ def build(setup_kwargs, config_settings=None):
     if config_settings is None:
         config_settings = {}
 
-    backend = config_settings.get("backend", "amd")
+    backend = config_settings.get("backend", "hip")
 
     # Normalize backend names
     if backend.lower() in ("nvidia", "cuda"):
@@ -25,7 +25,9 @@ def build(setup_kwargs, config_settings=None):
     else:
         backend = "hip"  # Default to hip
 
-    os.environ["IRIS_BACKEND"] = backend
+    # Write backend selection to a Python file
+    with open("iris/_backend_selected.py", "w") as f:
+        f.write(f'BACKEND = "{backend}"\n')
     print(f"Building Iris with backend={backend}")
 
     return setup_kwargs

From eaf980fbe3609ed4d80504e3a39c1be973b52213 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 03:06:40 +0000
Subject: [PATCH 12/22] Remove get_rocm_version() function from _cuda.py

- Removed get_rocm_version() function as it's not used in CUDA backend
- Function is ROCm-specific and not needed for NVIDIA GPUs
- get_num_xcc() in CUDA backend returns 1 directly without version check

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 iris/_cuda.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/iris/_cuda.py b/iris/_cuda.py
index aa7a04ea..db227d3a 100644
--- a/iris/_cuda.py
+++ b/iris/_cuda.py
@@ -97,36 +97,6 @@ def get_cu_count(device_id=None):
     return cu_count.value
 
 
-def get_rocm_version():
-    # Return CUDA version instead
-    major, minor = -1, -1
-
-    # Try nvcc --version
-    try:
-        result = subprocess.run(["nvcc", "--version"], capture_output=True, text=True, check=True)
-        # Parse version from output like "release 12.0, V12.0.76"
-        for line in result.stdout.split("\n"):
-            if "release" in line.lower():
-                version_part = line.split("release")[1].strip().split(",")[0]
-                parts = version_part.split(".")
-                if len(parts) >= 2:
-                    major = int(parts[0])
-                    minor = int(parts[1])
-                    break
-    except (subprocess.CalledProcessError, FileNotFoundError, ValueError, IndexError):
-        # If we can't get CUDA version, try environment variable
-        cuda_version = os.environ.get("CUDA_VERSION")
-        if cuda_version:
-            try:
-                parts = cuda_version.split(".")
-                major = int(parts[0])
-                minor = int(parts[1]) if len(parts) > 1 else 0
-            except (ValueError, IndexError):
-                pass
-
-    return (major, minor)
-
-
 def get_wall_clock_rate(device_id):
     cudaDevAttrMemoryClockRate = 36
     wall_clock_rate = ctypes.c_int()

From ee8a18af7d2619a4c6f5287e1d0451d96971dcc8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 03:11:41 +0000
Subject: [PATCH 13/22] Simplify backend loading and remove unused functions

- Remove malloc/free functions from _cuda.py (not used)
- Simplify iris/hip.py to use direct imports instead of dynamic loading
- Remove importlib.util dependency
- Cleaner and more readable code structure

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 iris/_cuda.py | 14 --------------
 iris/hip.py   | 19 +++----------------
 2 files changed, 3 insertions(+), 30 deletions(-)

diff --git a/iris/_cuda.py b/iris/_cuda.py
index db227d3a..958a5d74 100644
--- a/iris/_cuda.py
+++ b/iris/_cuda.py
@@ -118,17 +118,3 @@ def get_arch_string(device_id=None):
 def get_num_xcc(device_id=None):
     # XCC is AMD-specific, return 1 for CUDA
     return 1
-
-
-def malloc_fine_grained(size):
-    return hip_malloc(size)
-
-
-def hip_malloc(size):
-    ptr = ctypes.c_void_p()
-    hip_try(cuda_runtime.cudaMalloc(ctypes.byref(ptr), size))
-    return ptr
-
-
-def hip_free(ptr):
-    hip_try(cuda_runtime.cudaFree(ptr))
diff --git a/iris/hip.py b/iris/hip.py
index 26db03b9..1415c293 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -15,8 +15,6 @@
 """
 
 import os
-import sys
-import importlib.util
 
 
 # Detect backend
@@ -62,22 +60,11 @@ def _detect_backend():
 
 _backend = _detect_backend()
 
-# Load the appropriate backend module directly without triggering __init__.py
-_module_dir = os.path.dirname(__file__)
+# Import all public symbols from the appropriate backend module
 if _backend == "cuda":
-    _module_path = os.path.join(_module_dir, "_cuda.py")
-    _spec = importlib.util.spec_from_file_location("iris._cuda_backend", _module_path)
+    from iris._cuda import *  # noqa: F403, F401
 else:
-    _module_path = os.path.join(_module_dir, "_hip.py")
-    _spec = importlib.util.spec_from_file_location("iris._hip_backend", _module_path)
-
-_runtime_module = importlib.util.module_from_spec(_spec)
-_spec.loader.exec_module(_runtime_module)
-
-# Export all public symbols from the backend module
-for _name in dir(_runtime_module):
-    if not _name.startswith("_"):
-        globals()[_name] = getattr(_runtime_module, _name)
+    from iris._hip import *  # noqa: F403, F401
 
 
 # Make backend information available

From cde3ab8b8758fb943762cc65c30cb361521a40d2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 04:14:51 +0000
Subject: [PATCH 14/22] Remove build_backend.py and all environment variable
 support

- Delete build_backend.py (setup.py handles --config-settings)
- Update pyproject.toml to use setuptools.build_meta directly
- Remove IRIS_BACKEND environment variable checking from iris/hip.py
- Remove all env var documentation from README.md
- Backend now selected via: 1) build-time config, 2) auto-detection, 3) default to HIP

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 README.md        | 39 ++------------------------------
 build_backend.py | 59 ------------------------------------------------
 iris/hip.py      | 20 ++++------------
 pyproject.toml   |  3 +--
 4 files changed, 8 insertions(+), 113 deletions(-)
 delete mode 100644 build_backend.py

diff --git a/README.md b/README.md
index 09f7d7ad..8773f19f 100644
--- a/README.md
+++ b/README.md
@@ -125,9 +125,6 @@ To use Iris with AMD GPUs (default):
 pip install git+https://github.com/ROCm/iris.git --config-settings backend=hip
 ```
 
-> [!NOTE]
-> The backend can also be controlled at runtime via the `IRIS_BACKEND` environment variable if not set at build time.
-
 ### Docker Compose (Recommended for Development)
 
 The recommended way to get started is using Docker Compose, which provides a development environment with the Iris directory mounted inside the container. This allows you to make changes to the code outside the container and see them reflected inside.
@@ -167,11 +164,7 @@ Iris currently supports:
 
 ### Backend Selection
 
-Iris supports two methods for backend selection:
-
-#### 1. Build-time Configuration (Recommended)
-
-Install Iris with the desired backend using `--config-settings`:
+Iris supports backend selection at build time using `--config-settings`:
 
 ```bash
 # For NVIDIA GPUs
@@ -181,35 +174,7 @@ pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidi
 pip install git+https://github.com/ROCm/iris.git --config-settings backend=hip
 ```
 
-#### 2. Runtime Environment Variable
-
-If no backend was specified at build time, you can control it via the `IRIS_BACKEND` environment variable:
-
-```bash
-# Use CUDA backend for NVIDIA GPUs
-export IRIS_BACKEND=cuda
-# or
-export IRIS_BACKEND=nvidia
-
-# Use HIP backend for AMD GPUs (default)
-export IRIS_BACKEND=hip
-# or
-export IRIS_BACKEND=amd
-```
-
-Or set it in your Python script:
-
-```python
-import os
-os.environ["IRIS_BACKEND"] = "cuda"  # Must be set before importing iris
-import iris
-```
-
-The backend detection priority is:
-1. Build-time configuration (set via `--config-settings`)
-2. `IRIS_BACKEND` environment variable
-3. Auto-detection based on available GPU libraries
-4. Default to HIP for backward compatibility
+If no backend is specified at build time, Iris will auto-detect the appropriate backend based on available GPU libraries (libamdhip64.so for AMD, libcudart.so for NVIDIA), defaulting to HIP if neither is found.
 
 ## Roadmap
 
diff --git a/build_backend.py b/build_backend.py
deleted file mode 100644
index e2c21c04..00000000
--- a/build_backend.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-"""
-Custom build backend to support backend selection via --config-settings.
-
-This allows users to install Iris with:
-    pip install . --config-settings backend=nvidia
-or:
-    pip install . --config-settings backend=hip
-"""
-
-import os
-from setuptools import build_meta as _orig
-
-# Re-export all setuptools build_meta functions
-prepare_metadata_for_build_wheel = _orig.prepare_metadata_for_build_wheel
-get_requires_for_build_wheel = _orig.get_requires_for_build_wheel
-get_requires_for_build_sdist = _orig.get_requires_for_build_sdist
-
-
-def _write_backend_config(config_settings):
-    """Write backend configuration file based on --config-settings."""
-    backend = None
-
-    if config_settings:
-        backend = config_settings.get("backend", "").lower()
-
-    # Normalize backend names
-    if backend in ("nvidia", "cuda"):
-        backend = "cuda"
-    elif backend in ("amd", "rocm", "hip"):
-        backend = "hip"
-    else:
-        backend = None  # Auto-detect at runtime
-
-    # Write backend selection to a Python file
-    if backend:
-        with open("iris/_backend_selected.py", "w") as f:
-            f.write(f'BACKEND = "{backend}"\n')
-        print(f"Iris: Configured to use {backend} backend")
-    else:
-        # Remove file if it exists (auto-detect mode)
-        if os.path.exists("iris/_backend_selected.py"):
-            os.remove("iris/_backend_selected.py")
-        print("Iris: No backend specified, will auto-detect at runtime")
-
-
-# Wrap build functions to inject backend configuration
-def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
-    """Build wheel with backend configuration."""
-    _write_backend_config(config_settings)
-    return _orig.build_wheel(wheel_directory, config_settings, metadata_directory)
-
-
-def build_sdist(sdist_directory, config_settings=None):
-    """Build sdist with backend configuration."""
-    _write_backend_config(config_settings)
-    return _orig.build_sdist(sdist_directory, config_settings)
diff --git a/iris/hip.py b/iris/hip.py
index 1415c293..6f7a9782 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -6,20 +6,19 @@
 
 This module provides a HIP-compatible interface that transparently redirects
 to either the HIP backend (AMD GPUs) or CUDA backend (NVIDIA GPUs) based on
-build-time configuration, runtime detection, or auto-detection.
+build-time configuration or auto-detection.
 
 The backend is selected based on (in priority order):
 1. Build-time configuration (set via pip install --config-settings backend=nvidia)
-2. IRIS_BACKEND environment variable (set to 'cuda' or 'hip')
-3. Auto-detection based on available libraries
+2. Auto-detection based on available libraries
 """
 
-import os
+import ctypes
 
 
 # Detect backend
 def _detect_backend():
-    """Detect which backend to use based on build-time config, environment, and available libraries."""
+    """Detect which backend to use based on build-time config and available libraries."""
     # 1. Check for build-time configuration file first
     try:
         from . import _backend_selected
@@ -32,16 +31,7 @@ def _detect_backend():
     except (ImportError, AttributeError):
         pass
 
-    # 2. Check environment variable
-    backend_env = os.environ.get("IRIS_BACKEND", "").lower()
-    if backend_env in ("cuda", "nvidia"):
-        return "cuda"
-    elif backend_env in ("hip", "amd", "rocm"):
-        return "hip"
-
-    # 3. Auto-detect by trying to load libraries
-    import ctypes
-
+    # 2. Auto-detect by trying to load libraries
     try:
         ctypes.cdll.LoadLibrary("libamdhip64.so")
         return "hip"
diff --git a/pyproject.toml b/pyproject.toml
index e191a8c5..88f7b2e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,8 +3,7 @@
 
 [build-system]
 requires = ["setuptools>=61", "wheel", "setuptools-scm>=8"]
-build-backend = "build_backend"
-backend-path = ["."]
+build-backend = "setuptools.build_meta"
 
 [project]
 name = "iris"

From 96f462d42251450cb23d33f20a4ed688d974f8f9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 04:24:00 +0000
Subject: [PATCH 15/22] Revert README.md to original state

- Restore README.md to state before PR changes
- Keep only code changes without documentation updates
- All linting checks pass

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 README.md | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 8773f19f..2131d3f1 100644
--- a/README.md
+++ b/README.md
@@ -103,9 +103,7 @@ if __name__ == "__main__":
 ### Quick Installation
 
 > [!NOTE]
-> **Requirements**: Python 3.10+, PyTorch 2.0+, and Triton
-> - For AMD GPUs: ROCm 6.3.1+ HIP runtime and PyTorch ROCm version
-> - For NVIDIA GPUs: CUDA 11.0+ runtime and PyTorch CUDA version
+> **Requirements**: Python 3.10+, PyTorch 2.0+ (ROCm version), ROCm 6.3.1+ HIP runtime, and Triton
 
 For a quick installation directly from the repository:
 
@@ -113,18 +111,6 @@ For a quick installation directly from the repository:
 pip install git+https://github.com/ROCm/iris.git
 ```
 
-To use Iris with NVIDIA GPUs, install with the CUDA backend:
-
-```shell
-pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidia
-```
-
-To use Iris with AMD GPUs (default):
-
-```shell
-pip install git+https://github.com/ROCm/iris.git --config-settings backend=hip
-```
-
 ### Docker Compose (Recommended for Development)
 
 The recommended way to get started is using Docker Compose, which provides a development environment with the Iris directory mounted inside the container. This allows you to make changes to the code outside the container and see them reflected inside.
@@ -153,29 +139,11 @@ Check out our [examples](examples/) directory for ready-to-run scripts and usage
 
 Iris currently supports:
 
-### AMD GPUs (HIP Backend)
 - MI300X, MI350X & MI355X
 
-### NVIDIA GPUs (CUDA Backend)
-- All CUDA-capable GPUs with compute capability 7.0+
-
 > [!NOTE]
 > Iris may work on other AMD GPUs with ROCm compatibility.
 
-### Backend Selection
-
-Iris supports backend selection at build time using `--config-settings`:
-
-```bash
-# For NVIDIA GPUs
-pip install git+https://github.com/ROCm/iris.git --config-settings backend=nvidia
-
-# For AMD GPUs
-pip install git+https://github.com/ROCm/iris.git --config-settings backend=hip
-```
-
-If no backend is specified at build time, Iris will auto-detect the appropriate backend based on available GPU libraries (libamdhip64.so for AMD, libcudart.so for NVIDIA), defaulting to HIP if neither is found.
-
 ## Roadmap
 
 We plan to extend Iris with the following features:

From c64e4ed65fec2a4288be1d4bb2a8f009294be06a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 05:21:05 +0000
Subject: [PATCH 16/22] Simplify to auto-detection only, remove build() hook

- Remove build() function from setup.py (back to original)
- Simplify iris/hip.py to use only auto-detection (no build-time config)
- Remove _backend_selected.py from .gitignore (not used)
- Backend selection now purely based on library availability
- Minimal changes - keeps only essential backend auto-detection

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .gitignore  |  3 +--
 iris/hip.py | 22 +++-------------------
 setup.py    | 30 ------------------------------
 3 files changed, 4 insertions(+), 51 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5ce8808f..78645c5a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,5 +29,4 @@ slurm*.out
 examples/gemm/results/*
 asm/
 
-# Backend configuration (generated at build time)
-iris/_backend_selected.py
+# Backend configuration (generated
\ No newline at end of file
diff --git a/iris/hip.py b/iris/hip.py
index 6f7a9782..a731292c 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -6,11 +6,7 @@
 
 This module provides a HIP-compatible interface that transparently redirects
 to either the HIP backend (AMD GPUs) or CUDA backend (NVIDIA GPUs) based on
-build-time configuration or auto-detection.
-
-The backend is selected based on (in priority order):
-1. Build-time configuration (set via pip install --config-settings backend=nvidia)
-2. Auto-detection based on available libraries
+auto-detection.
 """
 
 import ctypes
@@ -18,20 +14,8 @@
 
 # Detect backend
 def _detect_backend():
-    """Detect which backend to use based on build-time config and available libraries."""
-    # 1. Check for build-time configuration file first
-    try:
-        from . import _backend_selected
-
-        backend_config = _backend_selected.BACKEND.lower()
-        if backend_config in ("cuda", "nvidia"):
-            return "cuda"
-        elif backend_config in ("hip", "amd", "rocm"):
-            return "hip"
-    except (ImportError, AttributeError):
-        pass
-
-    # 2. Auto-detect by trying to load libraries
+    """Detect which backend to use based on available libraries."""
+    # Auto-detect by trying to load libraries
     try:
         ctypes.cdll.LoadLibrary("libamdhip64.so")
         return "hip"
diff --git a/setup.py b/setup.py
index 49454359..69832461 100644
--- a/setup.py
+++ b/setup.py
@@ -1,38 +1,8 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 
-import os
 from setuptools import setup
 
-
-def build(setup_kwargs, config_settings=None):
-    """
-    Build hook to configure backend via --config-settings.
-
-    This function is called during the build process to handle
-    backend configuration from pip install --config-settings.
-    """
-    if config_settings is None:
-        config_settings = {}
-
-    backend = config_settings.get("backend", "hip")
-
-    # Normalize backend names
-    if backend.lower() in ("nvidia", "cuda"):
-        backend = "cuda"
-    elif backend.lower() in ("amd", "rocm", "hip"):
-        backend = "hip"
-    else:
-        backend = "hip"  # Default to hip
-
-    # Write backend selection to a Python file
-    with open("iris/_backend_selected.py", "w") as f:
-        f.write(f'BACKEND = "{backend}"\n')
-    print(f"Building Iris with backend={backend}")
-
-    return setup_kwargs
-
-
 # This setup.py provides backward compatibility for legacy metadata fields
 # that don't map directly from pyproject.toml's modern PEP 621 format.
 setup(

From da69d1bfa1773bbb4ec959aa806427d4488f7f5b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 6 Oct 2025 05:39:19 +0000
Subject: [PATCH 17/22] Consolidate backends into single hip.py with
 conditional branching

- Remove separate _hip.py and _cuda.py files
- Add backend auto-detection at module load time (_is_amd_backend flag)
- All functions now branch internally based on _is_amd_backend
- Minimal changes - keeps original hip.py structure
- Dynamic IPC handle size (64 for HIP, 128 for CUDA)
- Zero configuration required - works out of the box

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 iris/_cuda.py | 120 ----------------------
 iris/_hip.py  | 170 -------------------------------
 iris/hip.py   | 274 ++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 244 insertions(+), 320 deletions(-)
 delete mode 100644 iris/_cuda.py
 delete mode 100644 iris/_hip.py

diff --git a/iris/_cuda.py b/iris/_cuda.py
deleted file mode 100644
index 958a5d74..00000000
--- a/iris/_cuda.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-import ctypes
-import numpy as np
-import sys
-import torch
-import subprocess
-import os
-
-rt_path = "libcudart.so"
-cuda_runtime = ctypes.cdll.LoadLibrary(rt_path)
-
-
-def hip_try(err):
-    if err != 0:
-        cuda_runtime.cudaGetErrorString.restype = ctypes.c_char_p
-        error_string = cuda_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
-        raise RuntimeError(f"CUDA error code {err}: {error_string}")
-
-
-class hipIpcMemHandle_t(ctypes.Structure):
-    _fields_ = [("internal", ctypes.c_byte * 128)]
-
-
-def get_ipc_handle_size():
-    """Return the size of IPC handle in bytes (128 for CUDA)."""
-    return 128
-
-
-def open_ipc_handle(ipc_handle_data, rank):
-    ptr = ctypes.c_void_p()
-    cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
-    cuda_runtime.cudaIpcOpenMemHandle.argtypes = [
-        ctypes.POINTER(ctypes.c_void_p),
-        hipIpcMemHandle_t,
-        ctypes.c_uint,
-    ]
-    if isinstance(ipc_handle_data, np.ndarray):
-        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 128:
-            raise ValueError("ipc_handle_data must be a 128-element uint8 numpy array")
-        ipc_handle_bytes = ipc_handle_data.tobytes()
-        ipc_handle_data = (ctypes.c_char * 128).from_buffer_copy(ipc_handle_bytes)
-    else:
-        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 128 elements")
-
-    raw_memory = ctypes.create_string_buffer(128)
-    ctypes.memset(raw_memory, 0x00, 128)
-    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
-    ipc_handle_data_bytes = bytes(ipc_handle_data)
-    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 128)
-
-    hip_try(
-        cuda_runtime.cudaIpcOpenMemHandle(
-            ctypes.byref(ptr),
-            ipc_handle_struct,
-            cudaIpcMemLazyEnablePeerAccess,
-        )
-    )
-
-    return ptr.value
-
-
-def get_ipc_handle(ptr, rank):
-    ipc_handle = hipIpcMemHandle_t()
-    hip_try(cuda_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
-    return ipc_handle
-
-
-def count_devices():
-    device_count = ctypes.c_int()
-    hip_try(cuda_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
-    return device_count.value
-
-
-def set_device(gpu_id):
-    hip_try(cuda_runtime.cudaSetDevice(gpu_id))
-
-
-def get_device_id():
-    device_id = ctypes.c_int()
-    hip_try(cuda_runtime.cudaGetDevice(ctypes.byref(device_id)))
-    return device_id.value
-
-
-def get_cu_count(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-
-    cudaDeviceAttributeMultiprocessorCount = 16
-    cu_count = ctypes.c_int()
-
-    hip_try(
-        cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDeviceAttributeMultiprocessorCount, device_id)
-    )
-
-    return cu_count.value
-
-
-def get_wall_clock_rate(device_id):
-    cudaDevAttrMemoryClockRate = 36
-    wall_clock_rate = ctypes.c_int()
-    status = cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrMemoryClockRate, device_id)
-    hip_try(status)
-    return wall_clock_rate.value
-
-
-def get_arch_string(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-    # For CUDA, get compute capability
-    device_props = torch.cuda.get_device_properties(device_id)
-    major = device_props.major
-    minor = device_props.minor
-    return f"sm_{major}{minor}"
-
-
-def get_num_xcc(device_id=None):
-    # XCC is AMD-specific, return 1 for CUDA
-    return 1
diff --git a/iris/_hip.py b/iris/_hip.py
deleted file mode 100644
index f6f4d8ff..00000000
--- a/iris/_hip.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-import ctypes
-import numpy as np
-import sys
-import torch
-import subprocess
-import os
-
-rt_path = "libamdhip64.so"
-hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
-
-
-def hip_try(err):
-    if err != 0:
-        hip_runtime.hipGetErrorString.restype = ctypes.c_char_p
-        error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
-        raise RuntimeError(f"HIP error code {err}: {error_string}")
-
-
-class hipIpcMemHandle_t(ctypes.Structure):
-    _fields_ = [("reserved", ctypes.c_char * 64)]
-
-
-def get_ipc_handle_size():
-    """Return the size of IPC handle in bytes (64 for HIP)."""
-    return 64
-
-
-def open_ipc_handle(ipc_handle_data, rank):
-    ptr = ctypes.c_void_p()
-    hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
-    hip_runtime.hipIpcOpenMemHandle.argtypes = [
-        ctypes.POINTER(ctypes.c_void_p),
-        hipIpcMemHandle_t,
-        ctypes.c_uint,
-    ]
-    if isinstance(ipc_handle_data, np.ndarray):
-        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 64:
-            raise ValueError("ipc_handle_data must be a 64-element uint8 numpy array")
-        ipc_handle_bytes = ipc_handle_data.tobytes()
-        ipc_handle_data = (ctypes.c_char * 64).from_buffer_copy(ipc_handle_bytes)
-    else:
-        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 64 elements")
-
-    raw_memory = ctypes.create_string_buffer(64)
-    ctypes.memset(raw_memory, 0x00, 64)
-    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
-    ipc_handle_data_bytes = bytes(ipc_handle_data)
-    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 64)
-
-    hip_try(
-        hip_runtime.hipIpcOpenMemHandle(
-            ctypes.byref(ptr),
-            ipc_handle_struct,
-            hipIpcMemLazyEnablePeerAccess,
-        )
-    )
-
-    return ptr.value
-
-
-def get_ipc_handle(ptr, rank):
-    ipc_handle = hipIpcMemHandle_t()
-    hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
-    return ipc_handle
-
-
-def count_devices():
-    device_count = ctypes.c_int()
-    hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
-    return device_count.value
-
-
-def set_device(gpu_id):
-    hip_try(hip_runtime.hipSetDevice(gpu_id))
-
-
-def get_device_id():
-    device_id = ctypes.c_int()
-    hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id)))
-    return device_id.value
-
-
-def get_cu_count(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-
-    hipDeviceAttributeMultiprocessorCount = 63
-    cu_count = ctypes.c_int()
-
-    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
-
-    return cu_count.value
-
-
-def get_rocm_version():
-    major, minor = -1, -1
-
-    # Try hipconfig --path first
-    try:
-        result = subprocess.run(["hipconfig", "--path"], capture_output=True, text=True, check=True)
-        rocm_path = result.stdout.strip()
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        # Then look for $ROCM_PATH environment variable
-        rocm_path = os.environ.get("ROCM_PATH")
-        if not rocm_path:
-            # Finally, try default location
-            rocm_path = "/opt/rocm"
-
-    # Try to read version from .info/version file
-    try:
-        version_file_path = os.path.join(rocm_path, ".info", "version")
-        with open(version_file_path, "r") as version_file:
-            version = version_file.readline().strip()
-            major = int(version.split(".")[0])
-            minor = int(version.split(".")[1])
-    except (FileNotFoundError, IOError, ValueError, IndexError):
-        # If we can't read the version file, return -1, -1
-        pass
-
-    return (major, minor)
-
-
-def get_wall_clock_rate(device_id):
-    hipDeviceAttributeWallClockRate = 10017
-    wall_clock_rate = ctypes.c_int()
-    status = hip_runtime.hipDeviceGetAttribute(
-        ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id
-    )
-    hip_try(status)
-    return wall_clock_rate.value
-
-
-def get_arch_string(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-    arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
-    arch_name = arch_full.split(":")[0]
-    return arch_name
-
-
-def get_num_xcc(device_id=None):
-    if device_id is None:
-        device_id = get_device_id()
-    rocm_major, _ = get_rocm_version()
-    if rocm_major < 7:
-        return 8
-    hipDeviceAttributeNumberOfXccs = 10018
-    xcc_count = ctypes.c_int()
-    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
-    return xcc_count.value
-
-
-def malloc_fine_grained(size):
-    hipDeviceMallocFinegrained = 0x1
-    ptr = ctypes.c_void_p()
-    hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
-    return ptr
-
-
-def hip_malloc(size):
-    ptr = ctypes.c_void_p()
-    hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size))
-    return ptr
-
-
-def hip_free(ptr):
-    hip_try(hip_runtime.hipFree(ptr))
diff --git a/iris/hip.py b/iris/hip.py
index a731292c..a13acd07 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -1,47 +1,261 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 
-"""
-HIP-compatible API facade for Iris.
+import ctypes
+import numpy as np
+import sys
+import torch
+import subprocess
+import os
 
-This module provides a HIP-compatible interface that transparently redirects
-to either the HIP backend (AMD GPUs) or CUDA backend (NVIDIA GPUs) based on
-auto-detection.
-"""
+# Auto-detect backend
+_is_amd_backend = True
+try:
+    rt_path = "libamdhip64.so"
+    hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
+except OSError:
+    try:
+        rt_path = "libcudart.so"
+        hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
+        _is_amd_backend = False
+    except OSError:
+        # Default to HIP for backward compatibility
+        rt_path = "libamdhip64.so"
+        hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
 
-import ctypes
+
+def hip_try(err):
+    if err != 0:
+        if _is_amd_backend:
+            hip_runtime.hipGetErrorString.restype = ctypes.c_char_p
+            error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            raise RuntimeError(f"HIP error code {err}: {error_string}")
+        else:
+            hip_runtime.cudaGetErrorString.restype = ctypes.c_char_p
+            error_string = hip_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            raise RuntimeError(f"CUDA error code {err}: {error_string}")
+
+
+class hipIpcMemHandle_t(ctypes.Structure):
+    if _is_amd_backend:
+        _fields_ = [("reserved", ctypes.c_char * 64)]
+    else:
+        _fields_ = [("reserved", ctypes.c_char * 128)]
+
+
+def get_ipc_handle_size():
+    """Return the IPC handle size for the current backend."""
+    return 64 if _is_amd_backend else 128
+
+
+def open_ipc_handle(ipc_handle_data, rank):
+    ptr = ctypes.c_void_p()
+    handle_size = get_ipc_handle_size()
+    
+    if _is_amd_backend:
+        hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+        hip_runtime.hipIpcOpenMemHandle.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            hipIpcMemHandle_t,
+            ctypes.c_uint,
+        ]
+    else:
+        hip_runtime.cudaIpcOpenMemHandle.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            hipIpcMemHandle_t,
+            ctypes.c_uint,
+        ]
+        cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+    
+    if isinstance(ipc_handle_data, np.ndarray):
+        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != handle_size:
+            raise ValueError(f"ipc_handle_data must be a {handle_size}-element uint8 numpy array")
+        ipc_handle_bytes = ipc_handle_data.tobytes()
+        ipc_handle_data = (ctypes.c_char * handle_size).from_buffer_copy(ipc_handle_bytes)
+    else:
+        raise TypeError(f"ipc_handle_data must be a numpy.ndarray of dtype uint8 with {handle_size} elements")
+
+    raw_memory = ctypes.create_string_buffer(handle_size)
+    ctypes.memset(raw_memory, 0x00, handle_size)
+    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
+    ipc_handle_data_bytes = bytes(ipc_handle_data)
+    ctypes.memmove(raw_memory, ipc_handle_data_bytes, handle_size)
+
+    if _is_amd_backend:
+        hip_try(
+            hip_runtime.hipIpcOpenMemHandle(
+                ctypes.byref(ptr),
+                ipc_handle_struct,
+                hipIpcMemLazyEnablePeerAccess,
+            )
+        )
+    else:
+        hip_try(
+            hip_runtime.cudaIpcOpenMemHandle(
+                ctypes.byref(ptr),
+                ipc_handle_struct,
+                cudaIpcMemLazyEnablePeerAccess,
+            )
+        )
+
+    return ptr.value
+
+
+def get_ipc_handle(ptr, rank):
+    ipc_handle = hipIpcMemHandle_t()
+    if _is_amd_backend:
+        hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    else:
+        hip_try(hip_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    return ipc_handle
+
+
+def count_devices():
+    device_count = ctypes.c_int()
+    if _is_amd_backend:
+        hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
+    else:
+        hip_try(hip_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
+    return device_count.value
 
 
-# Detect backend
-def _detect_backend():
-    """Detect which backend to use based on available libraries."""
-    # Auto-detect by trying to load libraries
+def set_device(gpu_id):
+    if _is_amd_backend:
+        hip_try(hip_runtime.hipSetDevice(gpu_id))
+    else:
+        hip_try(hip_runtime.cudaSetDevice(gpu_id))
+
+
+def get_device_id():
+    device_id = ctypes.c_int()
+    if _is_amd_backend:
+        hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id)))
+    else:
+        hip_try(hip_runtime.cudaGetDevice(ctypes.byref(device_id)))
+    return device_id.value
+
+
+def get_cu_count(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+
+    cu_count = ctypes.c_int()
+    
+    if _is_amd_backend:
+        hipDeviceAttributeMultiprocessorCount = 63
+        hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
+    else:
+        cudaDevAttrMultiProcessorCount = 16
+        hip_try(hip_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDevAttrMultiProcessorCount, device_id))
+
+    return cu_count.value
+
+
+def get_rocm_version():
+    if not _is_amd_backend:
+        # Not applicable for CUDA
+        return (-1, -1)
+    
+    major, minor = -1, -1
+
+    # Try hipconfig --path first
     try:
-        ctypes.cdll.LoadLibrary("libamdhip64.so")
-        return "hip"
-    except (OSError, FileNotFoundError):
-        pass
+        result = subprocess.run(["hipconfig", "--path"], capture_output=True, text=True, check=True)
+        rocm_path = result.stdout.strip()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # Then look for $ROCM_PATH environment variable
+        rocm_path = os.environ.get("ROCM_PATH")
+        if not rocm_path:
+            # Finally, try default location
+            rocm_path = "/opt/rocm"
 
+    # Try to read version from .info/version file
     try:
-        ctypes.cdll.LoadLibrary("libcudart.so")
-        return "cuda"
-    except (OSError, FileNotFoundError):
+        version_file_path = os.path.join(rocm_path, ".info", "version")
+        with open(version_file_path, "r") as version_file:
+            version = version_file.readline().strip()
+            major = int(version.split(".")[0])
+            minor = int(version.split(".")[1])
+    except (FileNotFoundError, IOError, ValueError, IndexError):
+        # If we can't read the version file, return -1, -1
         pass
 
-    # Default to hip for backward compatibility
-    return "hip"
+    return (major, minor)
+
+
+def get_wall_clock_rate(device_id):
+    wall_clock_rate = ctypes.c_int()
+    
+    if _is_amd_backend:
+        hipDeviceAttributeWallClockRate = 10017
+        status = hip_runtime.hipDeviceGetAttribute(
+            ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id
+        )
+    else:
+        cudaDevAttrClockRate = 13
+        status = hip_runtime.cudaDeviceGetAttribute(
+            ctypes.byref(wall_clock_rate), cudaDevAttrClockRate, device_id
+        )
+    
+    hip_try(status)
+    return wall_clock_rate.value
+
+
+def get_arch_string(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+    
+    if _is_amd_backend:
+        arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
+        arch_name = arch_full.split(":")[0]
+        return arch_name
+    else:
+        # For CUDA, return compute capability
+        props = torch.cuda.get_device_properties(device_id)
+        return f"sm_{props.major}{props.minor}"
+
+
+def get_num_xcc(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+    
+    if not _is_amd_backend:
+        # XCC is AMD-specific, return 1 for CUDA
+        return 1
+    
+    rocm_major, _ = get_rocm_version()
+    if rocm_major < 7:
+        return 8
+    hipDeviceAttributeNumberOfXccs = 10018
+    xcc_count = ctypes.c_int()
+    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
+    return xcc_count.value
+
 
+def malloc_fine_grained(size):
+    ptr = ctypes.c_void_p()
+    
+    if _is_amd_backend:
+        hipDeviceMallocFinegrained = 0x1
+        hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
+    else:
+        # CUDA doesn't have direct equivalent, use regular malloc
+        hip_try(hip_runtime.cudaMalloc(ctypes.byref(ptr), size))
+    
+    return ptr
 
-_backend = _detect_backend()
 
-# Import all public symbols from the appropriate backend module
-if _backend == "cuda":
-    from iris._cuda import *  # noqa: F403, F401
-else:
-    from iris._hip import *  # noqa: F403, F401
+def hip_malloc(size):
+    ptr = ctypes.c_void_p()
+    if _is_amd_backend:
+        hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size))
+    else:
+        hip_try(hip_runtime.cudaMalloc(ctypes.byref(ptr), size))
+    return ptr
 
 
-# Make backend information available
-def get_backend():
-    """Get the currently active backend name ('hip' or 'cuda')."""
-    return _backend
+def hip_free(ptr):
+    if _is_amd_backend:
+        hip_try(hip_runtime.hipFree(ptr))
+    else:
+        hip_try(hip_runtime.cudaFree(ptr))

From 5f53ebe08e6ade6ea6ffe395b3320e5cb81ad0f1 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
Date: Wed, 8 Oct 2025 12:55:19 -0700
Subject: [PATCH 18/22] Add `setuptools` requirements

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2131d3f1..27eed85e 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ if __name__ == "__main__":
 ### Quick Installation
 
 > [!NOTE]
-> **Requirements**: Python 3.10+, PyTorch 2.0+ (ROCm version), ROCm 6.3.1+ HIP runtime, and Triton
+> **Requirements**: Python 3.10+, PyTorch 2.0+ (ROCm version), ROCm 6.3.1+ HIP runtime, Triton, and setuptools>=61
 
 For a quick installation directly from the repository:
 

From f06972ad313619265b36bcf9142687758ffb6685 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
Date: Wed, 8 Oct 2025 12:55:43 -0700
Subject: [PATCH 19/22] Name generic functions` gpu*`

---
 iris/hip.py | 87 +++++++++++++++++++++++++----------------------------
 1 file changed, 41 insertions(+), 46 deletions(-)

diff --git a/iris/hip.py b/iris/hip.py
index a13acd07..423ef88d 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -12,41 +12,36 @@
 _is_amd_backend = True
 try:
     rt_path = "libamdhip64.so"
-    hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
+    gpu_runtime = ctypes.cdll.LoadLibrary(rt_path)
 except OSError:
     try:
         rt_path = "libcudart.so"
-        hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
+        gpu_runtime = ctypes.cdll.LoadLibrary(rt_path)
         _is_amd_backend = False
     except OSError:
-        # Default to HIP for backward compatibility
         rt_path = "libamdhip64.so"
-        hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
+        gpu_runtime = ctypes.cdll.LoadLibrary(rt_path)
 
 
-def hip_try(err):
+def gpu_try(err):
     if err != 0:
         if _is_amd_backend:
-            hip_runtime.hipGetErrorString.restype = ctypes.c_char_p
-            error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            gpu_runtime.hipGetErrorString.restype = ctypes.c_char_p
+            error_string = gpu_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
             raise RuntimeError(f"HIP error code {err}: {error_string}")
         else:
-            hip_runtime.cudaGetErrorString.restype = ctypes.c_char_p
-            error_string = hip_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            gpu_runtime.cudaGetErrorString.restype = ctypes.c_char_p
+            error_string = gpu_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
             raise RuntimeError(f"CUDA error code {err}: {error_string}")
 
 
-class hipIpcMemHandle_t(ctypes.Structure):
-    if _is_amd_backend:
-        _fields_ = [("reserved", ctypes.c_char * 64)]
-    else:
-        _fields_ = [("reserved", ctypes.c_char * 128)]
-
-
 def get_ipc_handle_size():
     """Return the IPC handle size for the current backend."""
     return 64 if _is_amd_backend else 128
 
+class gpuIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("reserved", ctypes.c_char * get_ipc_handle_size())]
+
 
 def open_ipc_handle(ipc_handle_data, rank):
     ptr = ctypes.c_void_p()
@@ -54,15 +49,15 @@ def open_ipc_handle(ipc_handle_data, rank):
     
     if _is_amd_backend:
         hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
-        hip_runtime.hipIpcOpenMemHandle.argtypes = [
+        gpu_runtime.hipIpcOpenMemHandle.argtypes = [
             ctypes.POINTER(ctypes.c_void_p),
-            hipIpcMemHandle_t,
+            gpuIpcMemHandle_t,
             ctypes.c_uint,
         ]
     else:
-        hip_runtime.cudaIpcOpenMemHandle.argtypes = [
+        gpu_runtime.cudaIpcOpenMemHandle.argtypes = [
             ctypes.POINTER(ctypes.c_void_p),
-            hipIpcMemHandle_t,
+            gpuIpcMemHandle_t,
             ctypes.c_uint,
         ]
         cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
@@ -77,21 +72,21 @@ def open_ipc_handle(ipc_handle_data, rank):
 
     raw_memory = ctypes.create_string_buffer(handle_size)
     ctypes.memset(raw_memory, 0x00, handle_size)
-    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
+    ipc_handle_struct = gpuIpcMemHandle_t.from_buffer(raw_memory)
     ipc_handle_data_bytes = bytes(ipc_handle_data)
     ctypes.memmove(raw_memory, ipc_handle_data_bytes, handle_size)
 
     if _is_amd_backend:
-        hip_try(
-            hip_runtime.hipIpcOpenMemHandle(
+        gpu_try(
+            gpu_runtime.hipIpcOpenMemHandle(
                 ctypes.byref(ptr),
                 ipc_handle_struct,
                 hipIpcMemLazyEnablePeerAccess,
             )
         )
     else:
-        hip_try(
-            hip_runtime.cudaIpcOpenMemHandle(
+        gpu_try(
+            gpu_runtime.cudaIpcOpenMemHandle(
                 ctypes.byref(ptr),
                 ipc_handle_struct,
                 cudaIpcMemLazyEnablePeerAccess,
@@ -102,36 +97,36 @@ def open_ipc_handle(ipc_handle_data, rank):
 
 
 def get_ipc_handle(ptr, rank):
-    ipc_handle = hipIpcMemHandle_t()
+    ipc_handle = gpuIpcMemHandle_t()
     if _is_amd_backend:
-        hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+        gpu_try(gpu_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
     else:
-        hip_try(hip_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+        gpu_try(gpu_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
     return ipc_handle
 
 
 def count_devices():
     device_count = ctypes.c_int()
     if _is_amd_backend:
-        hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
+        gpu_try(gpu_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
     else:
-        hip_try(hip_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
+        gpu_try(gpu_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
     return device_count.value
 
 
 def set_device(gpu_id):
     if _is_amd_backend:
-        hip_try(hip_runtime.hipSetDevice(gpu_id))
+        gpu_try(gpu_runtime.hipSetDevice(gpu_id))
     else:
-        hip_try(hip_runtime.cudaSetDevice(gpu_id))
+        gpu_try(gpu_runtime.cudaSetDevice(gpu_id))
 
 
 def get_device_id():
     device_id = ctypes.c_int()
     if _is_amd_backend:
-        hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id)))
+        gpu_try(gpu_runtime.hipGetDevice(ctypes.byref(device_id)))
     else:
-        hip_try(hip_runtime.cudaGetDevice(ctypes.byref(device_id)))
+        gpu_try(gpu_runtime.cudaGetDevice(ctypes.byref(device_id)))
     return device_id.value
 
 
@@ -143,10 +138,10 @@ def get_cu_count(device_id=None):
     
     if _is_amd_backend:
         hipDeviceAttributeMultiprocessorCount = 63
-        hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
+        gpu_try(gpu_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
     else:
         cudaDevAttrMultiProcessorCount = 16
-        hip_try(hip_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDevAttrMultiProcessorCount, device_id))
+        gpu_try(gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDevAttrMultiProcessorCount, device_id))
 
     return cu_count.value
 
@@ -188,16 +183,16 @@ def get_wall_clock_rate(device_id):
     
     if _is_amd_backend:
         hipDeviceAttributeWallClockRate = 10017
-        status = hip_runtime.hipDeviceGetAttribute(
+        status = gpu_runtime.hipDeviceGetAttribute(
             ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id
         )
     else:
         cudaDevAttrClockRate = 13
-        status = hip_runtime.cudaDeviceGetAttribute(
+        status = gpu_runtime.cudaDeviceGetAttribute(
             ctypes.byref(wall_clock_rate), cudaDevAttrClockRate, device_id
         )
     
-    hip_try(status)
+    gpu_try(status)
     return wall_clock_rate.value
 
 
@@ -228,7 +223,7 @@ def get_num_xcc(device_id=None):
         return 8
     hipDeviceAttributeNumberOfXccs = 10018
     xcc_count = ctypes.c_int()
-    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
+    gpu_try(gpu_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
     return xcc_count.value
 
 
@@ -237,10 +232,10 @@ def malloc_fine_grained(size):
     
     if _is_amd_backend:
         hipDeviceMallocFinegrained = 0x1
-        hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
+        gpu_try(gpu_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
     else:
         # CUDA doesn't have direct equivalent, use regular malloc
-        hip_try(hip_runtime.cudaMalloc(ctypes.byref(ptr), size))
+        gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size))
     
     return ptr
 
@@ -248,14 +243,14 @@ def malloc_fine_grained(size):
 def hip_malloc(size):
     ptr = ctypes.c_void_p()
     if _is_amd_backend:
-        hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size))
+        gpu_try(gpu_runtime.hipMalloc(ctypes.byref(ptr), size))
     else:
-        hip_try(hip_runtime.cudaMalloc(ctypes.byref(ptr), size))
+        gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size))
     return ptr
 
 
 def hip_free(ptr):
     if _is_amd_backend:
-        hip_try(hip_runtime.hipFree(ptr))
+        gpu_try(gpu_runtime.hipFree(ptr))
     else:
-        hip_try(hip_runtime.cudaFree(ptr))
+        gpu_try(gpu_runtime.cudaFree(ptr))

From caf6ed0b6977957d481d97add595b4235b9baefb Mon Sep 17 00:00:00 2001
From: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
Date: Wed, 8 Oct 2025 12:56:12 -0700
Subject: [PATCH 20/22] Add necessay conversion for RCCL

---
 iris/iris.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/iris/iris.py b/iris/iris.py
index 5a9b5142..22627058 100644
--- a/iris/iris.py
+++ b/iris/iris.py
@@ -96,8 +96,10 @@ def __init__(self, heap_size=1 << 30):
 
         distributed_barrier()
 
-        all_ipc_handles = distributed_allgather(np.frombuffer(ipc_handle, dtype=np.uint8))
-        all_heap_bases = distributed_allgather(np.array([heap_bases[cur_rank]], dtype=np.uint64))
+        all_ipc_handles = distributed_allgather(np.frombuffer(ipc_handle, dtype=np.uint8).copy())
+        heap_base_bytes = np.array([heap_bases[cur_rank]], dtype=np.uint64).tobytes()
+        all_heap_bases_bytes = distributed_allgather(np.frombuffer(heap_base_bytes, dtype=np.uint8).copy())
+        all_heap_bases = np.frombuffer(all_heap_bases_bytes.tobytes(), dtype=np.uint64).reshape(num_ranks, -1)
 
         distributed_barrier()
 

From 73b0835c16163e7f5aef4a50689c20f4dc13a255 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 8 Oct 2025 19:57:16 +0000
Subject: [PATCH 21/22] Apply Ruff auto-fixes

---
 iris/hip.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/iris/hip.py b/iris/hip.py
index 423ef88d..89807860 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -39,6 +39,7 @@ def get_ipc_handle_size():
     """Return the IPC handle size for the current backend."""
     return 64 if _is_amd_backend else 128
 
+
 class gpuIpcMemHandle_t(ctypes.Structure):
     _fields_ = [("reserved", ctypes.c_char * get_ipc_handle_size())]
 
@@ -46,7 +47,7 @@ class gpuIpcMemHandle_t(ctypes.Structure):
 def open_ipc_handle(ipc_handle_data, rank):
     ptr = ctypes.c_void_p()
     handle_size = get_ipc_handle_size()
-    
+
     if _is_amd_backend:
         hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
         gpu_runtime.hipIpcOpenMemHandle.argtypes = [
@@ -61,7 +62,7 @@ def open_ipc_handle(ipc_handle_data, rank):
             ctypes.c_uint,
         ]
         cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
-    
+
     if isinstance(ipc_handle_data, np.ndarray):
         if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != handle_size:
             raise ValueError(f"ipc_handle_data must be a {handle_size}-element uint8 numpy array")
@@ -135,10 +136,12 @@ def get_cu_count(device_id=None):
         device_id = get_device_id()
 
     cu_count = ctypes.c_int()
-    
+
     if _is_amd_backend:
         hipDeviceAttributeMultiprocessorCount = 63
-        gpu_try(gpu_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
+        gpu_try(
+            gpu_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id)
+        )
     else:
         cudaDevAttrMultiProcessorCount = 16
         gpu_try(gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDevAttrMultiProcessorCount, device_id))
@@ -150,7 +153,7 @@ def get_rocm_version():
     if not _is_amd_backend:
         # Not applicable for CUDA
         return (-1, -1)
-    
+
     major, minor = -1, -1
 
     # Try hipconfig --path first
@@ -180,7 +183,7 @@ def get_rocm_version():
 
 def get_wall_clock_rate(device_id):
     wall_clock_rate = ctypes.c_int()
-    
+
     if _is_amd_backend:
         hipDeviceAttributeWallClockRate = 10017
         status = gpu_runtime.hipDeviceGetAttribute(
@@ -188,10 +191,8 @@ def get_wall_clock_rate(device_id):
         )
     else:
         cudaDevAttrClockRate = 13
-        status = gpu_runtime.cudaDeviceGetAttribute(
-            ctypes.byref(wall_clock_rate), cudaDevAttrClockRate, device_id
-        )
-    
+        status = gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrClockRate, device_id)
+
     gpu_try(status)
     return wall_clock_rate.value
 
@@ -199,7 +200,7 @@ def get_wall_clock_rate(device_id):
 def get_arch_string(device_id=None):
     if device_id is None:
         device_id = get_device_id()
-    
+
     if _is_amd_backend:
         arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
         arch_name = arch_full.split(":")[0]
@@ -213,11 +214,11 @@ def get_arch_string(device_id=None):
 def get_num_xcc(device_id=None):
     if device_id is None:
         device_id = get_device_id()
-    
+
     if not _is_amd_backend:
         # XCC is AMD-specific, return 1 for CUDA
         return 1
-    
+
     rocm_major, _ = get_rocm_version()
     if rocm_major < 7:
         return 8
@@ -229,14 +230,14 @@ def get_num_xcc(device_id=None):
 
 def malloc_fine_grained(size):
     ptr = ctypes.c_void_p()
-    
+
     if _is_amd_backend:
         hipDeviceMallocFinegrained = 0x1
         gpu_try(gpu_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
     else:
         # CUDA doesn't have direct equivalent, use regular malloc
         gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size))
-    
+
     return ptr
 
 
From abaa8539999b5d386acabda1e1ab3229b027db24 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <MuhammadAbdelghaffar.Awad@amd.com>
Date: Wed, 8 Oct 2025 13:27:46 -0700
Subject: [PATCH 22/22] Remove the git ignore comment

---
 .gitignore | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 78645c5a..cbaac1bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,4 @@ slurm*.out
 *.egg-info
 
 examples/gemm/results/*
-asm/
-
-# Backend configuration (generated
\ No newline at end of file
+asm/
\ No newline at end of file