From b44d8fd4c4be4e46039917c1142d1197af1adb87 Mon Sep 17 00:00:00 2001
From: ncclementi <natyclementi@gmail.com>
Date: Mon, 30 Mar 2026 20:26:14 +0000
Subject: [PATCH] update nvlink checks

---
 rapids_cli/doctor/checks/nvlink.py | 39 +++++++++++++----
 rapids_cli/tests/test_nvlink.py    | 69 +++++++++++++++++++++++++++---
 2 files changed, 93 insertions(+), 15 deletions(-)

diff --git a/rapids_cli/doctor/checks/nvlink.py b/rapids_cli/doctor/checks/nvlink.py
index 22bbdd1..22dca1f 100644
--- a/rapids_cli/doctor/checks/nvlink.py
+++ b/rapids_cli/doctor/checks/nvlink.py
@@ -5,22 +5,43 @@
 import pynvml
 
 
-def check_nvlink_status(verbose=True):
-    """Check the system for NVLink with 2 or more GPUs."""
+def check_nvlink_status(verbose=True, **kwargs):
+    """Check NVLink status across all GPUs."""
     try:
         pynvml.nvmlInit()
     except pynvml.NVMLError as e:
         raise ValueError("GPU not found. Please ensure GPUs are installed.") from e
 
     device_count = pynvml.nvmlDeviceGetCount()
+
+    # NVLink requires at least 2 GPUs to be meaningful. A single GPU has nothing
+    # to link to, so there is nothing to check.
     if device_count < 2:
         return False
 
-    for i in range(device_count):
-        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-        for nvlink_id in range(pynvml.NVML_NVLINK_MAX_LINKS):
+    # Note: this check assumes a homogeneous GPU environment (all GPUs of the same
+    # model). Mixed configurations — e.g. some NVLink-capable GPUs alongside some
+    # that are not — are not handled and may produce misleading results.
+
+    failed_links: list[tuple[int, int]] = []
+
+    for gpu_idx in range(device_count):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
+        for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS):
             try:
-                pynvml.nvmlDeviceGetNvLinkState(handle, 0)
-                return True
-            except pynvml.NVMLError as e:
-                raise ValueError(f"NVLink {nvlink_id} Status Check Failed") from e
+                # nvmlDeviceGetNvLinkState(device, link) returns NVML_FEATURE_ENABLED
+                # if the link is active, or NVML_FEATURE_DISABLED if it is not.
+                state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id)
+                if state == pynvml.NVML_FEATURE_DISABLED:
+                    failed_links.append((gpu_idx, link_id))
+            except pynvml.NVMLError_NotSupported:
+                # The driver reports NVLink is not supported on this system.
+                # There is nothing to check — skip like the single-GPU case above.
+                return False
+
+    if failed_links:
+        details = ", ".join(f"GPU {gpu} link {link}" for gpu, link in failed_links)
+        raise ValueError(f"NVLink inactive on: {details}")
+
+    if verbose:
+        return f"All NVLinks active across {device_count} GPUs"
diff --git a/rapids_cli/tests/test_nvlink.py b/rapids_cli/tests/test_nvlink.py
index e2d82c7..5af39d0 100644
--- a/rapids_cli/tests/test_nvlink.py
+++ b/rapids_cli/tests/test_nvlink.py
@@ -7,19 +7,32 @@
 from rapids_cli.doctor.checks.nvlink import check_nvlink_status
 
 
-def test_check_nvlink_status_success():
+@pytest.mark.parametrize(
+    "verbose, expected",
+    [
+        (True, "All NVLinks active across 2 GPUs"),
+        (False, None),
+    ],
+)
+def test_check_nvlink_status_success(verbose, expected):
+    """2 GPUs, all NVLinks active — verbose controls whether a summary string is returned."""
+    import pynvml
+
     mock_handle = MagicMock()
     with (
         patch("pynvml.nvmlInit"),
         patch("pynvml.nvmlDeviceGetCount", return_value=2),
         patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle),
-        patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1),
+        patch(
+            "pynvml.nvmlDeviceGetNvLinkState", return_value=pynvml.NVML_FEATURE_ENABLED
+        ),
     ):
-        result = check_nvlink_status(verbose=True)
-        assert result is True
+        result = check_nvlink_status(verbose=verbose)
+        assert result == expected
 
 
 def test_check_nvlink_status_single_gpu():
+    """Single GPU — NVLink is not applicable, check skips early."""
     with (
         patch("pynvml.nvmlInit"),
         patch("pynvml.nvmlDeviceGetCount", return_value=1),
@@ -29,6 +42,7 @@ def test_check_nvlink_status_single_gpu():
 
 
 def test_check_nvlink_status_no_gpu():
+    """nvmlInit fails — no GPUs installed."""
     import pynvml
 
     with patch("pynvml.nvmlInit", side_effect=pynvml.NVMLError(1)):
@@ -38,7 +52,8 @@ def test_check_nvlink_status_no_gpu():
             check_nvlink_status(verbose=False)
 
 
-def test_check_nvlink_status_nvml_error():
+def test_check_nvlink_status_not_supported():
+    """NVLink is not supported on this system — check skips silently like single-GPU case."""
     import pynvml
 
     mock_handle = MagicMock()
@@ -50,5 +65,47 @@ def test_check_nvlink_status_nvml_error():
             "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported
         ),
     ):
-        with pytest.raises(ValueError, match="NVLink 0 Status Check Failed"):
+        result = check_nvlink_status(verbose=False)
+        assert result is False
+
+
+def test_check_nvlink_status_link_inactive():
+    """A supported link is inactive — check fails and reports which GPU and link."""
+    import pynvml
+
+    mock_handle = MagicMock()
+    with (
+        patch("pynvml.nvmlInit"),
+        patch("pynvml.nvmlDeviceGetCount", return_value=2),
+        patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle),
+        patch(
+            "pynvml.nvmlDeviceGetNvLinkState", return_value=pynvml.NVML_FEATURE_DISABLED
+        ),
+    ):
+        with pytest.raises(ValueError, match="NVLink inactive on:"):
+            check_nvlink_status(verbose=False)
+
+
+def test_check_nvlink_status_partial_failure():
+    """Some links active, some inactive — all failures are reported in a single error."""
+    import pynvml
+
+    mock_handle = MagicMock()
+
+    # Simulate: link 0 active, link 1 inactive, rest active
+    def mock_link_state(handle, link_id):
+        if link_id == 1:
+            return pynvml.NVML_FEATURE_DISABLED
+        return pynvml.NVML_FEATURE_ENABLED
+
+    with (
+        patch("pynvml.nvmlInit"),
+        patch("pynvml.nvmlDeviceGetCount", return_value=2),
+        patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle),
+        patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=mock_link_state),
+    ):
+        with pytest.raises(ValueError, match="NVLink inactive on:") as exc_info:
             check_nvlink_status(verbose=False)
+        # Both GPUs should have link 1 reported as failed
+        assert "GPU 0 link 1" in str(exc_info.value)
+        assert "GPU 1 link 1" in str(exc_info.value)