From 752cac49c38fe5a939de5ff52daf969185fdbe12 Mon Sep 17 00:00:00 2001 From: myersCody Date: Tue, 17 Mar 2026 15:11:56 -0400 Subject: [PATCH 1/2] Fix gpu_uuid to be physical gpu --- nise/__init__.py | 2 +- nise/generators/ocp/ocp_generator.py | 33 ++++++++----- tests/test_ocp_generator.py | 69 +++++++++++++++++++++++----- 3 files changed, 80 insertions(+), 24 deletions(-) diff --git a/nise/__init__.py b/nise/__init__.py index 2d7d288d..d4e7f5c0 100644 --- a/nise/__init__.py +++ b/nise/__init__.py @@ -1,5 +1,5 @@ from .helpers import gcp_calculate_persistent_disk_usage_amount from .helpers import gcp_calculate_usage_amount_in_pricing -__version__ = "5.3.9" +__version__ = "5.4.0" VERSION = __version__.split(".") diff --git a/nise/generators/ocp/ocp_generator.py b/nise/generators/ocp/ocp_generator.py index 34b1869a..771c5183 100644 --- a/nise/generators/ocp/ocp_generator.py +++ b/nise/generators/ocp/ocp_generator.py @@ -1666,23 +1666,32 @@ def _gen_gpus(self): # noqa: C901 if not all([mig_profile, mig_strategy]): raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_strategy") mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}" - mig_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" - raw_mig_instance_id = mig_spec.get("mig_instance_id", mig_idx + 1) - try: - # MIG instance IDs are numeric; default to 1-based per-GPU-slice index when not provided. - mig_instance_id = int(raw_mig_instance_id) - except (ValueError, TypeError) as exc: - raise ValueError( - f"pod {pod_name}: mig_instance_id must be an integer value, got {raw_mig_instance_id}" - ) from exc - + # One gpu_uuid (GPU-...) per physical GPU; each MIG partition gets a unique mig_instance_id (MIG-...). + raw_mig_id = mig_spec.get("mig_instance_id") + if raw_mig_id is not None: + if isinstance(raw_mig_id, str): + mig_partition_id = raw_mig_id + elif isinstance(raw_mig_id, int): + mig_partition_id = f"MIG-{raw_mig_id}" + else: + try: + mig_partition_id = f"MIG-{int(raw_mig_id)}" + except (ValueError, TypeError) as exc: + raise ValueError( + f"pod {pod_name}: mig_instance_id must be a string or integer, " + f"got {type(raw_mig_id).__name__}" + ) from exc + else: + mig_partition_id = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" + + # Physical GPU uuid is shared by all MIG slices on this YAML gpu entry. pod_gpus.append( { - "gpu_uuid": mig_uuid, + "gpu_uuid": parent_gpu_uuid, "gpu_model_name": gpu_model, "gpu_vendor_name": GPU_VENDOR, "gpu_memory_capacity_mib": gpu_memory, - "mig_instance_id": mig_instance_id, + "mig_instance_id": mig_partition_id, "mig_profile": mig_profile, "mig_strategy": mig_strategy, } diff --git a/tests/test_ocp_generator.py b/tests/test_ocp_generator.py index 9d794f4f..d9e4a21c 100644 --- a/tests/test_ocp_generator.py +++ b/tests/test_ocp_generator.py @@ -1283,9 +1283,9 @@ def test_gen_gpus_yaml_with_mig_instances(self): self.assertEqual(gpu["gpu_model_name"], "H100") self.assertEqual(gpu["mig_profile"], "3g.40gb") self.assertEqual(gpu["mig_strategy"], "mixed") - self.assertIn("MIG-", gpu["gpu_uuid"]) - self.assertIsInstance(gpu["mig_instance_id"], int) - self.assertEqual(gpu["mig_instance_id"], 1) + self.assertIn("GPU-", gpu["gpu_uuid"]) + self.assertIsInstance(gpu["mig_instance_id"], str) + self.assertTrue(gpu["mig_instance_id"].startswith("MIG-")) def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self): """Test that ValueError is raised when a MIG instance lacks mig_profile or mig_strategy.""" @@ -1296,15 +1296,61 @@ def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self): self.assertIn("mig_profile", str(ctx.exception)) self.assertIn("mig_strategy", str(ctx.exception)) - def test_gen_gpus_raises_when_mig_instance_id_is_not_integer(self): - """Test that ValueError is raised when mig_instance_id cannot be coerced to int.""" + def test_gen_gpus_raises_when_mig_instance_id_invalid_type(self): + """Test that ValueError is raised when mig_instance_id is not a string or integer.""" attrs = self._mig_gpu_attributes( - pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": "not-an-int"} + pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": {}} ) with self.assertRaises(ValueError) as ctx: OCPGenerator(self.two_hours_ago, self.now, attrs) self.assertIn("invalid-mig-id-pod", str(ctx.exception)) - self.assertIn("mig_instance_id must be an integer value", str(ctx.exception)) + self.assertIn("mig_instance_id must be a string or integer", str(ctx.exception)) + + def test_mig_instances_share_same_physical_gpu_uuid(self): + """All MIG slices on one YAML gpu share gpu_uuid; mig_instance_id is unique per slice.""" + gpu = { + "gpu_model": "A100", + "gpu_memory_capacity_mib": 40960, + "mig_instances": [ + {"mig_profile": "1g.5gb", "mig_strategy": "mixed"}, + {"mig_profile": "2g.10gb", "mig_strategy": "mixed"}, + {"mig_profile": "4g.40gb", "mig_strategy": "mixed"}, + ], + } + attrs = { + "nodes": [ + { + "node_name": "mig-node", + "cpu_cores": 16, + "memory_gig": 64, + "namespaces": { + "ns": { + "pods": [ + { + "pod_name": "triple-mig-pod", + "cpu_request": 4, + "mem_request_gig": 16, + "cpu_limit": 8, + "mem_limit_gig": 32, + "gpus": [gpu], + }, + ] + } + }, + } + ] + } + generator = OCPGenerator(self.two_hours_ago, self.now, attrs) + pod_gpus = generator.gpus["triple-mig-pod"] + self.assertEqual(len(pod_gpus), 3) + physical_uuids = {g["gpu_uuid"] for g in pod_gpus} + self.assertEqual(len(physical_uuids), 1) + self.assertTrue(next(iter(physical_uuids)).startswith("GPU-")) + mig_ids = [g["mig_instance_id"] for g in pod_gpus] + self.assertEqual(len(set(mig_ids)), 3) + for mid in mig_ids: + self.assertIsInstance(mid, str) + self.assertTrue(mid.startswith("MIG-")) def test_gen_hourly_gpu_usage_includes_mig_fields(self): """Test that GPU usage rows include MIG fields when pod has MIG instances.""" @@ -1316,8 +1362,9 @@ def test_gen_hourly_gpu_usage_includes_mig_fields(self): row = gpu_data[0] self.assertEqual(row["mig_profile"], "3g.40gb") self.assertEqual(row["mig_strategy"], "mixed") - self.assertIsInstance(row["mig_instance_id"], int) - self.assertEqual(row["mig_instance_id"], 1) + self.assertIsInstance(row["mig_instance_id"], str) + self.assertTrue(row["mig_instance_id"].startswith("MIG-")) + self.assertIn("GPU-", row["gpu_uuid"]) def test_gen_gpus_random_generation(self): """Test random GPU generation (10% of pods).""" @@ -1397,7 +1444,7 @@ def test_update_gpu_data(self): "gpu_vendor_name": GPU_VENDOR, "gpu_memory_capacity_mib": 15360, "gpu_pod_uptime": 3000.123456, - "mig_instance_id": 1, + "mig_instance_id": "MIG-partition-1", "mig_profile": "3g.40gb", "mig_strategy": "mixed", } @@ -1410,7 +1457,7 @@ def test_update_gpu_data(self): self.assertEqual(updated_row["gpu_vendor_name"], GPU_VENDOR) self.assertEqual(updated_row["gpu_memory_capacity_mib"], 15360) self.assertEqual(updated_row["gpu_pod_uptime"], 3000.123456) - self.assertEqual(updated_row["mig_instance_id"], 1) + self.assertEqual(updated_row["mig_instance_id"], "MIG-partition-1") self.assertEqual(updated_row["mig_profile"], "3g.40gb") self.assertEqual(updated_row["mig_strategy"], "mixed") From 5f2f70acb10aa9bd7b65683dbdb09dd0d1893fc0 Mon Sep 17 00:00:00 2001 From: myersCody Date: Tue, 17 Mar 2026 15:23:16 -0400 Subject: [PATCH 2/2] Improve patch & lint --- nise/generators/ocp/ocp_generator.py | 36 +++++++++--------- tests/test_ocp_generator.py | 56 ++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/nise/generators/ocp/ocp_generator.py b/nise/generators/ocp/ocp_generator.py index 771c5183..45ef4712 100644 --- a/nise/generators/ocp/ocp_generator.py +++ b/nise/generators/ocp/ocp_generator.py @@ -1604,6 +1604,22 @@ def _gen_hourly_namespace_label_usage(self, **kwargs): ) yield row + @staticmethod + def _resolve_mig_partition_id(pod_name, mig_name, raw_mig_id): + """Stable MIG partition id: generated from mig_name, or from YAML string/int.""" + if raw_mig_id is None: + return f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" + if isinstance(raw_mig_id, str): + return raw_mig_id + if isinstance(raw_mig_id, int): + return f"MIG-{raw_mig_id}" + try: + return f"MIG-{int(raw_mig_id)}" + except (ValueError, TypeError) as exc: + raise ValueError( + f"pod {pod_name}: mig_instance_id must be a string or integer, got {type(raw_mig_id).__name__}" + ) from exc + def _gen_gpus(self): # noqa: C901 """Create GPUs for pods and nodes that need them.""" gpus = {} @@ -1666,23 +1682,9 @@ def _gen_gpus(self): # noqa: C901 if not all([mig_profile, mig_strategy]): raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_strategy") mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}" - # One gpu_uuid (GPU-...) per physical GPU; each MIG partition gets a unique mig_instance_id (MIG-...). - raw_mig_id = mig_spec.get("mig_instance_id") - if raw_mig_id is not None: - if isinstance(raw_mig_id, str): - mig_partition_id = raw_mig_id - elif isinstance(raw_mig_id, int): - mig_partition_id = f"MIG-{raw_mig_id}" - else: - try: - mig_partition_id = f"MIG-{int(raw_mig_id)}" - except (ValueError, TypeError) as exc: - raise ValueError( - f"pod {pod_name}: mig_instance_id must be a string or integer, " - f"got {type(raw_mig_id).__name__}" - ) from exc - else: - mig_partition_id = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" + mig_partition_id = self._resolve_mig_partition_id( + pod_name, mig_name, mig_spec.get("mig_instance_id") + ) # Physical GPU uuid is shared by all MIG slices on this YAML gpu entry. pod_gpus.append( diff --git a/tests/test_ocp_generator.py b/tests/test_ocp_generator.py index d9e4a21c..746974d3 100644 --- a/tests/test_ocp_generator.py +++ b/tests/test_ocp_generator.py @@ -21,6 +21,8 @@ from datetime import datetime from datetime import timedelta from unittest import TestCase +from uuid import NAMESPACE_DNS +from uuid import uuid5 from unittest.mock import Mock from unittest.mock import patch @@ -1298,9 +1300,7 @@ def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self): def test_gen_gpus_raises_when_mig_instance_id_invalid_type(self): """Test that ValueError is raised when mig_instance_id is not a string or integer.""" - attrs = self._mig_gpu_attributes( - pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": {}} - ) + attrs = self._mig_gpu_attributes(pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": {}}) with self.assertRaises(ValueError) as ctx: OCPGenerator(self.two_hours_ago, self.now, attrs) self.assertIn("invalid-mig-id-pod", str(ctx.exception)) @@ -1566,3 +1566,53 @@ def test_gpu_pod_uptime_matches_pod_seconds(self): unique_uptimes = set(uptimes) self.assertEqual(len(unique_uptimes), 1, f"Expected all GPUs to have same uptime, got {unique_uptimes}") self.assertEqual(unique_uptimes.pop(), specific_pod_seconds) + + +class ResolveMigPartitionIdTest(TestCase): + """Tests for OCPGenerator._resolve_mig_partition_id.""" + + def test_none_uses_stable_uuid_from_mig_name(self): + mig_name = "nise.ocp.mig.node-a.pod-b.0.1" + expected = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" + result = OCPGenerator._resolve_mig_partition_id("pod-b", mig_name, None) + self.assertEqual(result, expected) + self.assertTrue(result.startswith("MIG-")) + + def test_none_same_mig_name_same_id(self): + mig_name = "nise.ocp.mig.n.p.0.0" + a = OCPGenerator._resolve_mig_partition_id("p", mig_name, None) + b = OCPGenerator._resolve_mig_partition_id("p", mig_name, None) + self.assertEqual(a, b) + + def test_none_different_mig_name_different_id(self): + a = OCPGenerator._resolve_mig_partition_id("p", "nise.ocp.mig.n.p.0.0", None) + b = OCPGenerator._resolve_mig_partition_id("p", "nise.ocp.mig.n.p.0.1", None) + self.assertNotEqual(a, b) + + def test_string_returned_unchanged(self): + self.assertEqual( + OCPGenerator._resolve_mig_partition_id("pod", "mig", "MIG-abc-123"), + "MIG-abc-123", + ) + self.assertEqual( + OCPGenerator._resolve_mig_partition_id("pod", "mig", "custom-partition"), + "custom-partition", + ) + + def test_int_prefixed_with_mig(self): + self.assertEqual(OCPGenerator._resolve_mig_partition_id("p", "m", 7), "MIG-7") + self.assertEqual(OCPGenerator._resolve_mig_partition_id("p", "m", 0), "MIG-0") + + def test_coerces_numeric_non_int_via_int(self): + self.assertEqual(OCPGenerator._resolve_mig_partition_id("p", "m", 3.9), "MIG-3") + + def test_invalid_type_raises_value_error(self): + with self.assertRaises(ValueError) as ctx: + OCPGenerator._resolve_mig_partition_id("my-pod", "m", {}) + self.assertIn("my-pod", str(ctx.exception)) + self.assertIn("mig_instance_id must be a string or integer", str(ctx.exception)) + self.assertIn("dict", str(ctx.exception)) + + def test_invalid_type_list(self): + with self.assertRaises(ValueError): + OCPGenerator._resolve_mig_partition_id("p", "m", [])