From aabcb3c5021f8997c428fe9e28ab83724b88fa9e Mon Sep 17 00:00:00 2001 From: myersCody Date: Fri, 13 Mar 2026 13:28:59 -0400 Subject: [PATCH 1/3] Remove support for profile MIG mem & parent slice count --- nise/__init__.py | 2 +- nise/generators/ocp/ocp_generator.py | 20 ++------------------ tests/ocp_gpu_static_report.yml | 7 ------- tests/test_ocp_generator.py | 28 +++------------------------- 4 files changed, 6 insertions(+), 51 deletions(-) diff --git a/nise/__init__.py b/nise/__init__.py index d44673ef..bf7957b8 100644 --- a/nise/__init__.py +++ b/nise/__init__.py @@ -1,5 +1,5 @@ from .helpers import gcp_calculate_persistent_disk_usage_amount from .helpers import gcp_calculate_usage_amount_in_pricing -__version__ = "5.3.7" +__version__ = "5.3.8" VERSION = __version__.split(".") diff --git a/nise/generators/ocp/ocp_generator.py b/nise/generators/ocp/ocp_generator.py index ef53d7cd..28f6d3ae 100644 --- a/nise/generators/ocp/ocp_generator.py +++ b/nise/generators/ocp/ocp_generator.py @@ -222,9 +222,7 @@ "mig_instance_uuid", "mig_profile", "mig_slice_count", - "parent_gpu_max_slices", "parent_gpu_uuid", - "mig_memory_capacity_mib", ) COST_OCP_REPORT_TYPE_TO_COLS = { OCP_POD_USAGE: OCP_POD_USAGE_COLUMNS, @@ -1647,7 +1645,6 @@ def _gen_gpus(self): # noqa: C901 gpu_model = gpu_spec.get("gpu_model", choice(GPU_MODELS)) name = f"nise.ocp.gpu.{node_name}.{pod_name}.{gpu_idx}" parent_gpu_uuid = f"GPU-{uuid5(NAMESPACE_DNS, name)}" - max_slices = gpu_spec.get("gpu_max_slices") gpu_memory = gpu_spec.get("gpu_memory_capacity_mib", GPU_MEMORY_CAPACITY.get(gpu_model, 15360)) mig_instances = gpu_spec.get("mig_instances", []) if not mig_instances: @@ -1660,24 +1657,17 @@ def _gen_gpus(self): # noqa: C901 "mig_instance_uuid": None, "mig_profile": None, "mig_slice_count": None, - "parent_gpu_max_slices": None, "parent_gpu_uuid": None, - "mig_memory_capacity_mib": None, } ) continue - if not max_slices: - raise ValueError(f"GPU with MIG instances for pod '{pod_name}' requires gpu_max_slices") - for mig_idx, mig_spec in enumerate(mig_instances): mig_profile = mig_spec.get("mig_profile") mig_slice_count = mig_spec.get("mig_slice_count") - mig_memory = mig_spec.get("mig_memory_capacity_mib") - if not all([mig_profile, mig_slice_count, mig_memory]): + if not all([mig_profile, mig_slice_count]): raise ValueError( - f"MIG instance for pod '{pod_name}' requires mig_profile, " - "mig_slice_count, and mig_memory_capacity_mib" + f"MIG instance for pod '{pod_name}' requires mig_profile and mig_slice_count" ) mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}" mig_instance_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" @@ -1690,9 +1680,7 @@ def _gen_gpus(self): # noqa: C901 "mig_instance_uuid": mig_instance_uuid, "mig_profile": mig_profile, "mig_slice_count": mig_slice_count, - "parent_gpu_max_slices": max_slices, "parent_gpu_uuid": parent_gpu_uuid, - "mig_memory_capacity_mib": mig_memory, } ) @@ -1736,9 +1724,7 @@ def _gen_hourly_gpu_usage(self, **kwargs): mig_instance_uuid=gpu.get("mig_instance_uuid"), mig_profile=gpu.get("mig_profile"), mig_slice_count=gpu.get("mig_slice_count"), - parent_gpu_max_slices=gpu.get("parent_gpu_max_slices"), parent_gpu_uuid=gpu.get("parent_gpu_uuid"), - mig_memory_capacity_mib=gpu.get("mig_memory_capacity_mib"), **kwargs, ) yield row @@ -1757,9 +1743,7 @@ def _update_gpu_data(self, row, start, end, **kwargs): "mig_instance_uuid": kwargs.get("mig_instance_uuid"), "mig_profile": kwargs.get("mig_profile"), "mig_slice_count": kwargs.get("mig_slice_count"), - "parent_gpu_max_slices": kwargs.get("parent_gpu_max_slices"), "parent_gpu_uuid": kwargs.get("parent_gpu_uuid"), - "mig_memory_capacity_mib": kwargs.get("mig_memory_capacity_mib"), } row.update(data) return row diff --git a/tests/ocp_gpu_static_report.yml b/tests/ocp_gpu_static_report.yml index 0e2b7d0f..e3614807 100644 --- a/tests/ocp_gpu_static_report.yml +++ b/tests/ocp_gpu_static_report.yml @@ -61,33 +61,26 @@ generators: - gpu: gpu_model: "H100" gpu_memory_capacity_mib: 81920 - gpu_max_slices: 7 mig_instances: - mig_instance: mig_profile: "3g.40gb" mig_slice_count: 3 - mig_memory_capacity_mib: 40960 - mig_instance: mig_profile: "3g.40gb" mig_slice_count: 3 - mig_memory_capacity_mib: 40960 - gpu: gpu_model: "A100" gpu_memory_capacity_mib: 40960 - gpu_max_slices: 7 mig_instances: - mig_instance: mig_profile: "1g.5gb" mig_slice_count: 1 - mig_memory_capacity_mib: 5120 - mig_instance: mig_profile: "2g.10gb" mig_slice_count: 2 - mig_memory_capacity_mib: 10240 - mig_instance: mig_profile: "4g.20gb" mig_slice_count: 4 - mig_memory_capacity_mib: 20480 - gpu: gpu_model: "H100" gpu_memory_capacity_mib: 81920 diff --git a/tests/test_ocp_generator.py b/tests/test_ocp_generator.py index d5052320..cbb14f86 100644 --- a/tests/test_ocp_generator.py +++ b/tests/test_ocp_generator.py @@ -1198,12 +1198,10 @@ def _mig_gpu_attributes(self, pod_name="mig-pod", gpu_overrides=None, mig_instan gpu = { "gpu_model": "H100", "gpu_memory_capacity_mib": 81920, - "gpu_max_slices": 7, "mig_instances": [ { "mig_profile": "3g.40gb", "mig_slice_count": 3, - "mig_memory_capacity_mib": 40960, }, ], } @@ -1285,34 +1283,20 @@ def test_gen_gpus_yaml_with_mig_instances(self): self.assertEqual(gpu["gpu_model_name"], "H100") self.assertEqual(gpu["mig_profile"], "3g.40gb") self.assertEqual(gpu["mig_slice_count"], 3) - self.assertEqual(gpu["mig_memory_capacity_mib"], 40960) - self.assertEqual(gpu["parent_gpu_max_slices"], 7) self.assertIn("MIG-", gpu["mig_instance_uuid"]) self.assertIn("GPU-", gpu["parent_gpu_uuid"]) self.assertEqual(gpu["gpu_uuid"], gpu["mig_instance_uuid"]) - def test_gen_gpus_raises_when_mig_without_gpu_max_slices(self): - """Test that ValueError is raised when MIG instances are set but gpu_max_slices is missing.""" - attrs = self._mig_gpu_attributes(pod_name="bad-mig-pod") - del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["gpu_max_slices"] - with self.assertRaises(ValueError) as ctx: - OCPGenerator(self.two_hours_ago, self.now, attrs) - self.assertIn("gpu_max_slices", str(ctx.exception)) - def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self): - """Test that ValueError is raised when a MIG instance lacks mig_profile, mig_slice_count, or mig_memory.""" - attrs = self._mig_gpu_attributes( - pod_name="incomplete-mig-pod", - mig_instance_overrides={"mig_memory_capacity_mib": None}, - ) + """Test that ValueError is raised when a MIG instance lacks mig_profile or mig_slice_count.""" + attrs = self._mig_gpu_attributes(pod_name="incomplete-mig-pod") del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0][ - "mig_memory_capacity_mib" + "mig_slice_count" ] with self.assertRaises(ValueError) as ctx: OCPGenerator(self.two_hours_ago, self.now, attrs) self.assertIn("mig_profile", str(ctx.exception)) self.assertIn("mig_slice_count", str(ctx.exception)) - self.assertIn("mig_memory_capacity_mib", str(ctx.exception)) def test_gen_hourly_gpu_usage_includes_mig_fields(self): """Test that GPU usage rows include MIG fields when pod has MIG instances.""" @@ -1324,8 +1308,6 @@ def test_gen_hourly_gpu_usage_includes_mig_fields(self): row = gpu_data[0] self.assertEqual(row["mig_profile"], "3g.40gb") self.assertEqual(row["mig_slice_count"], 3) - self.assertEqual(row["mig_memory_capacity_mib"], 40960) - self.assertEqual(row["parent_gpu_max_slices"], 7) self.assertIn("MIG-", row["mig_instance_uuid"]) self.assertIn("GPU-", row["parent_gpu_uuid"]) @@ -1410,9 +1392,7 @@ def test_update_gpu_data(self): "mig_instance_uuid": "MIG-test-uuid", "mig_profile": "3g.40gb", "mig_slice_count": 3, - "parent_gpu_max_slices": 7, "parent_gpu_uuid": "GPU-parent-uuid", - "mig_memory_capacity_mib": 40960, } updated_row = generator._update_gpu_data(row, self.two_hours_ago, self.now, **kwargs) self.assertEqual(updated_row["node"], "test-node") @@ -1426,9 +1406,7 @@ def test_update_gpu_data(self): self.assertEqual(updated_row["mig_instance_uuid"], "MIG-test-uuid") self.assertEqual(updated_row["mig_profile"], "3g.40gb") self.assertEqual(updated_row["mig_slice_count"], 3) - self.assertEqual(updated_row["parent_gpu_max_slices"], 7) self.assertEqual(updated_row["parent_gpu_uuid"], "GPU-parent-uuid") - self.assertEqual(updated_row["mig_memory_capacity_mib"], 40960) def test_gpu_usage_with_multiple_gpus_per_pod(self): """Test that multiple GPUs per pod generate separate rows.""" From 692d84f3bbf7e7c2641f5c2e3ed87f139d0f92f6 Mon Sep 17 00:00:00 2001 From: myersCody Date: Fri, 13 Mar 2026 13:29:33 -0400 Subject: [PATCH 2/3] lint --- nise/generators/ocp/ocp_generator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nise/generators/ocp/ocp_generator.py b/nise/generators/ocp/ocp_generator.py index 28f6d3ae..42df295c 100644 --- a/nise/generators/ocp/ocp_generator.py +++ b/nise/generators/ocp/ocp_generator.py @@ -1666,9 +1666,7 @@ def _gen_gpus(self): # noqa: C901 mig_profile = mig_spec.get("mig_profile") mig_slice_count = mig_spec.get("mig_slice_count") if not all([mig_profile, mig_slice_count]): - raise ValueError( - f"MIG instance for pod '{pod_name}' requires mig_profile and mig_slice_count" - ) + raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_slice_count") mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}" mig_instance_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" pod_gpus.append( From abfdd28531ee3102102c269b936a66e40a84edaf Mon Sep 17 00:00:00 2001 From: myersCody Date: Fri, 13 Mar 2026 14:17:02 -0400 Subject: [PATCH 3/3] Address requests --- nise/generators/ocp/ocp_generator.py | 19 +++++++------------ tests/test_ocp_generator.py | 14 +++++--------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/nise/generators/ocp/ocp_generator.py b/nise/generators/ocp/ocp_generator.py index 42df295c..628a61fb 100644 --- a/nise/generators/ocp/ocp_generator.py +++ b/nise/generators/ocp/ocp_generator.py @@ -219,10 +219,9 @@ "gpu_vendor_name", "gpu_memory_capacity_mib", "gpu_pod_uptime", - "mig_instance_uuid", + "mig_instance_id", "mig_profile", "mig_slice_count", - "parent_gpu_uuid", ) COST_OCP_REPORT_TYPE_TO_COLS = { OCP_POD_USAGE: OCP_POD_USAGE_COLUMNS, @@ -1654,10 +1653,9 @@ def _gen_gpus(self): # noqa: C901 "gpu_model_name": gpu_model, "gpu_vendor_name": GPU_VENDOR, "gpu_memory_capacity_mib": gpu_memory, - "mig_instance_uuid": None, + "mig_instance_id": None, "mig_profile": None, "mig_slice_count": None, - "parent_gpu_uuid": None, } ) continue @@ -1668,17 +1666,16 @@ def _gen_gpus(self): # noqa: C901 if not all([mig_profile, mig_slice_count]): raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_slice_count") mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}" - mig_instance_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" + mig_instance_id = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" pod_gpus.append( { - "gpu_uuid": mig_instance_uuid, + "gpu_uuid": mig_instance_id, "gpu_model_name": gpu_model, "gpu_vendor_name": GPU_VENDOR, "gpu_memory_capacity_mib": gpu_memory, - "mig_instance_uuid": mig_instance_uuid, + "mig_instance_id": mig_instance_id, "mig_profile": mig_profile, "mig_slice_count": mig_slice_count, - "parent_gpu_uuid": parent_gpu_uuid, } ) @@ -1719,10 +1716,9 @@ def _gen_hourly_gpu_usage(self, **kwargs): gpu_vendor_name=gpu["gpu_vendor_name"], gpu_memory_capacity_mib=gpu["gpu_memory_capacity_mib"], gpu_pod_uptime=gpu_pod_uptime, - mig_instance_uuid=gpu.get("mig_instance_uuid"), + mig_instance_id=gpu.get("mig_instance_id"), mig_profile=gpu.get("mig_profile"), mig_slice_count=gpu.get("mig_slice_count"), - parent_gpu_uuid=gpu.get("parent_gpu_uuid"), **kwargs, ) yield row @@ -1738,10 +1734,9 @@ def _update_gpu_data(self, row, start, end, **kwargs): "gpu_vendor_name": kwargs.get("gpu_vendor_name"), "gpu_memory_capacity_mib": kwargs.get("gpu_memory_capacity_mib"), "gpu_pod_uptime": kwargs.get("gpu_pod_uptime"), - "mig_instance_uuid": kwargs.get("mig_instance_uuid"), + "mig_instance_id": kwargs.get("mig_instance_id"), "mig_profile": kwargs.get("mig_profile"), "mig_slice_count": kwargs.get("mig_slice_count"), - "parent_gpu_uuid": kwargs.get("parent_gpu_uuid"), } row.update(data) return row diff --git a/tests/test_ocp_generator.py b/tests/test_ocp_generator.py index cbb14f86..ddec1138 100644 --- a/tests/test_ocp_generator.py +++ b/tests/test_ocp_generator.py @@ -1283,9 +1283,8 @@ def test_gen_gpus_yaml_with_mig_instances(self): self.assertEqual(gpu["gpu_model_name"], "H100") self.assertEqual(gpu["mig_profile"], "3g.40gb") self.assertEqual(gpu["mig_slice_count"], 3) - self.assertIn("MIG-", gpu["mig_instance_uuid"]) - self.assertIn("GPU-", gpu["parent_gpu_uuid"]) - self.assertEqual(gpu["gpu_uuid"], gpu["mig_instance_uuid"]) + self.assertIn("MIG-", gpu["mig_instance_id"]) + self.assertEqual(gpu["gpu_uuid"], gpu["mig_instance_id"]) def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self): """Test that ValueError is raised when a MIG instance lacks mig_profile or mig_slice_count.""" @@ -1308,8 +1307,7 @@ def test_gen_hourly_gpu_usage_includes_mig_fields(self): row = gpu_data[0] self.assertEqual(row["mig_profile"], "3g.40gb") self.assertEqual(row["mig_slice_count"], 3) - self.assertIn("MIG-", row["mig_instance_uuid"]) - self.assertIn("GPU-", row["parent_gpu_uuid"]) + self.assertIn("MIG-", row["mig_instance_id"]) def test_gen_gpus_random_generation(self): """Test random GPU generation (10% of pods).""" @@ -1389,10 +1387,9 @@ def test_update_gpu_data(self): "gpu_vendor_name": GPU_VENDOR, "gpu_memory_capacity_mib": 15360, "gpu_pod_uptime": 3000.123456, - "mig_instance_uuid": "MIG-test-uuid", + "mig_instance_id": "MIG-test-uuid", "mig_profile": "3g.40gb", "mig_slice_count": 3, - "parent_gpu_uuid": "GPU-parent-uuid", } updated_row = generator._update_gpu_data(row, self.two_hours_ago, self.now, **kwargs) self.assertEqual(updated_row["node"], "test-node") @@ -1403,10 +1400,9 @@ def test_update_gpu_data(self): self.assertEqual(updated_row["gpu_vendor_name"], GPU_VENDOR) self.assertEqual(updated_row["gpu_memory_capacity_mib"], 15360) self.assertEqual(updated_row["gpu_pod_uptime"], 3000.123456) - self.assertEqual(updated_row["mig_instance_uuid"], "MIG-test-uuid") + self.assertEqual(updated_row["mig_instance_id"], "MIG-test-uuid") self.assertEqual(updated_row["mig_profile"], "3g.40gb") self.assertEqual(updated_row["mig_slice_count"], 3) - self.assertEqual(updated_row["parent_gpu_uuid"], "GPU-parent-uuid") def test_gpu_usage_with_multiple_gpus_per_pod(self): """Test that multiple GPUs per pod generate separate rows."""