diff --git a/nise/__init__.py b/nise/__init__.py index bf7957b8..2d7d288d 100644 --- a/nise/__init__.py +++ b/nise/__init__.py @@ -1,5 +1,5 @@ from .helpers import gcp_calculate_persistent_disk_usage_amount from .helpers import gcp_calculate_usage_amount_in_pricing -__version__ = "5.3.8" +__version__ = "5.3.9" VERSION = __version__.split(".") diff --git a/nise/generators/ocp/ocp_generator.py b/nise/generators/ocp/ocp_generator.py index 628a61fb..34b1869a 100644 --- a/nise/generators/ocp/ocp_generator.py +++ b/nise/generators/ocp/ocp_generator.py @@ -221,7 +221,7 @@ "gpu_pod_uptime", "mig_instance_id", "mig_profile", - "mig_slice_count", + "mig_strategy", ) COST_OCP_REPORT_TYPE_TO_COLS = { OCP_POD_USAGE: OCP_POD_USAGE_COLUMNS, @@ -1655,27 +1655,36 @@ def _gen_gpus(self): # noqa: C901 "gpu_memory_capacity_mib": gpu_memory, "mig_instance_id": None, "mig_profile": None, - "mig_slice_count": None, + "mig_strategy": None, } ) continue for mig_idx, mig_spec in enumerate(mig_instances): mig_profile = mig_spec.get("mig_profile") - mig_slice_count = mig_spec.get("mig_slice_count") - if not all([mig_profile, mig_slice_count]): - raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_slice_count") + mig_strategy = mig_spec.get("mig_strategy") + if not all([mig_profile, mig_strategy]): + raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_strategy") mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}" - mig_instance_id = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" + mig_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}" + raw_mig_instance_id = mig_spec.get("mig_instance_id", mig_idx + 1) + try: + # MIG instance IDs are numeric; default to 1-based per-GPU-slice index when not provided. + mig_instance_id = int(raw_mig_instance_id) + except (ValueError, TypeError) as exc: + raise ValueError( + f"pod {pod_name}: mig_instance_id must be an integer value, got {raw_mig_instance_id}" + ) from exc + pod_gpus.append( { - "gpu_uuid": mig_instance_id, + "gpu_uuid": mig_uuid, "gpu_model_name": gpu_model, "gpu_vendor_name": GPU_VENDOR, "gpu_memory_capacity_mib": gpu_memory, "mig_instance_id": mig_instance_id, "mig_profile": mig_profile, - "mig_slice_count": mig_slice_count, + "mig_strategy": mig_strategy, } ) @@ -1718,7 +1727,7 @@ def _gen_hourly_gpu_usage(self, **kwargs): gpu_pod_uptime=gpu_pod_uptime, mig_instance_id=gpu.get("mig_instance_id"), mig_profile=gpu.get("mig_profile"), - mig_slice_count=gpu.get("mig_slice_count"), + mig_strategy=gpu.get("mig_strategy"), **kwargs, ) yield row @@ -1736,7 +1745,7 @@ def _update_gpu_data(self, row, start, end, **kwargs): "gpu_pod_uptime": kwargs.get("gpu_pod_uptime"), "mig_instance_id": kwargs.get("mig_instance_id"), "mig_profile": kwargs.get("mig_profile"), - "mig_slice_count": kwargs.get("mig_slice_count"), + "mig_strategy": kwargs.get("mig_strategy"), } row.update(data) return row diff --git a/tests/ocp_gpu_static_report.yml b/tests/ocp_gpu_static_report.yml index e3614807..72f78515 100644 --- a/tests/ocp_gpu_static_report.yml +++ b/tests/ocp_gpu_static_report.yml @@ -64,26 +64,33 @@ generators: mig_instances: - mig_instance: mig_profile: "3g.40gb" - mig_slice_count: 3 + mig_strategy: "mixed" - mig_instance: mig_profile: "3g.40gb" - mig_slice_count: 3 + mig_strategy: "mixed" - gpu: gpu_model: "A100" gpu_memory_capacity_mib: 40960 mig_instances: - mig_instance: mig_profile: "1g.5gb" - mig_slice_count: 1 + mig_strategy: "mixed" - mig_instance: mig_profile: "2g.10gb" - mig_slice_count: 2 + mig_strategy: "mixed" - mig_instance: mig_profile: "4g.20gb" - mig_slice_count: 4 + mig_strategy: "mixed" - gpu: gpu_model: "H100" gpu_memory_capacity_mib: 81920 - gpu: gpu_model: "H100" gpu_memory_capacity_mib: 81920 + - gpu: + gpu_model: "GH100" + gpu_memory_capacity_mib: 40960 + mig_instances: + - mig_instance: + mig_profile: "3g.40gb" + mig_strategy: "single" diff --git a/tests/test_ocp_generator.py b/tests/test_ocp_generator.py index ddec1138..9d794f4f 100644 --- a/tests/test_ocp_generator.py +++ b/tests/test_ocp_generator.py @@ -1201,7 +1201,7 @@ def _mig_gpu_attributes(self, pod_name="mig-pod", gpu_overrides=None, mig_instan "mig_instances": [ { "mig_profile": "3g.40gb", - "mig_slice_count": 3, + "mig_strategy": "mixed", }, ], } @@ -1282,20 +1282,29 @@ def test_gen_gpus_yaml_with_mig_instances(self): gpu = pod_gpus[0] self.assertEqual(gpu["gpu_model_name"], "H100") self.assertEqual(gpu["mig_profile"], "3g.40gb") - self.assertEqual(gpu["mig_slice_count"], 3) - self.assertIn("MIG-", gpu["mig_instance_id"]) - self.assertEqual(gpu["gpu_uuid"], gpu["mig_instance_id"]) + self.assertEqual(gpu["mig_strategy"], "mixed") + self.assertIn("MIG-", gpu["gpu_uuid"]) + self.assertIsInstance(gpu["mig_instance_id"], int) + self.assertEqual(gpu["mig_instance_id"], 1) def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self): - """Test that ValueError is raised when a MIG instance lacks mig_profile or mig_slice_count.""" + """Test that ValueError is raised when a MIG instance lacks mig_profile or mig_strategy.""" attrs = self._mig_gpu_attributes(pod_name="incomplete-mig-pod") - del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0][ - "mig_slice_count" - ] + del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0]["mig_strategy"] with self.assertRaises(ValueError) as ctx: OCPGenerator(self.two_hours_ago, self.now, attrs) self.assertIn("mig_profile", str(ctx.exception)) - self.assertIn("mig_slice_count", str(ctx.exception)) + self.assertIn("mig_strategy", str(ctx.exception)) + + def test_gen_gpus_raises_when_mig_instance_id_is_not_integer(self): + """Test that ValueError is raised when mig_instance_id cannot be coerced to int.""" + attrs = self._mig_gpu_attributes( + pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": "not-an-int"} + ) + with self.assertRaises(ValueError) as ctx: + OCPGenerator(self.two_hours_ago, self.now, attrs) + self.assertIn("invalid-mig-id-pod", str(ctx.exception)) + self.assertIn("mig_instance_id must be an integer value", str(ctx.exception)) def test_gen_hourly_gpu_usage_includes_mig_fields(self): """Test that GPU usage rows include MIG fields when pod has MIG instances.""" @@ -1306,8 +1315,9 @@ def test_gen_hourly_gpu_usage_includes_mig_fields(self): self.assertGreater(len(gpu_data), 0) row = gpu_data[0] self.assertEqual(row["mig_profile"], "3g.40gb") - self.assertEqual(row["mig_slice_count"], 3) - self.assertIn("MIG-", row["mig_instance_id"]) + self.assertEqual(row["mig_strategy"], "mixed") + self.assertIsInstance(row["mig_instance_id"], int) + self.assertEqual(row["mig_instance_id"], 1) def test_gen_gpus_random_generation(self): """Test random GPU generation (10% of pods).""" @@ -1387,9 +1397,9 @@ def test_update_gpu_data(self): "gpu_vendor_name": GPU_VENDOR, "gpu_memory_capacity_mib": 15360, "gpu_pod_uptime": 3000.123456, - "mig_instance_id": "MIG-test-uuid", + "mig_instance_id": 1, "mig_profile": "3g.40gb", - "mig_slice_count": 3, + "mig_strategy": "mixed", } updated_row = generator._update_gpu_data(row, self.two_hours_ago, self.now, **kwargs) self.assertEqual(updated_row["node"], "test-node") @@ -1400,9 +1410,9 @@ def test_update_gpu_data(self): self.assertEqual(updated_row["gpu_vendor_name"], GPU_VENDOR) self.assertEqual(updated_row["gpu_memory_capacity_mib"], 15360) self.assertEqual(updated_row["gpu_pod_uptime"], 3000.123456) - self.assertEqual(updated_row["mig_instance_id"], "MIG-test-uuid") + self.assertEqual(updated_row["mig_instance_id"], 1) self.assertEqual(updated_row["mig_profile"], "3g.40gb") - self.assertEqual(updated_row["mig_slice_count"], 3) + self.assertEqual(updated_row["mig_strategy"], "mixed") def test_gpu_usage_with_multiple_gpus_per_pod(self): """Test that multiple GPUs per pod generate separate rows."""