Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nise/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .helpers import gcp_calculate_persistent_disk_usage_amount
from .helpers import gcp_calculate_usage_amount_in_pricing

__version__ = "5.3.8"
__version__ = "5.3.9"
VERSION = __version__.split(".")
29 changes: 19 additions & 10 deletions nise/generators/ocp/ocp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@
"gpu_pod_uptime",
"mig_instance_id",
"mig_profile",
"mig_slice_count",
"mig_strategy",
)
COST_OCP_REPORT_TYPE_TO_COLS = {
OCP_POD_USAGE: OCP_POD_USAGE_COLUMNS,
Expand Down Expand Up @@ -1655,27 +1655,36 @@ def _gen_gpus(self): # noqa: C901
"gpu_memory_capacity_mib": gpu_memory,
"mig_instance_id": None,
"mig_profile": None,
"mig_slice_count": None,
"mig_strategy": None,
}
)
continue

for mig_idx, mig_spec in enumerate(mig_instances):
mig_profile = mig_spec.get("mig_profile")
mig_slice_count = mig_spec.get("mig_slice_count")
if not all([mig_profile, mig_slice_count]):
raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_slice_count")
mig_strategy = mig_spec.get("mig_strategy")
if not all([mig_profile, mig_strategy]):
raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_strategy")
mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}"
mig_instance_id = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}"
mig_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}"
raw_mig_instance_id = mig_spec.get("mig_instance_id", mig_idx + 1)
try:
# MIG instance IDs are numeric; default to 1-based per-GPU-slice index when not provided.
mig_instance_id = int(raw_mig_instance_id)
except (ValueError, TypeError) as exc:
raise ValueError(
f"pod {pod_name}: mig_instance_id must be an integer value, got {raw_mig_instance_id}"
) from exc

pod_gpus.append(
{
"gpu_uuid": mig_instance_id,
"gpu_uuid": mig_uuid,
"gpu_model_name": gpu_model,
"gpu_vendor_name": GPU_VENDOR,
"gpu_memory_capacity_mib": gpu_memory,
"mig_instance_id": mig_instance_id,
"mig_profile": mig_profile,
"mig_slice_count": mig_slice_count,
"mig_strategy": mig_strategy,
}
)

Expand Down Expand Up @@ -1718,7 +1727,7 @@ def _gen_hourly_gpu_usage(self, **kwargs):
gpu_pod_uptime=gpu_pod_uptime,
mig_instance_id=gpu.get("mig_instance_id"),
mig_profile=gpu.get("mig_profile"),
mig_slice_count=gpu.get("mig_slice_count"),
mig_strategy=gpu.get("mig_strategy"),
**kwargs,
)
yield row
Expand All @@ -1736,7 +1745,7 @@ def _update_gpu_data(self, row, start, end, **kwargs):
"gpu_pod_uptime": kwargs.get("gpu_pod_uptime"),
"mig_instance_id": kwargs.get("mig_instance_id"),
"mig_profile": kwargs.get("mig_profile"),
"mig_slice_count": kwargs.get("mig_slice_count"),
"mig_strategy": kwargs.get("mig_strategy"),
}
row.update(data)
return row
Expand Down
17 changes: 12 additions & 5 deletions tests/ocp_gpu_static_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,26 +64,33 @@ generators:
mig_instances:
- mig_instance:
mig_profile: "3g.40gb"
mig_slice_count: 3
mig_strategy: "mixed"
- mig_instance:
mig_profile: "3g.40gb"
mig_slice_count: 3
mig_strategy: "mixed"
- gpu:
gpu_model: "A100"
gpu_memory_capacity_mib: 40960
mig_instances:
- mig_instance:
mig_profile: "1g.5gb"
mig_slice_count: 1
mig_strategy: "mixed"
- mig_instance:
mig_profile: "2g.10gb"
mig_slice_count: 2
mig_strategy: "mixed"
- mig_instance:
mig_profile: "4g.20gb"
mig_slice_count: 4
mig_strategy: "mixed"
- gpu:
gpu_model: "H100"
gpu_memory_capacity_mib: 81920
- gpu:
gpu_model: "H100"
gpu_memory_capacity_mib: 81920
- gpu:
gpu_model: "GH100"
gpu_memory_capacity_mib: 40960
mig_instances:
- mig_instance:
mig_profile: "3g.40gb"
mig_strategy: "single"
40 changes: 25 additions & 15 deletions tests/test_ocp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,7 @@ def _mig_gpu_attributes(self, pod_name="mig-pod", gpu_overrides=None, mig_instan
"mig_instances": [
{
"mig_profile": "3g.40gb",
"mig_slice_count": 3,
"mig_strategy": "mixed",
},
],
}
Expand Down Expand Up @@ -1282,20 +1282,29 @@ def test_gen_gpus_yaml_with_mig_instances(self):
gpu = pod_gpus[0]
self.assertEqual(gpu["gpu_model_name"], "H100")
self.assertEqual(gpu["mig_profile"], "3g.40gb")
self.assertEqual(gpu["mig_slice_count"], 3)
self.assertIn("MIG-", gpu["mig_instance_id"])
self.assertEqual(gpu["gpu_uuid"], gpu["mig_instance_id"])
self.assertEqual(gpu["mig_strategy"], "mixed")
self.assertIn("MIG-", gpu["gpu_uuid"])
self.assertIsInstance(gpu["mig_instance_id"], int)
self.assertEqual(gpu["mig_instance_id"], 1)

def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self):
"""Test that ValueError is raised when a MIG instance lacks mig_profile or mig_slice_count."""
"""Test that ValueError is raised when a MIG instance lacks mig_profile or mig_strategy."""
attrs = self._mig_gpu_attributes(pod_name="incomplete-mig-pod")
del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0][
"mig_slice_count"
]
del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0]["mig_strategy"]
with self.assertRaises(ValueError) as ctx:
OCPGenerator(self.two_hours_ago, self.now, attrs)
self.assertIn("mig_profile", str(ctx.exception))
self.assertIn("mig_slice_count", str(ctx.exception))
self.assertIn("mig_strategy", str(ctx.exception))

def test_gen_gpus_raises_when_mig_instance_id_is_not_integer(self):
"""Test that ValueError is raised when mig_instance_id cannot be coerced to int."""
attrs = self._mig_gpu_attributes(
pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": "not-an-int"}
)
with self.assertRaises(ValueError) as ctx:
OCPGenerator(self.two_hours_ago, self.now, attrs)
self.assertIn("invalid-mig-id-pod", str(ctx.exception))
self.assertIn("mig_instance_id must be an integer value", str(ctx.exception))

def test_gen_hourly_gpu_usage_includes_mig_fields(self):
"""Test that GPU usage rows include MIG fields when pod has MIG instances."""
Expand All @@ -1306,8 +1315,9 @@ def test_gen_hourly_gpu_usage_includes_mig_fields(self):
self.assertGreater(len(gpu_data), 0)
row = gpu_data[0]
self.assertEqual(row["mig_profile"], "3g.40gb")
self.assertEqual(row["mig_slice_count"], 3)
self.assertIn("MIG-", row["mig_instance_id"])
self.assertEqual(row["mig_strategy"], "mixed")
self.assertIsInstance(row["mig_instance_id"], int)
self.assertEqual(row["mig_instance_id"], 1)

def test_gen_gpus_random_generation(self):
"""Test random GPU generation (10% of pods)."""
Expand Down Expand Up @@ -1387,9 +1397,9 @@ def test_update_gpu_data(self):
"gpu_vendor_name": GPU_VENDOR,
"gpu_memory_capacity_mib": 15360,
"gpu_pod_uptime": 3000.123456,
"mig_instance_id": "MIG-test-uuid",
"mig_instance_id": 1,
"mig_profile": "3g.40gb",
"mig_slice_count": 3,
"mig_strategy": "mixed",
}
updated_row = generator._update_gpu_data(row, self.two_hours_ago, self.now, **kwargs)
self.assertEqual(updated_row["node"], "test-node")
Expand All @@ -1400,9 +1410,9 @@ def test_update_gpu_data(self):
self.assertEqual(updated_row["gpu_vendor_name"], GPU_VENDOR)
self.assertEqual(updated_row["gpu_memory_capacity_mib"], 15360)
self.assertEqual(updated_row["gpu_pod_uptime"], 3000.123456)
self.assertEqual(updated_row["mig_instance_id"], "MIG-test-uuid")
self.assertEqual(updated_row["mig_instance_id"], 1)
self.assertEqual(updated_row["mig_profile"], "3g.40gb")
self.assertEqual(updated_row["mig_slice_count"], 3)
self.assertEqual(updated_row["mig_strategy"], "mixed")

def test_gpu_usage_with_multiple_gpus_per_pod(self):
"""Test that multiple GPUs per pod generate separate rows."""
Expand Down
Loading