Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nise/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .helpers import gcp_calculate_persistent_disk_usage_amount
from .helpers import gcp_calculate_usage_amount_in_pricing

__version__ = "5.3.9"
__version__ = "5.4.0"
VERSION = __version__.split(".")
33 changes: 22 additions & 11 deletions nise/generators/ocp/ocp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1604,6 +1604,22 @@ def _gen_hourly_namespace_label_usage(self, **kwargs):
)
yield row

@staticmethod
def _resolve_mig_partition_id(pod_name, mig_name, raw_mig_id):
"""Stable MIG partition id: generated from mig_name, or from YAML string/int."""
if raw_mig_id is None:
return f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}"
if isinstance(raw_mig_id, str):
return raw_mig_id
if isinstance(raw_mig_id, int):
return f"MIG-{raw_mig_id}"
try:
return f"MIG-{int(raw_mig_id)}"
except (ValueError, TypeError) as exc:
raise ValueError(
f"pod {pod_name}: mig_instance_id must be a string or integer, got {type(raw_mig_id).__name__}"
) from exc

def _gen_gpus(self): # noqa: C901
"""Create GPUs for pods and nodes that need them."""
gpus = {}
Expand Down Expand Up @@ -1666,23 +1682,18 @@ def _gen_gpus(self): # noqa: C901
if not all([mig_profile, mig_strategy]):
raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_strategy")
mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}"
mig_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}"
raw_mig_instance_id = mig_spec.get("mig_instance_id", mig_idx + 1)
try:
# MIG instance IDs are numeric; default to 1-based per-GPU-slice index when not provided.
mig_instance_id = int(raw_mig_instance_id)
except (ValueError, TypeError) as exc:
raise ValueError(
f"pod {pod_name}: mig_instance_id must be an integer value, got {raw_mig_instance_id}"
) from exc
mig_partition_id = self._resolve_mig_partition_id(
pod_name, mig_name, mig_spec.get("mig_instance_id")
)

# Physical GPU uuid is shared by all MIG slices on this YAML gpu entry.
pod_gpus.append(
{
"gpu_uuid": mig_uuid,
"gpu_uuid": parent_gpu_uuid,
"gpu_model_name": gpu_model,
"gpu_vendor_name": GPU_VENDOR,
"gpu_memory_capacity_mib": gpu_memory,
"mig_instance_id": mig_instance_id,
"mig_instance_id": mig_partition_id,
"mig_profile": mig_profile,
"mig_strategy": mig_strategy,
}
Expand Down
123 changes: 110 additions & 13 deletions tests/test_ocp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from datetime import datetime
from datetime import timedelta
from unittest import TestCase
from uuid import NAMESPACE_DNS
from uuid import uuid5
from unittest.mock import Mock
from unittest.mock import patch

Expand Down Expand Up @@ -1283,9 +1285,9 @@ def test_gen_gpus_yaml_with_mig_instances(self):
self.assertEqual(gpu["gpu_model_name"], "H100")
self.assertEqual(gpu["mig_profile"], "3g.40gb")
self.assertEqual(gpu["mig_strategy"], "mixed")
self.assertIn("MIG-", gpu["gpu_uuid"])
self.assertIsInstance(gpu["mig_instance_id"], int)
self.assertEqual(gpu["mig_instance_id"], 1)
self.assertIn("GPU-", gpu["gpu_uuid"])
self.assertIsInstance(gpu["mig_instance_id"], str)
self.assertTrue(gpu["mig_instance_id"].startswith("MIG-"))

def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self):
"""Test that ValueError is raised when a MIG instance lacks mig_profile or mig_strategy."""
Expand All @@ -1296,15 +1298,59 @@ def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self):
self.assertIn("mig_profile", str(ctx.exception))
self.assertIn("mig_strategy", str(ctx.exception))

def test_gen_gpus_raises_when_mig_instance_id_is_not_integer(self):
"""Test that ValueError is raised when mig_instance_id cannot be coerced to int."""
attrs = self._mig_gpu_attributes(
pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": "not-an-int"}
)
def test_gen_gpus_raises_when_mig_instance_id_invalid_type(self):
"""Test that ValueError is raised when mig_instance_id is not a string or integer."""
attrs = self._mig_gpu_attributes(pod_name="invalid-mig-id-pod", mig_instance_overrides={"mig_instance_id": {}})
with self.assertRaises(ValueError) as ctx:
OCPGenerator(self.two_hours_ago, self.now, attrs)
self.assertIn("invalid-mig-id-pod", str(ctx.exception))
self.assertIn("mig_instance_id must be an integer value", str(ctx.exception))
self.assertIn("mig_instance_id must be a string or integer", str(ctx.exception))

def test_mig_instances_share_same_physical_gpu_uuid(self):
"""All MIG slices on one YAML gpu share gpu_uuid; mig_instance_id is unique per slice."""
gpu = {
"gpu_model": "A100",
"gpu_memory_capacity_mib": 40960,
"mig_instances": [
{"mig_profile": "1g.5gb", "mig_strategy": "mixed"},
{"mig_profile": "2g.10gb", "mig_strategy": "mixed"},
{"mig_profile": "4g.40gb", "mig_strategy": "mixed"},
],
}
attrs = {
"nodes": [
{
"node_name": "mig-node",
"cpu_cores": 16,
"memory_gig": 64,
"namespaces": {
"ns": {
"pods": [
{
"pod_name": "triple-mig-pod",
"cpu_request": 4,
"mem_request_gig": 16,
"cpu_limit": 8,
"mem_limit_gig": 32,
"gpus": [gpu],
},
]
}
},
}
]
}
generator = OCPGenerator(self.two_hours_ago, self.now, attrs)
pod_gpus = generator.gpus["triple-mig-pod"]
self.assertEqual(len(pod_gpus), 3)
physical_uuids = {g["gpu_uuid"] for g in pod_gpus}
self.assertEqual(len(physical_uuids), 1)
self.assertTrue(next(iter(physical_uuids)).startswith("GPU-"))
mig_ids = [g["mig_instance_id"] for g in pod_gpus]
self.assertEqual(len(set(mig_ids)), 3)
for mid in mig_ids:
self.assertIsInstance(mid, str)
self.assertTrue(mid.startswith("MIG-"))

def test_gen_hourly_gpu_usage_includes_mig_fields(self):
"""Test that GPU usage rows include MIG fields when pod has MIG instances."""
Expand All @@ -1316,8 +1362,9 @@ def test_gen_hourly_gpu_usage_includes_mig_fields(self):
row = gpu_data[0]
self.assertEqual(row["mig_profile"], "3g.40gb")
self.assertEqual(row["mig_strategy"], "mixed")
self.assertIsInstance(row["mig_instance_id"], int)
self.assertEqual(row["mig_instance_id"], 1)
self.assertIsInstance(row["mig_instance_id"], str)
self.assertTrue(row["mig_instance_id"].startswith("MIG-"))
self.assertIn("GPU-", row["gpu_uuid"])

def test_gen_gpus_random_generation(self):
"""Test random GPU generation (10% of pods)."""
Expand Down Expand Up @@ -1397,7 +1444,7 @@ def test_update_gpu_data(self):
"gpu_vendor_name": GPU_VENDOR,
"gpu_memory_capacity_mib": 15360,
"gpu_pod_uptime": 3000.123456,
"mig_instance_id": 1,
"mig_instance_id": "MIG-partition-1",
"mig_profile": "3g.40gb",
"mig_strategy": "mixed",
}
Expand All @@ -1410,7 +1457,7 @@ def test_update_gpu_data(self):
self.assertEqual(updated_row["gpu_vendor_name"], GPU_VENDOR)
self.assertEqual(updated_row["gpu_memory_capacity_mib"], 15360)
self.assertEqual(updated_row["gpu_pod_uptime"], 3000.123456)
self.assertEqual(updated_row["mig_instance_id"], 1)
self.assertEqual(updated_row["mig_instance_id"], "MIG-partition-1")
self.assertEqual(updated_row["mig_profile"], "3g.40gb")
self.assertEqual(updated_row["mig_strategy"], "mixed")

Expand Down Expand Up @@ -1519,3 +1566,53 @@ def test_gpu_pod_uptime_matches_pod_seconds(self):
unique_uptimes = set(uptimes)
self.assertEqual(len(unique_uptimes), 1, f"Expected all GPUs to have same uptime, got {unique_uptimes}")
self.assertEqual(unique_uptimes.pop(), specific_pod_seconds)


class ResolveMigPartitionIdTest(TestCase):
"""Tests for OCPGenerator._resolve_mig_partition_id."""

def test_none_uses_stable_uuid_from_mig_name(self):
mig_name = "nise.ocp.mig.node-a.pod-b.0.1"
expected = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}"
result = OCPGenerator._resolve_mig_partition_id("pod-b", mig_name, None)
self.assertEqual(result, expected)
self.assertTrue(result.startswith("MIG-"))

def test_none_same_mig_name_same_id(self):
mig_name = "nise.ocp.mig.n.p.0.0"
a = OCPGenerator._resolve_mig_partition_id("p", mig_name, None)
b = OCPGenerator._resolve_mig_partition_id("p", mig_name, None)
self.assertEqual(a, b)

def test_none_different_mig_name_different_id(self):
a = OCPGenerator._resolve_mig_partition_id("p", "nise.ocp.mig.n.p.0.0", None)
b = OCPGenerator._resolve_mig_partition_id("p", "nise.ocp.mig.n.p.0.1", None)
self.assertNotEqual(a, b)

def test_string_returned_unchanged(self):
self.assertEqual(
OCPGenerator._resolve_mig_partition_id("pod", "mig", "MIG-abc-123"),
"MIG-abc-123",
)
self.assertEqual(
OCPGenerator._resolve_mig_partition_id("pod", "mig", "custom-partition"),
"custom-partition",
)

def test_int_prefixed_with_mig(self):
self.assertEqual(OCPGenerator._resolve_mig_partition_id("p", "m", 7), "MIG-7")
self.assertEqual(OCPGenerator._resolve_mig_partition_id("p", "m", 0), "MIG-0")

def test_coerces_numeric_non_int_via_int(self):
self.assertEqual(OCPGenerator._resolve_mig_partition_id("p", "m", 3.9), "MIG-3")

def test_invalid_type_raises_value_error(self):
with self.assertRaises(ValueError) as ctx:
OCPGenerator._resolve_mig_partition_id("my-pod", "m", {})
self.assertIn("my-pod", str(ctx.exception))
self.assertIn("mig_instance_id must be a string or integer", str(ctx.exception))
self.assertIn("dict", str(ctx.exception))

def test_invalid_type_list(self):
with self.assertRaises(ValueError):
OCPGenerator._resolve_mig_partition_id("p", "m", [])
Loading