Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nise/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .helpers import gcp_calculate_persistent_disk_usage_amount
from .helpers import gcp_calculate_usage_amount_in_pricing

__version__ = "5.3.7"
__version__ = "5.3.8"
VERSION = __version__.split(".")
41 changes: 9 additions & 32 deletions nise/generators/ocp/ocp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,12 +219,9 @@
"gpu_vendor_name",
"gpu_memory_capacity_mib",
"gpu_pod_uptime",
"mig_instance_uuid",
"mig_instance_id",
"mig_profile",
"mig_slice_count",
"parent_gpu_max_slices",
"parent_gpu_uuid",
"mig_memory_capacity_mib",
)
COST_OCP_REPORT_TYPE_TO_COLS = {
OCP_POD_USAGE: OCP_POD_USAGE_COLUMNS,
Expand Down Expand Up @@ -1647,7 +1644,6 @@ def _gen_gpus(self): # noqa: C901
gpu_model = gpu_spec.get("gpu_model", choice(GPU_MODELS))
name = f"nise.ocp.gpu.{node_name}.{pod_name}.{gpu_idx}"
parent_gpu_uuid = f"GPU-{uuid5(NAMESPACE_DNS, name)}"
max_slices = gpu_spec.get("gpu_max_slices")
gpu_memory = gpu_spec.get("gpu_memory_capacity_mib", GPU_MEMORY_CAPACITY.get(gpu_model, 15360))
mig_instances = gpu_spec.get("mig_instances", [])
if not mig_instances:
Expand All @@ -1657,42 +1653,29 @@ def _gen_gpus(self): # noqa: C901
"gpu_model_name": gpu_model,
"gpu_vendor_name": GPU_VENDOR,
"gpu_memory_capacity_mib": gpu_memory,
"mig_instance_uuid": None,
"mig_instance_id": None,
"mig_profile": None,
"mig_slice_count": None,
"parent_gpu_max_slices": None,
"parent_gpu_uuid": None,
"mig_memory_capacity_mib": None,
}
)
continue

if not max_slices:
raise ValueError(f"GPU with MIG instances for pod '{pod_name}' requires gpu_max_slices")

for mig_idx, mig_spec in enumerate(mig_instances):
mig_profile = mig_spec.get("mig_profile")
mig_slice_count = mig_spec.get("mig_slice_count")
mig_memory = mig_spec.get("mig_memory_capacity_mib")
if not all([mig_profile, mig_slice_count, mig_memory]):
raise ValueError(
f"MIG instance for pod '{pod_name}' requires mig_profile, "
"mig_slice_count, and mig_memory_capacity_mib"
)
if not all([mig_profile, mig_slice_count]):
raise ValueError(f"MIG instance for pod '{pod_name}' requires mig_profile and mig_slice_count")
mig_name = f"nise.ocp.mig.{node_name}.{pod_name}.{gpu_idx}.{mig_idx}"
mig_instance_uuid = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}"
mig_instance_id = f"MIG-{uuid5(NAMESPACE_DNS, mig_name)}"
pod_gpus.append(
{
"gpu_uuid": mig_instance_uuid,
"gpu_uuid": mig_instance_id,
"gpu_model_name": gpu_model,
"gpu_vendor_name": GPU_VENDOR,
"gpu_memory_capacity_mib": gpu_memory,
"mig_instance_uuid": mig_instance_uuid,
"mig_instance_id": mig_instance_id,
"mig_profile": mig_profile,
"mig_slice_count": mig_slice_count,
"parent_gpu_max_slices": max_slices,
"parent_gpu_uuid": parent_gpu_uuid,
"mig_memory_capacity_mib": mig_memory,
}
)

Expand Down Expand Up @@ -1733,12 +1716,9 @@ def _gen_hourly_gpu_usage(self, **kwargs):
gpu_vendor_name=gpu["gpu_vendor_name"],
gpu_memory_capacity_mib=gpu["gpu_memory_capacity_mib"],
gpu_pod_uptime=gpu_pod_uptime,
mig_instance_uuid=gpu.get("mig_instance_uuid"),
mig_instance_id=gpu.get("mig_instance_id"),
mig_profile=gpu.get("mig_profile"),
mig_slice_count=gpu.get("mig_slice_count"),
parent_gpu_max_slices=gpu.get("parent_gpu_max_slices"),
parent_gpu_uuid=gpu.get("parent_gpu_uuid"),
mig_memory_capacity_mib=gpu.get("mig_memory_capacity_mib"),
**kwargs,
)
yield row
Expand All @@ -1754,12 +1734,9 @@ def _update_gpu_data(self, row, start, end, **kwargs):
"gpu_vendor_name": kwargs.get("gpu_vendor_name"),
"gpu_memory_capacity_mib": kwargs.get("gpu_memory_capacity_mib"),
"gpu_pod_uptime": kwargs.get("gpu_pod_uptime"),
"mig_instance_uuid": kwargs.get("mig_instance_uuid"),
"mig_instance_id": kwargs.get("mig_instance_id"),
"mig_profile": kwargs.get("mig_profile"),
"mig_slice_count": kwargs.get("mig_slice_count"),
"parent_gpu_max_slices": kwargs.get("parent_gpu_max_slices"),
"parent_gpu_uuid": kwargs.get("parent_gpu_uuid"),
"mig_memory_capacity_mib": kwargs.get("mig_memory_capacity_mib"),
}
row.update(data)
return row
Expand Down
7 changes: 0 additions & 7 deletions tests/ocp_gpu_static_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,33 +61,26 @@ generators:
- gpu:
gpu_model: "H100"
gpu_memory_capacity_mib: 81920
gpu_max_slices: 7
mig_instances:
- mig_instance:
mig_profile: "3g.40gb"
mig_slice_count: 3
mig_memory_capacity_mib: 40960
- mig_instance:
mig_profile: "3g.40gb"
mig_slice_count: 3
mig_memory_capacity_mib: 40960
- gpu:
gpu_model: "A100"
gpu_memory_capacity_mib: 40960
gpu_max_slices: 7
mig_instances:
- mig_instance:
mig_profile: "1g.5gb"
mig_slice_count: 1
mig_memory_capacity_mib: 5120
- mig_instance:
mig_profile: "2g.10gb"
mig_slice_count: 2
mig_memory_capacity_mib: 10240
- mig_instance:
mig_profile: "4g.20gb"
mig_slice_count: 4
mig_memory_capacity_mib: 20480
- gpu:
gpu_model: "H100"
gpu_memory_capacity_mib: 81920
Expand Down
42 changes: 8 additions & 34 deletions tests/test_ocp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,12 +1198,10 @@ def _mig_gpu_attributes(self, pod_name="mig-pod", gpu_overrides=None, mig_instan
gpu = {
"gpu_model": "H100",
"gpu_memory_capacity_mib": 81920,
"gpu_max_slices": 7,
"mig_instances": [
{
"mig_profile": "3g.40gb",
"mig_slice_count": 3,
"mig_memory_capacity_mib": 40960,
},
],
}
Expand Down Expand Up @@ -1285,34 +1283,19 @@ def test_gen_gpus_yaml_with_mig_instances(self):
self.assertEqual(gpu["gpu_model_name"], "H100")
self.assertEqual(gpu["mig_profile"], "3g.40gb")
self.assertEqual(gpu["mig_slice_count"], 3)
self.assertEqual(gpu["mig_memory_capacity_mib"], 40960)
self.assertEqual(gpu["parent_gpu_max_slices"], 7)
self.assertIn("MIG-", gpu["mig_instance_uuid"])
self.assertIn("GPU-", gpu["parent_gpu_uuid"])
self.assertEqual(gpu["gpu_uuid"], gpu["mig_instance_uuid"])

def test_gen_gpus_raises_when_mig_without_gpu_max_slices(self):
"""Test that ValueError is raised when MIG instances are set but gpu_max_slices is missing."""
attrs = self._mig_gpu_attributes(pod_name="bad-mig-pod")
del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["gpu_max_slices"]
with self.assertRaises(ValueError) as ctx:
OCPGenerator(self.two_hours_ago, self.now, attrs)
self.assertIn("gpu_max_slices", str(ctx.exception))
self.assertIn("MIG-", gpu["mig_instance_id"])
self.assertEqual(gpu["gpu_uuid"], gpu["mig_instance_id"])

def test_gen_gpus_raises_when_mig_instance_missing_required_fields(self):
"""Test that ValueError is raised when a MIG instance lacks mig_profile, mig_slice_count, or mig_memory."""
attrs = self._mig_gpu_attributes(
pod_name="incomplete-mig-pod",
mig_instance_overrides={"mig_memory_capacity_mib": None},
)
"""Test that ValueError is raised when a MIG instance lacks mig_profile or mig_slice_count."""
attrs = self._mig_gpu_attributes(pod_name="incomplete-mig-pod")
del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0][
"mig_memory_capacity_mib"
"mig_slice_count"
]
with self.assertRaises(ValueError) as ctx:
OCPGenerator(self.two_hours_ago, self.now, attrs)
self.assertIn("mig_profile", str(ctx.exception))
self.assertIn("mig_slice_count", str(ctx.exception))
Comment on lines +1291 to 1298
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This test is good, but it only covers the case where mig_slice_count is missing. To make it more robust and ensure that the validation works for all required fields, you could parameterize it to test the removal of each required field (mig_profile and mig_slice_count) individually. Using subTest would be a good way to achieve this without creating a separate test method for each field.

Suggested change
attrs = self._mig_gpu_attributes(pod_name="incomplete-mig-pod")
del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0][
"mig_memory_capacity_mib"
"mig_slice_count"
]
with self.assertRaises(ValueError) as ctx:
OCPGenerator(self.two_hours_ago, self.now, attrs)
self.assertIn("mig_profile", str(ctx.exception))
self.assertIn("mig_slice_count", str(ctx.exception))
for field_to_remove in ["mig_profile", "mig_slice_count"]:
with self.subTest(field_to_remove=field_to_remove):
attrs = self._mig_gpu_attributes(pod_name="incomplete-mig-pod")
del attrs["nodes"][0]["namespaces"]["mig-namespace"]["pods"][0]["gpus"][0]["mig_instances"][0][
field_to_remove
]
with self.assertRaises(ValueError) as ctx:
OCPGenerator(self.two_hours_ago, self.now, attrs)
self.assertIn("mig_profile", str(ctx.exception))
self.assertIn("mig_slice_count", str(ctx.exception))

self.assertIn("mig_memory_capacity_mib", str(ctx.exception))

def test_gen_hourly_gpu_usage_includes_mig_fields(self):
"""Test that GPU usage rows include MIG fields when pod has MIG instances."""
Expand All @@ -1324,10 +1307,7 @@ def test_gen_hourly_gpu_usage_includes_mig_fields(self):
row = gpu_data[0]
self.assertEqual(row["mig_profile"], "3g.40gb")
self.assertEqual(row["mig_slice_count"], 3)
self.assertEqual(row["mig_memory_capacity_mib"], 40960)
self.assertEqual(row["parent_gpu_max_slices"], 7)
self.assertIn("MIG-", row["mig_instance_uuid"])
self.assertIn("GPU-", row["parent_gpu_uuid"])
self.assertIn("MIG-", row["mig_instance_id"])

def test_gen_gpus_random_generation(self):
"""Test random GPU generation (10% of pods)."""
Expand Down Expand Up @@ -1407,12 +1387,9 @@ def test_update_gpu_data(self):
"gpu_vendor_name": GPU_VENDOR,
"gpu_memory_capacity_mib": 15360,
"gpu_pod_uptime": 3000.123456,
"mig_instance_uuid": "MIG-test-uuid",
"mig_instance_id": "MIG-test-uuid",
"mig_profile": "3g.40gb",
"mig_slice_count": 3,
"parent_gpu_max_slices": 7,
"parent_gpu_uuid": "GPU-parent-uuid",
"mig_memory_capacity_mib": 40960,
}
updated_row = generator._update_gpu_data(row, self.two_hours_ago, self.now, **kwargs)
self.assertEqual(updated_row["node"], "test-node")
Expand All @@ -1423,12 +1400,9 @@ def test_update_gpu_data(self):
self.assertEqual(updated_row["gpu_vendor_name"], GPU_VENDOR)
self.assertEqual(updated_row["gpu_memory_capacity_mib"], 15360)
self.assertEqual(updated_row["gpu_pod_uptime"], 3000.123456)
self.assertEqual(updated_row["mig_instance_uuid"], "MIG-test-uuid")
self.assertEqual(updated_row["mig_instance_id"], "MIG-test-uuid")
self.assertEqual(updated_row["mig_profile"], "3g.40gb")
self.assertEqual(updated_row["mig_slice_count"], 3)
self.assertEqual(updated_row["parent_gpu_max_slices"], 7)
self.assertEqual(updated_row["parent_gpu_uuid"], "GPU-parent-uuid")
self.assertEqual(updated_row["mig_memory_capacity_mib"], 40960)

def test_gpu_usage_with_multiple_gpus_per_pod(self):
"""Test that multiple GPUs per pod generate separate rows."""
Expand Down
Loading