From 82f7edd648ff729d2329165b2e8b4ac4e4610062 Mon Sep 17 00:00:00 2001 From: "Mr. Creditcard" Date: Wed, 25 Feb 2026 17:10:14 +0100 Subject: [PATCH] Add nvidia_smi_clock_speed check plugin --- .../collection/agent_based/nvidia_smi.py | 157 ++++++++++++++++ .../collection/graphing/gpu_clock_speed.py | 84 +++++++++ .../collection/agent_based/test_nvidia_smi.py | 174 ++++++++++++++++++ 3 files changed, 415 insertions(+) create mode 100644 cmk/plugins/collection/graphing/gpu_clock_speed.py diff --git a/cmk/plugins/collection/agent_based/nvidia_smi.py b/cmk/plugins/collection/agent_based/nvidia_smi.py index c9c9a5c6cf4..681914a6fe8 100644 --- a/cmk/plugins/collection/agent_based/nvidia_smi.py +++ b/cmk/plugins/collection/agent_based/nvidia_smi.py @@ -51,6 +51,7 @@ ] MiB = 1024.0**2 +MHz_to_Hz_factor = 1000_000 class PowerManagement(Enum): @@ -96,6 +97,17 @@ class Utilization(BaseModel): decoder_util: float | None +class Clock(BaseModel): + graphics_clock: float | None + sm_clock: float | None + mem_clock: float | None + video_clock: float | None + graphics_clock_max: float | None + sm_clock_max: float | None + mem_clock_max: float | None + video_clock_max: float | None + + class GPU(BaseModel): id: str product_name: str | None @@ -103,6 +115,7 @@ class GPU(BaseModel): power_readings: PowerReadings temperature: Temperature utilization: Utilization + clock: Clock class Section(BaseModel): @@ -247,6 +260,20 @@ def parse_nvidia_smi(string_table: StringTable) -> Section: encoder_util=get_float_from_element(gpu.find("utilization/encoder_util"), "%"), decoder_util=get_float_from_element(gpu.find("utilization/decoder_util"), "%"), ), + clock=Clock( + graphics_clock=get_float_from_element(gpu.find("clocks/graphics_clock"), "MHz"), + sm_clock=get_float_from_element(gpu.find("clocks/sm_clock"), "MHz"), + mem_clock=get_float_from_element(gpu.find("clocks/mem_clock"), "MHz"), + video_clock=get_float_from_element(gpu.find("clocks/video_clock"), "MHz"), + graphics_clock_max=get_float_from_element( + gpu.find("max_clocks/graphics_clock"), "MHz" + ), + sm_clock_max=get_float_from_element(gpu.find("max_clocks/sm_clock"), "MHz"), + mem_clock_max=get_float_from_element(gpu.find("max_clocks/mem_clock"), "MHz"), + video_clock_max=get_float_from_element( + gpu.find("max_clocks/video_clock"), "MHz" + ), + ), ) for gpu in xml.findall("gpu") }, @@ -522,3 +549,133 @@ def check_nvidia_smi_memory_util( check_default_parameters=MemoryParams(levels_total=None, levels_bar1=None, levels_fb=None), check_function=check_nvidia_smi_memory_util, ) + + +class ClockParams(TypedDict): + levels_graphics_upper: tuple[float, float] | None + levels_graphics_lower: tuple[float, float] | None + levels_sm_upper: tuple[float, float] | None + levels_sm_lower: tuple[float, float] | None + levels_mem_upper: tuple[float, float] | None + levels_mem_lower: tuple[float, float] | None + levels_video_upper: tuple[float, float] | None + levels_video_lower: tuple[float, float] | None + + +def discover_nvidia_smi_clock_speed(section: Section) -> DiscoveryResult: + for gpu_id, gpu in section.gpus.items(): + if gpu.clock.graphics_clock is not None: + yield Service(item=gpu_id) + + +def check_nvidia_smi_clock_speed( + item: str, + params: ClockParams, + section: Section, +) -> CheckResult: + if not (gpu := section.gpus.get(item)): + return + + graphics_clock = gpu.clock.graphics_clock + sm_clock = gpu.clock.sm_clock + mem_clock = gpu.clock.mem_clock + video_clock = gpu.clock.video_clock + graphics_clock_max = gpu.clock.graphics_clock_max + sm_clock_max = gpu.clock.sm_clock_max + mem_clock_max = gpu.clock.mem_clock_max + video_clock_max = gpu.clock.video_clock_max + + info_texts: dict[str, str | None] = { + "graphics_clock": None, + "sm_clock": None, + "mem_clock": None, + "video_clock": None, + } + + if graphics_clock is not None and graphics_clock_max is not None: + info_texts["graphics_clock"] = ( + f"Graphics clock: {graphics_clock} MHz / {graphics_clock_max} MHz" + ) + if sm_clock is not None and sm_clock_max is not None: + info_texts["sm_clock"] = f"SM clock: {sm_clock} MHz / {sm_clock_max} MHz" + if mem_clock is not None and mem_clock_max is not None: + info_texts["mem_clock"] = f"MEM clock: {mem_clock} MHz / {mem_clock_max} MHz" + if video_clock is not None and video_clock_max is not None: + info_texts["video_clock"] = f"Video clock: {video_clock} MHz / {video_clock_max} MHz" + + info_text: str = ", ".join([info for info in info_texts.values() if info is not None]) + + yield Result(state=State.OK, summary=info_text) + + # Clock speeds by nvidia-smi are given in MHz, check_mk expects them in hz for render.frequency + if graphics_clock is not None and graphics_clock_max is not None: + yield from check_levels_v1( + graphics_clock * MHz_to_Hz_factor, + levels_upper=params.get("levels_graphics_upper"), + levels_lower=params.get("levels_graphics_lower"), + render_func=render.frequency, + metric_name="graphics_clock", + boundaries=(0.0, graphics_clock_max * MHz_to_Hz_factor), + label="Graphics clock", + notice_only=True, + ) + yield Metric("graphics_clock_max", graphics_clock_max * MHz_to_Hz_factor) + + if sm_clock is not None and sm_clock_max is not None: + yield from check_levels_v1( + sm_clock * MHz_to_Hz_factor, + levels_upper=params.get("levels_sm_upper"), + levels_lower=params.get("levels_sm_lower"), + render_func=render.frequency, + metric_name="sm_clock", + boundaries=(0.0, sm_clock_max * MHz_to_Hz_factor), + label="SM clock", + notice_only=True, + ) + yield Metric("sm_clock_max", sm_clock_max * MHz_to_Hz_factor) + + if mem_clock is not None and mem_clock_max is not None: + yield from check_levels_v1( + mem_clock * MHz_to_Hz_factor, + levels_upper=params.get("levels_mem_upper"), + levels_lower=params.get("levels_mem_lower"), + render_func=render.frequency, + metric_name="mem_clock", + boundaries=(0.0, mem_clock_max * MHz_to_Hz_factor), + label="MEM clock", + notice_only=True, + ) + yield Metric("mem_clock_max", mem_clock_max * MHz_to_Hz_factor) + + if video_clock is not None and video_clock_max is not None: + yield from check_levels_v1( + video_clock * MHz_to_Hz_factor, + levels_upper=params.get("levels_video_upper"), + levels_lower=params.get("levels_video_lower"), + render_func=render.frequency, + metric_name="video_clock", + boundaries=(0.0, video_clock_max * MHz_to_Hz_factor), + label="Video clock", + notice_only=True, + ) + yield Metric("video_clock_max", video_clock_max * MHz_to_Hz_factor) + + +check_plugin_nvidia_smi_clock_speed = CheckPlugin( + name="nvidia_smi_clock_speed", + service_name="Nvidia GPU Clock speed %s", + sections=["nvidia_smi"], + discovery_function=discover_nvidia_smi_clock_speed, + check_ruleset_name="nvidia_smi_clock_speed", + check_default_parameters=ClockParams( + levels_graphics_upper=None, + levels_graphics_lower=None, + levels_sm_upper=None, + levels_sm_lower=None, + levels_mem_upper=None, + levels_mem_lower=None, + levels_video_upper=None, + levels_video_lower=None, + ), + check_function=check_nvidia_smi_clock_speed, +) diff --git a/cmk/plugins/collection/graphing/gpu_clock_speed.py b/cmk/plugins/collection/graphing/gpu_clock_speed.py new file mode 100644 index 00000000000..06a027facc9 --- /dev/null +++ b/cmk/plugins/collection/graphing/gpu_clock_speed.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Checkmk GmbH - License: GNU General Public License v2 +# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and +# conditions defined in the file COPYING, which is part of this source code package. + +from cmk.graphing.v1 import graphs, metrics, Title + +UNIT_HZ = metrics.Unit(metrics.IECNotation("Hz")) + +metric_graphics_clock = metrics.Metric( + name="graphics_clock", + title=Title("Graphics Clock speed"), + unit=UNIT_HZ, + color=metrics.Color.GREEN, +) +metric_graphics_clock_max = metrics.Metric( + name="graphics_clock_max", + title=Title("Graphics Clock speed (Max)"), + unit=UNIT_HZ, + color=metrics.Color.PURPLE, +) + +graph_graphics_clock = graphs.Graph( + name="graphics_clock", + title=Title("Graphics Clock speed"), + simple_lines=["graphics_clock", "graphics_clock_max"], +) + +metric_sm_clock = metrics.Metric( + name="sm_clock", + title=Title("SM Clock speed"), + unit=UNIT_HZ, + color=metrics.Color.GREEN, +) +metric_sm_clock_max = metrics.Metric( + name="sm_clock_max", + title=Title("SM Clock speed (Max)"), + unit=UNIT_HZ, + color=metrics.Color.PURPLE, +) + +graph_sm_clock = graphs.Graph( + name="sm_clock", + title=Title("SM Clock speed"), + simple_lines=["sm_clock", "sm_clock_max"], +) + +metric_mem_clock = metrics.Metric( + name="mem_clock", + title=Title("MEM Clock speed"), + unit=UNIT_HZ, + color=metrics.Color.GREEN, +) +metric_mem_clock_max = metrics.Metric( + name="mem_clock_max", + title=Title("MEM Clock speed (Max)"), + unit=UNIT_HZ, + color=metrics.Color.PURPLE, +) + +graph_mem_clock = graphs.Graph( + name="mem_clock", + title=Title("MEM Clock speed"), + simple_lines=["mem_clock", "mem_clock_max"], +) + +metric_video_clock = metrics.Metric( + name="video_clock", + title=Title("Video Clock speed"), + unit=UNIT_HZ, + color=metrics.Color.GREEN, +) +metric_video_clock_max = metrics.Metric( + name="video_clock_max", + title=Title("Video Clock speed (Max)"), + unit=UNIT_HZ, + color=metrics.Color.PURPLE, +) + +graph_video_clock = graphs.Graph( + name="video_clock", + title=Title("Video Clock speed"), + simple_lines=["video_clock", "video_clock_max"], +) diff --git a/tests/unit/cmk/plugins/collection/agent_based/test_nvidia_smi.py b/tests/unit/cmk/plugins/collection/agent_based/test_nvidia_smi.py index 80ab829ec94..b66873b9f85 100644 --- a/tests/unit/cmk/plugins/collection/agent_based/test_nvidia_smi.py +++ b/tests/unit/cmk/plugins/collection/agent_based/test_nvidia_smi.py @@ -268,6 +268,16 @@ def empty_value_store(monkeypatch: pytest.MonkeyPatch) -> None: encoder_util=3.0, decoder_util=8.0, ), + clock=nvidia_smi.Clock( + graphics_clock=1244.0, + sm_clock=1244.0, + mem_clock=6993.0, + video_clock=1154.0, + graphics_clock_max=2475.0, + sm_clock_max=2475.0, + mem_clock_max=7001.0, + video_clock_max=1950.0, + ), ) }, ) @@ -581,3 +591,167 @@ def test_check_nvidia_smi_memory_util( expected_result: CheckResult, ) -> None: assert list(nvidia_smi.check_nvidia_smi_memory_util(item, params, section)) == expected_result + + +# ------------------------------------------------------------------------ + + +@pytest.mark.parametrize( + "section, expected_result", + [ + ( + SECTION, + [Service(item="00000000:0B:00.0")], + ), + ], +) +def test_discover_nvidia_smi_clock_speed( + section: nvidia_smi.Section, + expected_result: DiscoveryResult, +) -> None: + assert list(nvidia_smi.discover_nvidia_smi_clock_speed(section)) == expected_result + + +@pytest.mark.parametrize( + "item, params, section, expected_result", + [ + ( + "00000000:0B:00.0", + {}, + SECTION, + [ + Result( + state=State.OK, + summary="Graphics clock: 1244.0 MHz / 2475.0 MHz, SM clock: 1244.0 MHz / 2475.0 MHz, MEM clock: 6993.0 MHz / 7001.0 MHz, Video clock: 1154.0 MHz / 1950.0 MHz", + ), + Result(state=State.OK, notice="Graphics clock: 1.24 GHz"), + Metric("graphics_clock", 1244_000_000.0, boundaries=(0.0, 2475_000_000)), + Metric("graphics_clock_max", 2475_000_000.0), + Result(state=State.OK, notice="SM clock: 1.24 GHz"), + Metric("sm_clock", 1244_000_000.0, boundaries=(0.0, 2475_000_000)), + Metric("sm_clock_max", 2475_000_000.0), + Result(state=State.OK, notice="MEM clock: 6.99 GHz"), + Metric("mem_clock", 6993_000_000.0, boundaries=(0.0, 7001_000_000)), + Metric("mem_clock_max", 7001_000_000.0), + Result(state=State.OK, notice="Video clock: 1.15 GHz"), + Metric("video_clock", 1154_000_000.0, boundaries=(0.0, 1950_000_000)), + Metric("video_clock_max", 1950_000_000.0), + ], + ), + ( + "00000000:0B:00.0", + nvidia_smi.ClockParams( + levels_graphics_upper=(1200_000_000.0, 1900_000_000.0), + levels_graphics_lower=None, + levels_sm_upper=(1000_000_000.0, 1200_000_000.0), + levels_sm_lower=None, + levels_mem_upper=(6900_000_000.0, 7000_000_000.0), + levels_mem_lower=None, + levels_video_upper=(900_000_000.0, 1100_000_000.0), + levels_video_lower=None, + ), + SECTION, + [ + Result( + state=State.OK, + summary="Graphics clock: 1244.0 MHz / 2475.0 MHz, SM clock: 1244.0 MHz / 2475.0 MHz, MEM clock: 6993.0 MHz / 7001.0 MHz, Video clock: 1154.0 MHz / 1950.0 MHz", + ), + Result( + state=State.WARN, + notice="Graphics clock: 1.24 GHz (warn/crit at 1.20 GHz/1.90 GHz)", + ), + Metric( + "graphics_clock", + 1244000000.0, + levels=(1200000000.0, 1900000000.0), + boundaries=(0.0, 2475000000.0), + ), + Metric("graphics_clock_max", 2475000000.0), + Result( + state=State.CRIT, + notice="SM clock: 1.24 GHz (warn/crit at 1.00 GHz/1.20 GHz)", + ), + Metric( + "sm_clock", + 1244000000.0, + levels=(1000000000.0, 1200000000.0), + boundaries=(0.0, 2475000000.0), + ), + Metric("sm_clock_max", 2475000000.0), + Result( + state=State.WARN, + notice="MEM clock: 6.99 GHz (warn/crit at 6.90 GHz/7.00 GHz)", + ), + Metric( + "mem_clock", + 6993000000.0, + levels=(6900000000.0, 7000000000.0), + boundaries=(0.0, 7001000000.0), + ), + Metric("mem_clock_max", 7001000000.0), + Result( + state=State.CRIT, + notice="Video clock: 1.15 GHz (warn/crit at 900 MHz/1.10 GHz)", + ), + Metric( + "video_clock", + 1154000000.0, + levels=(900000000.0, 1100000000.0), + boundaries=(0.0, 1950000000.0), + ), + Metric("video_clock_max", 1950000000.0), + ], + ), + ( + "00000000:0B:00.0", + nvidia_smi.ClockParams( + levels_graphics_upper=None, + levels_graphics_lower=(1700_000_000.0, 1500_000_000.0), + levels_sm_upper=None, + levels_sm_lower=(1300_000_000.0, 1100_000_000.0), + levels_mem_upper=None, + levels_mem_lower=(7000_000_000.0, 6900_000_000.0), + levels_video_upper=None, + levels_video_lower=(1300_000_000.0, 1200_000_000.0), + ), + SECTION, + [ + Result( + state=State.OK, + summary="Graphics clock: 1244.0 MHz / 2475.0 MHz, SM clock: 1244.0 MHz / 2475.0 MHz, MEM clock: 6993.0 MHz / 7001.0 MHz, Video clock: 1154.0 MHz / 1950.0 MHz", + ), + Result( + state=State.CRIT, + notice="Graphics clock: 1.24 GHz (warn/crit below 1.70 GHz/1.50 GHz)", + ), + Metric("graphics_clock", 1244000000.0, boundaries=(0.0, 2475000000.0)), + Metric("graphics_clock_max", 2475000000.0), + Result( + state=State.WARN, + notice="SM clock: 1.24 GHz (warn/crit below 1.30 GHz/1.10 GHz)", + ), + Metric("sm_clock", 1244000000.0, boundaries=(0.0, 2475000000.0)), + Metric("sm_clock_max", 2475000000.0), + Result( + state=State.WARN, + notice="MEM clock: 6.99 GHz (warn/crit below 7.00 GHz/6.90 GHz)", + ), + Metric("mem_clock", 6993000000.0, boundaries=(0.0, 7001000000.0)), + Metric("mem_clock_max", 7001000000.0), + Result( + state=State.CRIT, + notice="Video clock: 1.15 GHz (warn/crit below 1.30 GHz/1.20 GHz)", + ), + Metric("video_clock", 1154000000.0, boundaries=(0.0, 1950000000.0)), + Metric("video_clock_max", 1950000000.0), + ], + ), + ], +) +def test_check_nvidia_smi_clock_speed( + item: str, + params: nvidia_smi.ClockParams, + section: nvidia_smi.Section, + expected_result: CheckResult, +) -> None: + assert list(nvidia_smi.check_nvidia_smi_clock_speed(item, params, section)) == expected_result