From 0c98c08964e43b908e6362b1d74c8fd1c4d24049 Mon Sep 17 00:00:00 2001 From: Klaus Schuetz Date: Tue, 24 Mar 2026 10:43:13 +0100 Subject: [PATCH 1/3] add module vsan capacity --- checkvsphere/vcmd/vsan.py | 126 ++++++++++++++++++++++++++++++++++++++ docs/cmd/vsan.md | 11 +++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/checkvsphere/vcmd/vsan.py b/checkvsphere/vcmd/vsan.py index eb0c994..ffda152 100644 --- a/checkvsphere/vcmd/vsan.py +++ b/checkvsphere/vcmd/vsan.py @@ -115,6 +115,8 @@ def run(): check_objecthealth(check, clusters) elif args.mode == "healthtest": check_healthtest(check, clusters) + elif args.mode == "capacity": + check_capacity(check, clusters, vhs) else: raise Exception("WHAT?") @@ -179,6 +181,114 @@ def check_objecthealth(check, clusters): (status, message) = check.check_messages(separator='\n', separator_all='\n', **opts) check.exit(status, message) +def check_capacity(check, clusters, vhs): + """ + Checks vSAN capacity, including slack and resync. + Provides performance data and status for Icinga. + Optional debugging via args.debug + """ + try: + vcMos = vsu.GetVsanVcMos( + args._si._stub, + context=sslContext(args), + version=vsu.GetLatestVmodlVersion(args.host, int(args.port)) + ) + vsan_space_system = vcMos['vsan-cluster-space-report-system'] + except KeyError: + check.exit(CRITICAL, "vsan-cluster-space-report-system API nicht verfügbar!") + except Exception as e: + check.exit(CRITICAL, f"vsan API Fehler: {e}") + + # Default Thresholds (Effective Free %) + warn_eff = args.warning if args.warning is not None else 25 + crit_eff = args.critical if args.critical is not None else 15 + + for cluster in clusters: + try: + if not cluster['configurationEx'].vsanConfigInfo.enabled: + continue + if isbanned(args, cluster['name'], 'exclude'): + continue + if not isallowed(args, cluster['name'], 'include'): + continue + + if getattr(args, 'debug', False): + print(f"DEBUG: Cluster={cluster['name']}, MoRef={cluster['moref']}") + print("DEBUG: vsan_space_system methods:", dir(vsan_space_system)) + + # Try ManagedStorageSpaceUsage, fallback QuerySpaceUsage + try: + if getattr(args, 'debug', False): + print("DEBUG: Versuch QueryVsanManagedStorageSpaceUsage") + capacity = vsan_space_system.QueryVsanManagedStorageSpaceUsage(cluster['moref']) + except Exception as e1: + if getattr(args, 'debug', False): + print(f"DEBUG: QueryVsanManagedStorageSpaceUsage failed ({e1}), fallback QuerySpaceUsage") + capacity = vsan_space_system.QuerySpaceUsage(cluster['moref']) + + if getattr(args, 'debug', False): + print("DEBUG: Capacity abgerufen:", capacity) + + # Correct Usage-Calculation + total = getattr(capacity, 'totalCapacityB', 0) + used = getattr(getattr(capacity, 'spaceOverview', None), 'usedB', 0) + free = getattr(capacity, 'freeCapacityB', 0) + slack = getattr(capacity, 'slackSpaceB', 0) + resync = getattr(capacity, 'resyncSpaceB', 0) + + effective_free = max(0, free - slack - resync) + usage_pct = (used / total) * 100 if total > 0 else 0 + effective_free_pct = (effective_free / total) * 100 if total > 0 else 0 + + # Calculate status + state = OK + if effective_free_pct < crit_eff: + state = CRITICAL + elif effective_free_pct < warn_eff: + state = WARNING + + # Perfdata rounded Values & Thresholds + check.add_perfdata(label=f"{cluster['name']}_usage", + value=round(usage_pct, 1), + warning=warn_eff, + critical=crit_eff, + uom='%') + check.add_perfdata(label=f"{cluster['name']}_free_gb", + value=round(free / 1024**3, 1), + uom='GB') + check.add_perfdata(label=f"{cluster['name']}_slack_gb", + value=round(slack / 1024**3, 1), + uom='GB') + check.add_perfdata(label=f"{cluster['name']}_resync_gb", + value=round(resync / 1024**3, 1), + uom='GB') + check.add_perfdata(label=f"{cluster['name']}_effective_free_gb", + value=round(effective_free / 1024**3, 1), + warning=warn_eff, + critical=crit_eff, + uom='GB') + + # Message + check.add_message( + state, + f"{cluster['name']}: usage={round(usage_pct,1)}% " + f"(free={round(free/1024*3,1)}GB, slack={round(slack/1024*3,1)}GB, " + f"resync={round(resync/1024**3,1)}GB, effective_free={round(effective_free_pct,1)}%)" + ) + + except Exception as e: + if getattr(args, 'debug', False): + print(f"DEBUG ERROR: Cluster={cluster['name']}, Exception={e}") + check.add_message(CRITICAL, f"{cluster['name']}: Fehler beim Abfragen: {e}") + + # All OK Option + opts = {} + if not getattr(args, 'verbose', False): + opts['allok'] = "everything is fine" + + status, message = check.check_messages(separator='\n', separator_all='\n', **opts) + check.exit(status, message) + def sslContext(args): context = ssl.create_default_context() context.check_hostname = False @@ -209,10 +319,26 @@ def get_argparser(): 'choices': [ 'objecthealth', 'healthtest', + 'capacity' ], 'help': 'which runtime mode to check' } }) + parser.add_optional_arguments({ + 'name_or_flags': ['--warning'], + 'options': { + 'type': float, + 'help': 'Warning threshold for usage in percent' + } + }) + + parser.add_optional_arguments({ + 'name_or_flags': ['--critical'], + 'options': { + 'type': float, + 'help': 'Critical threshold for usage in percent' + } + }) return parser diff --git a/docs/cmd/vsan.md b/docs/cmd/vsan.md index 5c68130..010e01b 100644 --- a/docs/cmd/vsan.md +++ b/docs/cmd/vsan.md @@ -16,7 +16,7 @@ options: |---|---| | `--vihost HOSTNAME` | (optional) the name of the HostSystem to check, if omitted the first HostSystem found is checked, which is handy if you run this check directly against the host | | `--maintenance-state STATE` | one of OK, WARNING, CRITICAL, UNKNOWN. The status to use when the host is in maintenance mode, this defaults to UNKNOWN | -| `--mode MODE` | one of objecthealth, healthtest | +| `--mode MODE` | one of objecthealth, healthtest, capacity | | `--include REGEX` | (optional) REGEX is checked against the cluster name | | `--exclude REGEX` | (optional) REGEX is checked against the cluster name | | `--include-group REGEX` | (optional) only with `--mode healthtest`, REGEX is checked against the tests' group name | @@ -25,6 +25,8 @@ options: | `--exclude-test REGEX` | (optional) only with `--mode healthtest`, REGEX is checked against the test name | | `--cache` | fetch cached data from the API when available and not outdated | | `--verbose` | show also tests the where OK | +| `--warning` | warning free threshold for capacity | +| `--critical` | critical free threshold for capacity | ### `--mode healthtest` @@ -45,6 +47,13 @@ REGEX of `--include`, `--exclude` is matched against cluster name. This is an in depth check of the "vSAN object health" test. It's not very well tested yet. +### `--mode capacity` + +REGEX of `--include`, `--exclude` is matched against cluster name. + +This Checks vSAN capacity, including slack and resync. +Provides performance data. Uses --warning and --critical for free threshold + ## Examples ``` From 11f624c6e187c7a225809537c57611d963d06cc0 Mon Sep 17 00:00:00 2001 From: Klaus Schuetz Date: Wed, 25 Mar 2026 08:38:17 +0100 Subject: [PATCH 2/3] correct language and thresholds --- checkvsphere/vcmd/vsan.py | 60 ++++++++++----------------------------- 1 file changed, 15 insertions(+), 45 deletions(-) diff --git a/checkvsphere/vcmd/vsan.py b/checkvsphere/vcmd/vsan.py index ffda152..5d68359 100644 --- a/checkvsphere/vcmd/vsan.py +++ b/checkvsphere/vcmd/vsan.py @@ -65,6 +65,7 @@ def run(): args = parser.get_args() check = Check() + check.set_threshold(warning=args.warning, critical=args.critical) args._si = service_instance.connect(args) @@ -184,8 +185,6 @@ def check_objecthealth(check, clusters): def check_capacity(check, clusters, vhs): """ Checks vSAN capacity, including slack and resync. - Provides performance data and status for Icinga. - Optional debugging via args.debug """ try: vcMos = vsu.GetVsanVcMos( @@ -195,13 +194,9 @@ def check_capacity(check, clusters, vhs): ) vsan_space_system = vcMos['vsan-cluster-space-report-system'] except KeyError: - check.exit(CRITICAL, "vsan-cluster-space-report-system API nicht verfügbar!") + check.exit(CRITICAL, "vsan-cluster-space-report-system API not avilable!") except Exception as e: - check.exit(CRITICAL, f"vsan API Fehler: {e}") - - # Default Thresholds (Effective Free %) - warn_eff = args.warning if args.warning is not None else 25 - crit_eff = args.critical if args.critical is not None else 15 + check.exit(CRITICAL, f"vsan API error: {e}") for cluster in clusters: try: @@ -219,7 +214,7 @@ def check_capacity(check, clusters, vhs): # Try ManagedStorageSpaceUsage, fallback QuerySpaceUsage try: if getattr(args, 'debug', False): - print("DEBUG: Versuch QueryVsanManagedStorageSpaceUsage") + print("DEBUG: Try QueryVsanManagedStorageSpaceUsage") capacity = vsan_space_system.QueryVsanManagedStorageSpaceUsage(cluster['moref']) except Exception as e1: if getattr(args, 'debug', False): @@ -227,7 +222,7 @@ def check_capacity(check, clusters, vhs): capacity = vsan_space_system.QuerySpaceUsage(cluster['moref']) if getattr(args, 'debug', False): - print("DEBUG: Capacity abgerufen:", capacity) + print("DEBUG: Capacity querried:", capacity) # Correct Usage-Calculation total = getattr(capacity, 'totalCapacityB', 0) @@ -240,19 +235,14 @@ def check_capacity(check, clusters, vhs): usage_pct = (used / total) * 100 if total > 0 else 0 effective_free_pct = (effective_free / total) * 100 if total > 0 else 0 - # Calculate status - state = OK - if effective_free_pct < crit_eff: - state = CRITICAL - elif effective_free_pct < warn_eff: - state = WARNING + # This checks the usage_pct against args.warning and args.critical + state = check.check_threshold(usage_pct) - # Perfdata rounded Values & Thresholds check.add_perfdata(label=f"{cluster['name']}_usage", value=round(usage_pct, 1), - warning=warn_eff, - critical=crit_eff, - uom='%') + uom='%', + threshold=check.threshold) + check.add_perfdata(label=f"{cluster['name']}_free_gb", value=round(free / 1024**3, 1), uom='GB') @@ -264,24 +254,18 @@ def check_capacity(check, clusters, vhs): uom='GB') check.add_perfdata(label=f"{cluster['name']}_effective_free_gb", value=round(effective_free / 1024**3, 1), - warning=warn_eff, - critical=crit_eff, uom='GB') - # Message check.add_message( state, f"{cluster['name']}: usage={round(usage_pct,1)}% " - f"(free={round(free/1024*3,1)}GB, slack={round(slack/1024*3,1)}GB, " + f"(free={round(free/1024**3,1)}GB, slack={round(slack/1024**3,1)}GB, " f"resync={round(resync/1024**3,1)}GB, effective_free={round(effective_free_pct,1)}%)" ) except Exception as e: - if getattr(args, 'debug', False): - print(f"DEBUG ERROR: Cluster={cluster['name']}, Exception={e}") - check.add_message(CRITICAL, f"{cluster['name']}: Fehler beim Abfragen: {e}") - - # All OK Option + check.add_message(CRITICAL, f"{cluster['name']}: Error while querying: {e}") + opts = {} if not getattr(args, 'verbose', False): opts['allok'] = "everything is fine" @@ -324,22 +308,8 @@ def get_argparser(): 'help': 'which runtime mode to check' } }) - parser.add_optional_arguments({ - 'name_or_flags': ['--warning'], - 'options': { - 'type': float, - 'help': 'Warning threshold for usage in percent' - } - }) - - parser.add_optional_arguments({ - 'name_or_flags': ['--critical'], - 'options': { - 'type': float, - 'help': 'Critical threshold for usage in percent' - } - }) - + parser.add_optional_arguments(CheckArgument.WARNING_THRESHOLD) + parser.add_optional_arguments(CheckArgument.CRITICAL_THRESHOLD) return parser def import_vsan(): From 358b57b36c432089058d4f9aa29f9e48565d0702 Mon Sep 17 00:00:00 2001 From: Klaus Schuetz Date: Wed, 25 Mar 2026 08:38:17 +0100 Subject: [PATCH 3/3] correct language and threshold, adapt doc --- docs/cmd/vsan.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/cmd/vsan.md b/docs/cmd/vsan.md index 010e01b..3fe03af 100644 --- a/docs/cmd/vsan.md +++ b/docs/cmd/vsan.md @@ -25,8 +25,8 @@ options: | `--exclude-test REGEX` | (optional) only with `--mode healthtest`, REGEX is checked against the test name | | `--cache` | fetch cached data from the API when available and not outdated | | `--verbose` | show also tests the where OK | -| `--warning` | warning free threshold for capacity | -| `--critical` | critical free threshold for capacity | +| `--warning` | warning threshold for capacity | +| `--critical` | critical threshold for capacity | ### `--mode healthtest` @@ -52,7 +52,7 @@ tested yet. REGEX of `--include`, `--exclude` is matched against cluster name. This Checks vSAN capacity, including slack and resync. -Provides performance data. Uses --warning and --critical for free threshold +Provides performance data. Uses --warning and --critical for threshold ## Examples