From ef7f2e5c74dcc9ae736e5b0a3495dcef9b936f41 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Thu, 23 Apr 2026 14:21:14 +0200 Subject: [PATCH 1/5] feat(observability): add runner VM hostmetrics Grafana dashboard Adds a read-only Grafana dashboard (editable: false) for runner VM host-level metrics to be served via cos-configuration-k8s using the grafana-dashboard relation, which provisions it as an immutable filesystem dashboard in Grafana. The dashboard covers: - CPU utilisation by state and load averages - Memory usage by state - Disk I/O throughput and operations - Filesystem usage % by mount point - Network traffic, errors and drops Template variables: - github_job_id: filter by GitHub Actions workflow run job ID - instance: filter by runner hostname Metric names follow the OpenTelemetry hostmetrics receiver prometheus convention (e.g. system_cpu_time_seconds_total). The github_job_id label is expected to be set as a resource attribute by the otelcol pipeline collecting metrics from the runner VMs. Related: ISD-5152 --- grafana_dashboards/runner_vm_hostmetrics.json | 542 ++++++++++++++++++ 1 file changed, 542 insertions(+) create mode 100644 grafana_dashboards/runner_vm_hostmetrics.json diff --git a/grafana_dashboards/runner_vm_hostmetrics.json b/grafana_dashboards/runner_vm_hostmetrics.json new file mode 100644 index 00000000..c5faecf5 --- /dev/null +++ b/grafana_dashboards/runner_vm_hostmetrics.json @@ -0,0 +1,542 @@ +{ + "__inputs": [ + { + "name": "prometheusds", + "label": "Prometheus / Mimir", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "description": "Host-level resource metrics (CPU, memory, disk, network, load) for GitHub Actions runner VMs. Filter by github_job_id to inspect a specific workflow run.", + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 1, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "CPU Usage", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "100 - (avg by (instance) (rate(system_cpu_time_seconds_total{state=\"idle\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])) * 100)", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 2, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Memory Usage", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "100 * (1 - (system_memory_usage_bytes{state=\"free\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} / sum without (state) (system_memory_usage_bytes{instance=~\"$instance\",github_job_id=~\"$github_job_id\"})))", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 3, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Filesystem Usage (/)", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "100 * system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} / (system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} + system_filesystem_usage_bytes{state=\"free\",mountpoint=\"/\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"})", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Network In", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "sum by (instance) (rate(system_network_io_bytes_total{direction=\"receive\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval]))", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Network Out", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "sum by (instance) (rate(system_network_io_bytes_total{direction=\"transmit\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval]))", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "title": "CPU", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 10, "gradientMode": "none", "showPoints": "never" }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "idle" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "system" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "user" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "iowait" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 10, + "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "title": "CPU Time by State", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "avg by (state) (rate(system_cpu_time_seconds_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])) * 100", + "legendFormat": "{{state}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 2, "fillOpacity": 0, "showPoints": "never" }, + "unit": "short", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 11, + "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] } }, + "title": "Load Average", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "system_cpu_load_average_1m{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}", + "legendFormat": "1m", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "system_cpu_load_average_5m{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}", + "legendFormat": "5m", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "system_cpu_load_average_15m{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}", + "legendFormat": "15m", + "refId": "C" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 102, + "title": "Memory", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 20, "gradientMode": "none", "showPoints": "never", "stacking": { "mode": "normal", "group": "A" } }, + "unit": "bytes", + "min": 0 + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "used" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "cached" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "buffered" }, "properties": [{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "free" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 15 }, + "id": 20, + "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "title": "Memory Usage by State", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "system_memory_usage_bytes{instance=~\"$instance\",github_job_id=~\"$github_job_id\",state=~\"used|cached|buffered|free\"}", + "legendFormat": "{{state}}", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "id": 103, + "title": "Disk I/O", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "unit": "Bps", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 30, + "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "title": "Disk Throughput", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "rate(system_disk_io_bytes_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "legendFormat": "{{device}} {{direction}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "unit": "iops", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 31, + "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "title": "Disk Operations/s", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "rate(system_disk_operations_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "legendFormat": "{{device}} {{direction}}", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 }, + "id": 104, + "title": "Filesystem", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 33 }, + "id": 40, + "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showUnfilled": true }, + "title": "Filesystem Usage %", + "type": "bargauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "100 * system_filesystem_usage_bytes{state=\"used\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} / (system_filesystem_usage_bytes{state=\"used\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} + system_filesystem_usage_bytes{state=\"free\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"})", + "legendFormat": "{{mountpoint}} ({{device}})", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, + "id": 105, + "title": "Network", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "unit": "Bps", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 42 }, + "id": 50, + "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "title": "Network Traffic", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "rate(system_network_io_bytes_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "legendFormat": "{{device}} {{direction}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "unit": "pps", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 42 }, + "id": 51, + "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "title": "Network Errors & Drops", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "rate(system_network_errors_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "legendFormat": "errors {{device}} {{direction}}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "expr": "rate(system_network_dropped_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "legendFormat": "drops {{device}} {{direction}}", + "refId": "B" + } + ] + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["github-runner", "hostmetrics"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "prometheusds", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "definition": "label_values(system_cpu_time_seconds_total, github_job_id)", + "hide": 0, + "includeAll": true, + "allValue": ".*", + "label": "Job ID", + "multi": false, + "name": "github_job_id", + "options": [], + "query": { + "query": "label_values(system_cpu_time_seconds_total, github_job_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "definition": "label_values(system_cpu_time_seconds_total{github_job_id=~\"$github_job_id\"}, instance)", + "hide": 0, + "includeAll": true, + "allValue": ".*", + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(system_cpu_time_seconds_total{github_job_id=~\"$github_job_id\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "GitHub Runner VM Hostmetrics", + "uid": "github-runner-vm-hostmetrics", + "version": 1 +} From c4dc6df5d084c9c1c7993143acc9dc76e60bd955 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Thu, 23 Apr 2026 14:31:32 +0200 Subject: [PATCH 2/5] docs: document observability layout and rename dashboard directory Rename grafana_dashboards/ to runner_grafana_dashboards/ to make the purpose explicit at the repo root level (runner VM host metrics, not charm workload metrics). Update README with: - Repository layout overview - Observability section explaining the cos-configuration-k8s delivery mechanism and the immutability guarantee - Table of conventions for where dashboards live and what grafana_dashboards_path value to use in Terraform --- README.md | 36 +++++++++++++++++-- .../runner_vm_hostmetrics.json | 0 2 files changed, 33 insertions(+), 3 deletions(-) rename {grafana_dashboards => runner_grafana_dashboards}/runner_vm_hostmetrics.json (100%) diff --git a/README.md b/README.md index 836514ba..3fb2aeec 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,39 @@ # GitHub runner operators - ![WIP](https://img.shields.io/badge/status-WIP-yellow) A monorepo containing charms to operate Self-Hosted GitHub Action Runners. -At the moment, it contains initial code for the `webhook-gateway` -application, that receives and forwards GitHub webhooks to an AMQP queue. +## Repository layout + +``` +charms/ + planner-operator/ # Juju charm: GitHub runner planner + cos_custom/ + grafana_dashboards/ # Grafana dashboards for the planner charm + # (served via cos-configuration-k8s, path: charms/planner-operator/cos_custom/grafana_dashboards) + webhook-gateway-operator/ # Juju charm: GitHub webhook gateway + +runner_grafana_dashboards/ # Grafana dashboards for runner VM host metrics + # (served via cos-configuration-k8s, path: runner_grafana_dashboards) +``` + +## Observability: Grafana dashboards + +Dashboards in this repo are delivered to Grafana through +[`cos-configuration-k8s`](https://charmhub.io/cos-configuration-k8s), which syncs +JSON files from this Git repository and provisions them via the `grafana-dashboard` +relation. Provisioned dashboards are **immutable** in Grafana regardless of user +role — they cannot be edited or deleted through the UI. + +### Conventions + +| Directory | Purpose | `grafana_dashboards_path` config value | +|---|---|---| +| `charms//cos_custom/grafana_dashboards/` | Dashboards for a specific charm's workload metrics | `charms//cos_custom/grafana_dashboards` | +| `runner_grafana_dashboards/` | Dashboards for runner VM host-level metrics (CPU, memory, disk, network) | `runner_grafana_dashboards` | + +Dashboard JSON files must use `__inputs` to declare the datasource (type `prometheus`) and +set `"editable": false`. Metric names follow the +[OpenTelemetry hostmetrics receiver](https://opentelemetry.io/docs/collector/components/#receiver) +Prometheus naming convention (e.g. `system_cpu_time_seconds_total`). diff --git a/grafana_dashboards/runner_vm_hostmetrics.json b/runner_grafana_dashboards/runner_vm_hostmetrics.json similarity index 100% rename from grafana_dashboards/runner_vm_hostmetrics.json rename to runner_grafana_dashboards/runner_vm_hostmetrics.json From 44d577e330f820a1a0e09ed9fd502510cc4dec13 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Fri, 24 Apr 2026 10:50:29 +0200 Subject: [PATCH 3/5] fix: align dashboard labels with OTel config from github-runner-operator Replace github_job_id with github_job and instance with github_runner to match the actual attribute labels set by the pre-job OTel config (see canonical/github-runner-operator#781). Add github_repository and github_workflow template variables so the dashboard can be filtered the same way as the existing PS6 hostmetrics dashboard. --- .../runner_vm_hostmetrics.json | 911 +++++++++++++++--- 1 file changed, 757 insertions(+), 154 deletions(-) diff --git a/runner_grafana_dashboards/runner_vm_hostmetrics.json b/runner_grafana_dashboards/runner_vm_hostmetrics.json index c5faecf5..30499bb5 100644 --- a/runner_grafana_dashboards/runner_vm_hostmetrics.json +++ b/runner_grafana_dashboards/runner_vm_hostmetrics.json @@ -42,7 +42,7 @@ "version": "" } ], - "description": "Host-level resource metrics (CPU, memory, disk, network, load) for GitHub Actions runner VMs. Filter by github_job_id to inspect a specific workflow run.", + "description": "Host-level resource metrics (CPU, memory, disk, network, load) for GitHub Actions runner VMs. Filter by repository, workflow, job and runner to inspect a specific run.", "editable": false, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -51,22 +51,41 @@ "panels": [ { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, "id": 100, "title": "Overview", "type": "row" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 90 } + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } ] }, "unit": "percent", @@ -75,31 +94,66 @@ }, "overrides": [] }, - "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, "id": 1, - "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, "title": "CPU Usage", "type": "stat", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "100 - (avg by (instance) (rate(system_cpu_time_seconds_total{state=\"idle\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])) * 100)", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "100 - (avg by (github_runner) (rate(system_cpu_time_seconds_total{state=\"idle\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])) * 100)", "legendFormat": "{{instance}}", "refId": "A" } ] }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 90 } + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } ] }, "unit": "percent", @@ -108,31 +162,66 @@ }, "overrides": [] }, - "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, "id": 2, - "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, "title": "Memory Usage", "type": "stat", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "100 * (1 - (system_memory_usage_bytes{state=\"free\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} / sum without (state) (system_memory_usage_bytes{instance=~\"$instance\",github_job_id=~\"$github_job_id\"})))", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "100 * (1 - (system_memory_usage_bytes{state=\"free\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} / sum without (state) (system_memory_usage_bytes{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"})))", "legendFormat": "{{instance}}", "refId": "A" } ] }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 90 } + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } ] }, "unit": "percent", @@ -141,63 +230,152 @@ }, "overrides": [] }, - "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, "id": 3, - "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, "title": "Filesystem Usage (/)", "type": "stat", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "100 * system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} / (system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} + system_filesystem_usage_bytes{state=\"free\",mountpoint=\"/\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"})", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "100 * system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} / (system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} + system_filesystem_usage_bytes{state=\"free\",mountpoint=\"/\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"})", "legendFormat": "{{instance}}", "refId": "A" } ] }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, "unit": "Bps" }, "overrides": [] }, - "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, "id": 4, - "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, "title": "Network In", "type": "stat", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "sum by (instance) (rate(system_network_io_bytes_total{direction=\"receive\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval]))", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum by (github_runner) (rate(system_network_io_bytes_total{direction=\"receive\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval]))", "legendFormat": "{{instance}}", "refId": "A" } ] }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, - "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, "unit": "Bps" }, "overrides": [] }, - "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, "id": 5, - "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, "title": "Network Out", "type": "stat", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "sum by (instance) (rate(system_network_io_bytes_total{direction=\"transmit\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval]))", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "sum by (github_runner) (rate(system_network_io_bytes_total{direction=\"transmit\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval]))", "legendFormat": "{{instance}}", "refId": "A" } @@ -205,74 +383,201 @@ }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, "id": 101, "title": "CPU", "type": "row" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 1, "fillOpacity": 10, "gradientMode": "none", "showPoints": "never" }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "gradientMode": "none", + "showPoints": "never" + }, "unit": "percent", "min": 0, "max": 100 }, "overrides": [ - { "matcher": { "id": "byName", "options": "idle" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "system" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "user" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "iowait" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] } + { + "matcher": { + "id": "byName", + "options": "idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "system" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "user" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } ] }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, "id": 10, - "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, "title": "CPU Time by State", "type": "timeseries", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "avg by (state) (rate(system_cpu_time_seconds_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])) * 100", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "avg by (state) (rate(system_cpu_time_seconds_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])) * 100", "legendFormat": "{{state}}", "refId": "A" } ] }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 2, "fillOpacity": 0, "showPoints": "never" }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 2, + "fillOpacity": 0, + "showPoints": "never" + }, "unit": "short", "min": 0 }, "overrides": [] }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, "id": 11, - "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["lastNotNull"] } }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "lastNotNull" + ] + } + }, "title": "Load Average", "type": "timeseries", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "system_cpu_load_average_1m{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "system_cpu_load_average_1m{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}", "legendFormat": "1m", "refId": "A" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "system_cpu_load_average_5m{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "system_cpu_load_average_5m{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}", "legendFormat": "5m", "refId": "B" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "system_cpu_load_average_15m{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "system_cpu_load_average_15m{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}", "legendFormat": "15m", "refId": "C" } @@ -280,36 +585,132 @@ }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, "id": 102, "title": "Memory", "type": "row" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 1, "fillOpacity": 20, "gradientMode": "none", "showPoints": "never", "stacking": { "mode": "normal", "group": "A" } }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 1, + "fillOpacity": 20, + "gradientMode": "none", + "showPoints": "never", + "stacking": { + "mode": "normal", + "group": "A" + } + }, "unit": "bytes", "min": 0 }, "overrides": [ - { "matcher": { "id": "byName", "options": "used" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "cached" }, "properties": [{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "buffered" }, "properties": [{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }] }, - { "matcher": { "id": "byName", "options": "free" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + { + "matcher": { + "id": "byName", + "options": "used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "buffered" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } ] }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 15 }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 15 + }, "id": 20, - "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, "title": "Memory Usage by State", "type": "timeseries", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "system_memory_usage_bytes{instance=~\"$instance\",github_job_id=~\"$github_job_id\",state=~\"used|cached|buffered|free\"}", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "system_memory_usage_bytes{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\",state=~\"used|cached|buffered|free\"}", "legendFormat": "{{state}}", "refId": "A" } @@ -317,56 +718,121 @@ }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, "id": 103, "title": "Disk I/O", "type": "row" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + }, "unit": "Bps", "min": 0 }, "overrides": [] }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, "id": 30, - "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, "title": "Disk Throughput", "type": "timeseries", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "rate(system_disk_io_bytes_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "rate(system_disk_io_bytes_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", "legendFormat": "{{device}} {{direction}}", "refId": "A" } ] }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + }, "unit": "iops", "min": 0 }, "overrides": [] }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, "id": 31, - "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, "title": "Disk Operations/s", "type": "timeseries", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "rate(system_disk_operations_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "rate(system_disk_operations_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", "legendFormat": "{{device}} {{direction}}", "refId": "A" } @@ -374,22 +840,41 @@ }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, "id": 104, "title": "Filesystem", "type": "row" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "thresholds" }, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - { "color": "green", "value": null }, - { "color": "yellow", "value": 70 }, - { "color": "red", "value": 90 } + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } ] }, "unit": "percent", @@ -398,15 +883,34 @@ }, "overrides": [] }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 33 }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 33 + }, "id": 40, - "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showUnfilled": true }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, "title": "Filesystem Usage %", "type": "bargauge", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "100 * system_filesystem_usage_bytes{state=\"used\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} / (system_filesystem_usage_bytes{state=\"used\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"} + system_filesystem_usage_bytes{state=\"free\",instance=~\"$instance\",github_job_id=~\"$github_job_id\"})", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "100 * system_filesystem_usage_bytes{state=\"used\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} / (system_filesystem_usage_bytes{state=\"used\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} + system_filesystem_usage_bytes{state=\"free\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"})", "legendFormat": "{{mountpoint}} ({{device}})", "refId": "A" } @@ -414,62 +918,130 @@ }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, "id": 105, "title": "Network", "type": "row" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + }, "unit": "Bps", "min": 0 }, "overrides": [] }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 42 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, "id": 50, - "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, "title": "Network Traffic", "type": "timeseries", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "rate(system_network_io_bytes_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "rate(system_network_io_bytes_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", "legendFormat": "{{device}} {{direction}}", "refId": "A" } ] }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, "fieldConfig": { "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { "lineWidth": 1, "fillOpacity": 10, "showPoints": "never" }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + }, "unit": "pps", "min": 0 }, "overrides": [] }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 42 }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, "id": 51, - "options": { "tooltip": { "mode": "multi", "sort": "desc" }, "legend": { "displayMode": "list", "placement": "bottom", "calcs": ["mean", "max"] } }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "list", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, "title": "Network Errors & Drops", "type": "timeseries", "targets": [ { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "rate(system_network_errors_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "rate(system_network_errors_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", "legendFormat": "errors {{device}} {{direction}}", "refId": "A" }, { - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "expr": "rate(system_network_dropped_total{instance=~\"$instance\",github_job_id=~\"$github_job_id\"}[$__rate_interval])", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "expr": "rate(system_network_dropped_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", "legendFormat": "drops {{device}} {{direction}}", "refId": "B" } @@ -478,65 +1050,96 @@ ], "refresh": "30s", "schemaVersion": 38, - "tags": ["github-runner", "hostmetrics"], + "tags": [ + "github-runner", + "hostmetrics" + ], "templating": { "list": [ { - "current": {}, - "hide": 0, - "includeAll": false, - "label": "Datasource", - "multi": false, "name": "prometheusds", - "options": [], + "type": "datasource", + "pluginId": "prometheus", "query": "prometheus", + "label": "Prometheus", + "hide": 0, "refresh": 1, - "type": "datasource" + "includeAll": false, + "multi": false }, { - "current": {}, - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "definition": "label_values(system_cpu_time_seconds_total, github_job_id)", - "hide": 0, + "name": "github_repository", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "label": "Repository", + "query": "label_values(system_cpu_time_seconds_total, github_repository)", + "refresh": 2, + "sort": 1, "includeAll": true, - "allValue": ".*", - "label": "Job ID", "multi": false, - "name": "github_job_id", - "options": [], - "query": { - "query": "label_values(system_cpu_time_seconds_total, github_job_id)", - "refId": "StandardVariableQuery" + "allValue": ".*", + "hide": 0 + }, + { + "name": "github_workflow", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" }, + "label": "Workflow", + "query": "label_values(system_cpu_time_seconds_total{github_repository=~\"$github_repository\"}, github_workflow)", "refresh": 2, "sort": 1, - "type": "query" + "includeAll": true, + "multi": false, + "allValue": ".*", + "hide": 0 }, { - "current": {}, - "datasource": { "type": "prometheus", "uid": "${prometheusds}" }, - "definition": "label_values(system_cpu_time_seconds_total{github_job_id=~\"$github_job_id\"}, instance)", - "hide": 0, + "name": "github_job", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" + }, + "label": "Job", + "query": "label_values(system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\"}, github_job)", + "refresh": 2, + "sort": 1, "includeAll": true, + "multi": false, "allValue": ".*", - "label": "Instance", - "multi": true, - "name": "instance", - "options": [], - "query": { - "query": "label_values(system_cpu_time_seconds_total{github_job_id=~\"$github_job_id\"}, instance)", - "refId": "StandardVariableQuery" + "hide": 0 + }, + { + "name": "github_runner", + "type": "query", + "datasource": { + "type": "prometheus", + "uid": "${prometheusds}" }, + "label": "Runner", + "query": "label_values(system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\"}, github_runner)", "refresh": 2, "sort": 1, - "type": "query" + "includeAll": true, + "multi": true, + "allValue": ".*", + "hide": 0 } ] }, - "time": { "from": "now-1h", "to": "now" }, + "time": { + "from": "now-1h", + "to": "now" + }, "timepicker": {}, "timezone": "browser", "title": "GitHub Runner VM Hostmetrics", "uid": "github-runner-vm-hostmetrics", "version": 1 -} +} \ No newline at end of file From ed3412f15600a24fa8c5216fa3462e5a1ce49b18 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 27 Apr 2026 12:10:08 +0200 Subject: [PATCH 4/5] refactor(observability): mirror upstream OTel hostmetrics dashboard layout Restructure the runner VM hostmetrics dashboard to follow the upstream OpenTelemetry hostmetrics dashboard (Grafana gnetId 24638): Overview row of CPU/Memory/Root FS gauges plus Load/Cores/Total Memory stats, then CPU, Memory, Disk I/O, Filesystem and Network sections with read/write and rx/tx split axes. Make every templating variable support "All" via includeAll, multi-select and allValue ".*", and switch all label matchers to =~ so regex interpolation works. --- .../runner_vm_hostmetrics.json | 1411 ++++++++--------- 1 file changed, 683 insertions(+), 728 deletions(-) diff --git a/runner_grafana_dashboards/runner_vm_hostmetrics.json b/runner_grafana_dashboards/runner_vm_hostmetrics.json index 30499bb5..79f7c1a3 100644 --- a/runner_grafana_dashboards/runner_vm_hostmetrics.json +++ b/runner_grafana_dashboards/runner_vm_hostmetrics.json @@ -29,6 +29,12 @@ "name": "Bar gauge", "version": "" }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, { "type": "panel", "id": "stat", @@ -42,7 +48,23 @@ "version": "" } ], - "description": "Host-level resource metrics (CPU, memory, disk, network, load) for GitHub Actions runner VMs. Filter by repository, workflow, job and runner to inspect a specific run.", + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Host-level resource metrics (CPU, memory, disk, filesystem, network) for GitHub Actions runner VMs, based on the OpenTelemetry hostmetrics receiver. Filter by repository, workflow, job and runner; select \"All\" to aggregate across the matching set.", "editable": false, "fiscalYearStartMonth": 0, "graphTooltip": 1, @@ -51,999 +73,943 @@ "panels": [ { "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, "id": 100, + "panels": [], "title": "Overview", "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, + "color": {"mode": "thresholds"}, + "mappings": [], + "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 90 - } + {"color": "green", "value": null}, + {"color": "#EAB839", "value": 0.6}, + {"color": "red", "value": 0.8} ] }, - "unit": "percent", - "min": 0, - "max": 100 + "unit": "percentunit" }, "overrides": [] }, - "gridPos": { - "h": 4, - "w": 4, - "x": 0, - "y": 1 - }, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, "id": 1, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "showThresholdLabels": false, + "showThresholdMarkers": true }, - "title": "CPU Usage", - "type": "stat", + "title": "CPU", + "type": "gauge", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "100 - (avg by (github_runner) (rate(system_cpu_time_seconds_total{state=\"idle\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])) * 100)", - "legendFormat": "{{instance}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "1 - avg(rate(system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"idle\"}[$__rate_interval]))", + "legendFormat": "__auto", + "range": true, "refId": "A" } ] }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, + "color": {"mode": "thresholds"}, + "mappings": [], + "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 90 - } + {"color": "green", "value": null}, + {"color": "#EAB839", "value": 0.6}, + {"color": "red", "value": 0.8} ] }, - "unit": "percent", - "min": 0, - "max": 100 + "unit": "percentunit" }, "overrides": [] }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 1 - }, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, "id": 2, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "showThresholdLabels": false, + "showThresholdMarkers": true }, - "title": "Memory Usage", - "type": "stat", + "title": "Memory", + "type": "gauge", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "100 * (1 - (system_memory_usage_bytes{state=\"free\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} / sum without (state) (system_memory_usage_bytes{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"})))", - "legendFormat": "{{instance}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "1 - (sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"free\"}) / sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", + "legendFormat": "__auto", + "range": true, "refId": "A" } ] }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, + "color": {"mode": "thresholds"}, + "mappings": [], + "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 90 - } + {"color": "green", "value": null}, + {"color": "#EAB839", "value": 0.7}, + {"color": "red", "value": 0.9} ] }, - "unit": "percent", - "min": 0, - "max": 100 + "unit": "percentunit" }, "overrides": [] }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 1 - }, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, "id": 3, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "showThresholdLabels": false, + "showThresholdMarkers": true }, - "title": "Filesystem Usage (/)", - "type": "stat", + "title": "Root FS", + "type": "gauge", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "100 * system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} / (system_filesystem_usage_bytes{state=\"used\",mountpoint=\"/\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} + system_filesystem_usage_bytes{state=\"free\",mountpoint=\"/\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"})", - "legendFormat": "{{instance}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum(system_filesystem_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",mountpoint=\"/\",state=\"used\"}) / sum(system_filesystem_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",mountpoint=\"/\"})", + "legendFormat": "__auto", + "range": true, "refId": "A" } ] }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "blue", - "value": null - } - ] - }, - "unit": "Bps" + "color": {"mode": "palette-classic"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "short" }, "overrides": [] }, - "gridPos": { - "h": 4, - "w": 6, - "x": 12, - "y": 1 - }, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, "id": 4, "options": { "colorMode": "value", "graphMode": "area", - "justifyMode": "center", + "justifyMode": "auto", "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto" }, - "title": "Network In", + "title": "Load 1m", "type": "stat", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "sum by (github_runner) (rate(system_network_io_bytes_total{direction=\"receive\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval]))", - "legendFormat": "{{instance}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "avg(system_cpu_load_average_1m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"})", + "legendFormat": "__auto", + "range": true, "refId": "A" } ] }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "blue", - "value": null - } - ] - }, - "unit": "Bps" + "color": {"mode": "palette-classic"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "short" }, "overrides": [] }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, "id": 5, "options": { "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", + "graphMode": "none", + "justifyMode": "auto", "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "title": "CPU Cores", + "type": "stat", + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "count(count by (cpu) (system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ] + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "bytes" }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 1}, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto" }, - "title": "Network Out", + "title": "Total Memory", "type": "stat", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "sum by (github_runner) (rate(system_network_io_bytes_total{direction=\"transmit\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval]))", - "legendFormat": "{{instance}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"})", + "legendFormat": "__auto", + "range": true, "refId": "A" } ] }, { "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "id": 101, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "id": 110, + "panels": [], "title": "CPU", "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": {"mode": "palette-classic"}, "custom": { - "lineWidth": 1, - "fillOpacity": 10, + "drawStyle": "line", + "fillOpacity": 20, "gradientMode": "none", - "showPoints": "never" + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} }, - "unit": "percent", + "mappings": [], + "max": 1, "min": 0, - "max": 100 + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "percentunit" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "idle" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "system" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "user" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "iowait" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "orange", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 6 + "overrides": [] }, - "id": 10, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 6}, + "id": 11, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - }, - "legend": { - "displayMode": "list", - "placement": "bottom", - "calcs": [ - "mean", - "max" - ] - } + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} }, - "title": "CPU Time by State", + "title": "CPU Utilization", "type": "timeseries", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "avg by (state) (rate(system_cpu_time_seconds_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])) * 100", - "legendFormat": "{{state}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "1 - avg(rate(system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"idle\"}[$__rate_interval]))", + "legendFormat": "CPU Utilization", + "range": true, "refId": "A" } ] }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": {"mode": "palette-classic"}, "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", "lineWidth": 2, - "fillOpacity": 0, - "showPoints": "never" + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} }, - "unit": "short", - "min": 0 + "mappings": [], + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "short" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 6 + "overrides": [ + { + "matcher": {"id": "byName", "options": "cores"}, + "properties": [ + {"id": "color", "value": {"fixedColor": "red", "mode": "fixed"}}, + {"id": "custom.lineStyle", "value": {"dash": [10, 10], "fill": "dash"}} + ] + } + ] }, - "id": 11, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 6}, + "id": 12, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - }, - "legend": { - "displayMode": "list", - "placement": "bottom", - "calcs": [ - "lastNotNull" - ] - } + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} }, - "title": "Load Average", + "title": "System Load", "type": "timeseries", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "system_cpu_load_average_1m{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "system_cpu_load_average_1m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}", "legendFormat": "1m", + "range": true, "refId": "A" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "system_cpu_load_average_5m{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "system_cpu_load_average_5m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}", "legendFormat": "5m", + "range": true, "refId": "B" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "system_cpu_load_average_15m{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "system_cpu_load_average_15m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}", "legendFormat": "15m", + "range": true, "refId": "C" + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "count(count by (cpu) (system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", + "legendFormat": "cores", + "range": true, + "refId": "D" } ] }, { "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 14 - }, - "id": 102, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 14}, + "id": 120, + "panels": [], "title": "Memory", "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": {"mode": "palette-classic"}, "custom": { - "lineWidth": 1, - "fillOpacity": 20, + "drawStyle": "line", + "fillOpacity": 50, "gradientMode": "none", + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, "showPoints": "never", - "stacking": { - "mode": "normal", - "group": "A" - } + "spanNulls": false, + "stacking": {"group": "A", "mode": "normal"} }, - "unit": "bytes", - "min": 0 + "mappings": [], + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "bytes" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "used" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "buffered" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "purple", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 15 + "overrides": [] }, - "id": 20, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 15}, + "id": 21, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" + "legend": {"calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "title": "Memory Usage", + "type": "timeseries", + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"used\"})", + "legendFormat": "used", + "range": true, + "refId": "A" }, - "legend": { - "displayMode": "list", - "placement": "bottom", - "calcs": [ - "mean", - "max" - ] + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"cached\"})", + "legendFormat": "cached", + "range": true, + "refId": "B" + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"buffered\"})", + "legendFormat": "buffered", + "range": true, + "refId": "C" + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"free\"})", + "legendFormat": "free", + "range": true, + "refId": "D" } + ] + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "percentunit" + }, + "overrides": [] }, - "title": "Memory Usage by State", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 15}, + "id": 22, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "title": "Memory Utilization", "type": "timeseries", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "system_memory_usage_bytes{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\",state=~\"used|cached|buffered|free\"}", - "legendFormat": "{{state}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "1 - (sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"free\"}) / sum(system_memory_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", + "legendFormat": "Memory Utilization", + "range": true, "refId": "A" } ] }, { "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 23 - }, - "id": 103, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 23}, + "id": 130, + "panels": [], "title": "Disk I/O", "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": {"mode": "palette-classic"}, "custom": { + "axisCenteredZero": true, + "axisLabel": "read (-) / write (+)", + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", "lineWidth": 1, - "fillOpacity": 10, - "showPoints": "never" + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} }, - "unit": "Bps", - "min": 0 + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "Bps" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 + "overrides": [ + { + "matcher": {"id": "byRegexp", "options": "/.*read/"}, + "properties": [{"id": "custom.transform", "value": "negative-Y"}] + } + ] }, - "id": 30, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 24}, + "id": 31, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "title": "Disk I/O Throughput", + "type": "timeseries", + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_disk_io_bytes_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"read\"}[$__rate_interval]))", + "legendFormat": "{{device}} read", + "range": true, + "refId": "A" }, - "legend": { - "displayMode": "list", - "placement": "bottom", - "calcs": [ - "mean", - "max" - ] + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_disk_io_bytes_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"write\"}[$__rate_interval]))", + "legendFormat": "{{device}} write", + "range": true, + "refId": "B" } + ] + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": true, + "axisLabel": "read (-) / write (+)", + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} + }, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "iops" + }, + "overrides": [ + { + "matcher": {"id": "byRegexp", "options": "/.*read/"}, + "properties": [{"id": "custom.transform", "value": "negative-Y"}] + } + ] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 24}, + "id": 32, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} }, - "title": "Disk Throughput", + "title": "Disk IOPS", "type": "timeseries", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "rate(system_disk_io_bytes_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", - "legendFormat": "{{device}} {{direction}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_disk_operations_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"read\"}[$__rate_interval]))", + "legendFormat": "{{device}} read", + "range": true, "refId": "A" + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_disk_operations_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"write\"}[$__rate_interval]))", + "legendFormat": "{{device}} write", + "range": true, + "refId": "B" } ] }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": {"mode": "palette-classic"}, "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", "lineWidth": 1, - "fillOpacity": 10, - "showPoints": "never" + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} }, - "unit": "iops", - "min": 0 + "mappings": [], + "max": 1, + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "percentunit" }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "id": 31, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 32}, + "id": 33, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - }, - "legend": { - "displayMode": "list", - "placement": "bottom", - "calcs": [ - "mean", - "max" - ] - } + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} }, - "title": "Disk Operations/s", + "title": "Disk Busy %", "type": "timeseries", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "rate(system_disk_operations_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", - "legendFormat": "{{device}} {{direction}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_disk_io_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}[$__rate_interval]))", + "legendFormat": "{{device}}", + "range": true, "refId": "A" } ] }, { "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 104, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 40}, + "id": 140, + "panels": [], "title": "Filesystem", "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, + "color": {"mode": "thresholds"}, + "mappings": [], + "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 90 - } + {"color": "green", "value": null}, + {"color": "#EAB839", "value": 0.7}, + {"color": "red", "value": 0.9} ] }, - "unit": "percent", - "min": 0, - "max": 100 + "unit": "percentunit" }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 40, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 41}, + "id": 41, "options": { - "displayMode": "gradient", + "displayMode": "lcd", "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "showUnfilled": true, + "valueMode": "color" }, - "title": "Filesystem Usage %", + "title": "Filesystem Utilization", "type": "bargauge", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (mountpoint) (system_filesystem_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"used\"}) / sum by (mountpoint) (system_filesystem_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=~\"used|free\"})", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A" + } + ] + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "none", + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} }, - "expr": "100 * system_filesystem_usage_bytes{state=\"used\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} / (system_filesystem_usage_bytes{state=\"used\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"} + system_filesystem_usage_bytes{state=\"free\",github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"})", - "legendFormat": "{{mountpoint}} ({{device}})", + "mappings": [], + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, + "id": 42, + "options": { + "legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "title": "Filesystem Usage", + "type": "timeseries", + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (mountpoint) (system_filesystem_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"used\"})", + "legendFormat": "{{mountpoint}} used", + "range": true, "refId": "A" + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (mountpoint) (system_filesystem_usage_bytes{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",state=\"free\"})", + "legendFormat": "{{mountpoint}} free", + "range": true, + "refId": "B" } ] }, { "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 41 - }, - "id": 105, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 49}, + "id": 150, + "panels": [], "title": "Network", "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": {"mode": "palette-classic"}, "custom": { + "axisCenteredZero": true, + "axisLabel": "rx (-) / tx (+)", + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", "lineWidth": 1, - "fillOpacity": 10, - "showPoints": "never" + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} }, - "unit": "Bps", - "min": 0 + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "bps" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 42 + "overrides": [ + { + "matcher": {"id": "byRegexp", "options": "/.*receive/"}, + "properties": [{"id": "custom.transform", "value": "negative-Y"}] + } + ] }, - "id": 50, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 50}, + "id": 51, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "title": "Network Throughput", + "type": "timeseries", + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_network_io_bytes_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"receive\"}[$__rate_interval])) * 8", + "legendFormat": "{{device}} receive", + "range": true, + "refId": "A" }, - "legend": { - "displayMode": "list", - "placement": "bottom", - "calcs": [ - "mean", - "max" - ] + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_network_io_bytes_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"transmit\"}[$__rate_interval])) * 8", + "legendFormat": "{{device}} transmit", + "range": true, + "refId": "B" } + ] + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": true, + "axisLabel": "rx (-) / tx (+)", + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} + }, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "pps" + }, + "overrides": [ + { + "matcher": {"id": "byRegexp", "options": "/.*receive/"}, + "properties": [{"id": "custom.transform", "value": "negative-Y"}] + } + ] }, - "title": "Network Traffic", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 50}, + "id": 52, + "options": { + "legend": {"calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "title": "Network Packets", "type": "timeseries", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "rate(system_network_io_bytes_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", - "legendFormat": "{{device}} {{direction}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_network_packets_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"receive\"}[$__rate_interval]))", + "legendFormat": "{{device}} receive", + "range": true, "refId": "A" + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device) (rate(system_network_packets_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\",direction=\"transmit\"}[$__rate_interval]))", + "legendFormat": "{{device}} transmit", + "range": true, + "refId": "B" } ] }, { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": {"mode": "palette-classic"}, "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", "lineWidth": 1, - "fillOpacity": 10, - "showPoints": "never" + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} }, - "unit": "pps", - "min": 0 + "mappings": [], + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "short" }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 42 - }, - "id": 51, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 58}, + "id": 53, "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - }, - "legend": { - "displayMode": "list", - "placement": "bottom", - "calcs": [ - "mean", - "max" - ] - } + "legend": {"calcs": ["sum"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} }, - "title": "Network Errors & Drops", + "title": "Network Errors", "type": "timeseries", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "rate(system_network_errors_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", - "legendFormat": "errors {{device}} {{direction}}", + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (device, direction) (rate(system_network_errors_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}[$__rate_interval]))", + "legendFormat": "{{device}} {{direction}} errors", + "range": true, "refId": "A" + } + ] + }, + { + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": {"group": "A", "mode": "none"} + }, + "mappings": [], + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "short" }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 58}, + "id": 54, + "options": { + "legend": {"calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "title": "TCP Connections", + "type": "timeseries", + "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, - "expr": "rate(system_network_dropped_total{github_runner=~\"$github_runner\",github_job=~\"$github_job\",github_repository=~\"$github_repository\"}[$__rate_interval])", - "legendFormat": "drops {{device}} {{direction}}", - "refId": "B" + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, + "editorMode": "code", + "expr": "sum by (state) (system_network_connections{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"})", + "legendFormat": "{{state}}", + "range": true, + "refId": "A" } ] } @@ -1052,7 +1018,8 @@ "schemaVersion": 38, "tags": [ "github-runner", - "hostmetrics" + "hostmetrics", + "opentelemetry" ], "templating": { "list": [ @@ -1070,60 +1037,48 @@ { "name": "github_repository", "type": "query", - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "label": "Repository", - "query": "label_values(system_cpu_time_seconds_total, github_repository)", + "query": "label_values(system_cpu_load_average_1m, github_repository)", "refresh": 2, "sort": 1, "includeAll": true, - "multi": false, + "multi": true, "allValue": ".*", "hide": 0 }, { "name": "github_workflow", "type": "query", - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "label": "Workflow", - "query": "label_values(system_cpu_time_seconds_total{github_repository=~\"$github_repository\"}, github_workflow)", + "query": "label_values(system_cpu_load_average_1m{github_repository=~\"$github_repository\"}, github_workflow)", "refresh": 2, "sort": 1, "includeAll": true, - "multi": false, + "multi": true, "allValue": ".*", "hide": 0 }, { "name": "github_job", "type": "query", - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "label": "Job", - "query": "label_values(system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\"}, github_job)", + "query": "label_values(system_cpu_load_average_1m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\"}, github_job)", "refresh": 2, "sort": 1, "includeAll": true, - "multi": false, + "multi": true, "allValue": ".*", "hide": 0 }, { "name": "github_runner", "type": "query", - "datasource": { - "type": "prometheus", - "uid": "${prometheusds}" - }, + "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "label": "Runner", - "query": "label_values(system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\"}, github_runner)", + "query": "label_values(system_cpu_load_average_1m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\"}, github_runner)", "refresh": 2, "sort": 1, "includeAll": true, @@ -1142,4 +1097,4 @@ "title": "GitHub Runner VM Hostmetrics", "uid": "github-runner-vm-hostmetrics", "version": 1 -} \ No newline at end of file +} From 9b4111d66792e8c701cd4e9010c0fb121e8c46a7 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 27 Apr 2026 13:19:55 +0200 Subject: [PATCH 5/5] fix(observability): correct multi-runner aggregations in hostmetrics dashboard When the runner variable resolves to multiple series (multi-select or "All"), several panels previously produced misleading values: - CPU Cores stat / System Load "cores" reference: count(count by (cpu) ...) collapses cpu indexes across runners, returning the max-cores-on-any-host rather than fleet total. Group by github_runner so cpu indexes stay distinct, then expose total cores in the stat panel and per-runner cores on the load panel (so the reference aligns with the averaged load lines). - System Load 1m/5m/15m: bare metric returns one series per runner with identical legends ("1m"/"5m"/"15m"), making the chart unreadable. Wrap in avg() to get one fleet-average line per period. - Disk Busy %: sum by (device) of fractional busy time can exceed 1 with multiple runners and gets silently clamped by max:1. Switch to avg by (device) so the value stays a meaningful 0-1 fleet average. Also soften the README guidance on editable: false. cos-configuration-k8s provisions dashboards from the filesystem, which makes them read-only in Grafana regardless of the flag, so the explicit "must" requirement was contradicted by existing dashboards in charms/planner-operator/. --- README.md | 6 ++++-- .../runner_vm_hostmetrics.json | 20 +++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3fb2aeec..8ed78118 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,9 @@ role — they cannot be edited or deleted through the UI. | `charms//cos_custom/grafana_dashboards/` | Dashboards for a specific charm's workload metrics | `charms//cos_custom/grafana_dashboards` | | `runner_grafana_dashboards/` | Dashboards for runner VM host-level metrics (CPU, memory, disk, network) | `runner_grafana_dashboards` | -Dashboard JSON files must use `__inputs` to declare the datasource (type `prometheus`) and -set `"editable": false`. Metric names follow the +Dashboard JSON files should use `__inputs` to declare the datasource (type `prometheus`). +Setting `"editable": false` is recommended for clarity, but is not strictly required: +dashboards delivered through `cos-configuration-k8s` are filesystem-provisioned and +therefore read-only in Grafana regardless of the JSON flag. Metric names follow the [OpenTelemetry hostmetrics receiver](https://opentelemetry.io/docs/collector/components/#receiver) Prometheus naming convention (e.g. `system_cpu_time_seconds_total`). diff --git a/runner_grafana_dashboards/runner_vm_hostmetrics.json b/runner_grafana_dashboards/runner_vm_hostmetrics.json index 79f7c1a3..0be5da9c 100644 --- a/runner_grafana_dashboards/runner_vm_hostmetrics.json +++ b/runner_grafana_dashboards/runner_vm_hostmetrics.json @@ -263,7 +263,7 @@ { "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "editorMode": "code", - "expr": "count(count by (cpu) (system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", + "expr": "count(count by (cpu, github_runner) (system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", "legendFormat": "__auto", "range": true, "refId": "A" @@ -398,32 +398,32 @@ { "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "editorMode": "code", - "expr": "system_cpu_load_average_1m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}", - "legendFormat": "1m", + "expr": "avg(system_cpu_load_average_1m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"})", + "legendFormat": "1m (avg)", "range": true, "refId": "A" }, { "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "editorMode": "code", - "expr": "system_cpu_load_average_5m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}", - "legendFormat": "5m", + "expr": "avg(system_cpu_load_average_5m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"})", + "legendFormat": "5m (avg)", "range": true, "refId": "B" }, { "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "editorMode": "code", - "expr": "system_cpu_load_average_15m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}", - "legendFormat": "15m", + "expr": "avg(system_cpu_load_average_15m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"})", + "legendFormat": "15m (avg)", "range": true, "refId": "C" }, { "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "editorMode": "code", - "expr": "count(count by (cpu) (system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", - "legendFormat": "cores", + "expr": "count(count by (cpu, github_runner) (system_cpu_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"})) / count(count by (github_runner) (system_cpu_load_average_1m{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}))", + "legendFormat": "cores (per runner)", "range": true, "refId": "D" } @@ -702,7 +702,7 @@ { "datasource": {"type": "prometheus", "uid": "${prometheusds}"}, "editorMode": "code", - "expr": "sum by (device) (rate(system_disk_io_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}[$__rate_interval]))", + "expr": "avg by (device) (rate(system_disk_io_time_seconds_total{github_repository=~\"$github_repository\",github_workflow=~\"$github_workflow\",github_job=~\"$github_job\",github_runner=~\"$github_runner\"}[$__rate_interval]))", "legendFormat": "{{device}}", "range": true, "refId": "A"