From 1663a491f70ee8db6bbdd85d88996ea7414c6dde Mon Sep 17 00:00:00 2001 From: TerrifiedBug Date: Thu, 5 Mar 2026 17:48:11 +0000 Subject: [PATCH] fix: correct CPU usage calculation to account for idle time The CPU graph was pinned at 100% because host_cpu_seconds_total from Vector is a per-core, per-mode counter. Summing all modes (including idle) across all cores meant the delta always exceeded wall-clock time, so (delta/dt)*100 was always >100% and got clamped. Fix: track idle CPU seconds separately and compute utilization as (total - idle) / total * 100, which is core-count independent and gives accurate whole-server CPU utilization. Changes across the full stack: - Agent scraper: filter by mode label, sum idle+iowait separately - Agent structs/heartbeat: add CpuSecondsIdle field - Server heartbeat route: accept and store cpuSecondsIdle - Prisma schema + migration: add cpuSecondsIdle column - Fleet router: return new field - Frontend chart: new formula using idle delta --- agent/internal/agent/heartbeat.go | 1 + agent/internal/client/client.go | 1 + agent/internal/metrics/scraper.go | 5 +++++ .../migration.sql | 2 ++ prisma/schema.prisma | 1 + src/app/api/agent/heartbeat/route.ts | 2 ++ src/components/fleet/node-metrics-charts.tsx | 13 +++++-------- src/server/routers/fleet.ts | 1 + 8 files changed, 18 insertions(+), 8 deletions(-) create mode 100644 prisma/migrations/20260305100000_add_cpu_seconds_idle/migration.sql diff --git a/agent/internal/agent/heartbeat.go b/agent/internal/agent/heartbeat.go index fc9a4e0..088172e 100644 --- a/agent/internal/agent/heartbeat.go +++ b/agent/internal/agent/heartbeat.go @@ -60,6 +60,7 @@ func buildHeartbeat(sup *supervisor.Supervisor, vectorVersion string, deployment MemoryUsedBytes: sr.Host.MemoryUsedBytes, MemoryFreeBytes: sr.Host.MemoryFreeBytes, CpuSecondsTotal: sr.Host.CpuSecondsTotal, + CpuSecondsIdle: sr.Host.CpuSecondsIdle, LoadAvg1: sr.Host.LoadAvg1, LoadAvg5: sr.Host.LoadAvg5, LoadAvg15: sr.Host.LoadAvg15, diff --git a/agent/internal/client/client.go b/agent/internal/client/client.go index 9707588..bdb0634 100644 --- a/agent/internal/client/client.go +++ b/agent/internal/client/client.go @@ -172,6 +172,7 @@ type HostMetrics struct { MemoryUsedBytes int64 `json:"memoryUsedBytes"` MemoryFreeBytes int64 `json:"memoryFreeBytes"` CpuSecondsTotal float64 `json:"cpuSecondsTotal"` + CpuSecondsIdle float64 `json:"cpuSecondsIdle"` LoadAvg1 float64 `json:"loadAvg1"` LoadAvg5 float64 `json:"loadAvg5"` LoadAvg15 float64 `json:"loadAvg15"` diff --git a/agent/internal/metrics/scraper.go b/agent/internal/metrics/scraper.go index b268c9d..45351fb 100644 --- a/agent/internal/metrics/scraper.go +++ b/agent/internal/metrics/scraper.go @@ -37,6 +37,7 @@ type HostMetrics struct { MemoryUsedBytes int64 MemoryFreeBytes int64 CpuSecondsTotal float64 + CpuSecondsIdle float64 LoadAvg1 float64 LoadAvg5 float64 LoadAvg15 float64 @@ -145,6 +146,10 @@ func ScrapePrometheus(metricsPort int) ScrapeResult { sr.Host.MemoryFreeBytes += int64(value) case "host_cpu_seconds_total": sr.Host.CpuSecondsTotal += value + mode := labels["mode"] + if mode == "idle" || mode == "iowait" { + sr.Host.CpuSecondsIdle += value + } case "host_load1": sr.Host.LoadAvg1 += value case "host_load5": diff --git a/prisma/migrations/20260305100000_add_cpu_seconds_idle/migration.sql b/prisma/migrations/20260305100000_add_cpu_seconds_idle/migration.sql new file mode 100644 index 0000000..4a03aab --- /dev/null +++ b/prisma/migrations/20260305100000_add_cpu_seconds_idle/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "NodeMetric" ADD COLUMN "cpuSecondsIdle" DOUBLE PRECISION NOT NULL DEFAULT 0; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 3e68e44..6af5408 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -118,6 +118,7 @@ model NodeMetric { memoryUsedBytes BigInt @default(0) memoryFreeBytes BigInt @default(0) cpuSecondsTotal Float @default(0) + cpuSecondsIdle Float @default(0) loadAvg1 Float @default(0) loadAvg5 Float @default(0) loadAvg15 Float @default(0) diff --git a/src/app/api/agent/heartbeat/route.ts b/src/app/api/agent/heartbeat/route.ts index b0de6bc..68ea223 100644 --- a/src/app/api/agent/heartbeat/route.ts +++ b/src/app/api/agent/heartbeat/route.ts @@ -47,6 +47,7 @@ const heartbeatSchema = z.object({ memoryUsedBytes: z.number().optional(), memoryFreeBytes: z.number().optional(), cpuSecondsTotal: z.number().optional(), + cpuSecondsIdle: z.number().optional(), loadAvg1: z.number().optional(), loadAvg5: z.number().optional(), loadAvg15: z.number().optional(), @@ -268,6 +269,7 @@ export async function POST(request: Request) { memoryUsedBytes: hostMetrics.memoryUsedBytes ?? 0, memoryFreeBytes: hostMetrics.memoryFreeBytes ?? 0, cpuSecondsTotal: hostMetrics.cpuSecondsTotal ?? 0, + cpuSecondsIdle: hostMetrics.cpuSecondsIdle ?? 0, loadAvg1: hostMetrics.loadAvg1 ?? 0, loadAvg5: hostMetrics.loadAvg5 ?? 0, loadAvg15: hostMetrics.loadAvg15 ?? 0, diff --git a/src/components/fleet/node-metrics-charts.tsx b/src/components/fleet/node-metrics-charts.tsx index 0784a1d..bbd35e2 100644 --- a/src/components/fleet/node-metrics-charts.tsx +++ b/src/components/fleet/node-metrics-charts.tsx @@ -93,17 +93,14 @@ export function NodeMetricsCharts({ nodeId }: NodeMetricsChartsProps) { const fsUsed = Number(m.fsUsedBytes); const fsPercent = fsTotal > 0 ? (fsUsed / fsTotal) * 100 : 0; - // CPU% computed as delta of cpuSecondsTotal between consecutive samples + // CPU% = (busy time / total time) across all cores let cpuPercent = 0; if (i > 0) { const prev = raw[i - 1]; - const dtSeconds = - (new Date(m.timestamp).getTime() - - new Date(prev.timestamp).getTime()) / - 1000; - if (dtSeconds > 0) { - const cpuDelta = m.cpuSecondsTotal - prev.cpuSecondsTotal; - cpuPercent = (cpuDelta / dtSeconds) * 100; + const totalDelta = m.cpuSecondsTotal - prev.cpuSecondsTotal; + const idleDelta = m.cpuSecondsIdle - prev.cpuSecondsIdle; + if (totalDelta > 0) { + cpuPercent = ((totalDelta - idleDelta) / totalDelta) * 100; if (cpuPercent < 0) cpuPercent = 0; if (cpuPercent > 100) cpuPercent = 100; } diff --git a/src/server/routers/fleet.ts b/src/server/routers/fleet.ts index ec7cf47..ce03887 100644 --- a/src/server/routers/fleet.ts +++ b/src/server/routers/fleet.ts @@ -192,6 +192,7 @@ export const fleetRouter = router({ memoryUsedBytes: true, memoryFreeBytes: true, cpuSecondsTotal: true, + cpuSecondsIdle: true, loadAvg1: true, loadAvg5: true, loadAvg15: true,