Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions agent/internal/agent/heartbeat.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func buildHeartbeat(sup *supervisor.Supervisor, vectorVersion string, deployment
MemoryUsedBytes: sr.Host.MemoryUsedBytes,
MemoryFreeBytes: sr.Host.MemoryFreeBytes,
CpuSecondsTotal: sr.Host.CpuSecondsTotal,
CpuSecondsIdle: sr.Host.CpuSecondsIdle,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Misaligned indentation on new field

The CpuSecondsIdle line uses one fewer tab than every other field in the same struct literal. Running gofmt would flag this. While it compiles fine, it breaks visual alignment and will cause noisy diffs in future edits.

Suggested change
CpuSecondsIdle: sr.Host.CpuSecondsIdle,
CpuSecondsIdle: sr.Host.CpuSecondsIdle,
Prompt To Fix With AI
This is a comment left during a code review.
Path: agent/internal/agent/heartbeat.go
Line: 63

Comment:
**Misaligned indentation on new field**

The `CpuSecondsIdle` line uses one fewer tab than every other field in the same struct literal. Running `gofmt` would flag this. While it compiles fine, it breaks visual alignment and will cause noisy diffs in future edits.

```suggestion
				CpuSecondsIdle:   sr.Host.CpuSecondsIdle,
```

How can I resolve this? If you propose a fix, please make it concise.

LoadAvg1: sr.Host.LoadAvg1,
LoadAvg5: sr.Host.LoadAvg5,
LoadAvg15: sr.Host.LoadAvg15,
Expand Down
1 change: 1 addition & 0 deletions agent/internal/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ type HostMetrics struct {
MemoryUsedBytes int64 `json:"memoryUsedBytes"`
MemoryFreeBytes int64 `json:"memoryFreeBytes"`
CpuSecondsTotal float64 `json:"cpuSecondsTotal"`
CpuSecondsIdle float64 `json:"cpuSecondsIdle"`
LoadAvg1 float64 `json:"loadAvg1"`
LoadAvg5 float64 `json:"loadAvg5"`
LoadAvg15 float64 `json:"loadAvg15"`
Expand Down
5 changes: 5 additions & 0 deletions agent/internal/metrics/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type HostMetrics struct {
MemoryUsedBytes int64
MemoryFreeBytes int64
CpuSecondsTotal float64
CpuSecondsIdle float64
LoadAvg1 float64
LoadAvg5 float64
LoadAvg15 float64
Expand Down Expand Up @@ -145,6 +146,10 @@ func ScrapePrometheus(metricsPort int) ScrapeResult {
sr.Host.MemoryFreeBytes += int64(value)
case "host_cpu_seconds_total":
sr.Host.CpuSecondsTotal += value
mode := labels["mode"]
if mode == "idle" || mode == "iowait" {
sr.Host.CpuSecondsIdle += value
}
Comment on lines +150 to +152
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider excluding iowait from the idle bucket

Classifying iowait as "idle" means I/O-bound workloads will show artificially low CPU utilization on the graph. For example, a system that's 80% blocked on disk reads will report ~20% CPU busy even though it's clearly under stress.

Standard tools like iostat and htop report iowait as a separate category precisely to make I/O pressure visible. The field is also named CpuSecondsIdle, which implies pure idle time.

If the intent is "CPU not doing compute work", renaming the field to CpuSecondsNonBusy (and documenting that it includes iowait) would at least make the semantics explicit. Alternatively, tracking idle only and displaying iowait as a separate series in the chart gives users richer diagnostic information.

Prompt To Fix With AI
This is a comment left during a code review.
Path: agent/internal/metrics/scraper.go
Line: 150-152

Comment:
**Consider excluding `iowait` from the idle bucket**

Classifying `iowait` as "idle" means I/O-bound workloads will show artificially low CPU utilization on the graph. For example, a system that's 80% blocked on disk reads will report ~20% CPU busy even though it's clearly under stress.

Standard tools like `iostat` and `htop` report `iowait` as a separate category precisely to make I/O pressure visible. The field is also named `CpuSecondsIdle`, which implies pure idle time.

If the intent is "CPU not doing compute work", renaming the field to `CpuSecondsNonBusy` (and documenting that it includes iowait) would at least make the semantics explicit. Alternatively, tracking `idle` only and displaying `iowait` as a separate series in the chart gives users richer diagnostic information.

How can I resolve this? If you propose a fix, please make it concise.

case "host_load1":
sr.Host.LoadAvg1 += value
case "host_load5":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "NodeMetric" ADD COLUMN "cpuSecondsIdle" DOUBLE PRECISION NOT NULL DEFAULT 0;
1 change: 1 addition & 0 deletions prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ model NodeMetric {
memoryUsedBytes BigInt @default(0)
memoryFreeBytes BigInt @default(0)
cpuSecondsTotal Float @default(0)
cpuSecondsIdle Float @default(0)
loadAvg1 Float @default(0)
loadAvg5 Float @default(0)
loadAvg15 Float @default(0)
Expand Down
2 changes: 2 additions & 0 deletions src/app/api/agent/heartbeat/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const heartbeatSchema = z.object({
memoryUsedBytes: z.number().optional(),
memoryFreeBytes: z.number().optional(),
cpuSecondsTotal: z.number().optional(),
cpuSecondsIdle: z.number().optional(),
loadAvg1: z.number().optional(),
loadAvg5: z.number().optional(),
loadAvg15: z.number().optional(),
Expand Down Expand Up @@ -268,6 +269,7 @@ export async function POST(request: Request) {
memoryUsedBytes: hostMetrics.memoryUsedBytes ?? 0,
memoryFreeBytes: hostMetrics.memoryFreeBytes ?? 0,
cpuSecondsTotal: hostMetrics.cpuSecondsTotal ?? 0,
cpuSecondsIdle: hostMetrics.cpuSecondsIdle ?? 0,
loadAvg1: hostMetrics.loadAvg1 ?? 0,
loadAvg5: hostMetrics.loadAvg5 ?? 0,
loadAvg15: hostMetrics.loadAvg15 ?? 0,
Expand Down
13 changes: 5 additions & 8 deletions src/components/fleet/node-metrics-charts.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,14 @@ export function NodeMetricsCharts({ nodeId }: NodeMetricsChartsProps) {
const fsUsed = Number(m.fsUsedBytes);
const fsPercent = fsTotal > 0 ? (fsUsed / fsTotal) * 100 : 0;

// CPU% computed as delta of cpuSecondsTotal between consecutive samples
// CPU% = (busy time / total time) across all cores
let cpuPercent = 0;
if (i > 0) {
const prev = raw[i - 1];
const dtSeconds =
(new Date(m.timestamp).getTime() -
new Date(prev.timestamp).getTime()) /
1000;
if (dtSeconds > 0) {
const cpuDelta = m.cpuSecondsTotal - prev.cpuSecondsTotal;
cpuPercent = (cpuDelta / dtSeconds) * 100;
const totalDelta = m.cpuSecondsTotal - prev.cpuSecondsTotal;
const idleDelta = m.cpuSecondsIdle - prev.cpuSecondsIdle;
if (totalDelta > 0) {
cpuPercent = ((totalDelta - idleDelta) / totalDelta) * 100;
if (cpuPercent < 0) cpuPercent = 0;
if (cpuPercent > 100) cpuPercent = 100;
}
Expand Down
1 change: 1 addition & 0 deletions src/server/routers/fleet.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ export const fleetRouter = router({
memoryUsedBytes: true,
memoryFreeBytes: true,
cpuSecondsTotal: true,
cpuSecondsIdle: true,
loadAvg1: true,
loadAvg5: true,
loadAvg15: true,
Expand Down
Loading