From cde981983cbca4c1748a3e65c9b7c516ab10ece4 Mon Sep 17 00:00:00 2001 From: hsinhoyeh Date: Thu, 23 Apr 2026 20:49:15 +0800 Subject: [PATCH] feat: prevent disk-full cascade failures with ZFS reservations and earlier alerts Addresses the incident where a full ZFS pool caused PostgreSQL to crash (couldn't write its PID file), which cascaded into Caddy OOM and a full web UI outage. ZFS reservations for core services (idempotent, applied on each EnsurePostgres/Caddy/VictoriaMetrics/Security call): - postgres: 5GB reserved - caddy: 2GB reserved - security: 2GB reserved - victoria: 2GB reserved Total 11GB guaranteed for core services even if user containers fill the pool. ZFS set is silently skipped on non-ZFS pools. Alert rule changes: - New DiskUsageWarning at 70% for 10m (early heads-up to plan expansion) - Lower DiskAlmostFull from 95% to 90% for 2m (more reaction time) - HighDiskUsage description now warns that core services may fail Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/server/alert_rules.go | 16 +++++++++++++--- internal/server/core_services.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/internal/server/alert_rules.go b/internal/server/alert_rules.go index d73ac0f..35f3a88 100644 --- a/internal/server/alert_rules.go +++ b/internal/server/alert_rules.go @@ -16,6 +16,16 @@ const DefaultAlertRules = `groups: summary: "High memory usage detected" description: "System memory usage is above 90% for more than 5 minutes (current: {{ $value | printf \"%.1f\" }}%%)." + - alert: DiskUsageWarning + expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 70 + for: 10m + labels: + severity: warning + source: default + annotations: + summary: "Disk usage approaching capacity" + description: "System disk usage is above 70% for more than 10 minutes (current: {{ $value | printf \"%.1f\" }}%%). Plan disk expansion or cleanup." + - alert: HighDiskUsage expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 85 for: 5m @@ -24,17 +34,17 @@ const DefaultAlertRules = `groups: source: default annotations: summary: "High disk usage detected" - description: "System disk usage is above 85% for more than 5 minutes (current: {{ $value | printf \"%.1f\" }}%%)." + description: "System disk usage is above 85% for more than 5 minutes (current: {{ $value | printf \"%.1f\" }}%%). Act soon — core services (PostgreSQL, Caddy) may fail if disk fills." - alert: DiskAlmostFull - expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 95 + expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 90 for: 2m labels: severity: critical source: default annotations: summary: "Disk almost full" - description: "System disk usage is above 95% for more than 2 minutes (current: {{ $value | printf \"%.1f\" }}%%). Immediate action required." + description: "System disk usage is above 90% for more than 2 minutes (current: {{ $value | printf \"%.1f\" }}%%). IMMEDIATE action required — core services will fail at 100%." - alert: HighCPULoad expr: system_cpu_load_5m > system_cpu_count * 0.8 diff --git a/internal/server/core_services.go b/internal/server/core_services.go index 5d7fe0b..9fbda63 100644 --- a/internal/server/core_services.go +++ b/internal/server/core_services.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "os/exec" "strings" "time" @@ -93,6 +94,21 @@ func NewCoreServices(incusClient *incus.Client, config CoreServicesConfig) *Core } } +// ensureCoreReservation sets a ZFS reservation on a core container's dataset so +// it can always write even when user containers fill the pool. This prevents +// cascade failures (e.g. PostgreSQL crashing when the pool is 100% full +// because user containers overflowed their quotas). +// +// Idempotent — safe to call repeatedly. Silently skips on non-ZFS pools. +func (cs *CoreServices) ensureCoreReservation(containerName, size string) { + dataset := fmt.Sprintf("incus-pool/containers/containers/%s", containerName) + cmd := exec.Command("zfs", "set", "reservation="+size, dataset) + if out, err := cmd.CombinedOutput(); err != nil { + // ZFS may not be present (dir-backed pool) — not fatal + log.Printf("Note: skipping ZFS reservation for %s: %v (output: %s)", containerName, err, string(out)) + } +} + // ensurePostgresRestartPolicy adds a systemd override so postgresql@16-main // auto-restarts on failure. Ubuntu 24.04 runs postgres via postgresql@16-main.service // (a template unit), not plain postgresql.service. Idempotent — safe to call repeatedly. @@ -109,6 +125,9 @@ func (cs *CoreServices) ensurePostgresRestartPolicy() { // EnsurePostgres ensures PostgreSQL container is running and returns the connection string func (cs *CoreServices) EnsurePostgres(ctx context.Context) (string, error) { + // Reserve 5GB so postgres can always write (prevents cascade failures) + cs.ensureCoreReservation(CorePostgresContainer, "5G") + // Check if container already exists info, err := cs.incusClient.GetContainer(CorePostgresContainer) if err == nil { @@ -316,6 +335,9 @@ func (cs *CoreServices) GetPostgresIP() string { // EnsureCaddy ensures Caddy container is running and returns the admin URL func (cs *CoreServices) EnsureCaddy(ctx context.Context, baseDomain string) (string, error) { + // Reserve 2GB so Caddy can always write (prevents cascade failures) + cs.ensureCoreReservation(CoreCaddyContainer, "2G") + // Check if container already exists info, err := cs.incusClient.GetContainer(CoreCaddyContainer) if err == nil { @@ -506,6 +528,9 @@ func (cs *CoreServices) updateGrafanaDashboard() { // EnsureVictoriaMetrics ensures the Victoria Metrics + Grafana container is running func (cs *CoreServices) EnsureVictoriaMetrics(ctx context.Context, postgresIP string) (string, error) { + // Reserve 2GB so metrics/alerts keep flowing even when pool is full + cs.ensureCoreReservation(CoreVictoriaMetricsContainer, "2G") + // Check if container already exists info, err := cs.incusClient.GetContainer(CoreVictoriaMetricsContainer) if err == nil { @@ -1184,6 +1209,9 @@ func (cs *CoreServices) waitForAlertmanager(ctx context.Context) error { // EnsureSecurity ensures the ClamAV security container is running func (cs *CoreServices) EnsureSecurity(ctx context.Context) error { + // Reserve 2GB so scans can continue even when pool is full + cs.ensureCoreReservation(CoreSecurityContainer, "2G") + // Check if container already exists info, err := cs.incusClient.GetContainer(CoreSecurityContainer) if err == nil {