Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions internal/server/alert_rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ const DefaultAlertRules = `groups:
summary: "High memory usage detected"
description: "System memory usage is above 90% for more than 5 minutes (current: {{ $value | printf \"%.1f\" }}%%)."

- alert: DiskUsageWarning
expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 70
for: 10m
labels:
severity: warning
source: default
annotations:
summary: "Disk usage approaching capacity"
description: "System disk usage is above 70% for more than 10 minutes (current: {{ $value | printf \"%.1f\" }}%%). Plan disk expansion or cleanup."

- alert: HighDiskUsage
expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 85
for: 5m
Expand All @@ -24,17 +34,17 @@ const DefaultAlertRules = `groups:
source: default
annotations:
summary: "High disk usage detected"
description: "System disk usage is above 85% for more than 5 minutes (current: {{ $value | printf \"%.1f\" }}%%)."
description: "System disk usage is above 85% for more than 5 minutes (current: {{ $value | printf \"%.1f\" }}%%). Act soon — core services (PostgreSQL, Caddy) may fail if disk fills."

- alert: DiskAlmostFull
expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 95
expr: system_disk_used_bytes / system_disk_total_bytes * 100 > 90
for: 2m
labels:
severity: critical
source: default
annotations:
summary: "Disk almost full"
description: "System disk usage is above 95% for more than 2 minutes (current: {{ $value | printf \"%.1f\" }}%%). Immediate action required."
description: "System disk usage is above 90% for more than 2 minutes (current: {{ $value | printf \"%.1f\" }}%%). IMMEDIATE action required — core services will fail at 100%."

- alert: HighCPULoad
expr: system_cpu_load_5m > system_cpu_count * 0.8
Expand Down
28 changes: 28 additions & 0 deletions internal/server/core_services.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"context"
"fmt"
"log"
"os/exec"
"strings"
"time"

Expand Down Expand Up @@ -93,6 +94,21 @@
}
}

// ensureCoreReservation sets a ZFS reservation on a core container's dataset so
// it can always write even when user containers fill the pool. This prevents
// cascade failures (e.g. PostgreSQL crashing when the pool is 100% full
// because user containers overflowed their quotas).
//
// Idempotent — safe to call repeatedly. Silently skips on non-ZFS pools.
func (cs *CoreServices) ensureCoreReservation(containerName, size string) {
dataset := fmt.Sprintf("incus-pool/containers/containers/%s", containerName)
cmd := exec.Command("zfs", "set", "reservation="+size, dataset)

Check failure

Code scanning / gosec

Subprocess launched with variable Error

Subprocess launched with a potential tainted input or cmd arguments
if out, err := cmd.CombinedOutput(); err != nil {
// ZFS may not be present (dir-backed pool) — not fatal
log.Printf("Note: skipping ZFS reservation for %s: %v (output: %s)", containerName, err, string(out))
}
}

// ensurePostgresRestartPolicy adds a systemd override so postgresql@16-main
// auto-restarts on failure. Ubuntu 24.04 runs postgres via postgresql@16-main.service
// (a template unit), not plain postgresql.service. Idempotent — safe to call repeatedly.
Expand All @@ -109,6 +125,9 @@

// EnsurePostgres ensures PostgreSQL container is running and returns the connection string
func (cs *CoreServices) EnsurePostgres(ctx context.Context) (string, error) {
// Reserve 5GB so postgres can always write (prevents cascade failures)
cs.ensureCoreReservation(CorePostgresContainer, "5G")

// Check if container already exists
info, err := cs.incusClient.GetContainer(CorePostgresContainer)
if err == nil {
Expand Down Expand Up @@ -316,6 +335,9 @@

// EnsureCaddy ensures Caddy container is running and returns the admin URL
func (cs *CoreServices) EnsureCaddy(ctx context.Context, baseDomain string) (string, error) {
// Reserve 2GB so Caddy can always write (prevents cascade failures)
cs.ensureCoreReservation(CoreCaddyContainer, "2G")

// Check if container already exists
info, err := cs.incusClient.GetContainer(CoreCaddyContainer)
if err == nil {
Expand Down Expand Up @@ -506,6 +528,9 @@

// EnsureVictoriaMetrics ensures the Victoria Metrics + Grafana container is running
func (cs *CoreServices) EnsureVictoriaMetrics(ctx context.Context, postgresIP string) (string, error) {
// Reserve 2GB so metrics/alerts keep flowing even when pool is full
cs.ensureCoreReservation(CoreVictoriaMetricsContainer, "2G")

// Check if container already exists
info, err := cs.incusClient.GetContainer(CoreVictoriaMetricsContainer)
if err == nil {
Expand Down Expand Up @@ -1184,6 +1209,9 @@

// EnsureSecurity ensures the ClamAV security container is running
func (cs *CoreServices) EnsureSecurity(ctx context.Context) error {
// Reserve 2GB so scans can continue even when pool is full
cs.ensureCoreReservation(CoreSecurityContainer, "2G")

// Check if container already exists
info, err := cs.incusClient.GetContainer(CoreSecurityContainer)
if err == nil {
Expand Down
Loading