|
| 1 | +--- |
| 2 | +apiVersion: monitoring.coreos.com/v1 |
| 3 | +kind: PrometheusRule |
| 4 | +metadata: |
| 5 | + name: controller-alerts |
| 6 | + namespace: {{ .Values.namespaces.olmv1.name }} |
| 7 | +spec: |
| 8 | + groups: |
| 9 | + - name: controller-panic |
| 10 | + rules: |
| 11 | + - alert: reconciler-panic |
| 12 | + annotations: |
| 13 | + description: controller of pod {{`{{ $labels.pod }}`}} experienced panic(s); count={{`{{ $value }}`}} |
| 14 | + expr: controller_runtime_reconcile_panics_total{} > 0 |
| 15 | + - alert: webhook-panic |
| 16 | + annotations: |
| 17 | + description: controller webhook of pod {{`{{ $labels.pod }}`}} experienced panic(s); count={{`{{ $value }}`}} |
| 18 | + expr: controller_runtime_webhook_panics_total{} > 0 |
| 19 | + - name: resource-usage |
| 20 | + rules: |
| 21 | + - alert: oom-events |
| 22 | + annotations: |
| 23 | + description: container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.pod }}`}} experienced OOM event(s); count={{`{{ $value }}`}} |
| 24 | + expr: container_oom_events_total > 0 |
| 25 | + - alert: operator-controller-memory-growth |
| 26 | + annotations: |
| 27 | + description: 'operator-controller pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec' |
| 28 | + expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000 |
| 29 | + for: 5m |
| 30 | + keep_firing_for: 1d |
| 31 | + - alert: catalogd-memory-growth |
| 32 | + annotations: |
| 33 | + description: 'catalogd pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec' |
| 34 | + expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000 |
| 35 | + for: 5m |
| 36 | + keep_firing_for: 1d |
| 37 | + - alert: operator-controller-memory-usage |
| 38 | + annotations: |
| 39 | + description: 'operator-controller pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B' |
| 40 | + expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000 |
| 41 | + for: 5m |
| 42 | + keep_firing_for: 1d |
| 43 | + - alert: catalogd-memory-usage |
| 44 | + annotations: |
| 45 | + description: 'catalogd pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B' |
| 46 | + expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000 |
| 47 | + for: 5m |
| 48 | + keep_firing_for: 1d |
| 49 | + - alert: operator-controller-cpu-usage |
| 50 | + annotations: |
| 51 | + description: 'operator-controller using high cpu resource for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%' |
| 52 | + expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20 |
| 53 | + for: 5m |
| 54 | + keep_firing_for: 1d |
| 55 | + - alert: catalogd-cpu-usage |
| 56 | + annotations: |
| 57 | + description: 'catalogd using high cpu resources for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%' |
| 58 | + expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20 |
| 59 | + for: 5m |
| 60 | + keep_firing_for: 1d |
| 61 | + - alert: operator-controller-api-call-rate |
| 62 | + annotations: |
| 63 | + description: 'operator-controller making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec' |
| 64 | + expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10 |
| 65 | + for: 5m |
| 66 | + keep_firing_for: 1d |
| 67 | + - alert: catalogd-api-call-rate |
| 68 | + annotations: |
| 69 | + description: 'catalogd making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec' |
| 70 | + expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5 |
| 71 | + for: 5m |
| 72 | + keep_firing_for: 1d |
0 commit comments