From f6d2f16fe9372366c93c91c7a6c7a1f7950f32ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luis=20D=C3=ADaz?= Date: Fri, 17 Oct 2025 11:44:41 +0200 Subject: [PATCH 1/4] testing new alerting to report ip of instance down --- ansible/roles/prometheus/files/alert_rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/prometheus/files/alert_rules.yml b/ansible/roles/prometheus/files/alert_rules.yml index d94ccc17..0f2309f8 100644 --- a/ansible/roles/prometheus/files/alert_rules.yml +++ b/ansible/roles/prometheus/files/alert_rules.yml @@ -12,7 +12,7 @@ groups: expr: up != 1 for: 5m annotations: - summary: '{{ $labels.instance }} is not `up`' + summary: '{{ $labels.instance }} ({{$labels.ec2_host}}) is not `up`' - alert: systemd # yes, just "systemd", it's unclear what's going wrong :-) expr: node_systemd_system_running != 1 # that's basically output of `systemctl is-system-running` From 74944d3ca58208c2fc08911c046f34eaaa06bfb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luis=20D=C3=ADaz?= Date: Fri, 17 Oct 2025 12:05:13 +0200 Subject: [PATCH 2/4] Set alert time to 1m to make testing easier --- ansible/roles/prometheus/files/alert_rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/prometheus/files/alert_rules.yml b/ansible/roles/prometheus/files/alert_rules.yml index 0f2309f8..92af7e6c 100644 --- a/ansible/roles/prometheus/files/alert_rules.yml +++ b/ansible/roles/prometheus/files/alert_rules.yml @@ -10,7 +10,7 @@ groups: # including http scraping failure - alert: InstanceDown expr: up != 1 - for: 5m + for: 1m # TODO set back to 5m when we finish testing annotations: summary: '{{ $labels.instance }} ({{$labels.ec2_host}}) is not `up`' From ecdecc3903c0721d7ab46530c4891537553c7603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luis=20D=C3=ADaz?= Date: Fri, 17 Oct 2025 12:16:02 +0200 Subject: [PATCH 3/4] improve legibility of alert message --- ansible/roles/prometheus/files/alert_rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/prometheus/files/alert_rules.yml b/ansible/roles/prometheus/files/alert_rules.yml index 92af7e6c..2bdeca66 100644 --- a/ansible/roles/prometheus/files/alert_rules.yml +++ b/ansible/roles/prometheus/files/alert_rules.yml @@ -12,7 +12,7 @@ groups: expr: up != 1 for: 1m # TODO set back to 5m when we finish testing annotations: - summary: '{{ $labels.instance }} ({{$labels.ec2_host}}) is not `up`' + summary: '{{ $labels.instance }} {{if $labels.ec2_host}} ({{$labels.ec2_host}}) {{end}} is not `up`' - alert: systemd # yes, just "systemd", it's unclear what's going wrong :-) expr: node_systemd_system_running != 1 # that's basically output of `systemctl is-system-running` From ce670a842e768705cd42528e8fc0b972bec9d416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luis=20D=C3=ADaz?= Date: Fri, 17 Oct 2025 12:30:50 +0200 Subject: [PATCH 4/4] roll back short alert interval --- ansible/roles/prometheus/files/alert_rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/prometheus/files/alert_rules.yml b/ansible/roles/prometheus/files/alert_rules.yml index 2bdeca66..81ca3da5 100644 --- a/ansible/roles/prometheus/files/alert_rules.yml +++ b/ansible/roles/prometheus/files/alert_rules.yml @@ -10,7 +10,7 @@ groups: # including http scraping failure - alert: InstanceDown expr: up != 1 - for: 1m # TODO set back to 5m when we finish testing + for: 5m annotations: summary: '{{ $labels.instance }} {{if $labels.ec2_host}} ({{$labels.ec2_host}}) {{end}} is not `up`'