From 3f0ff32f77807addb028b49c05ca4965a2fa65b3 Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Fri, 31 Jan 2025 00:49:40 +0330 Subject: [PATCH 01/12] update to version 1.7.0 and add more alert rules --- .../prometheus-openstack-exporter/Chart.yaml | 4 +- .../templates/prometheusrule.yaml | 168 +++++++++++++----- .../prometheus-openstack-exporter/values.yaml | 2 +- 3 files changed, 129 insertions(+), 45 deletions(-) diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index b9c89e720..4753b5b03 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.4.3 -appVersion: v1.6.0 +version: 0.4.4 +appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml index 0b88676c5..eb5200134 100644 --- a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml +++ b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml @@ -8,25 +8,45 @@ metadata: {{ include "openstack-exporter.labels" . | indent 4 }} spec: groups: + - name: keystone + rules: + - alert: KeystoneDown + for: 5m + expr: 'openstack_identity_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack Keystone service down" + description: "OpenStack Keystone service down" + + - name: glance + rules: + - alert: GlanceDown + for: 5m + expr: 'openstack_glance_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack Glance service down" + description: "OpenStack Glance service down" + - name: cinder rules: - - alert: CinderAgentDown - expr: | - openstack_cinder_agent_state != 1 + - alert: CinderDown + for: 5m + expr: 'openstack_cinder_up != 1' labels: - severity: P4 + severity: critical annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. + summary: "OpenStack Cinder service down" + description: "OpenStack Cinder service down" - alert: CinderAgentDown for: 5m expr: | - openstack_cinder_agent_state != 1 + openstack_cinder_agent_state{adminState="enabled"} != 1 labels: - severity: P3 + severity: critical annotations: summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" description: > @@ -39,7 +59,7 @@ spec: expr: | openstack_cinder_agent_state{adminState!="enabled"} labels: - severity: P5 + severity: warning annotations: summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" description: > @@ -52,33 +72,50 @@ spec: expr: | openstack_cinder_volume_status{status=~"error.*"} labels: - severity: P4 + severity: warning annotations: summary: "[`{{`{{$labels.id}}`}}`] Volume in ERROR state" description: > The volume `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must be cleaned up or removed in order to provide a consistent customer experience. + - alert: CinderVolumeInDeleting + expr: 'openstack_cinder_volume_status == 7' + for: 10m + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Volume in deleting state" + description: > + The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" + + - alert: CinderVolumeInCreating + expr: 'openstack_cinder_volume_status == 0' + for: 10m + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Volume in creating state" + description: > + The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" - name: neutron rules: - - alert: NeutronAgentDown - expr: | - openstack_neutron_agent_state != 1 + - alert: NeutronDown + for: 5m + expr: 'openstack_neutron_up != 1' labels: - severity: P4 + severity: critical annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. + summary: "OpenStack Neutron service down" + description: "OpenStack Neutron service down" - alert: NeutronAgentDown for: 5m expr: | - openstack_neutron_agent_state != 1 + openstack_neutron_agent_state{adminState="up"} != 1 labels: - severity: P3 + severity: critical annotations: summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" description: > @@ -91,7 +128,7 @@ spec: expr: | openstack_neutron_agent_state{adminState!="up"} labels: - severity: P5 + severity: warning annotations: summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" description: > @@ -103,7 +140,7 @@ spec: expr: | openstack_neutron_port{binding_vif_type="binding_failed"} != 0 labels: - severity: P3 + severity: warning annotations: summary: "[`{{`{{$labels.device_owner}}`}}`] `{{`{{$labels.mac_address}}`}}` binding failed" description: > @@ -114,7 +151,7 @@ spec: expr: | sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 labels: - severity: P4 + severity: warning annotations: summary: "[`{{`{{$labels.network_name}}`}}`] `{{`{{$labels.subnet_name}}`}}` running out of IPs" description: > @@ -122,26 +159,23 @@ spec: is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will impact the provisioning of new ports. - - name: nova rules: - - alert: NovaAgentDown - expr: | - openstack_nova_agent_state != 1 + - alert: NovaDown + for: 5m + expr: 'openstack_nova_up != 1' labels: - severity: P4 + severity: critical annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. + summary: "OpenStack Nova service down" + description: "OpenStack Nova service down" - alert: NovaAgentDown for: 5m expr: | - openstack_nova_agent_state != 1 + openstack_nova_agent_state{adminState="enabled"} != 1 labels: - severity: P3 + severity: critical annotations: summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" description: > @@ -154,7 +188,7 @@ spec: expr: | openstack_nova_agent_state{adminState!="enabled"} labels: - severity: P5 + severity: warning annotations: summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" description: > @@ -163,23 +197,73 @@ spec: as quickly as possible. - alert: NovaInstanceInError - for: 24h + for: 15m expr: | openstack_nova_server_status{status="ERROR"} labels: - severity: P4 + severity: warning annotations: summary: "[`{{`{{$labels.id}}`}}`] Instance in ERROR state" description: > - The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must + The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 15 minutes. It must be cleaned up or removed in order to provide a consistent customer experience. + - alert: NovaInstanceInBuilding + for: 15m + expr: 'openstack_nova_server_status == 1' + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in BUILD state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in BUILD state for over 15 minutes. + + - alert: NovaInstanceInRESIZE + for: 15m + expr: 'openstack_nova_server_status == 10' + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in RESIZE state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in RESIZE state for over 15 minutes. + + - alert: NovaInstanceInUNKNOWN + for: 15m + expr: 'openstack_nova_server_status == 13' + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in UNKNOWN state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in UNKNOWN state for over 15 minutes. + + - alert: NovaInstanceInVERIFY_RESIZE + for: 15m + expr: 'openstack_nova_server_status == 14' + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in VERIFY_RESIZE state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes. + + - alert: NovaInstanceInMIGRATING + for: 15m + expr: 'openstack_nova_server_status == 15' + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in MIGRATING state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 15 minutes. + - alert: NovaFailureRisk for: 6h expr: | (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 labels: - severity: P4 + severity: warning annotations: summary: "[nova] Failure risk" description: > @@ -201,7 +285,7 @@ spec: (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) ) * 100 > 75 labels: - severity: P4 + severity: warning annotations: summary: "[nova] Capacity risk" description: > diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 03381b96e..8de570251 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -8,7 +8,7 @@ replicaCount: 1 image: repository: ghcr.io/openstack-exporter/openstack-exporter - tag: 1.6.0 + tag: 1.7.0 pullPolicy: Always serviceMonitor: From f66efa233858360a55cd6ec560b2eab3624c71ee Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Sun, 2 Feb 2025 14:29:50 +0330 Subject: [PATCH 02/12] add rule for loadbalancer --- .../prometheus-openstack-exporter/Chart.yaml | 2 +- .../templates/prometheusrule.yaml | 88 +++++++++++++++---- 2 files changed, 70 insertions(+), 20 deletions(-) diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index 4753b5b03..a10f87732 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.4.4 +version: 0.4.5 appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml index eb5200134..ba581617b 100644 --- a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml +++ b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml @@ -80,14 +80,14 @@ spec: be cleaned up or removed in order to provide a consistent customer experience. - alert: CinderVolumeInDeleting - expr: 'openstack_cinder_volume_status == 7' - for: 10m - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in deleting state" - description: > - The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" + expr: 'openstack_cinder_volume_status == 7' + for: 10m + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Volume in deleting state" + description: > + The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" - alert: CinderVolumeInCreating expr: 'openstack_cinder_volume_status == 0' @@ -197,22 +197,22 @@ spec: as quickly as possible. - alert: NovaInstanceInError - for: 15m + for: 10m expr: | openstack_nova_server_status{status="ERROR"} labels: - severity: warning + severity: critical annotations: summary: "[`{{`{{$labels.id}}`}}`] Instance in ERROR state" description: > - The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 15 minutes. It must + The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 10 minutes. It must be cleaned up or removed in order to provide a consistent customer experience. - alert: NovaInstanceInBuilding for: 15m expr: 'openstack_nova_server_status == 1' labels: - severity: warning + severity: critical annotations: summary: "[`{{`{{$labels.id}}`}}`] Instance in BUILD state" description: > @@ -222,7 +222,7 @@ spec: for: 15m expr: 'openstack_nova_server_status == 10' labels: - severity: warning + severity: critical annotations: summary: "[`{{`{{$labels.id}}`}}`] Instance in RESIZE state" description: > @@ -232,7 +232,7 @@ spec: for: 15m expr: 'openstack_nova_server_status == 13' labels: - severity: warning + severity: critical annotations: summary: "[`{{`{{$labels.id}}`}}`] Instance in UNKNOWN state" description: > @@ -242,21 +242,21 @@ spec: for: 15m expr: 'openstack_nova_server_status == 14' labels: - severity: warning + severity: critical annotations: summary: "[`{{`{{$labels.id}}`}}`] Instance in VERIFY_RESIZE state" description: > The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes. - alert: NovaInstanceInMIGRATING - for: 15m + for: 30m expr: 'openstack_nova_server_status == 15' labels: - severity: warning + severity: critical annotations: summary: "[`{{`{{$labels.id}}`}}`] Instance in MIGRATING state" description: > - The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 15 minutes. + The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 30 minutes. - alert: NovaFailureRisk for: 6h @@ -272,7 +272,7 @@ spec: failures occur. Please ensure that adequate amount of infrastructure is assigned to this deployment to prevent this. - - alert: NovaCapacity + - alert: NovaCapacityNearFull for: 6h expr: | sum ( @@ -286,9 +286,59 @@ spec: ) * 100 > 75 labels: severity: warning + annotations: + summary: "[nova] near full Capacity risk" + description: > + The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running + out of capacity due to the timeline required to add new nodes. Please ensure that adequate + amount of infrastructure is assigned to this deployment to prevent this. + + - alert: NovaCapacityFull + for: 6h + expr: | + sum ( + openstack_nova_memory_used_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) / sum ( + openstack_nova_memory_available_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) * 100 > 85 + labels: + severity: critical annotations: summary: "[nova] Capacity risk" description: > The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running out of capacity due to the timeline required to add new nodes. Please ensure that adequate amount of infrastructure is assigned to this deployment to prevent this. + + - name: octavia + rules: + - alert: LoadbalancerDown + for: 5m + expr: 'openstack_loadbalancer_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer service down" + description: "OpenStack loadbalancer service down" + + - alert: LoadbalancerNotActive + for: 5m + expr: openstack_loadbalancer_loadbalancer_status{provisioning_status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + + - alert: LoadbalancerPoolNotActive + for: 5m + expr: openstack_loadbalancer_pool_status{provisioning_status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" From a170411bdaf306a4e70305b64bc525165b184e1d Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Mon, 3 Feb 2025 00:40:55 +0330 Subject: [PATCH 03/12] add rule in values --- .../prometheus-openstack-exporter/Chart.yaml | 2 +- .../templates/prometheusrule.yaml | 380 +++--------------- .../prometheus-openstack-exporter/values.yaml | 342 ++++++++++++++++ 3 files changed, 389 insertions(+), 335 deletions(-) diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index a10f87732..656dab4f9 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.4.5 +version: 0.5.1 appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml index ba581617b..563b68053 100644 --- a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml +++ b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml @@ -5,340 +5,52 @@ metadata: name: {{ include "openstack-exporter.fullname" . }} namespace: {{ .Release.Namespace }} labels: -{{ include "openstack-exporter.labels" . | indent 4 }} + {{- include "openstack-exporter.labels" . | indent 4 }} spec: groups: - - name: keystone + {{- range $groupName, $group := .Values.promethuesRules }} + {{- if (dig "enabled" true $group )}} + - name: {{ $groupName }} rules: - - alert: KeystoneDown - for: 5m - expr: 'openstack_identity_up != 1' - labels: - severity: critical - annotations: - summary: "OpenStack Keystone service down" - description: "OpenStack Keystone service down" - - - name: glance - rules: - - alert: GlanceDown - for: 5m - expr: 'openstack_glance_up != 1' - labels: - severity: critical - annotations: - summary: "OpenStack Glance service down" - description: "OpenStack Glance service down" - - - name: cinder - rules: - - alert: CinderDown - for: 5m - expr: 'openstack_cinder_up != 1' - labels: - severity: critical - annotations: - summary: "OpenStack Cinder service down" - description: "OpenStack Cinder service down" - - - alert: CinderAgentDown - for: 5m - expr: | - openstack_cinder_agent_state{adminState="enabled"} != 1 - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down for 5 minutes. This can affect volume operations so it must - be resolved as quickly as possible. - - - alert: CinderAgentDisabled - for: 1h - expr: | - openstack_cinder_agent_state{adminState!="enabled"} - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect volume operations so it must be resolved - as quickly as possible. - - - alert: CinderVolumeInError - for: 24h - expr: | - openstack_cinder_volume_status{status=~"error.*"} - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in ERROR state" - description: > - The volume `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must - be cleaned up or removed in order to provide a consistent customer experience. - - - alert: CinderVolumeInDeleting - expr: 'openstack_cinder_volume_status == 7' - for: 10m - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in deleting state" - description: > - The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" - - - alert: CinderVolumeInCreating - expr: 'openstack_cinder_volume_status == 0' - for: 10m - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in creating state" - description: > - The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" - - - name: neutron - rules: - - alert: NeutronDown - for: 5m - expr: 'openstack_neutron_up != 1' - labels: - severity: critical - annotations: - summary: "OpenStack Neutron service down" - description: "OpenStack Neutron service down" - - - alert: NeutronAgentDown - for: 5m - expr: | - openstack_neutron_agent_state{adminState="up"} != 1 - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down for 5 minutes. This can affect network operations so it must - be resolved as quickly as possible. - - - alert: NeutronAgentDisabled - for: 1h - expr: | - openstack_neutron_agent_state{adminState!="up"} - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect network operations so it must be resolved - as quickly as possible. - - - alert: NeutronBindingFailedPorts - expr: | - openstack_neutron_port{binding_vif_type="binding_failed"} != 0 - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.device_owner}}`}}`] `{{`{{$labels.mac_address}}`}}` binding failed" - description: > - The NIC `{{`{{$labels.mac_address}}`}}` of `{{`{{$labels.device_owner}}`}}` - has binding failed port now. - - - alert: NeutronNetworkOutOfIPs - expr: | - sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.network_name}}`}}`] `{{`{{$labels.subnet_name}}`}}` running out of IPs" - description: > - The subnet `{{`{{$labels.subnet_name}}`}}` within `{{`{{$labels.network_name}}`}}` - is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will - impact the provisioning of new ports. - - - name: nova - rules: - - alert: NovaDown - for: 5m - expr: 'openstack_nova_up != 1' - labels: - severity: critical - annotations: - summary: "OpenStack Nova service down" - description: "OpenStack Nova service down" - - - alert: NovaAgentDown - for: 5m - expr: | - openstack_nova_agent_state{adminState="enabled"} != 1 - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. This can affect compute operations so it must be resolved - as quickly as possible. - - - alert: NovaAgentDisabled - for: 1h - expr: | - openstack_nova_agent_state{adminState!="enabled"} - labels: - severity: warning - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect compute operations so it must be resolved - as quickly as possible. - - - alert: NovaInstanceInError - for: 10m - expr: | - openstack_nova_server_status{status="ERROR"} - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in ERROR state" - description: > - The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 10 minutes. It must - be cleaned up or removed in order to provide a consistent customer experience. - - - alert: NovaInstanceInBuilding - for: 15m - expr: 'openstack_nova_server_status == 1' - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in BUILD state" - description: > - The instance `{{`{{$labels.id}}`}}` has been in BUILD state for over 15 minutes. - - - alert: NovaInstanceInRESIZE - for: 15m - expr: 'openstack_nova_server_status == 10' - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in RESIZE state" - description: > - The instance `{{`{{$labels.id}}`}}` has been in RESIZE state for over 15 minutes. - - - alert: NovaInstanceInUNKNOWN - for: 15m - expr: 'openstack_nova_server_status == 13' - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in UNKNOWN state" - description: > - The instance `{{`{{$labels.id}}`}}` has been in UNKNOWN state for over 15 minutes. - - - alert: NovaInstanceInVERIFY_RESIZE - for: 15m - expr: 'openstack_nova_server_status == 14' - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in VERIFY_RESIZE state" - description: > - The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes. - - - alert: NovaInstanceInMIGRATING - for: 30m - expr: 'openstack_nova_server_status == 15' - labels: - severity: critical - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in MIGRATING state" - description: > - The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 30 minutes. - - - alert: NovaFailureRisk - for: 6h - expr: | - (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 - labels: - severity: warning - annotations: - summary: "[nova] Failure risk" - description: > - The cloud capacity will be at `{{`{{$value}}`}}` in the event of the failure of a single - hypervisor which puts the cloud at risk of not being able to recover should any hypervisor - failures occur. Please ensure that adequate amount of infrastructure is assigned to this - deployment to prevent this. - - - alert: NovaCapacityNearFull - for: 6h - expr: | - sum ( - openstack_nova_memory_used_bytes - + on(hostname) group_left(adminState) - (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) - ) / sum ( - openstack_nova_memory_available_bytes - + on(hostname) group_left(adminState) - (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) - ) * 100 > 75 - labels: - severity: warning - annotations: - summary: "[nova] near full Capacity risk" - description: > - The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running - out of capacity due to the timeline required to add new nodes. Please ensure that adequate - amount of infrastructure is assigned to this deployment to prevent this. - - - alert: NovaCapacityFull - for: 6h - expr: | - sum ( - openstack_nova_memory_used_bytes - + on(hostname) group_left(adminState) - (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) - ) / sum ( - openstack_nova_memory_available_bytes - + on(hostname) group_left(adminState) - (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) - ) * 100 > 85 - labels: - severity: critical - annotations: - summary: "[nova] Capacity risk" - description: > - The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running - out of capacity due to the timeline required to add new nodes. Please ensure that adequate - amount of infrastructure is assigned to this deployment to prevent this. - - - name: octavia - rules: - - alert: LoadbalancerDown - for: 5m - expr: 'openstack_loadbalancer_up != 1' - labels: - severity: critical - annotations: - summary: "OpenStack loadbalancer service down" - description: "OpenStack loadbalancer service down" - - - alert: LoadbalancerNotActive - for: 5m - expr: openstack_loadbalancer_loadbalancer_status{provisioning_status!="ACTIVE"} - labels: - severity: critical - annotations: - summary: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" - description: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" - - - alert: LoadbalancerPoolNotActive - for: 5m - expr: openstack_loadbalancer_pool_status{provisioning_status!="ACTIVE"} - labels: - severity: critical - annotations: - summary: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" - description: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + {{- range $ruleName, $rule := $group.rules }} + {{- if (dig "enabled" true $rule )}} + - # {{ $ruleName }} + {{- with $rule.alert }} + alert: {{ . }} + {{- end }} + + {{- with $rule.expr }} + expr: {{ tpl . $ | quote }} + {{- end }} + + {{- with $rule.record }} + record: {{ . }} + {{- end }} + + {{- with $rule.for }} + for: {{ . }} + {{- end }} + + {{- with $rule.keep_firing_for }} + keep_firing_for: {{ . }} + {{- end }} + + {{- with $rule.labels }} + labels: + {{- range $k,$v := . }} + {{ $k }}: {{ tpl $v $ | quote }} + {{- end }} + {{- end }} + + {{- with $rule.annotations }} + annotations: + {{- range $k, $v := . }} + {{ $k }}: | + {{- tpl $v $ | nindent 10 }} + {{- end }} + {{- end }} + + {{- end }} + {{- end }} + {{- end }} + {{- end }} diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 8de570251..38c5b8caf 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -69,3 +69,345 @@ clouds_yaml_config: | # ... # cloud2: # ... + +promethuesRules: + keystone: + enabled: true + rules: + - alert: KeystoneDown + for: 5m + expr: 'openstack_identity_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack Keystone service down" + description: "OpenStack Keystone service down" + + glance: + enabled: true + rules: + - alert: GlanceDown + for: 5m + expr: 'openstack_glance_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack Glance service down" + description: "OpenStack Glance service down" + + cinder: + enabled: true + rules: + - alert: CinderDown + for: 5m + expr: 'openstack_cinder_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack Cinder service down" + description: "OpenStack Cinder service down" + + - alert: CinderAgentDown + for: 5m + expr: | + openstack_cinder_agent_state{adminState="enabled"} != 1 + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + is being reported as down for 5 minutes. This can affect volume operations so it must + be resolved as quickly as possible. + + - alert: CinderAgentDisabled + for: 1h + expr: | + openstack_cinder_agent_state{adminState!="enabled"} + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + has been disabled for 60 minutes. This can affect volume operations so it must be resolved + as quickly as possible. + + - alert: CinderVolumeInError + for: 24h + expr: | + openstack_cinder_volume_status{status=~"error.*"} + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Volume in ERROR state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must + be cleaned up or removed in order to provide a consistent customer experience. + + - alert: CinderVolumeInDeleting + expr: 'openstack_cinder_volume_status == 7' + for: 10m + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Volume in deleting state" + description: > + The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" + + - alert: CinderVolumeInCreating + expr: 'openstack_cinder_volume_status == 0' + for: 10m + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Volume in creating state" + description: > + The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" + + neutron: + enabled: true + rules: + - alert: NeutronDown + for: 5m + expr: 'openstack_neutron_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack Neutron service down" + description: "OpenStack Neutron service down" + + - alert: NeutronAgentDown + for: 5m + expr: | + openstack_neutron_agent_state{adminState="up"} != 1 + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + is being reported as down for 5 minutes. This can affect network operations so it must + be resolved as quickly as possible. + + - alert: NeutronAgentDisabled + for: 1h + expr: | + openstack_neutron_agent_state{adminState!="up"} + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + has been disabled for 60 minutes. This can affect network operations so it must be resolved + as quickly as possible. + + - alert: NeutronBindingFailedPorts + expr: | + openstack_neutron_port{binding_vif_type="binding_failed"} != 0 + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.device_owner}}`}}`] `{{`{{$labels.mac_address}}`}}` binding failed" + description: > + The NIC `{{`{{$labels.mac_address}}`}}` of `{{`{{$labels.device_owner}}`}}` + has binding failed port now. + + - alert: NeutronNetworkOutOfIPs + expr: | + sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.network_name}}`}}`] `{{`{{$labels.subnet_name}}`}}` running out of IPs" + description: > + The subnet `{{`{{$labels.subnet_name}}`}}` within `{{`{{$labels.network_name}}`}}` + is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will + impact the provisioning of new ports. + + nova: + enabled: true + rules: + - alert: NovaDown + for: 5m + expr: 'openstack_nova_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack Nova service down" + description: "OpenStack Nova service down" + + - alert: NovaAgentDown + for: 5m + expr: | + openstack_nova_agent_state{adminState="enabled"} != 1 + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + is being reported as down. This can affect compute operations so it must be resolved + as quickly as possible. + + - alert: NovaAgentDisabled + for: 1h + expr: | + openstack_nova_agent_state{adminState!="enabled"} + labels: + severity: warning + annotations: + summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + has been disabled for 60 minutes. This can affect compute operations so it must be resolved + as quickly as possible. + + - alert: NovaInstanceInError + for: 10m + expr: | + openstack_nova_server_status{status="ERROR"} + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in ERROR state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 10 minutes. It must + be cleaned up or removed in order to provide a consistent customer experience. + + - alert: NovaInstanceInBuilding + for: 15m + expr: 'openstack_nova_server_status == 1' + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in BUILD state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in BUILD state for over 15 minutes. + + - alert: NovaInstanceInRESIZE + for: 15m + expr: 'openstack_nova_server_status == 10' + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in RESIZE state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in RESIZE state for over 15 minutes. + + - alert: NovaInstanceInUNKNOWN + for: 15m + expr: 'openstack_nova_server_status == 13' + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in UNKNOWN state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in UNKNOWN state for over 15 minutes. + + - alert: NovaInstanceInVERIFY_RESIZE + for: 15m + expr: 'openstack_nova_server_status == 14' + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in VERIFY_RESIZE state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes. + + - alert: NovaInstanceInMIGRATING + for: 30m + expr: 'openstack_nova_server_status == 15' + labels: + severity: critical + annotations: + summary: "[`{{`{{$labels.id}}`}}`] Instance in MIGRATING state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 30 minutes. + + - alert: NovaFailureRisk + for: 6h + expr: | + (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 + labels: + severity: warning + annotations: + summary: "[nova] Failure risk" + description: > + The cloud capacity will be at `{{`{{$value}}`}}` in the event of the failure of a single + hypervisor which puts the cloud at risk of not being able to recover should any hypervisor + failures occur. Please ensure that adequate amount of infrastructure is assigned to this + deployment to prevent this. + + - alert: NovaCapacityNearFull + for: 6h + expr: | + sum ( + openstack_nova_memory_used_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) / sum ( + openstack_nova_memory_available_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) * 100 > 75 + labels: + severity: warning + annotations: + summary: "[nova] near full Capacity risk" + description: > + The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running + out of capacity due to the timeline required to add new nodes. Please ensure that adequate + amount of infrastructure is assigned to this deployment to prevent this. + + - alert: NovaCapacityFull + for: 6h + expr: | + sum ( + openstack_nova_memory_used_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) / sum ( + openstack_nova_memory_available_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) * 100 > 85 + labels: + severity: critical + annotations: + summary: "[nova] Capacity risk" + description: > + The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running + out of capacity due to the timeline required to add new nodes. Please ensure that adequate + amount of infrastructure is assigned to this deployment to prevent this. + + octavia: + enabled: true + rules: + - alert: LoadbalancerDown + for: 5m + expr: 'openstack_loadbalancer_up != 1' + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer service down" + description: "OpenStack loadbalancer service down" + + - alert: LoadbalancerNotActive + for: 5m + expr: openstack_loadbalancer_loadbalancer_status{provisioning_status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + + - alert: LoadbalancerPoolNotActive + for: 5m + expr: openstack_loadbalancer_pool_status{provisioning_status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" From 08cdf7118ff58dbd80737284ec1d6a2e3a1b33ba Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Tue, 4 Feb 2025 15:51:26 +0330 Subject: [PATCH 04/12] add rule for manila --- .../prometheus-openstack-exporter/Chart.yaml | 2 +- .../prometheus-openstack-exporter/values.yaml | 162 ++++++++++-------- 2 files changed, 88 insertions(+), 76 deletions(-) diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index 656dab4f9..9d19eafe1 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.5.1 +version: 0.5.2 appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 38c5b8caf..84eef4541 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -76,7 +76,7 @@ promethuesRules: rules: - alert: KeystoneDown for: 5m - expr: 'openstack_identity_up != 1' + expr: openstack_identity_up != 1 labels: severity: critical annotations: @@ -88,7 +88,7 @@ promethuesRules: rules: - alert: GlanceDown for: 5m - expr: 'openstack_glance_up != 1' + expr: openstack_glance_up != 1 labels: severity: critical annotations: @@ -100,7 +100,7 @@ promethuesRules: rules: - alert: CinderDown for: 5m - expr: 'openstack_cinder_up != 1' + expr: openstack_cinder_up != 1 labels: severity: critical annotations: @@ -109,68 +109,65 @@ promethuesRules: - alert: CinderAgentDown for: 5m - expr: | - openstack_cinder_agent_state{adminState="enabled"} != 1 + expr: openstack_cinder_agent_state{adminState="enabled"} != 1 labels: severity: critical annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down" description: > The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down for 5 minutes. This can affect volume operations so it must + is being reported as down for 5 minutes. This can affect volume operations so it must be resolved as quickly as possible. - alert: CinderAgentDisabled for: 1h - expr: | - openstack_cinder_agent_state{adminState!="enabled"} + expr: openstack_cinder_agent_state{adminState!="enabled"} labels: severity: warning annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled" description: > The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect volume operations so it must be resolved + has been disabled for 60 minutes. This can affect volume operations so it must be resolved as quickly as possible. - alert: CinderVolumeInError - for: 24h - expr: | - openstack_cinder_volume_status{status=~"error.*"} + for: 15m + expr: openstack_cinder_volume_status{status=~"error.*"} labels: - severity: warning + severity: critical annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in ERROR state" + summary: "`{{`{{$labels.name}}`}}` Volume in ERROR state" description: > - The volume `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must - be cleaned up or removed in order to provide a consistent customer experience. + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + It must be cleaned up or removed in order to provide a consistent customer experience. - alert: CinderVolumeInDeleting - expr: 'openstack_cinder_volume_status == 7' - for: 10m + expr: openstack_cinder_volume_status == 7 + for: 15m labels: - severity: warning + severity: warning annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in deleting state" - description: > - The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" + summary: "`{{`{{$labels.name}}`}}` Volume in DELETING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - alert: CinderVolumeInCreating - expr: 'openstack_cinder_volume_status == 0' - for: 10m + expr: openstack_cinder_volume_status == 0 + for: 15m labels: severity: warning annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in creating state" + summary: "`{{`{{$labels.name}}`}}` Volume in CREATING state" description: > - The volume `{{`{{$labels.id}}`}}` is stuck in deleting status for more than 10 minutes" + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. neutron: enabled: true rules: - alert: NeutronDown for: 5m - expr: 'openstack_neutron_up != 1' + expr: openstack_neutron_up != 1 labels: severity: critical annotations: @@ -179,12 +176,11 @@ promethuesRules: - alert: NeutronAgentDown for: 5m - expr: | - openstack_neutron_agent_state{adminState="up"} != 1 + expr: openstack_neutron_agent_state{adminState="up"} != 1 labels: severity: critical annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down" description: > The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` is being reported as down for 5 minutes. This can affect network operations so it must @@ -192,24 +188,22 @@ promethuesRules: - alert: NeutronAgentDisabled for: 1h - expr: | - openstack_neutron_agent_state{adminState!="up"} + expr: openstack_neutron_agent_state{adminState!="up"} labels: severity: warning annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled" description: > The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect network operations so it must be resolved + has been disabled for 60 minutes. This can affect network operations so it must be resolved as quickly as possible. - alert: NeutronBindingFailedPorts - expr: | - openstack_neutron_port{binding_vif_type="binding_failed"} != 0 + expr: openstack_neutron_port{binding_vif_type="binding_failed"} != 0 labels: severity: warning annotations: - summary: "[`{{`{{$labels.device_owner}}`}}`] `{{`{{$labels.mac_address}}`}}` binding failed" + summary: "`{{`{{$labels.device_owner}}`}}` `{{`{{$labels.mac_address}}`}}` binding failed" description: > The NIC `{{`{{$labels.mac_address}}`}}` of `{{`{{$labels.device_owner}}`}}` has binding failed port now. @@ -220,10 +214,10 @@ promethuesRules: labels: severity: warning annotations: - summary: "[`{{`{{$labels.network_name}}`}}`] `{{`{{$labels.subnet_name}}`}}` running out of IPs" + summary: "`{{`{{$labels.network_name}}`}}` `{{`{{$labels.subnet_name}}`}}` running out of IPs" description: > The subnet `{{`{{$labels.subnet_name}}`}}` within `{{`{{$labels.network_name}}`}}` - is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will + is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will impact the provisioning of new ports. nova: @@ -231,7 +225,7 @@ promethuesRules: rules: - alert: NovaDown for: 5m - expr: 'openstack_nova_up != 1' + expr: openstack_nova_up != 1 labels: severity: critical annotations: @@ -240,91 +234,88 @@ promethuesRules: - alert: NovaAgentDown for: 5m - expr: | - openstack_nova_agent_state{adminState="enabled"} != 1 + expr: openstack_nova_agent_state{adminState="enabled"} != 1 labels: severity: critical annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down" description: > The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. This can affect compute operations so it must be resolved + is being reported as down. This can affect compute operations so it must be resolved as quickly as possible. - alert: NovaAgentDisabled for: 1h - expr: | - openstack_nova_agent_state{adminState!="enabled"} + expr: openstack_nova_agent_state{adminState!="enabled"} labels: severity: warning annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled" description: > The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect compute operations so it must be resolved + has been disabled for 60 minutes. This can affect compute operations so it must be resolved as quickly as possible. - alert: NovaInstanceInError for: 10m - expr: | - openstack_nova_server_status{status="ERROR"} + expr: openstack_nova_server_status{status="ERROR"} labels: severity: critical annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in ERROR state" + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" description: > - The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 10 minutes. It must - be cleaned up or removed in order to provide a consistent customer experience. + The instance `{{`{{$labels.id}}`}}` on host `{{`{{$labels.hypervisor_hostname}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes. + It must be cleaned up or removed in order to provide a consistent customer experience. - alert: NovaInstanceInBuilding for: 15m - expr: 'openstack_nova_server_status == 1' + expr: openstack_nova_server_status == 1 labels: severity: critical annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in BUILD state" + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" description: > - The instance `{{`{{$labels.id}}`}}` has been in BUILD state for over 15 minutes. + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - alert: NovaInstanceInRESIZE for: 15m - expr: 'openstack_nova_server_status == 10' + expr: openstack_nova_server_status == 10 labels: severity: critical annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in RESIZE state" + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" description: > - The instance `{{`{{$labels.id}}`}}` has been in RESIZE state for over 15 minutes. + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - alert: NovaInstanceInUNKNOWN for: 15m - expr: 'openstack_nova_server_status == 13' + expr: openstack_nova_server_status == 13 labels: severity: critical annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in UNKNOWN state" + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" description: > - The instance `{{`{{$labels.id}}`}}` has been in UNKNOWN state for over 15 minutes. + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - alert: NovaInstanceInVERIFY_RESIZE for: 15m - expr: 'openstack_nova_server_status == 14' + expr: openstack_nova_server_status == 14 labels: severity: critical annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in VERIFY_RESIZE state" + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" description: > - The instance `{{`{{$labels.id}}`}}` has been in VERIFY_RESIZE state for over 15 minutes. + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - alert: NovaInstanceInMIGRATING for: 30m - expr: 'openstack_nova_server_status == 15' + expr: openstack_nova_server_status == 15 labels: severity: critical annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in MIGRATING state" + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" description: > - The instance `{{`{{$labels.id}}`}}` has been in MIGRATING state for over 30 minutes. + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. - alert: NovaFailureRisk for: 6h @@ -337,7 +328,7 @@ promethuesRules: description: > The cloud capacity will be at `{{`{{$value}}`}}` in the event of the failure of a single hypervisor which puts the cloud at risk of not being able to recover should any hypervisor - failures occur. Please ensure that adequate amount of infrastructure is assigned to this + failures occur. Please ensure that adequate amount of infrastructure is assigned to this deployment to prevent this. - alert: NovaCapacityNearFull @@ -379,7 +370,7 @@ promethuesRules: summary: "[nova] Capacity risk" description: > The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running - out of capacity due to the timeline required to add new nodes. Please ensure that adequate + out of capacity due to the timeline required to add new nodes. Please ensure that adequate amount of infrastructure is assigned to this deployment to prevent this. octavia: @@ -387,7 +378,7 @@ promethuesRules: rules: - alert: LoadbalancerDown for: 5m - expr: 'openstack_loadbalancer_up != 1' + expr: openstack_loadbalancer_up != 1 labels: severity: critical annotations: @@ -401,7 +392,7 @@ promethuesRules: severity: critical annotations: summary: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" - description: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer `{{`{{$labels.id}}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}`" - alert: LoadbalancerPoolNotActive for: 5m @@ -410,4 +401,25 @@ promethuesRules: severity: critical annotations: summary: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" - description: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer pool `{{`{{$labels.id}}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}`" + + manila: + enabled: true + rules: + - alert: ManilaDown + for: 5m + expr: openstack_sharev2_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack shared file system (Manila) service down" + description: "OpenStack shared file system (Manila) service down" + + - alert: ManilaStatusNotActive + for: 5m + expr: openstack_sharev2_share_status{status!="available"} + labels: + severity: critical + annotations: + summary: "OpenStack Share `{{`{{$labels.name}}`}}` status is not ACTIVE" + description: "OpenStack Share `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`" From 0ff28234e6c7c3292ae4a0e8ddd5a6fe99d1b7d7 Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Sun, 13 Apr 2025 23:52:36 +0330 Subject: [PATCH 05/12] rule(cinder): add new rule for volume status --- README.md | 30 ++++++++++- .../prometheus-openstack-exporter/Chart.yaml | 2 +- .../templates/service.yaml | 2 +- .../prometheus-openstack-exporter/values.yaml | 50 +++++++++++++++++++ 4 files changed, 81 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 34f23795a..de902c973 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ git clone https://github.com/openstack-exporter/helm-charts.git # Package the chart cd helm-charts/charts/prometheus-openstack-exporter/ -helm package . +helm package . # Get chart version & install version="$(awk '/^version:/{ print $NF }' Chart.yaml)" @@ -26,3 +26,31 @@ helm install prometheus-openstack-exporter prometheus-openstack-exporter-${versi ## Contributing Please fill pull requests or issues under Github. + + + +## OpenStack volumes can be in the following states: +openstack_cinder_volume_status: + +| Status | Value | +|---------------------|---------| +|"creating" | 0 | +|"available" | 1 | +|"reserved" | 2 | +|"attaching" | 3 | +|"detaching" | 4 | +|"in-use" | 5 | +|"maintenance" | 6 | +|"deleting" | 7 | +|"awaiting-transfer" | 8 | +|"error" | 9 | +|"error_deleting" | 10 | +|"backing-up" | 11 | +|"restoring-backup" | 12 | +|"error_backing-up" | 13 | +|"error_restoring" | 14 | +|"error_extending" | 15 | +|"downloading" | 16 | +|"uploading" | 17 | +|"retyping" | 18 | +|"extending" | 19 | diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index 9d19eafe1..e331de5c7 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.5.2 +version: 0.5.3 appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/templates/service.yaml b/charts/prometheus-openstack-exporter/templates/service.yaml index 95aa278ee..7c8024493 100644 --- a/charts/prometheus-openstack-exporter/templates/service.yaml +++ b/charts/prometheus-openstack-exporter/templates/service.yaml @@ -13,4 +13,4 @@ spec: port: 9180 targetPort: metrics selector: -{{- include "openstack-exporter.labels" . | indent 4 }} \ No newline at end of file +{{- include "openstack-exporter.labels" . | indent 4 }} diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 84eef4541..56cd9b0c1 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -142,6 +142,26 @@ promethuesRules: The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. It must be cleaned up or removed in order to provide a consistent customer experience. + - alert: CinderVolumeInAttaching + expr: openstack_cinder_volume_status == 3 + for: 10m + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in ATTACHING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes. + + - alert: CinderVolumeInDetaching + expr: openstack_cinder_volume_status == 4 + for: 10m + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in DETACHING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes. + - alert: CinderVolumeInDeleting expr: openstack_cinder_volume_status == 7 for: 15m @@ -162,6 +182,36 @@ promethuesRules: description: > The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + - alert: CinderVolumeInBackingUp + expr: openstack_cinder_volume_status == 12 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in BackingUp state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInRestoringBackup + expr: openstack_cinder_volume_status == 12 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in RESTORING-BACKUP state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInExtending + expr: openstack_cinder_volume_status == 19 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in EXTENDING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + neutron: enabled: true rules: From 69683a759d98d52ce8b300fe9e03eb3e0ee8bd81 Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Sat, 31 May 2025 18:05:29 +0330 Subject: [PATCH 06/12] update rule for all volume status --- .../prometheus-openstack-exporter/Chart.yaml | 2 +- .../prometheus-openstack-exporter/values.yaml | 69 +++++++++++++++++-- 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index e331de5c7..1c1896ae0 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.5.3 +version: 0.5.4 appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 56cd9b0c1..e80b158dc 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -131,6 +131,7 @@ promethuesRules: has been disabled for 60 minutes. This can affect volume operations so it must be resolved as quickly as possible. + # value = (9 | 10 | 13 | 14 | 15) - alert: CinderVolumeInError for: 15m expr: openstack_cinder_volume_status{status=~"error.*"} @@ -142,6 +143,26 @@ promethuesRules: The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. It must be cleaned up or removed in order to provide a consistent customer experience. + - alert: CinderVolumeInCreating + expr: openstack_cinder_volume_status == 0 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in CREATING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInReserved + expr: openstack_cinder_volume_status == 2 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in RESERVED state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + - alert: CinderVolumeInAttaching expr: openstack_cinder_volume_status == 3 for: 10m @@ -162,6 +183,16 @@ promethuesRules: description: > The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes. + - alert: CinderVolumeInMaintenance + expr: openstack_cinder_volume_status == 6 + for: 20m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in MAINTENANCE state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + - alert: CinderVolumeInDeleting expr: openstack_cinder_volume_status == 7 for: 15m @@ -172,18 +203,18 @@ promethuesRules: description: > The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - - alert: CinderVolumeInCreating - expr: openstack_cinder_volume_status == 0 + - alert: CinderVolumeInAwaitingTransfer + expr: openstack_cinder_volume_status == 8 for: 15m labels: severity: warning annotations: - summary: "`{{`{{$labels.name}}`}}` Volume in CREATING state" + summary: "`{{`{{$labels.name}}`}}` Volume in AwaitingTransfer state" description: > The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - alert: CinderVolumeInBackingUp - expr: openstack_cinder_volume_status == 12 + expr: openstack_cinder_volume_status == 11 for: 15m labels: severity: warning @@ -202,6 +233,36 @@ promethuesRules: description: > The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + - alert: CinderVolumeInDownloading + expr: openstack_cinder_volume_status == 16 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in DOWNLOADING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInUploading + expr: openstack_cinder_volume_status == 17 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in UPLOADING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInRetyping + expr: openstack_cinder_volume_status == 18 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in RETYPING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + - alert: CinderVolumeInExtending expr: openstack_cinder_volume_status == 19 for: 15m From 8a883949184da3d8c9275f18b53d7767d27992cc Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Sun, 1 Jun 2025 10:50:06 +0330 Subject: [PATCH 07/12] new rule for qouta --- README.md | 29 ++- .../prometheus-openstack-exporter/Chart.yaml | 2 +- .../prometheus-openstack-exporter/values.yaml | 189 +++++++++++++++++- 3 files changed, 217 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index de902c973..558a9b608 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Please fill pull requests or issues under Github. -## OpenStack volumes can be in the following states: +## OpenStack volumes can be in the following status: openstack_cinder_volume_status: | Status | Value | @@ -54,3 +54,30 @@ openstack_cinder_volume_status: |"uploading" | 17 | |"retyping" | 18 | |"extending" | 19 | + +## OpenStack server can be in the following status: +openstack_nova_server_status: + +| Status | Value | Description +|-------------------|-------|--------------------| +| ACTIVE | 0 | +| BUILD | 1 | The server has not finished the original build process. +| BUILD(spawning) | 2 | The server has not finished the original build process but networking works (HP Cloud specific) +| DELETED | 3 | The server is deleted. +| ERROR | 4 | The server is in error. +| HARD_REBOOT | 5 | The server is hard rebooting. +| PASSWORD | 6 | The password is being reset on the server. +| REBOOT | 7 | The server is in a soft reboot state. +| REBUILD | 8 | The server is currently being rebuilt from an image. +| RESCUE | 9 | The server is in rescue mode. +| RESIZE | 10 | Server is performing the differential copy of data that changed during its initial copy. +| SHUTOFF | 11 | The virtual machine (VM) was powered down by the user, but not through the OpenStack Compute API. +| SUSPENDED | 12 | The server is suspended, either by request or necessity. +| UNKNOWN | 13 | The state of the server is unknown. Contact your cloud provider. +| VERIFY_RESIZE | 14 | System is awaiting confirmation that the server is operational after a move or resize. +| MIGRATING | 15 | The server is migrating. This is caused by a live migration (moving a server that is active) action. +| PAUSED | 16 | The server is paused. +| REVERT_RESIZE | 17 | The resize or migration of a server failed for some reason. The destination server is being cleaned up and the original source server is restarting. +| SHELVED | 18 | The server is in shelved state. Depends on the shelve offload time, the server will be automatically shelved off loaded. +| SHELVED_OFFLOADED | 19 | The shelved server is offloaded (removed from the compute host) and it needs unshelved action to be used again. +| SOFT_DELETED | 20 | The server is marked as deleted but will remain in the cloud for some configurable amount of time. diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index 1c1896ae0..e243340e8 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.5.4 +version: 0.6.1 appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index e80b158dc..b518978a3 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -95,6 +95,15 @@ promethuesRules: summary: "OpenStack Glance service down" description: "OpenStack Glance service down" + - alert: GlanceImageStatusNotActive + for: 5m + expr: openstack_glance_image_created_at{status!="active"} + labels: + severity: warning + annotations: + summary: "OpenStack Image `{{`{{$labels.name}}`}}` status is not ACTIVE" + description: "OpenStack Image `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`" + cinder: enabled: true rules: @@ -273,6 +282,102 @@ promethuesRules: description: > The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + - alert: CinderVolumeQuotaAlmostFull + expr: (openstack_cinder_limits_volume_max_gb - openstack_cinder_limits_volume_used_gb) < 200 + for: 30m + labels: + severity: warning + annotations: + summary: "Cinder volume quota almost full for project `{{ $labels.tenant }}`" + description: > + Project '{{ $labels.tenant_id }}' is nearing its Cinder volume quota limit. + Only {{ $value }} GB of volume space remains out of the allocated quota. + Consider cleaning up unused volumes or requesting a quota increase. + + - alert: CinderVolumeQuotaFull + expr: (openstack_cinder_limits_volume_max_gb - openstack_cinder_limits_volume_used_gb) < 50 + for: 5m + labels: + severity: critical + annotations: + summary: "Cinder volume quota full for project `{{ $labels.tenant }}`" + description: > + Project '{{ $labels.tenant_id }}' is nearing its Cinder volume quota limit. + Only {{ $value }} GB of volume space remains out of the allocated quota. + Consider cleaning up unused volumes or requesting a quota increase. + + - alert: CinderBackupQuotaAlmostFull + expr: (openstack_cinder_limits_backup_max_gb - openstack_cinder_limits_backup_used_gb) < 200 + for: 30m + labels: + severity: warning + annotations: + summary: "Cinder backup quota almost full for project `{{ $labels.tenant }}`" + description: > + Project '{{ $labels.tenant_id }}' is nearing its Cinder backup quota limit. + Only {{ $value }} GB of backup space remains out of the allocated quota. + Consider cleaning up unused backup or requesting a quota increase. + + - alert: CinderBackupQuotaFull + expr: (openstack_cinder_limits_backup_max_gb - openstack_cinder_limits_backup_used_gb) < 50 + for: 5m + labels: + severity: critical + annotations: + summary: "Cinder backup quota full for project `{{ $labels.tenant }}`" + description: > + Project '{{ $labels.tenant_id }}' is nearing its Cinder volume quota limit. + Only {{ $value }} GB of backup space remains out of the allocated quota. + Consider cleaning up unused backup or requesting a quota increase. + + placement: + enabled: true + rules: + - alert: PlacementDown + for: 5m + expr: openstack_placement_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack Placement service down" + description: "OpenStack Placement service down" + + - alert: PlacementLowMemoryResource + for: 5m + expr: | + ( openstack_placement_resource_allocation_ratio{resourcetype="MEMORY_MB"} * on (hostname) + openstack_placement_resource_total{resourcetype="MEMORY_MB"} + ) - + ( openstack_placement_resource_usage{resourcetype="MEMORY_MB"} + on (hostname) + openstack_placement_resource_reserved{resourcetype="MEMORY_MB"} + ) < 32768 + labels: + severity: warning + annotations: + summary: "Low memory on host `{{ $labels.hostname }}`" + description: > + The available memory (after accounting for usage and reserved memory) on host {{ $labels.hostname }} + is below 32GB. This could lead to scheduling issues or performance degradation for new instances. + Consider investigating memory usage or increasing available resources. + + - alert: PlacementLowMemoryResource + for: 5m + expr: | + ( openstack_placement_resource_allocation_ratio{resourcetype="MEMORY_MB"} * on (hostname) + openstack_placement_resource_total{resourcetype="MEMORY_MB"} + ) - + ( openstack_placement_resource_usage{resourcetype="MEMORY_MB"} + on (hostname) + openstack_placement_resource_reserved{resourcetype="MEMORY_MB"} + ) < 12288 + labels: + severity: critical + annotations: + summary: "Low memory on host `{{ $labels.hostname }}`" + description: > + The available memory (after accounting for usage and reserved memory) on host {{ $labels.hostname }} + is below 12GB. This could lead to scheduling issues or performance degradation for new instances. + Consider investigating memory usage or increasing available resources. + neutron: enabled: true rules: @@ -331,6 +436,15 @@ promethuesRules: is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will impact the provisioning of new ports. + - alert: NeutronrouterNotActive + for: 5m + expr: openstack_neutron_router{status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack neutron router `{{`{{$labels.name}}`}}` status is not ACTIVE" + description: "OpenStack neutron router `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`" + nova: enabled: true rules: @@ -388,6 +502,46 @@ promethuesRules: description: > The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + - alert: NovaInstanceInSpawning + for: 10m + expr: openstack_nova_server_status == 2 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: NovaInstanceInHardReboot + for: 30m + expr: openstack_nova_server_status == 5 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. + + - alert: NovaInstanceInReboot + for: 30m + expr: openstack_nova_server_status == 7 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. + + - alert: NovaInstanceInRebuild + for: 30m + expr: openstack_nova_server_status == 8 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. + - alert: NovaInstanceInRESIZE for: 15m expr: openstack_nova_server_status == 10 @@ -409,7 +563,7 @@ promethuesRules: The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. - alert: NovaInstanceInVERIFY_RESIZE - for: 15m + for: 10m expr: openstack_nova_server_status == 14 labels: severity: critical @@ -484,6 +638,39 @@ promethuesRules: out of capacity due to the timeline required to add new nodes. Please ensure that adequate amount of infrastructure is assigned to this deployment to prevent this. + - alert: NovaInstanceQuotaAlmostFull + expr: (openstack_nova_limits_instances_max - openstack_nova_limits_instances_used) < 2 + for: 15m + labels: + severity: warning + annotations: + summary: "Nova instance quota almost full for project `{{ $labels.tenant }}`" + description: > + Project '{{ $labels.tenant_id }}' is nearing its Nova instance quota limit. + Only {{ $value }} instance remains to reach quota. + + - alert: NovaMemoryQuotaAlmostFull + expr: (openstack_nova_limits_memory_max - openstack_nova_limits_memory_used) < 12288 + for: 10m + labels: + severity: warning + annotations: + summary: "Nova memory quota almost full for project `{{ $labels.tenant }}`" + description: > + Project '{{ $labels.tenant_id }}' is nearing its Nova memory quota limit. + Only {{ $value }} MB memory remains to reach quota. + + - alert: NovaCpuQuotaAlmostFull + expr: (openstack_nova_limits_vcpus_max - openstack_nova_limits_vcpus_used) < 10 + for: 10m + labels: + severity: warning + annotations: + summary: "Nova vcpu quota almost full for project `{{ $labels.tenant }}`" + description: > + Project '{{ $labels.tenant_id }}' is nearing its Nova vcpu quota limit. + Only {{ $value }} vcpu remains to reach quota. + octavia: enabled: true rules: From a8891d2e1e7349bb9e7eefdc34d71982a9ad187d Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Sun, 1 Jun 2025 11:19:03 +0330 Subject: [PATCH 08/12] fix typo --- .../prometheus-openstack-exporter/values.yaml | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index b518978a3..9cf5f0da5 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -288,10 +288,10 @@ promethuesRules: labels: severity: warning annotations: - summary: "Cinder volume quota almost full for project `{{ $labels.tenant }}`" + summary: "Cinder volume quota almost full for project `{{`{{$labels.tenant}}`}}`" description: > - Project '{{ $labels.tenant_id }}' is nearing its Cinder volume quota limit. - Only {{ $value }} GB of volume space remains out of the allocated quota. + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder volume quota limit. + Only `{{`{{$value}}`}}` GB of volume space remains out of the allocated quota. Consider cleaning up unused volumes or requesting a quota increase. - alert: CinderVolumeQuotaFull @@ -300,10 +300,10 @@ promethuesRules: labels: severity: critical annotations: - summary: "Cinder volume quota full for project `{{ $labels.tenant }}`" + summary: "Cinder volume quota full for project `{{`{{$labels.tenant}}`}}`" description: > - Project '{{ $labels.tenant_id }}' is nearing its Cinder volume quota limit. - Only {{ $value }} GB of volume space remains out of the allocated quota. + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder volume quota limit. + Only `{{`{{$value}}`}}` GB of volume space remains out of the allocated quota. Consider cleaning up unused volumes or requesting a quota increase. - alert: CinderBackupQuotaAlmostFull @@ -312,10 +312,10 @@ promethuesRules: labels: severity: warning annotations: - summary: "Cinder backup quota almost full for project `{{ $labels.tenant }}`" + summary: "Cinder backup quota almost full for project `{{`{{$labels.tenant}}`}}`" description: > - Project '{{ $labels.tenant_id }}' is nearing its Cinder backup quota limit. - Only {{ $value }} GB of backup space remains out of the allocated quota. + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder backup quota limit. + Only `{{`{{$value}}`}}` GB of backup space remains out of the allocated quota. Consider cleaning up unused backup or requesting a quota increase. - alert: CinderBackupQuotaFull @@ -324,10 +324,10 @@ promethuesRules: labels: severity: critical annotations: - summary: "Cinder backup quota full for project `{{ $labels.tenant }}`" + summary: "Cinder backup quota full for project `{{`{{$labels.tenant}}`}}`" description: > - Project '{{ $labels.tenant_id }}' is nearing its Cinder volume quota limit. - Only {{ $value }} GB of backup space remains out of the allocated quota. + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder volume quota limit. + Only `{{`{{$value}}`}}` GB of backup space remains out of the allocated quota. Consider cleaning up unused backup or requesting a quota increase. placement: @@ -354,9 +354,9 @@ promethuesRules: labels: severity: warning annotations: - summary: "Low memory on host `{{ $labels.hostname }}`" + summary: "Low memory on host `{{`{{$labels.hostname}}`}}`" description: > - The available memory (after accounting for usage and reserved memory) on host {{ $labels.hostname }} + The available memory (after accounting for usage and reserved memory) on host `{{`{{$labels.hostname}}`}}` is below 32GB. This could lead to scheduling issues or performance degradation for new instances. Consider investigating memory usage or increasing available resources. @@ -372,9 +372,9 @@ promethuesRules: labels: severity: critical annotations: - summary: "Low memory on host `{{ $labels.hostname }}`" + summary: "Low memory on host `{{`{{$labels.hostname}}`}}`" description: > - The available memory (after accounting for usage and reserved memory) on host {{ $labels.hostname }} + The available memory (after accounting for usage and reserved memory) on host `{{`{{$labels.hostname}}`}}` is below 12GB. This could lead to scheduling issues or performance degradation for new instances. Consider investigating memory usage or increasing available resources. @@ -644,10 +644,10 @@ promethuesRules: labels: severity: warning annotations: - summary: "Nova instance quota almost full for project `{{ $labels.tenant }}`" + summary: "Nova instance quota almost full for project `{{`{{$labels.tenant}}`}}`" description: > - Project '{{ $labels.tenant_id }}' is nearing its Nova instance quota limit. - Only {{ $value }} instance remains to reach quota. + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Nova instance quota limit. + Only `{{`{{$value}}`}}` instance remains to reach quota. - alert: NovaMemoryQuotaAlmostFull expr: (openstack_nova_limits_memory_max - openstack_nova_limits_memory_used) < 12288 @@ -655,10 +655,10 @@ promethuesRules: labels: severity: warning annotations: - summary: "Nova memory quota almost full for project `{{ $labels.tenant }}`" + summary: "Nova memory quota almost full for project `{{`{{$labels.tenant}}`}}`" description: > - Project '{{ $labels.tenant_id }}' is nearing its Nova memory quota limit. - Only {{ $value }} MB memory remains to reach quota. + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Nova memory quota limit. + Only `{{`{{$value}}`}}` MB memory remains to reach quota. - alert: NovaCpuQuotaAlmostFull expr: (openstack_nova_limits_vcpus_max - openstack_nova_limits_vcpus_used) < 10 @@ -666,10 +666,10 @@ promethuesRules: labels: severity: warning annotations: - summary: "Nova vcpu quota almost full for project `{{ $labels.tenant }}`" + summary: "Nova vcpu quota almost full for project `{{`{{$labels.tenant}}`}}`" description: > - Project '{{ $labels.tenant_id }}' is nearing its Nova vcpu quota limit. - Only {{ $value }} vcpu remains to reach quota. + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Nova vcpu quota limit. + Only `{{`{{$value}}`}}` vcpu remains to reach quota. octavia: enabled: true From a9f3243aaa14673467c01a09b22232639d77c5dc Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Sun, 1 Jun 2025 11:19:49 +0330 Subject: [PATCH 09/12] update version --- charts/prometheus-openstack-exporter/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index e243340e8..9081f970a 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.6.1 +version: 0.6.2 appVersion: v1.7.0 From ce9c27ac0ab310962cb0c1e157723accbf1cc819 Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Mon, 11 Aug 2025 14:45:18 +0330 Subject: [PATCH 10/12] tune rules for cinder and neutron --- charts/prometheus-openstack-exporter/values.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 9cf5f0da5..9ea864284 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -142,7 +142,7 @@ promethuesRules: # value = (9 | 10 | 13 | 14 | 15) - alert: CinderVolumeInError - for: 15m + for: 5m expr: openstack_cinder_volume_status{status=~"error.*"} labels: severity: critical @@ -156,7 +156,7 @@ promethuesRules: expr: openstack_cinder_volume_status == 0 for: 15m labels: - severity: warning + severity: critical annotations: summary: "`{{`{{$labels.name}}`}}` Volume in CREATING state" description: > @@ -206,7 +206,7 @@ promethuesRules: expr: openstack_cinder_volume_status == 7 for: 15m labels: - severity: warning + severity: critical annotations: summary: "`{{`{{$labels.name}}`}}` Volume in DELETING state" description: > @@ -276,7 +276,7 @@ promethuesRules: expr: openstack_cinder_volume_status == 19 for: 15m labels: - severity: warning + severity: critical annotations: summary: "`{{`{{$labels.name}}`}}` Volume in EXTENDING state" description: > @@ -416,6 +416,7 @@ promethuesRules: - alert: NeutronBindingFailedPorts expr: openstack_neutron_port{binding_vif_type="binding_failed"} != 0 + for: 2m labels: severity: warning annotations: @@ -427,6 +428,7 @@ promethuesRules: - alert: NeutronNetworkOutOfIPs expr: | sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 + for: 15m labels: severity: warning annotations: From c367684117d5268eb106c37d4456507edbbf4c90 Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Mon, 11 Aug 2025 14:46:41 +0330 Subject: [PATCH 11/12] update chart version --- charts/prometheus-openstack-exporter/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index 9081f970a..595babd5e 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.6.2 +version: 0.6.3 appVersion: v1.7.0 From f6901bea98b97bc8e2ad72a63cdad9db7649fb8c Mon Sep 17 00:00:00 2001 From: Saeed Padari Date: Mon, 6 Oct 2025 15:24:18 +0330 Subject: [PATCH 12/12] remove NovaAgentDisabled rule --- charts/prometheus-openstack-exporter/values.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 9ea864284..570825666 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -471,18 +471,6 @@ promethuesRules: is being reported as down. This can affect compute operations so it must be resolved as quickly as possible. - - alert: NovaAgentDisabled - for: 1h - expr: openstack_nova_agent_state{adminState!="enabled"} - labels: - severity: warning - annotations: - summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect compute operations so it must be resolved - as quickly as possible. - - alert: NovaInstanceInError for: 10m expr: openstack_nova_server_status{status="ERROR"}