diff --git a/README.md b/README.md index 34f23795a..558a9b608 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ git clone https://github.com/openstack-exporter/helm-charts.git # Package the chart cd helm-charts/charts/prometheus-openstack-exporter/ -helm package . +helm package . # Get chart version & install version="$(awk '/^version:/{ print $NF }' Chart.yaml)" @@ -26,3 +26,58 @@ helm install prometheus-openstack-exporter prometheus-openstack-exporter-${versi ## Contributing Please fill pull requests or issues under Github. + + + +## OpenStack volumes can be in the following status: +openstack_cinder_volume_status: + +| Status | Value | +|---------------------|---------| +|"creating" | 0 | +|"available" | 1 | +|"reserved" | 2 | +|"attaching" | 3 | +|"detaching" | 4 | +|"in-use" | 5 | +|"maintenance" | 6 | +|"deleting" | 7 | +|"awaiting-transfer" | 8 | +|"error" | 9 | +|"error_deleting" | 10 | +|"backing-up" | 11 | +|"restoring-backup" | 12 | +|"error_backing-up" | 13 | +|"error_restoring" | 14 | +|"error_extending" | 15 | +|"downloading" | 16 | +|"uploading" | 17 | +|"retyping" | 18 | +|"extending" | 19 | + +## OpenStack server can be in the following status: +openstack_nova_server_status: + +| Status | Value | Description +|-------------------|-------|--------------------| +| ACTIVE | 0 | +| BUILD | 1 | The server has not finished the original build process. +| BUILD(spawning) | 2 | The server has not finished the original build process but networking works (HP Cloud specific) +| DELETED | 3 | The server is deleted. +| ERROR | 4 | The server is in error. +| HARD_REBOOT | 5 | The server is hard rebooting. +| PASSWORD | 6 | The password is being reset on the server. +| REBOOT | 7 | The server is in a soft reboot state. +| REBUILD | 8 | The server is currently being rebuilt from an image. +| RESCUE | 9 | The server is in rescue mode. +| RESIZE | 10 | Server is performing the differential copy of data that changed during its initial copy. +| SHUTOFF | 11 | The virtual machine (VM) was powered down by the user, but not through the OpenStack Compute API. +| SUSPENDED | 12 | The server is suspended, either by request or necessity. +| UNKNOWN | 13 | The state of the server is unknown. Contact your cloud provider. +| VERIFY_RESIZE | 14 | System is awaiting confirmation that the server is operational after a move or resize. +| MIGRATING | 15 | The server is migrating. This is caused by a live migration (moving a server that is active) action. +| PAUSED | 16 | The server is paused. +| REVERT_RESIZE | 17 | The resize or migration of a server failed for some reason. The destination server is being cleaned up and the original source server is restarting. +| SHELVED | 18 | The server is in shelved state. Depends on the shelve offload time, the server will be automatically shelved off loaded. +| SHELVED_OFFLOADED | 19 | The shelved server is offloaded (removed from the compute host) and it needs unshelved action to be used again. +| SOFT_DELETED | 20 | The server is marked as deleted but will remain in the cloud for some configurable amount of time. diff --git a/charts/prometheus-openstack-exporter/Chart.yaml b/charts/prometheus-openstack-exporter/Chart.yaml index b9c89e720..595babd5e 100644 --- a/charts/prometheus-openstack-exporter/Chart.yaml +++ b/charts/prometheus-openstack-exporter/Chart.yaml @@ -1,5 +1,5 @@ --- apiVersion: v1 name: prometheus-openstack-exporter -version: 0.4.3 -appVersion: v1.6.0 +version: 0.6.3 +appVersion: v1.7.0 diff --git a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml index 0b88676c5..563b68053 100644 --- a/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml +++ b/charts/prometheus-openstack-exporter/templates/prometheusrule.yaml @@ -5,206 +5,52 @@ metadata: name: {{ include "openstack-exporter.fullname" . }} namespace: {{ .Release.Namespace }} labels: -{{ include "openstack-exporter.labels" . | indent 4 }} + {{- include "openstack-exporter.labels" . | indent 4 }} spec: groups: - - name: cinder + {{- range $groupName, $group := .Values.promethuesRules }} + {{- if (dig "enabled" true $group )}} + - name: {{ $groupName }} rules: - - alert: CinderAgentDown - expr: | - openstack_cinder_agent_state != 1 - labels: - severity: P4 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. - - - alert: CinderAgentDown - for: 5m - expr: | - openstack_cinder_agent_state != 1 - labels: - severity: P3 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down for 5 minutes. This can affect volume operations so it must - be resolved as quickly as possible. - - - alert: CinderAgentDisabled - for: 1h - expr: | - openstack_cinder_agent_state{adminState!="enabled"} - labels: - severity: P5 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect volume operations so it must be resolved - as quickly as possible. - - - alert: CinderVolumeInError - for: 24h - expr: | - openstack_cinder_volume_status{status=~"error.*"} - labels: - severity: P4 - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Volume in ERROR state" - description: > - The volume `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must - be cleaned up or removed in order to provide a consistent customer experience. - - - - name: neutron - rules: - - alert: NeutronAgentDown - expr: | - openstack_neutron_agent_state != 1 - labels: - severity: P4 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. - - - alert: NeutronAgentDown - for: 5m - expr: | - openstack_neutron_agent_state != 1 - labels: - severity: P3 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down for 5 minutes. This can affect network operations so it must - be resolved as quickly as possible. - - - alert: NeutronAgentDisabled - for: 1h - expr: | - openstack_neutron_agent_state{adminState!="up"} - labels: - severity: P5 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect network operations so it must be resolved - as quickly as possible. - - - alert: NeutronBindingFailedPorts - expr: | - openstack_neutron_port{binding_vif_type="binding_failed"} != 0 - labels: - severity: P3 - annotations: - summary: "[`{{`{{$labels.device_owner}}`}}`] `{{`{{$labels.mac_address}}`}}` binding failed" - description: > - The NIC `{{`{{$labels.mac_address}}`}}` of `{{`{{$labels.device_owner}}`}}` - has binding failed port now. - - - alert: NeutronNetworkOutOfIPs - expr: | - sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 - labels: - severity: P4 - annotations: - summary: "[`{{`{{$labels.network_name}}`}}`] `{{`{{$labels.subnet_name}}`}}` running out of IPs" - description: > - The subnet `{{`{{$labels.subnet_name}}`}}` within `{{`{{$labels.network_name}}`}}` - is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will - impact the provisioning of new ports. - - - - name: nova - rules: - - alert: NovaAgentDown - expr: | - openstack_nova_agent_state != 1 - labels: - severity: P4 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. - - - alert: NovaAgentDown - for: 5m - expr: | - openstack_nova_agent_state != 1 - labels: - severity: P3 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` down" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - is being reported as down. This can affect compute operations so it must be resolved - as quickly as possible. - - - alert: NovaAgentDisabled - for: 1h - expr: | - openstack_nova_agent_state{adminState!="enabled"} - labels: - severity: P5 - annotations: - summary: "[`{{`{{$labels.hostname}}`}}`] `{{`{{$labels.exported_service}}`}}` disabled" - description: > - The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` - has been disabled for 60 minutes. This can affect compute operations so it must be resolved - as quickly as possible. - - - alert: NovaInstanceInError - for: 24h - expr: | - openstack_nova_server_status{status="ERROR"} - labels: - severity: P4 - annotations: - summary: "[`{{`{{$labels.id}}`}}`] Instance in ERROR state" - description: > - The instance `{{`{{$labels.id}}`}}` has been in ERROR state for over 24 hours. It must - be cleaned up or removed in order to provide a consistent customer experience. - - - alert: NovaFailureRisk - for: 6h - expr: | - (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 - labels: - severity: P4 - annotations: - summary: "[nova] Failure risk" - description: > - The cloud capacity will be at `{{`{{$value}}`}}` in the event of the failure of a single - hypervisor which puts the cloud at risk of not being able to recover should any hypervisor - failures occur. Please ensure that adequate amount of infrastructure is assigned to this - deployment to prevent this. - - - alert: NovaCapacity - for: 6h - expr: | - sum ( - openstack_nova_memory_used_bytes - + on(hostname) group_left(adminState) - (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) - ) / sum ( - openstack_nova_memory_available_bytes - + on(hostname) group_left(adminState) - (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) - ) * 100 > 75 - labels: - severity: P4 - annotations: - summary: "[nova] Capacity risk" - description: > - The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running - out of capacity due to the timeline required to add new nodes. Please ensure that adequate - amount of infrastructure is assigned to this deployment to prevent this. + {{- range $ruleName, $rule := $group.rules }} + {{- if (dig "enabled" true $rule )}} + - # {{ $ruleName }} + {{- with $rule.alert }} + alert: {{ . }} + {{- end }} + + {{- with $rule.expr }} + expr: {{ tpl . $ | quote }} + {{- end }} + + {{- with $rule.record }} + record: {{ . }} + {{- end }} + + {{- with $rule.for }} + for: {{ . }} + {{- end }} + + {{- with $rule.keep_firing_for }} + keep_firing_for: {{ . }} + {{- end }} + + {{- with $rule.labels }} + labels: + {{- range $k,$v := . }} + {{ $k }}: {{ tpl $v $ | quote }} + {{- end }} + {{- end }} + + {{- with $rule.annotations }} + annotations: + {{- range $k, $v := . }} + {{ $k }}: | + {{- tpl $v $ | nindent 10 }} + {{- end }} + {{- end }} + + {{- end }} + {{- end }} + {{- end }} + {{- end }} diff --git a/charts/prometheus-openstack-exporter/templates/service.yaml b/charts/prometheus-openstack-exporter/templates/service.yaml index 95aa278ee..7c8024493 100644 --- a/charts/prometheus-openstack-exporter/templates/service.yaml +++ b/charts/prometheus-openstack-exporter/templates/service.yaml @@ -13,4 +13,4 @@ spec: port: 9180 targetPort: metrics selector: -{{- include "openstack-exporter.labels" . | indent 4 }} \ No newline at end of file +{{- include "openstack-exporter.labels" . | indent 4 }} diff --git a/charts/prometheus-openstack-exporter/values.yaml b/charts/prometheus-openstack-exporter/values.yaml index 03381b96e..570825666 100644 --- a/charts/prometheus-openstack-exporter/values.yaml +++ b/charts/prometheus-openstack-exporter/values.yaml @@ -8,7 +8,7 @@ replicaCount: 1 image: repository: ghcr.io/openstack-exporter/openstack-exporter - tag: 1.6.0 + tag: 1.7.0 pullPolicy: Always serviceMonitor: @@ -69,3 +69,645 @@ clouds_yaml_config: | # ... # cloud2: # ... + +promethuesRules: + keystone: + enabled: true + rules: + - alert: KeystoneDown + for: 5m + expr: openstack_identity_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack Keystone service down" + description: "OpenStack Keystone service down" + + glance: + enabled: true + rules: + - alert: GlanceDown + for: 5m + expr: openstack_glance_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack Glance service down" + description: "OpenStack Glance service down" + + - alert: GlanceImageStatusNotActive + for: 5m + expr: openstack_glance_image_created_at{status!="active"} + labels: + severity: warning + annotations: + summary: "OpenStack Image `{{`{{$labels.name}}`}}` status is not ACTIVE" + description: "OpenStack Image `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`" + + cinder: + enabled: true + rules: + - alert: CinderDown + for: 5m + expr: openstack_cinder_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack Cinder service down" + description: "OpenStack Cinder service down" + + - alert: CinderAgentDown + for: 5m + expr: openstack_cinder_agent_state{adminState="enabled"} != 1 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + is being reported as down for 5 minutes. This can affect volume operations so it must + be resolved as quickly as possible. + + - alert: CinderAgentDisabled + for: 1h + expr: openstack_cinder_agent_state{adminState!="enabled"} + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + has been disabled for 60 minutes. This can affect volume operations so it must be resolved + as quickly as possible. + + # value = (9 | 10 | 13 | 14 | 15) + - alert: CinderVolumeInError + for: 5m + expr: openstack_cinder_volume_status{status=~"error.*"} + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in ERROR state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + It must be cleaned up or removed in order to provide a consistent customer experience. + + - alert: CinderVolumeInCreating + expr: openstack_cinder_volume_status == 0 + for: 15m + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in CREATING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInReserved + expr: openstack_cinder_volume_status == 2 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in RESERVED state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInAttaching + expr: openstack_cinder_volume_status == 3 + for: 10m + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in ATTACHING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes. + + - alert: CinderVolumeInDetaching + expr: openstack_cinder_volume_status == 4 + for: 10m + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in DETACHING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes. + + - alert: CinderVolumeInMaintenance + expr: openstack_cinder_volume_status == 6 + for: 20m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in MAINTENANCE state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInDeleting + expr: openstack_cinder_volume_status == 7 + for: 15m + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in DELETING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInAwaitingTransfer + expr: openstack_cinder_volume_status == 8 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in AwaitingTransfer state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInBackingUp + expr: openstack_cinder_volume_status == 11 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in BackingUp state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInRestoringBackup + expr: openstack_cinder_volume_status == 12 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in RESTORING-BACKUP state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInDownloading + expr: openstack_cinder_volume_status == 16 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in DOWNLOADING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInUploading + expr: openstack_cinder_volume_status == 17 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in UPLOADING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInRetyping + expr: openstack_cinder_volume_status == 18 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in RETYPING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeInExtending + expr: openstack_cinder_volume_status == 19 + for: 15m + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Volume in EXTENDING state" + description: > + The volume `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: CinderVolumeQuotaAlmostFull + expr: (openstack_cinder_limits_volume_max_gb - openstack_cinder_limits_volume_used_gb) < 200 + for: 30m + labels: + severity: warning + annotations: + summary: "Cinder volume quota almost full for project `{{`{{$labels.tenant}}`}}`" + description: > + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder volume quota limit. + Only `{{`{{$value}}`}}` GB of volume space remains out of the allocated quota. + Consider cleaning up unused volumes or requesting a quota increase. + + - alert: CinderVolumeQuotaFull + expr: (openstack_cinder_limits_volume_max_gb - openstack_cinder_limits_volume_used_gb) < 50 + for: 5m + labels: + severity: critical + annotations: + summary: "Cinder volume quota full for project `{{`{{$labels.tenant}}`}}`" + description: > + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder volume quota limit. + Only `{{`{{$value}}`}}` GB of volume space remains out of the allocated quota. + Consider cleaning up unused volumes or requesting a quota increase. + + - alert: CinderBackupQuotaAlmostFull + expr: (openstack_cinder_limits_backup_max_gb - openstack_cinder_limits_backup_used_gb) < 200 + for: 30m + labels: + severity: warning + annotations: + summary: "Cinder backup quota almost full for project `{{`{{$labels.tenant}}`}}`" + description: > + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder backup quota limit. + Only `{{`{{$value}}`}}` GB of backup space remains out of the allocated quota. + Consider cleaning up unused backup or requesting a quota increase. + + - alert: CinderBackupQuotaFull + expr: (openstack_cinder_limits_backup_max_gb - openstack_cinder_limits_backup_used_gb) < 50 + for: 5m + labels: + severity: critical + annotations: + summary: "Cinder backup quota full for project `{{`{{$labels.tenant}}`}}`" + description: > + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Cinder volume quota limit. + Only `{{`{{$value}}`}}` GB of backup space remains out of the allocated quota. + Consider cleaning up unused backup or requesting a quota increase. + + placement: + enabled: true + rules: + - alert: PlacementDown + for: 5m + expr: openstack_placement_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack Placement service down" + description: "OpenStack Placement service down" + + - alert: PlacementLowMemoryResource + for: 5m + expr: | + ( openstack_placement_resource_allocation_ratio{resourcetype="MEMORY_MB"} * on (hostname) + openstack_placement_resource_total{resourcetype="MEMORY_MB"} + ) - + ( openstack_placement_resource_usage{resourcetype="MEMORY_MB"} + on (hostname) + openstack_placement_resource_reserved{resourcetype="MEMORY_MB"} + ) < 32768 + labels: + severity: warning + annotations: + summary: "Low memory on host `{{`{{$labels.hostname}}`}}`" + description: > + The available memory (after accounting for usage and reserved memory) on host `{{`{{$labels.hostname}}`}}` + is below 32GB. This could lead to scheduling issues or performance degradation for new instances. + Consider investigating memory usage or increasing available resources. + + - alert: PlacementLowMemoryResource + for: 5m + expr: | + ( openstack_placement_resource_allocation_ratio{resourcetype="MEMORY_MB"} * on (hostname) + openstack_placement_resource_total{resourcetype="MEMORY_MB"} + ) - + ( openstack_placement_resource_usage{resourcetype="MEMORY_MB"} + on (hostname) + openstack_placement_resource_reserved{resourcetype="MEMORY_MB"} + ) < 12288 + labels: + severity: critical + annotations: + summary: "Low memory on host `{{`{{$labels.hostname}}`}}`" + description: > + The available memory (after accounting for usage and reserved memory) on host `{{`{{$labels.hostname}}`}}` + is below 12GB. This could lead to scheduling issues or performance degradation for new instances. + Consider investigating memory usage or increasing available resources. + + neutron: + enabled: true + rules: + - alert: NeutronDown + for: 5m + expr: openstack_neutron_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack Neutron service down" + description: "OpenStack Neutron service down" + + - alert: NeutronAgentDown + for: 5m + expr: openstack_neutron_agent_state{adminState="up"} != 1 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + is being reported as down for 5 minutes. This can affect network operations so it must + be resolved as quickly as possible. + + - alert: NeutronAgentDisabled + for: 1h + expr: openstack_neutron_agent_state{adminState!="up"} + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` disabled" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + has been disabled for 60 minutes. This can affect network operations so it must be resolved + as quickly as possible. + + - alert: NeutronBindingFailedPorts + expr: openstack_neutron_port{binding_vif_type="binding_failed"} != 0 + for: 2m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.device_owner}}`}}` `{{`{{$labels.mac_address}}`}}` binding failed" + description: > + The NIC `{{`{{$labels.mac_address}}`}}` of `{{`{{$labels.device_owner}}`}}` + has binding failed port now. + + - alert: NeutronNetworkOutOfIPs + expr: | + sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) / sum by (network_id) (openstack_neutron_network_ip_availabilities_total{project_id!=""}) * 100 > 80 + for: 15m + labels: + severity: warning + annotations: + summary: "`{{`{{$labels.network_name}}`}}` `{{`{{$labels.subnet_name}}`}}` running out of IPs" + description: > + The subnet `{{`{{$labels.subnet_name}}`}}` within `{{`{{$labels.network_name}}`}}` + is currently at `{{`{{$value}}`}}`% utilization. If the IP addresses run out, it will + impact the provisioning of new ports. + + - alert: NeutronrouterNotActive + for: 5m + expr: openstack_neutron_router{status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack neutron router `{{`{{$labels.name}}`}}` status is not ACTIVE" + description: "OpenStack neutron router `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`" + + nova: + enabled: true + rules: + - alert: NovaDown + for: 5m + expr: openstack_nova_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack Nova service down" + description: "OpenStack Nova service down" + + - alert: NovaAgentDown + for: 5m + expr: openstack_nova_agent_state{adminState="enabled"} != 1 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.hostname}}`}}` `{{`{{$labels.exported_service}}`}}` down" + description: > + The service `{{`{{$labels.exported_service}}`}}` running on `{{`{{$labels.hostname}}`}}` + is being reported as down. This can affect compute operations so it must be resolved + as quickly as possible. + + - alert: NovaInstanceInError + for: 10m + expr: openstack_nova_server_status{status="ERROR"} + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` on host `{{`{{$labels.hypervisor_hostname}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 10 minutes. + It must be cleaned up or removed in order to provide a consistent customer experience. + + - alert: NovaInstanceInBuilding + for: 15m + expr: openstack_nova_server_status == 1 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: NovaInstanceInSpawning + for: 10m + expr: openstack_nova_server_status == 2 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: NovaInstanceInHardReboot + for: 30m + expr: openstack_nova_server_status == 5 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. + + - alert: NovaInstanceInReboot + for: 30m + expr: openstack_nova_server_status == 7 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. + + - alert: NovaInstanceInRebuild + for: 30m + expr: openstack_nova_server_status == 8 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. + + - alert: NovaInstanceInRESIZE + for: 15m + expr: openstack_nova_server_status == 10 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: NovaInstanceInUNKNOWN + for: 15m + expr: openstack_nova_server_status == 13 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: NovaInstanceInVERIFY_RESIZE + for: 10m + expr: openstack_nova_server_status == 14 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 15 minutes. + + - alert: NovaInstanceInMIGRATING + for: 30m + expr: openstack_nova_server_status == 15 + labels: + severity: critical + annotations: + summary: "`{{`{{$labels.name}}`}}` Instance in `{{`{{$labels.status}}`}}` state" + description: > + The instance `{{`{{$labels.id}}`}}` has been in `{{`{{$labels.status}}`}}` state for over 30 minutes. + + - alert: NovaFailureRisk + for: 6h + expr: | + (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 + labels: + severity: warning + annotations: + summary: "[nova] Failure risk" + description: > + The cloud capacity will be at `{{`{{$value}}`}}` in the event of the failure of a single + hypervisor which puts the cloud at risk of not being able to recover should any hypervisor + failures occur. Please ensure that adequate amount of infrastructure is assigned to this + deployment to prevent this. + + - alert: NovaCapacityNearFull + for: 6h + expr: | + sum ( + openstack_nova_memory_used_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) / sum ( + openstack_nova_memory_available_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) * 100 > 75 + labels: + severity: warning + annotations: + summary: "[nova] near full Capacity risk" + description: > + The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running + out of capacity due to the timeline required to add new nodes. Please ensure that adequate + amount of infrastructure is assigned to this deployment to prevent this. + + - alert: NovaCapacityFull + for: 6h + expr: | + sum ( + openstack_nova_memory_used_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) / sum ( + openstack_nova_memory_available_bytes + + on(hostname) group_left(adminState) + (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) + ) * 100 > 85 + labels: + severity: critical + annotations: + summary: "[nova] Capacity risk" + description: > + The cloud capacity is currently at `{{`{{$value}}`}}` which means there is a risk of running + out of capacity due to the timeline required to add new nodes. Please ensure that adequate + amount of infrastructure is assigned to this deployment to prevent this. + + - alert: NovaInstanceQuotaAlmostFull + expr: (openstack_nova_limits_instances_max - openstack_nova_limits_instances_used) < 2 + for: 15m + labels: + severity: warning + annotations: + summary: "Nova instance quota almost full for project `{{`{{$labels.tenant}}`}}`" + description: > + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Nova instance quota limit. + Only `{{`{{$value}}`}}` instance remains to reach quota. + + - alert: NovaMemoryQuotaAlmostFull + expr: (openstack_nova_limits_memory_max - openstack_nova_limits_memory_used) < 12288 + for: 10m + labels: + severity: warning + annotations: + summary: "Nova memory quota almost full for project `{{`{{$labels.tenant}}`}}`" + description: > + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Nova memory quota limit. + Only `{{`{{$value}}`}}` MB memory remains to reach quota. + + - alert: NovaCpuQuotaAlmostFull + expr: (openstack_nova_limits_vcpus_max - openstack_nova_limits_vcpus_used) < 10 + for: 10m + labels: + severity: warning + annotations: + summary: "Nova vcpu quota almost full for project `{{`{{$labels.tenant}}`}}`" + description: > + Project `{{`{{$labels.tenant_id}}`}}` is nearing its Nova vcpu quota limit. + Only `{{`{{$value}}`}}` vcpu remains to reach quota. + + octavia: + enabled: true + rules: + - alert: LoadbalancerDown + for: 5m + expr: openstack_loadbalancer_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer service down" + description: "OpenStack loadbalancer service down" + + - alert: LoadbalancerNotActive + for: 5m + expr: openstack_loadbalancer_loadbalancer_status{provisioning_status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer `{{`{{$labels.id}}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}`" + + - alert: LoadbalancerPoolNotActive + for: 5m + expr: openstack_loadbalancer_pool_status{provisioning_status!="ACTIVE"} + labels: + severity: critical + annotations: + summary: "OpenStack loadbalancer pool `{{`{{$labels.name}}`}}` provisioning status is not ACTIVE" + description: "OpenStack loadbalancer pool `{{`{{$labels.id}}`}}` provisioning status is `{{`{{$labels.provisioning_status}}`}}`" + + manila: + enabled: true + rules: + - alert: ManilaDown + for: 5m + expr: openstack_sharev2_up != 1 + labels: + severity: critical + annotations: + summary: "OpenStack shared file system (Manila) service down" + description: "OpenStack shared file system (Manila) service down" + + - alert: ManilaStatusNotActive + for: 5m + expr: openstack_sharev2_share_status{status!="available"} + labels: + severity: critical + annotations: + summary: "OpenStack Share `{{`{{$labels.name}}`}}` status is not ACTIVE" + description: "OpenStack Share `{{`{{$labels.id}}`}}` status is `{{`{{$labels.status}}`}}`"