diff --git a/conf/ems/9.6.0/ems.yaml b/conf/ems/9.6.0/ems.yaml index 4ebfe64ce..64167df6c 100644 --- a/conf/ems/9.6.0/ems.yaml +++ b/conf/ems/9.6.0/ems.yaml @@ -1041,4 +1041,9 @@ events: - name: smbc.pfo.completed exports: - - parameters.dstpath => dst_path \ No newline at end of file + - parameters.dstpath => dst_path + + - name: callhome.data.outage.detected + exports: + - ^^node.name => node + - parameters.subject => subject \ No newline at end of file diff --git a/container/prometheus/ems_alert_rules.yml b/container/prometheus/ems_alert_rules.yml index d5e3085d7..d1184041f 100644 --- a/container/prometheus/ems_alert_rules.yml +++ b/container/prometheus/ems_alert_rules.yml @@ -531,6 +531,30 @@ groups: impact: "Availability" runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#nvram-battery-low" + - alert: Data Outage Detected + expr: last_over_time(ems_events{message="callhome.data.outage.detected"}[1d]) == 1 + labels: + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} + annotations: + summary: "Call home for {{ $labels.subject }} on node {{ $labels.node }}" + impact: "Availability" + runbook: "https://netapp.github.io/harvest/nightly/resources/ems-alert-runbook/#data-outage-detected" + - alert: HA Interconnect Down expr: last_over_time(ems_events{message="callhome.hainterconnect.down"}[1d]) == 1 labels: diff --git a/docs/resources/ems-alert-runbook.md b/docs/resources/ems-alert-runbook.md index 74981f4ee..4c2ea6b87 100644 --- a/docs/resources/ems-alert-runbook.md +++ b/docs/resources/ems-alert-runbook.md @@ -401,6 +401,20 @@ Perform the following corrective actions: 2. If the battery was replaced recently or the system was non-operational for an extended period of time, monitor the battery to verify that it is charging properly. 3. Contact NetApp technical support if the battery runtime continues to decrease below critical levels, and the storage system shuts down automatically. +### Data Outage Detected + +**Impact**: Availability + +**EMS Event**: `callhome.data.outage.detected` + +This message occurs when the system detects that it has encountered an outage prior to this boot. +If your system is configured to do so, it generates and transmits an AutoSupport (or 'call home') message to NetApp technical support and to the configured destinations. +Successful delivery of an AutoSupport message significantly improves problem determination and resolution. + +**Remediation** + +Contact NetApp technical support. + ### NetBIOS Name Conflict **Impact**: Availability diff --git a/integration/test/alert_rule_test.go b/integration/test/alert_rule_test.go index 357643ea4..1fdd73013 100644 --- a/integration/test/alert_rule_test.go +++ b/integration/test/alert_rule_test.go @@ -185,8 +185,8 @@ func parseEmsLabels(exports *node.Node) string { var labels []string if exports != nil { for _, export := range exports.GetAllChildContentS() { - name, display, _, _ := template.ParseMetric(export) - if strings.HasPrefix(name, "parameters") { + _, display, _, _ := template.ParseMetric(export) + if display != "" { labels = append(labels, display) } }