diff --git a/config/prometheus/alerts.yaml b/config/prometheus/alerts.yaml index 3fb47178f3..f006fd193b 100644 --- a/config/prometheus/alerts.yaml +++ b/config/prometheus/alerts.yaml @@ -40,3 +40,11 @@ spec: annotations: description: "Workload is not protected for disaster recovery (DRPC: {{ $labels.obj_name }}, Namespace: {{ $labels.obj_namespace }}). Inspect DRPC status.conditions for details." alert_type: "DisasterRecovery" + - alert: UnsupportedConsistencyGroupingEnabled + expr: ramen_unsupported_consistency_grouping_enabled == 1 + for: 10m + labels: + severity: warning + annotations: + description: "Unsupported consistency grouping is enabled for disaster recovery (DRPC: {{ $labels.obj_name }}, Namespace: {{ $labels.obj_namespace }})." + alert_type: "DisasterRecovery" diff --git a/internal/controller/drplacementcontrol_controller.go b/internal/controller/drplacementcontrol_controller.go index 30370a39c7..106a2e4815 100644 --- a/internal/controller/drplacementcontrol_controller.go +++ b/internal/controller/drplacementcontrol_controller.go @@ -351,6 +351,28 @@ func (r *DRPlacementControlReconciler) setWorkloadProtectionMetric(workloadProte workloadProtectionMetrics.WorkloadProtectionStatus.Set(float64(protected)) } +// setCGEnabledMetric sets metric based on annotations on DRPC, +// where 0 indicates consistency grouping is not enabled +// and 1 indicates consistency grouping is enabled +func (r *DRPlacementControlReconciler) setCGEnabledMetric(drpc *rmn.DRPlacementControl, + cgEnabledMetrics *CGEnabledMetrics, log logr.Logger, +) { + if cgEnabledMetrics == nil { + return + } + + log.Info(fmt.Sprintf("setting metric: (%s)", CGEnabled)) + + enabled := 0 + + drpcAnnotations := drpc.GetAnnotations() + if drpcAnnotations != nil && drpcAnnotations[rmnutil.IsCGEnabledAnnotation] == "true" { + enabled = 1 + } + + cgEnabledMetrics.CGEnabled.Set(float64(enabled)) +} + //nolint:funlen func (r *DRPlacementControlReconciler) createDRPCInstance( ctx context.Context, @@ -461,6 +483,17 @@ func (r *DRPlacementControlReconciler) createWorkloadProtectionMetricsInstance( } } +func (r *DRPlacementControlReconciler) createCGEnabledMetricsInstance( + drpc *rmn.DRPlacementControl, +) *CGEnabledMetrics { + cgEnabledLabels := CGEnabledMetricLabels(drpc) + cgEnabledMetrics := NewCGEnabledMetric(cgEnabledLabels) + + return &CGEnabledMetrics{ + CGEnabled: cgEnabledMetrics.CGEnabled, + } +} + // isBeingDeleted returns true if either DRPC, user placement, or both are being deleted func isBeingDeleted(drpc *rmn.DRPlacementControl, usrPl client.Object) bool { return rmnutil.ResourceIsDeleted(drpc) || @@ -710,6 +743,9 @@ func (r *DRPlacementControlReconciler) finalizeDRPC(ctx context.Context, drpc *r workloadProtectionLabels := WorkloadProtectionStatusLabels(drpc) DeleteWorkloadProtectionStatusMetric(workloadProtectionLabels) + cgEnabledMetricLabels := CGEnabledMetricLabels(drpc) + DeleteCGEnabledMetric(cgEnabledMetricLabels) + return nil } @@ -1584,6 +1620,9 @@ func (r *DRPlacementControlReconciler) setDRPCMetrics(ctx context.Context, workloadProtectionMetrics := r.createWorkloadProtectionMetricsInstance(drpc) r.setWorkloadProtectionMetric(workloadProtectionMetrics, drpc.Status.Conditions, log) + cgEnabledMetrics := r.createCGEnabledMetricsInstance(drpc) + r.setCGEnabledMetric(drpc, cgEnabledMetrics, log) + drPolicy, err := GetDRPolicy(ctx, r.Client, drpc, log) if err != nil { return fmt.Errorf("failed to get DRPolicy %w", err) diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go index b8265615da..4bd761875e 100644 --- a/internal/controller/metrics.go +++ b/internal/controller/metrics.go @@ -22,6 +22,7 @@ const ( LastSyncDurationSeconds = "last_sync_duration_seconds" LastSyncDataBytes = "last_sync_data_bytes" WorkloadProtectionStatus = "workload_protection_status" + CGEnabled = "unsupported_consistency_grouping_enabled" ) type SyncTimeMetrics struct { @@ -43,6 +44,9 @@ type SyncDataBytesMetrics struct { type WorkloadProtectionMetrics struct { WorkloadProtectionStatus prometheus.Gauge } +type CGEnabledMetrics struct { + CGEnabled prometheus.Gauge +} type SyncMetrics struct { SyncTimeMetrics @@ -90,6 +94,12 @@ var ( ObjName, // Name of the resoure [drpc-name] ObjNamespace, // DRPC namespace } + + cgEnabledMetricLabels = []string{ + ObjType, // Name of the type of the resource [drpc] + ObjName, // Name of the resoure [drpc-name] + ObjNamespace, // DRPC namespace + } ) var ( @@ -137,6 +147,15 @@ var ( }, workloadProtectionStatusLabels, ) + + cgEnabled = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: CGEnabled, + Namespace: metricNamespace, + Help: "Unsupported consistency grouping enabled status", + }, + cgEnabledMetricLabels, + ) ) // lastSyncTime metrics reports value from lastGrpupSyncTime taken from DRPC status @@ -234,6 +253,25 @@ func DeleteWorkloadProtectionStatusMetric(labels prometheus.Labels) bool { return workloadProtectionStatus.Delete(labels) } +// CGEnabled Metric reports information if consistency grouping is enabled for a DRPC +func CGEnabledMetricLabels(drpc *rmn.DRPlacementControl) prometheus.Labels { + return prometheus.Labels{ + ObjType: "DRPlacementControl", + ObjName: drpc.Name, + ObjNamespace: drpc.Namespace, + } +} + +func NewCGEnabledMetric(labels prometheus.Labels) CGEnabledMetrics { + return CGEnabledMetrics{ + CGEnabled: cgEnabled.With(labels), + } +} + +func DeleteCGEnabledMetric(labels prometheus.Labels) bool { + return cgEnabled.Delete(labels) +} + func init() { // Register custom metrics with the global prometheus registry metrics.Registry.MustRegister(dRPolicySyncInterval) @@ -241,4 +279,5 @@ func init() { metrics.Registry.MustRegister(lastSyncDuration) metrics.Registry.MustRegister(lastSyncDataBytes) metrics.Registry.MustRegister(workloadProtectionStatus) + metrics.Registry.MustRegister(cgEnabled) }