diff --git a/internal/controllers/scheduling/controller.go b/internal/controllers/scheduling/controller.go index 6d31d971..c58dc61c 100644 --- a/internal/controllers/scheduling/controller.go +++ b/internal/controllers/scheduling/controller.go @@ -111,6 +111,9 @@ func (c *controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu nextSlot := c.getNextCooldownSlot(comps) logger.Info("listed compositions", "compositionCount", len(comps.Items), "nextCooldownSlot", nextSlot) + // Reset the compositionHealth metric before iterating through compositions + compositionHealth.Reset() + var inFlight int var op *op for _, comp := range comps.Items { @@ -122,7 +125,10 @@ func (c *controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu if missedReconciliation(&comp, c.watchdogThreshold) { synth := synthsByName[comp.Spec.Synthesizer.Name] stuckReconciling.WithLabelValues(comp.Spec.Synthesizer.Name, getSynthOwner(&synth)).Inc() + compositionHealth.WithLabelValues(comp.Name, comp.Namespace, comp.Spec.Synthesizer.Name).Set(1) logger.Info("detected composition missed reconciliation", "compositionName", comp.Name, "compositionNamespace", comp.Namespace, "synthesizerName", comp.Spec.Synthesizer.Name) + } else { + compositionHealth.WithLabelValues(comp.Name, comp.Namespace, comp.Spec.Synthesizer.Name).Set(0) } synth, ok := synthsByName[comp.Spec.Synthesizer.Name] diff --git a/internal/controllers/scheduling/controller_test.go b/internal/controllers/scheduling/controller_test.go index ec3ce631..8eae71b7 100644 --- a/internal/controllers/scheduling/controller_test.go +++ b/internal/controllers/scheduling/controller_test.go @@ -10,6 +10,7 @@ import ( apiv1 "github.com/Azure/eno/api/v1" "github.com/Azure/eno/internal/testutil" + prometheustestutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -715,3 +716,54 @@ func TestRetryContention(t *testing.T) { return err == nil && secondComp.Status.InFlightSynthesis != nil }) } + +// TestCompositionHealthMetrics proves that the compositionHealth metric is set correctly during reconciliation. +func TestCompositionHealthMetrics(t *testing.T) { + ctx := testutil.NewContext(t) + cli := testutil.NewClient(t) + + c := &controller{client: cli, concurrencyLimit: 10, watchdogThreshold: time.Millisecond * 100} + + synth := &apiv1.Synthesizer{} + synth.Name = "test-synth" + require.NoError(t, cli.Create(ctx, synth)) + + // Create a healthy composition (recently reconciled) + healthyComp := &apiv1.Composition{} + healthyComp.Name = "healthy-comp" + healthyComp.Namespace = "default" + healthyComp.Finalizers = []string{"eno.azure.io/cleanup"} + healthyComp.Spec.Synthesizer.Name = synth.Name + require.NoError(t, cli.Create(ctx, healthyComp)) + + healthyComp.Status.CurrentSynthesis = &apiv1.Synthesis{ + UUID: "healthy-uuid", + Reconciled: ptr.To(metav1.Now()), + } + require.NoError(t, cli.Status().Update(ctx, healthyComp)) + + // Create a stuck composition (initialized long ago, not reconciled) + stuckComp := &apiv1.Composition{} + stuckComp.Name = "stuck-comp" + stuckComp.Namespace = "default" + stuckComp.Finalizers = []string{"eno.azure.io/cleanup"} + stuckComp.Spec.Synthesizer.Name = synth.Name + require.NoError(t, cli.Create(ctx, stuckComp)) + + stuckComp.Status.CurrentSynthesis = &apiv1.Synthesis{ + UUID: "stuck-uuid", + Initialized: ptr.To(metav1.NewTime(time.Now().Add(-time.Hour))), // initialized long ago + } + require.NoError(t, cli.Status().Update(ctx, stuckComp)) + + // Run reconciliation + _, err := c.Reconcile(ctx, ctrl.Request{}) + require.NoError(t, err) + + // Verify metrics + healthyValue := prometheustestutil.ToFloat64(compositionHealth.WithLabelValues("healthy-comp", "default", "test-synth")) + assert.Equal(t, float64(0), healthyValue, "healthy composition should have health value 0") + + stuckValue := prometheustestutil.ToFloat64(compositionHealth.WithLabelValues("stuck-comp", "default", "test-synth")) + assert.Equal(t, float64(1), stuckValue, "stuck composition should have health value 1") +} diff --git a/internal/controllers/scheduling/metrics.go b/internal/controllers/scheduling/metrics.go index 480f43d5..b5832d73 100644 --- a/internal/controllers/scheduling/metrics.go +++ b/internal/controllers/scheduling/metrics.go @@ -30,10 +30,17 @@ var ( Help: "Number of compositions that have not been reconciled since a period after their current synthesis was initialized", }, []string{"synthesizer", "owner"}, ) + + compositionHealth = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "eno_composition_health", + Help: "Health status of each composition (0 = healthy, 1 = stuck/unhealthy)", + }, []string{"composition_name", "composition_namespace", "synthesizer_name"}, + ) ) func init() { - metrics.Registry.MustRegister(freeSynthesisSlots, schedulingLatency, stuckReconciling) + metrics.Registry.MustRegister(freeSynthesisSlots, schedulingLatency, stuckReconciling, compositionHealth) } func missedReconciliation(comp *apiv1.Composition, threshold time.Duration) bool {