Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions internal/controllers/scheduling/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ func (c *controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
nextSlot := c.getNextCooldownSlot(comps)
logger.Info("listed compositions", "compositionCount", len(comps.Items), "nextCooldownSlot", nextSlot)

// Reset the compositionHealth metric before iterating through compositions
compositionHealth.Reset()

var inFlight int
var op *op
for _, comp := range comps.Items {
Expand All @@ -122,7 +125,10 @@ func (c *controller) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
if missedReconciliation(&comp, c.watchdogThreshold) {
synth := synthsByName[comp.Spec.Synthesizer.Name]
stuckReconciling.WithLabelValues(comp.Spec.Synthesizer.Name, getSynthOwner(&synth)).Inc()
compositionHealth.WithLabelValues(comp.Name, comp.Namespace, comp.Spec.Synthesizer.Name).Set(1)
logger.Info("detected composition missed reconciliation", "compositionName", comp.Name, "compositionNamespace", comp.Namespace, "synthesizerName", comp.Spec.Synthesizer.Name)
} else {
compositionHealth.WithLabelValues(comp.Name, comp.Namespace, comp.Spec.Synthesizer.Name).Set(0)
}

synth, ok := synthsByName[comp.Spec.Synthesizer.Name]
Expand Down
52 changes: 52 additions & 0 deletions internal/controllers/scheduling/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

apiv1 "github.com/Azure/eno/api/v1"
"github.com/Azure/eno/internal/testutil"
prometheustestutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -715,3 +716,54 @@ func TestRetryContention(t *testing.T) {
return err == nil && secondComp.Status.InFlightSynthesis != nil
})
}

// TestCompositionHealthMetrics proves that the compositionHealth metric is set correctly during reconciliation.
func TestCompositionHealthMetrics(t *testing.T) {
ctx := testutil.NewContext(t)
cli := testutil.NewClient(t)

c := &controller{client: cli, concurrencyLimit: 10, watchdogThreshold: time.Millisecond * 100}

synth := &apiv1.Synthesizer{}
synth.Name = "test-synth"
require.NoError(t, cli.Create(ctx, synth))

// Create a healthy composition (recently reconciled)
healthyComp := &apiv1.Composition{}
healthyComp.Name = "healthy-comp"
healthyComp.Namespace = "default"
healthyComp.Finalizers = []string{"eno.azure.io/cleanup"}
healthyComp.Spec.Synthesizer.Name = synth.Name
require.NoError(t, cli.Create(ctx, healthyComp))

healthyComp.Status.CurrentSynthesis = &apiv1.Synthesis{
UUID: "healthy-uuid",
Reconciled: ptr.To(metav1.Now()),
}
require.NoError(t, cli.Status().Update(ctx, healthyComp))

// Create a stuck composition (initialized long ago, not reconciled)
stuckComp := &apiv1.Composition{}
stuckComp.Name = "stuck-comp"
stuckComp.Namespace = "default"
stuckComp.Finalizers = []string{"eno.azure.io/cleanup"}
stuckComp.Spec.Synthesizer.Name = synth.Name
require.NoError(t, cli.Create(ctx, stuckComp))

stuckComp.Status.CurrentSynthesis = &apiv1.Synthesis{
UUID: "stuck-uuid",
Initialized: ptr.To(metav1.NewTime(time.Now().Add(-time.Hour))), // initialized long ago
}
require.NoError(t, cli.Status().Update(ctx, stuckComp))

// Run reconciliation
_, err := c.Reconcile(ctx, ctrl.Request{})
require.NoError(t, err)

// Verify metrics
healthyValue := prometheustestutil.ToFloat64(compositionHealth.WithLabelValues("healthy-comp", "default", "test-synth"))
assert.Equal(t, float64(0), healthyValue, "healthy composition should have health value 0")

stuckValue := prometheustestutil.ToFloat64(compositionHealth.WithLabelValues("stuck-comp", "default", "test-synth"))
assert.Equal(t, float64(1), stuckValue, "stuck composition should have health value 1")
}
9 changes: 8 additions & 1 deletion internal/controllers/scheduling/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,17 @@ var (
Help: "Number of compositions that have not been reconciled since a period after their current synthesis was initialized",
}, []string{"synthesizer", "owner"},
)

compositionHealth = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "eno_composition_health",
Help: "Health status of each composition (0 = healthy, 1 = stuck/unhealthy)",
}, []string{"composition_name", "composition_namespace", "synthesizer_name"},
)
)

func init() {
metrics.Registry.MustRegister(freeSynthesisSlots, schedulingLatency, stuckReconciling)
metrics.Registry.MustRegister(freeSynthesisSlots, schedulingLatency, stuckReconciling, compositionHealth)
}

func missedReconciliation(comp *apiv1.Composition, threshold time.Duration) bool {
Expand Down
Loading