From ee719d6e0d6217a17c32b267dafefbb8547296bf Mon Sep 17 00:00:00 2001 From: Patrick Derks Date: Thu, 19 Feb 2026 11:18:06 +0100 Subject: [PATCH] feat: add prometheus metrics --- api/v1/store_env.go | 18 ++ cmd/main.go | 7 +- go.mod | 2 +- helm/templates/deployment.yaml | 4 + helm/templates/metrics-service.yaml | 29 +++ helm/values.yaml | 4 + internal/config/config.go | 6 +- internal/controller/store_controller.go | 2 + internal/controller/store_status.go | 12 + internal/metrics/metrics.go | 278 ++++++++++++++++++++++++ 10 files changed, 358 insertions(+), 4 deletions(-) create mode 100644 helm/templates/metrics-service.yaml create mode 100644 internal/metrics/metrics.go diff --git a/api/v1/store_env.go b/api/v1/store_env.go index f8f8693d..c2f4abb0 100644 --- a/api/v1/store_env.go +++ b/api/v1/store_env.go @@ -8,6 +8,17 @@ import ( corev1 "k8s.io/api/core/v1" ) +// operatorServiceURL is set once at operator startup and injected into every +// store container as K8S_OPERATOR_URL so the Shopware consumer knows how to +// reach the operator. +var operatorServiceURL string + +// SetOperatorServiceURL stores the operator's service URL globally so it can +// be included in every store's environment without modifying the Store CRD. +func SetOperatorServiceURL(url string) { + operatorServiceURL = url +} + // TODO: If building more than one instance print a warning for the cache to use // redis func (s *Store) getAppCache() []corev1.EnvVar { @@ -509,6 +520,13 @@ func (s *Store) GetEnv() []corev1.EnvVar { }, } + if operatorServiceURL != "" { + c = append(c, corev1.EnvVar{ + Name: "K8S_OPERATOR_URL", + Value: operatorServiceURL, + }) + } + if s.Spec.ShopConfiguration.UsageDataConsent == "revoked" { c = append(c, corev1.EnvVar{ Name: "SHOPWARE_USAGE_DATA_CONSENT", diff --git a/cmd/main.go b/cmd/main.go index a7fcbef9..4cd6f12d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -33,6 +33,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "github.com/go-logr/zapr" shopv1 "github.com/shopware/shopware-operator/api/v1" @@ -68,6 +69,8 @@ func main() { os.Exit(1) } + shopv1.SetOperatorServiceURL(cfg.OperatorServiceURL) + logger := logging.NewLogger(cfg.LogLevel, cfg.LogFormat). With(zapz.String("service", "shopware-operator")). With(zapz.String("operator_version", version)). @@ -84,8 +87,8 @@ func main() { } mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - // Metrics: metricsserver.Options{BindAddress: cfg.MetricsAddr}, + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: cfg.MetricsAddr}, HealthProbeBindAddress: cfg.ProbeAddr, Cache: cache.Options{ DefaultNamespaces: map[string]cache.Config{ diff --git a/go.mod b/go.mod index 358295c5..9690d9f7 100644 --- a/go.mod +++ b/go.mod @@ -80,7 +80,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/prometheus/client_golang v1.22.0 // indirect + github.com/prometheus/client_golang v1.22.0 github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index 28a1e05b..8659a40c 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -79,6 +79,10 @@ spec: value: "{{ .Values.logFormat | default "json" }}" - name: DISABLE_CHECKS value: "{{ .Values.disableChecks | default "false" }}" + {{- if .Values.metrics.enabled }} + - name: OPERATOR_SERVICE_URL + value: "http://shopware-operator.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.metrics.port | default 8080 }}" + {{- end }} {{- if and (hasKey .Values "events") (hasKey .Values.events "nats") (.Values.events.nats.enable) }} - name: NATS_ENABLE value: "true" diff --git a/helm/templates/metrics-service.yaml b/helm/templates/metrics-service.yaml new file mode 100644 index 00000000..7bd666b3 --- /dev/null +++ b/helm/templates/metrics-service.yaml @@ -0,0 +1,29 @@ +{{- if not .Values.crds.installOnly }} +{{- if .Values.metrics.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: shopware-operator + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: metrics + app.kubernetes.io/created-by: shopware-operator + app.kubernetes.io/instance: shopware-operator + app.kubernetes.io/managed-by: shopware-operator + app.kubernetes.io/name: service + app.kubernetes.io/part-of: shopware-operator + control-plane: shopware-operator +{{- with .Values.labels }} + {{- toYaml . | nindent 4 }} +{{- end }} +spec: + type: ClusterIP + selector: + control-plane: shopware-operator + ports: + - name: http-metrics + port: {{ .Values.metrics.port | default 8080 }} + targetPort: 8080 + protocol: TCP +{{- end }} +{{- end }} diff --git a/helm/values.yaml b/helm/values.yaml index c552c801..be4b3c1c 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -94,3 +94,7 @@ logFormat: json # Disable check for s3/database/fastly and Opensearch checks. Useful if network access is not given for one of the services. # This is a global level. You can also control this per store. disableChecks: false + +metrics: + enabled: false + port: 8080 diff --git a/internal/config/config.go b/internal/config/config.go index b628f614..23f90500 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -58,9 +58,13 @@ type StoreConfig struct { NatsHandler NatsHandler `env:",prefix=NATS_"` // Metrics and health probe configuration - MetricsAddr string `env:"METRICS_BIND_ADDRESS, default=0"` + MetricsAddr string `env:"METRICS_BIND_ADDRESS, default=:8080"` ProbeAddr string `env:"HEALTH_PROBE_BIND_ADDRESS, default=:8081"` + // OperatorServiceURL is exposed to store containers as K8S_OPERATOR_URL so + // the Shopware consumer knows how to reach the operator service. + OperatorServiceURL string `env:"OPERATOR_SERVICE_URL"` + EnableLeaderElection bool `env:"LEADER_ELECT, default=true"` DisableChecks bool `env:"DISABLE_CHECKS, default=false"` Namespace string `env:"NAMESPACE, default=default"` diff --git a/internal/controller/store_controller.go b/internal/controller/store_controller.go index 418db9a3..f76312b9 100644 --- a/internal/controller/store_controller.go +++ b/internal/controller/store_controller.go @@ -15,6 +15,7 @@ import ( "github.com/shopware/shopware-operator/internal/job" "github.com/shopware/shopware-operator/internal/k8s" "github.com/shopware/shopware-operator/internal/logging" + "github.com/shopware/shopware-operator/internal/metrics" "github.com/shopware/shopware-operator/internal/pdb" "github.com/shopware/shopware-operator/internal/secret" "github.com/shopware/shopware-operator/internal/service" @@ -160,6 +161,7 @@ func (r *StoreReconciler) Reconcile( // } if !store.DeletionTimestamp.IsZero() { + metrics.RemoveStoreMetrics(store) return shortRequeue, nil } diff --git a/internal/controller/store_status.go b/internal/controller/store_status.go index 39188382..9725683e 100644 --- a/internal/controller/store_status.go +++ b/internal/controller/store_status.go @@ -8,10 +8,12 @@ import ( "github.com/aws/aws-sdk-go-v2/aws" v1 "github.com/shopware/shopware-operator/api/v1" + "github.com/shopware/shopware-operator/internal/cronjob" "github.com/shopware/shopware-operator/internal/deployment" "github.com/shopware/shopware-operator/internal/job" "github.com/shopware/shopware-operator/internal/k8s" "github.com/shopware/shopware-operator/internal/logging" + "github.com/shopware/shopware-operator/internal/metrics" "github.com/shopware/shopware-operator/internal/util" "go.uber.org/zap" corev1 "k8s.io/api/core/v1" @@ -117,6 +119,16 @@ func (r *StoreReconciler) reconcileCRStatus( logging.FromContext(ctx).Infow("Update store status", zap.Any("status", store.Status)) r.SendEvent(ctx, *store, "Update store status") + metrics.UpdateStoreMetrics(store) + + scheduledCronJob, err := cronjob.GetScheduledCronJob(ctx, r.Client, *store) + if err != nil { + if !k8serrors.IsNotFound(err) { + logging.FromContext(ctx).Warnw("failed to get scheduled task cronjob for metrics", zap.Error(err)) + } + scheduledCronJob = nil + } + metrics.UpdateScheduledTaskMetrics(store, scheduledCronJob) return writeStoreStatus(ctx, r.Client, types.NamespacedName{ Namespace: store.Namespace, diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 00000000..b1faeeb8 --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,278 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + v1 "github.com/shopware/shopware-operator/api/v1" + batchv1 "k8s.io/api/batch/v1" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + storeState = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_state", + Help: "Current state of a Shopware store (1 for active state, 0 otherwise)", + }, []string{"store", "namespace", "state"}) + + storeCurrentImage = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_current_image", + Help: "Current image of a Shopware store", + }, []string{"store", "namespace", "image"}) + + storeDeploymentReplicasAvailable = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_deployment_replicas_available", + Help: "Available replica count per deployment type", + }, []string{"store", "namespace", "deployment_type"}) + + storeDeploymentReplicasDesired = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_deployment_replicas_desired", + Help: "Desired replica count per deployment type", + }, []string{"store", "namespace", "deployment_type"}) + + storeDeploymentState = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_deployment_state", + Help: "Current state of a store deployment (1 for active state, 0 otherwise)", + }, []string{"store", "namespace", "deployment_type", "state"}) + + storeUsageDataConsent = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_usage_data_consent", + Help: "Usage data consent status (1 for allowed, 0 for revoked)", + }, []string{"store", "namespace"}) + + storeHPAEnabled = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_hpa_enabled", + Help: "Whether the HorizontalPodAutoscaler is enabled (1) or disabled (0)", + }, []string{"store", "namespace"}) + + storeHPAMinReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_hpa_min_replicas", + Help: "HPA minimum replicas", + }, []string{"store", "namespace"}) + + storeHPAMaxReplicas = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_hpa_max_replicas", + Help: "HPA maximum replicas", + }, []string{"store", "namespace"}) + + storeScheduledTaskSuspended = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_scheduled_task_suspended", + Help: "Whether the scheduled task CronJob is suspended (1) or active (0)", + }, []string{"store", "namespace"}) + + storeScheduledTaskLastRunStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_scheduled_task_last_run_status", + Help: "Status of the latest scheduled task run (1 for success, -1 for failure, 0 for unknown/no runs)", + }, []string{"store", "namespace"}) + + storeScheduledTaskLastSuccessTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "shopware_store_scheduled_task_last_success_timestamp", + Help: "Unix timestamp of the last successful scheduled task run", + }, []string{"store", "namespace"}) + + allStates = []v1.StatefulAppState{ + v1.StateWait, + v1.StateSetup, + v1.StateSetupError, + v1.StateInitializing, + v1.StateMigration, + v1.StateMigrationError, + v1.StateReady, + } + + allDeploymentStates = []v1.DeploymentState{ + v1.DeploymentStateUnknown, + v1.DeploymentStateError, + v1.DeploymentStateNotFound, + v1.DeploymentStateRunning, + v1.DeploymentStateScaling, + } +) + +func init() { + metrics.Registry.MustRegister( + storeState, + storeCurrentImage, + storeDeploymentReplicasAvailable, + storeDeploymentReplicasDesired, + storeDeploymentState, + storeUsageDataConsent, + storeHPAEnabled, + storeHPAMinReplicas, + storeHPAMaxReplicas, + storeScheduledTaskSuspended, + storeScheduledTaskLastRunStatus, + storeScheduledTaskLastSuccessTime, + ) +} + +// UpdateStoreMetrics sets all gauge values from a Store's status. +func UpdateStoreMetrics(store *v1.Store) { + name := store.Name + ns := store.Namespace + + // Store state + for _, s := range allStates { + val := float64(0) + if store.Status.State == s { + val = 1 + } + storeState.WithLabelValues(name, ns, string(s)).Set(val) + } + + // Current image + if store.Status.CurrentImageTag != "" { + // Delete old image labels, then set new one + storeCurrentImage.DeletePartialMatch(prometheus.Labels{ + "store": name, + "namespace": ns, + }) + storeCurrentImage.WithLabelValues(name, ns, store.Status.CurrentImageTag).Set(1) + } + + // Usage data consent + val := float64(0) + if store.Spec.ShopConfiguration.UsageDataConsent == "allowed" { + val = 1 + } + storeUsageDataConsent.WithLabelValues(name, ns).Set(val) + + // HPA + hpa := store.Spec.HorizontalPodAutoscaler + if hpa.Enabled { + storeHPAEnabled.WithLabelValues(name, ns).Set(1) + storeHPAMaxReplicas.WithLabelValues(name, ns).Set(float64(hpa.MaxReplicas)) + if hpa.MinReplicas != nil { + storeHPAMinReplicas.WithLabelValues(name, ns).Set(float64(*hpa.MinReplicas)) + } else { + storeHPAMinReplicas.WithLabelValues(name, ns).Set(0) + } + } else { + storeHPAEnabled.WithLabelValues(name, ns).Set(0) + storeHPAMinReplicas.WithLabelValues(name, ns).Set(0) + storeHPAMaxReplicas.WithLabelValues(name, ns).Set(0) + } + + // Deployment metrics + setDeploymentMetrics(name, ns, "admin", store.Status.AdminState) + setDeploymentMetrics(name, ns, "storefront", store.Status.StorefrontState) + setDeploymentMetrics(name, ns, "worker", store.Status.WorkerState) +} + +func setDeploymentMetrics(name, ns, deploymentType string, cond v1.DeploymentCondition) { + // Parse available/desired from the Ready field (format: "available/desired") + var available, desired int + if cond.Ready != "" { + fmtScan(cond.Ready, &available, &desired) + } + + storeDeploymentReplicasAvailable.WithLabelValues(name, ns, deploymentType).Set(float64(available)) + storeDeploymentReplicasDesired.WithLabelValues(name, ns, deploymentType).Set(float64(cond.StoreReplicas)) + + // Deployment state + for _, s := range allDeploymentStates { + val := float64(0) + if cond.State == s { + val = 1 + } + storeDeploymentState.WithLabelValues(name, ns, deploymentType, string(s)).Set(val) + } +} + +func fmtScan(ready string, available, desired *int) { + // Parse "X/Y" format + for i, c := range ready { + if c == '/' { + *available = atoi(ready[:i]) + *desired = atoi(ready[i+1:]) + return + } + } +} + +func atoi(s string) int { + n := 0 + for _, c := range s { + if c >= '0' && c <= '9' { + n = n*10 + int(c-'0') + } + } + return n +} + +// UpdateScheduledTaskMetrics sets metrics for the scheduled task CronJob. +func UpdateScheduledTaskMetrics(store *v1.Store, cronJob *batchv1.CronJob) { + name := store.Name + ns := store.Namespace + + if cronJob == nil { + storeScheduledTaskSuspended.WithLabelValues(name, ns).Set(0) + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(0) + storeScheduledTaskLastSuccessTime.WithLabelValues(name, ns).Set(0) + return + } + + // Suspended + if cronJob.Spec.Suspend != nil && *cronJob.Spec.Suspend { + storeScheduledTaskSuspended.WithLabelValues(name, ns).Set(1) + } else { + storeScheduledTaskSuspended.WithLabelValues(name, ns).Set(0) + } + + // Last run status: compare LastSuccessfulTime vs LastScheduleTime. + // If last successful time >= last schedule time, the latest run succeeded. + // If last schedule time is after last successful time, the job either + // failed or is still running. Only report failure when no jobs are + // currently active; otherwise report 0 (in progress / unknown). + jobActive := len(cronJob.Status.Active) > 0 + switch { + case cronJob.Status.LastSuccessfulTime != nil && cronJob.Status.LastScheduleTime != nil: + if !cronJob.Status.LastSuccessfulTime.Before(cronJob.Status.LastScheduleTime) { + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(1) + } else if jobActive { + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(0) + } else { + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(-1) + } + case cronJob.Status.LastSuccessfulTime != nil: + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(1) + case cronJob.Status.LastScheduleTime != nil: + // Scheduled but never succeeded — could still be running + if jobActive { + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(0) + } else { + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(-1) + } + default: + storeScheduledTaskLastRunStatus.WithLabelValues(name, ns).Set(0) + } + + // Last success timestamp + if cronJob.Status.LastSuccessfulTime != nil { + storeScheduledTaskLastSuccessTime.WithLabelValues(name, ns).Set(float64(cronJob.Status.LastSuccessfulTime.Unix())) + } else { + storeScheduledTaskLastSuccessTime.WithLabelValues(name, ns).Set(0) + } +} + +// RemoveStoreMetrics removes all metrics for a deleted store. +func RemoveStoreMetrics(store *v1.Store) { + name := store.Name + ns := store.Namespace + + match := prometheus.Labels{ + "store": name, + "namespace": ns, + } + + storeState.DeletePartialMatch(match) + storeCurrentImage.DeletePartialMatch(match) + storeDeploymentReplicasAvailable.DeletePartialMatch(match) + storeDeploymentReplicasDesired.DeletePartialMatch(match) + storeDeploymentState.DeletePartialMatch(match) + storeUsageDataConsent.DeletePartialMatch(match) + storeHPAEnabled.DeletePartialMatch(match) + storeHPAMinReplicas.DeletePartialMatch(match) + storeHPAMaxReplicas.DeletePartialMatch(match) + storeScheduledTaskSuspended.DeletePartialMatch(match) + storeScheduledTaskLastRunStatus.DeletePartialMatch(match) + storeScheduledTaskLastSuccessTime.DeletePartialMatch(match) +}