diff --git a/.changes/unreleased/operator-Fixed-20260420-150000.yaml b/.changes/unreleased/operator-Fixed-20260420-150000.yaml new file mode 100644 index 000000000..b5a961284 --- /dev/null +++ b/.changes/unreleased/operator-Fixed-20260420-150000.yaml @@ -0,0 +1,14 @@ +project: operator +kind: Fixed +body: | + Fixed a SASL bootstrap-user password drift that left clusters unauthenticated after + the bootstrap user Secret was rotated. When the Secret was deleted — for example, to + migrate from a Helm-era Secret to operator-managed ownership — the operator + regenerated it with a fresh random password, but the running Redpanda cluster retained + the original password in its internal SCRAM DB because the sidecar configwatcher + explicitly created the internal superuser only once. The configwatcher now mirrors the + Secret's password into Redpanda's SCRAM DB via the admin API whenever its user-sync + runs — at pod start and on fsnotify events when the mounted Secret changes — so a + rotated bootstrap user Secret propagates into the running cluster and rpk keeps + authenticating after the next pod restart. +time: 2026-04-20T15:00:00.000000+00:00 diff --git a/acceptance/features/bootstrap-user.feature b/acceptance/features/bootstrap-user.feature new file mode 100644 index 000000000..d597aa3f0 --- /dev/null +++ b/acceptance/features/bootstrap-user.feature @@ -0,0 +1,61 @@ +Feature: SASL bootstrap user secret lifecycle + + # Regression test for a password-drift bug surfaced by users migrating from + # the legacy Helm deploy flow to operator-managed clusters. + # + # Scenario the user hit in production: + # 1. A Redpanda cluster was originally bootstrapped by the old Helm chart + # with a pre-existing `-bootstrap-user` Secret holding a known + # password, referenced from `auth.sasl.bootstrapUser.secretKeyRef`. + # 2. During cleanup, the user deletes the pre-existing Secret and removes + # the `bootstrapUser` block from the CR, expecting the operator to take + # full ownership of the Secret (the documented "let the operator + # manage it" path). + # 3. The operator's render state looks for a default-named Secret, does not + # find it, and generates a new Secret with a freshly-randomized password. + # 4. Previously the running Redpanda kept the original password in its + # internal SCRAM DB because the sidecar configwatcher only ever *created* + # the internal superuser and never updated it. `rpk` inside any pod that + # restarted after the rotation then failed with + # `SASL_AUTHENTICATION_FAILED: Invalid credentials`. + # + # Fix: the configwatcher now mirrors the Secret's password into Redpanda's + # SCRAM DB on every sync via the admin API's `UpdateUser`, so a rotated + # bootstrap user Secret propagates into the running cluster. + @skip:gke @skip:aks @skip:eks + Scenario: Bootstrap user secret deleted and regenerated; rpk still authenticates + Given I apply Kubernetes manifest: + """ + --- + apiVersion: cluster.redpanda.com/v1alpha2 + kind: Redpanda + metadata: + name: bootstrap-regen + spec: + clusterSpec: + image: + repository: ${DEFAULT_REDPANDA_REPO} + tag: ${DEFAULT_REDPANDA_TAG} + statefulset: + replicas: 1 + sideCars: + image: + tag: dev + repository: localhost/redpanda-operator + controllers: + image: + tag: dev + repository: localhost/redpanda-operator + external: + enabled: false + auth: + sasl: + enabled: true + """ + And cluster "bootstrap-regen" is stable with 1 nodes + And rpk is configured correctly in "bootstrap-regen" cluster + When I delete the bootstrap user secret for cluster "bootstrap-regen" + And the bootstrap user secret for cluster "bootstrap-regen" is regenerated with a new password + And I restart all pods in cluster "bootstrap-regen" + Then cluster "bootstrap-regen" is stable with 1 nodes + And rpk is configured correctly in "bootstrap-regen" cluster diff --git a/acceptance/steps/bootstrap_user.go b/acceptance/steps/bootstrap_user.go new file mode 100644 index 000000000..08df4f3f9 --- /dev/null +++ b/acceptance/steps/bootstrap_user.go @@ -0,0 +1,109 @@ +// Copyright 2026 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +package steps + +import ( + "context" + "fmt" + "time" + + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + framework "github.com/redpanda-data/redpanda-operator/harpoon" +) + +// bootstrapUserPasswordKey is used to stash the pre-deletion password on the +// scenario context so later steps can assert the operator actually rotated it. +type bootstrapUserPasswordKey string + +func bootstrapUserSecretName(cluster string) string { + return fmt.Sprintf("%s-bootstrap-user", cluster) +} + +// iDeleteTheBootstrapUserSecretForCluster records the current bootstrap user +// password and then deletes the Secret. The recorded password is stored on the +// returned context so a follow-up step can confirm the operator regenerates it +// with a *different* value. +func iDeleteTheBootstrapUserSecretForCluster(ctx context.Context, t framework.TestingT, cluster string) context.Context { + name := bootstrapUserSecretName(cluster) + + var secret corev1.Secret + require.NoError(t, t.Get(ctx, t.ResourceKey(name), &secret)) + + password := string(secret.Data["password"]) + require.NotEmpty(t, password, "bootstrap user secret %q has no password", name) + + t.Logf("Recorded original bootstrap user password (length %d), deleting secret %q", len(password), name) + require.NoError(t, t.Delete(ctx, &secret)) + + require.Eventually(t, func() bool { + var check corev1.Secret + err := t.Get(ctx, t.ResourceKey(name), &check) + return apierrors.IsNotFound(err) + }, 30*time.Second, 2*time.Second, "secret %q was never deleted", name) + + return context.WithValue(ctx, bootstrapUserPasswordKey(cluster), password) +} + +// theBootstrapUserSecretForClusterIsRegenerated waits until the operator has +// recreated the Secret and confirms the new password differs from the recorded +// original. This guards against the test accidentally observing the pre-delete +// Secret before reconciliation runs. +func theBootstrapUserSecretForClusterIsRegenerated(ctx context.Context, t framework.TestingT, cluster string) { + name := bootstrapUserSecretName(cluster) + original, _ := ctx.Value(bootstrapUserPasswordKey(cluster)).(string) + require.NotEmpty(t, original, "no recorded bootstrap user password for cluster %q — did the delete step run?", cluster) + + require.Eventually(t, func() bool { + var secret corev1.Secret + if err := t.Get(ctx, t.ResourceKey(name), &secret); err != nil { + t.Logf("waiting for secret %q to reappear: %v", name, err) + return false + } + current := string(secret.Data["password"]) + if current == "" { + t.Logf("secret %q has no password yet", name) + return false + } + if current == original { + t.Logf("secret %q still holds the original password", name) + return false + } + t.Logf("secret %q now holds a regenerated password (length %d, differs from original)", name, len(current)) + return true + }, 2*time.Minute, 5*time.Second, "secret %q was never regenerated with a different password", name) +} + +// iRestartAllPodsInCluster deletes every Pod owned by the cluster's +// StatefulSet so that each replacement Pod's `RPK_USER` / `RPK_PASS` env vars +// are re-materialized from the current bootstrap user Secret. The StatefulSet +// controller takes care of bringing the replacement Pods back. +func iRestartAllPodsInCluster(ctx context.Context, t framework.TestingT, cluster string) { + var sts appsv1.StatefulSet + require.NoError(t, t.Get(ctx, t.ResourceKey(cluster), &sts)) + + selector, err := metav1.LabelSelectorAsSelector(sts.Spec.Selector) + require.NoError(t, err) + + var pods corev1.PodList + require.NoError(t, t.List(ctx, &pods, client.InNamespace(t.Namespace()), client.MatchingLabelsSelector{Selector: selector})) + require.NotEmpty(t, pods.Items, "no pods found for cluster %q", cluster) + + for i := range pods.Items { + pod := &pods.Items[i] + t.Logf("Deleting pod %q to force re-read of bootstrap user env vars", pod.Name) + require.NoError(t, t.Delete(ctx, pod)) + } +} diff --git a/acceptance/steps/register.go b/acceptance/steps/register.go index cd8179ddc..7186aa439 100644 --- a/acceptance/steps/register.go +++ b/acceptance/steps/register.go @@ -162,6 +162,11 @@ func init() { framework.RegisterStep(`^service "([^"]*)" should not have field managers:$`, checkResourceNoFieldManagers) framework.RegisterStep(`^cluster "([^"]*)" should have sync error:$`, checkClusterHasSyncError) + // Bootstrap user lifecycle steps + framework.RegisterStep(`^I delete the bootstrap user secret for cluster "([^"]*)"$`, iDeleteTheBootstrapUserSecretForCluster) + framework.RegisterStep(`^the bootstrap user secret for cluster "([^"]*)" is regenerated with a new password$`, theBootstrapUserSecretForClusterIsRegenerated) + framework.RegisterStep(`^I restart all pods in cluster "([^"]*)"$`, iRestartAllPodsInCluster) + // Debug steps framework.RegisterStep(`^I become debuggable$`, sleepALongTime) } diff --git a/operator/internal/configwatcher/configwatcher.go b/operator/internal/configwatcher/configwatcher.go index 4a24e0f2a..e8f0dc24d 100644 --- a/operator/internal/configwatcher/configwatcher.go +++ b/operator/internal/configwatcher/configwatcher.go @@ -200,11 +200,14 @@ func (w *ConfigWatcher) SyncUsers(ctx context.Context, path string) { w.log.Info("synchronizing users in file", "file", path) - // sync our internal superuser first + // sync our internal superuser first. We mirror the Secret's password + // into Redpanda's SCRAM DB on every sync so that a rotated bootstrap + // user Secret (e.g. the operator regenerating it after it was deleted) + // actually propagates into the running cluster. syncInternalUser never + // falls back to delete-and-recreate — dropping the internal superuser + // even briefly could strand the operator. internalSuperuser, password, mechanism := getInternalUser() - // the internal user should only ever be created once, so don't - // update its password ever. - w.syncUser(ctx, internalSuperuser, password, mechanism, false) + w.syncInternalUser(ctx, internalSuperuser, password, mechanism) users := []string{internalSuperuser} @@ -248,6 +251,28 @@ func (w *ConfigWatcher) setSuperusers(ctx context.Context, users []string) { } } +// syncInternalUser ensures Redpanda's internal superuser record matches the +// password currently mounted for the pod. Unlike syncUser, this path never +// falls back to a delete/recreate: dropping the internal superuser — even +// for the brief window between DeleteUser and CreateUser — could strand the +// operator. Sending UpdateUser with the current password is idempotent on +// the Redpanda side, so this is safe to invoke on every sync. +func (w *ConfigWatcher) syncInternalUser(ctx context.Context, user, password, mechanism string) { + w.log.Info("synchronizing internal user", "user", user) + + err := w.adminClient.CreateUser(ctx, user, password, mechanism) + if err == nil { + return + } + if !strings.Contains(err.Error(), "already exists") { + w.log.Error(err, "could not create internal user", "user", user) + return + } + if err := w.adminClient.UpdateUser(ctx, user, password, mechanism); err != nil { + w.log.Error(err, "could not update internal user password", "user", user) + } +} + func (w *ConfigWatcher) syncUser(ctx context.Context, user, password, mechanism string, recreate bool) { w.log.Info("synchronizing user", "user", user) diff --git a/operator/internal/configwatcher/configwatcher_test.go b/operator/internal/configwatcher/configwatcher_test.go index afba61940..066e1175d 100644 --- a/operator/internal/configwatcher/configwatcher_test.go +++ b/operator/internal/configwatcher/configwatcher_test.go @@ -14,6 +14,7 @@ import ( "fmt" "strings" "testing" + "time" "github.com/go-logr/logr/testr" "github.com/redpanda-data/common-go/rpadmin" @@ -21,6 +22,8 @@ import ( "github.com/stretchr/testify/require" "github.com/testcontainers/testcontainers-go" "github.com/testcontainers/testcontainers-go/modules/redpanda" + "github.com/twmb/franz-go/pkg/kgo" + "github.com/twmb/franz-go/pkg/sasl/scram" "sigs.k8s.io/controller-runtime/pkg/log" "github.com/redpanda-data/redpanda-operator/operator/internal/configwatcher" @@ -40,6 +43,8 @@ func TestConfigWatcher(t *testing.T) { ctx, "redpandadata/redpanda:v24.2.4", redpanda.WithSuperusers("user"), + redpanda.WithEnableSASL(), + redpanda.WithEnableKafkaAuthorization(), testcontainers.WithEnv(map[string]string{ "RP_BOOTSTRAP_USER": fmt.Sprintf("%s:%s:%s", user, password, saslMechanism), }), @@ -116,6 +121,31 @@ func TestConfigWatcher(t *testing.T) { require.ElementsMatch(t, superusers, clusterUsers) + // Simulate the bootstrap user Secret being rotated (the operator + // regenerating it after it was deleted). getInternalUser() reads the + // password out of RPK_PASS on every sync, so flipping the env var and + // re-running SyncUsers is enough to exercise the rotation path. + // + // Without the fix, SyncUsers would call CreateUser, see "already + // exists", and return — leaving Redpanda's SCRAM DB pointed at the + // original password forever. With the fix, it follows up with + // UpdateUser so the rotated password actually takes effect. We + // validate via a Kafka SASL handshake because admin-API basic auth + // is derivable from rpk-config (the only way the configwatcher + // authenticates) and therefore can't prove the SCRAM DB changed. + const rotatedPassword = "rotated-password-after-secret-regen" + t.Setenv("RPK_PASS", rotatedPassword) + + watcher.SyncUsers(ctx, "/etc/secret/users/users.txt") + + kafkaBroker, err := container.KafkaSeedBroker(ctx) + require.NoError(t, err) + + require.Error(t, kafkaSASLHandshake(ctx, kafkaBroker, user, password), + "original password must no longer authenticate after rotation") + require.NoError(t, kafkaSASLHandshake(ctx, kafkaBroker, user, rotatedPassword), + "rotated password must authenticate after SyncUsers propagates it") + cancel() select { @@ -142,3 +172,22 @@ rpk: func createUserLine(user, password, mechanism string) string { return user + ":" + password + ":" + mechanism } + +// kafkaSASLHandshake opens a short-lived kgo client against the Kafka listener +// with SCRAM-SHA-512 credentials and pings each seed broker. The SASL +// handshake runs as part of broker connection setup, so a non-nil return +// means the credentials were rejected (or the broker was unreachable). +func kafkaSASLHandshake(ctx context.Context, broker, user, password string) error { + client, err := kgo.NewClient( + kgo.SeedBrokers(broker), + kgo.SASL((&scram.Auth{User: user, Pass: password}).AsSha512Mechanism()), + ) + if err != nil { + return err + } + defer client.Close() + + pingCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + return client.Ping(pingCtx) +}