Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .changes/unreleased/operator-Fixed-20260420-150000.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
project: operator
kind: Fixed
body: |
Fixed a SASL bootstrap-user password drift that left clusters unauthenticated after
the bootstrap user Secret was rotated. When the Secret was deleted — for example, to
migrate from a Helm-era Secret to operator-managed ownership — the operator
regenerated it with a fresh random password, but the running Redpanda cluster retained
the original password in its internal SCRAM DB because the sidecar configwatcher
explicitly created the internal superuser only once. The configwatcher now mirrors the
Secret's password into Redpanda's SCRAM DB via the admin API whenever its user-sync
runs — at pod start and on fsnotify events when the mounted Secret changes — so a
rotated bootstrap user Secret propagates into the running cluster and rpk keeps
authenticating after the next pod restart.
time: 2026-04-20T15:00:00.000000+00:00
61 changes: 61 additions & 0 deletions acceptance/features/bootstrap-user.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
Feature: SASL bootstrap user secret lifecycle

# Regression test for a password-drift bug surfaced by users migrating from
# the legacy Helm deploy flow to operator-managed clusters.
#
# Scenario the user hit in production:
# 1. A Redpanda cluster was originally bootstrapped by the old Helm chart
# with a pre-existing `<fullname>-bootstrap-user` Secret holding a known
# password, referenced from `auth.sasl.bootstrapUser.secretKeyRef`.
# 2. During cleanup, the user deletes the pre-existing Secret and removes
# the `bootstrapUser` block from the CR, expecting the operator to take
# full ownership of the Secret (the documented "let the operator
# manage it" path).
# 3. The operator's render state looks for a default-named Secret, does not
# find it, and generates a new Secret with a freshly-randomized password.
# 4. Previously the running Redpanda kept the original password in its
# internal SCRAM DB because the sidecar configwatcher only ever *created*
# the internal superuser and never updated it. `rpk` inside any pod that
# restarted after the rotation then failed with
# `SASL_AUTHENTICATION_FAILED: Invalid credentials`.
#
# Fix: the configwatcher now mirrors the Secret's password into Redpanda's
# SCRAM DB on every sync via the admin API's `UpdateUser`, so a rotated
# bootstrap user Secret propagates into the running cluster.
@skip:gke @skip:aks @skip:eks
Scenario: Bootstrap user secret deleted and regenerated; rpk still authenticates
Given I apply Kubernetes manifest:
"""
---
apiVersion: cluster.redpanda.com/v1alpha2
kind: Redpanda
metadata:
name: bootstrap-regen
spec:
clusterSpec:
image:
repository: ${DEFAULT_REDPANDA_REPO}
tag: ${DEFAULT_REDPANDA_TAG}
statefulset:
replicas: 1
sideCars:
image:
tag: dev
repository: localhost/redpanda-operator
controllers:
image:
tag: dev
repository: localhost/redpanda-operator
external:
enabled: false
auth:
sasl:
enabled: true
"""
And cluster "bootstrap-regen" is stable with 1 nodes
And rpk is configured correctly in "bootstrap-regen" cluster
When I delete the bootstrap user secret for cluster "bootstrap-regen"
And the bootstrap user secret for cluster "bootstrap-regen" is regenerated with a new password
And I restart all pods in cluster "bootstrap-regen"
Then cluster "bootstrap-regen" is stable with 1 nodes
And rpk is configured correctly in "bootstrap-regen" cluster
109 changes: 109 additions & 0 deletions acceptance/steps/bootstrap_user.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright 2026 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

package steps

import (
"context"
"fmt"
"time"

"github.com/stretchr/testify/require"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

framework "github.com/redpanda-data/redpanda-operator/harpoon"
)

// bootstrapUserPasswordKey is used to stash the pre-deletion password on the
// scenario context so later steps can assert the operator actually rotated it.
type bootstrapUserPasswordKey string

func bootstrapUserSecretName(cluster string) string {
return fmt.Sprintf("%s-bootstrap-user", cluster)
}

// iDeleteTheBootstrapUserSecretForCluster records the current bootstrap user
// password and then deletes the Secret. The recorded password is stored on the
// returned context so a follow-up step can confirm the operator regenerates it
// with a *different* value.
func iDeleteTheBootstrapUserSecretForCluster(ctx context.Context, t framework.TestingT, cluster string) context.Context {
name := bootstrapUserSecretName(cluster)

var secret corev1.Secret
require.NoError(t, t.Get(ctx, t.ResourceKey(name), &secret))

password := string(secret.Data["password"])
require.NotEmpty(t, password, "bootstrap user secret %q has no password", name)

t.Logf("Recorded original bootstrap user password (length %d), deleting secret %q", len(password), name)
require.NoError(t, t.Delete(ctx, &secret))

require.Eventually(t, func() bool {
var check corev1.Secret
err := t.Get(ctx, t.ResourceKey(name), &check)
return apierrors.IsNotFound(err)
}, 30*time.Second, 2*time.Second, "secret %q was never deleted", name)

return context.WithValue(ctx, bootstrapUserPasswordKey(cluster), password)
}

// theBootstrapUserSecretForClusterIsRegenerated waits until the operator has
// recreated the Secret and confirms the new password differs from the recorded
// original. This guards against the test accidentally observing the pre-delete
// Secret before reconciliation runs.
func theBootstrapUserSecretForClusterIsRegenerated(ctx context.Context, t framework.TestingT, cluster string) {
name := bootstrapUserSecretName(cluster)
original, _ := ctx.Value(bootstrapUserPasswordKey(cluster)).(string)
require.NotEmpty(t, original, "no recorded bootstrap user password for cluster %q — did the delete step run?", cluster)

require.Eventually(t, func() bool {
var secret corev1.Secret
if err := t.Get(ctx, t.ResourceKey(name), &secret); err != nil {
t.Logf("waiting for secret %q to reappear: %v", name, err)
return false
}
current := string(secret.Data["password"])
if current == "" {
t.Logf("secret %q has no password yet", name)
return false
}
if current == original {
t.Logf("secret %q still holds the original password", name)
return false
}
t.Logf("secret %q now holds a regenerated password (length %d, differs from original)", name, len(current))
return true
}, 2*time.Minute, 5*time.Second, "secret %q was never regenerated with a different password", name)
}

// iRestartAllPodsInCluster deletes every Pod owned by the cluster's
// StatefulSet so that each replacement Pod's `RPK_USER` / `RPK_PASS` env vars
// are re-materialized from the current bootstrap user Secret. The StatefulSet
// controller takes care of bringing the replacement Pods back.
func iRestartAllPodsInCluster(ctx context.Context, t framework.TestingT, cluster string) {
var sts appsv1.StatefulSet
require.NoError(t, t.Get(ctx, t.ResourceKey(cluster), &sts))

selector, err := metav1.LabelSelectorAsSelector(sts.Spec.Selector)
require.NoError(t, err)

var pods corev1.PodList
require.NoError(t, t.List(ctx, &pods, client.InNamespace(t.Namespace()), client.MatchingLabelsSelector{Selector: selector}))
require.NotEmpty(t, pods.Items, "no pods found for cluster %q", cluster)

for i := range pods.Items {
pod := &pods.Items[i]
t.Logf("Deleting pod %q to force re-read of bootstrap user env vars", pod.Name)
require.NoError(t, t.Delete(ctx, pod))
}
}
5 changes: 5 additions & 0 deletions acceptance/steps/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,11 @@ func init() {
framework.RegisterStep(`^service "([^"]*)" should not have field managers:$`, checkResourceNoFieldManagers)
framework.RegisterStep(`^cluster "([^"]*)" should have sync error:$`, checkClusterHasSyncError)

// Bootstrap user lifecycle steps
framework.RegisterStep(`^I delete the bootstrap user secret for cluster "([^"]*)"$`, iDeleteTheBootstrapUserSecretForCluster)
framework.RegisterStep(`^the bootstrap user secret for cluster "([^"]*)" is regenerated with a new password$`, theBootstrapUserSecretForClusterIsRegenerated)
framework.RegisterStep(`^I restart all pods in cluster "([^"]*)"$`, iRestartAllPodsInCluster)

// Debug steps
framework.RegisterStep(`^I become debuggable$`, sleepALongTime)
}
33 changes: 29 additions & 4 deletions operator/internal/configwatcher/configwatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,14 @@ func (w *ConfigWatcher) SyncUsers(ctx context.Context, path string) {

w.log.Info("synchronizing users in file", "file", path)

// sync our internal superuser first
// sync our internal superuser first. We mirror the Secret's password
// into Redpanda's SCRAM DB on every sync so that a rotated bootstrap
// user Secret (e.g. the operator regenerating it after it was deleted)
// actually propagates into the running cluster. syncInternalUser never
// falls back to delete-and-recreate — dropping the internal superuser
// even briefly could strand the operator.
internalSuperuser, password, mechanism := getInternalUser()
// the internal user should only ever be created once, so don't
// update its password ever.
w.syncUser(ctx, internalSuperuser, password, mechanism, false)
w.syncInternalUser(ctx, internalSuperuser, password, mechanism)

users := []string{internalSuperuser}

Expand Down Expand Up @@ -248,6 +251,28 @@ func (w *ConfigWatcher) setSuperusers(ctx context.Context, users []string) {
}
}

// syncInternalUser ensures Redpanda's internal superuser record matches the
// password currently mounted for the pod. Unlike syncUser, this path never
// falls back to a delete/recreate: dropping the internal superuser — even
// for the brief window between DeleteUser and CreateUser — could strand the
// operator. Sending UpdateUser with the current password is idempotent on
// the Redpanda side, so this is safe to invoke on every sync.
func (w *ConfigWatcher) syncInternalUser(ctx context.Context, user, password, mechanism string) {
w.log.Info("synchronizing internal user", "user", user)

err := w.adminClient.CreateUser(ctx, user, password, mechanism)
if err == nil {
return
}
if !strings.Contains(err.Error(), "already exists") {
w.log.Error(err, "could not create internal user", "user", user)
return
}
if err := w.adminClient.UpdateUser(ctx, user, password, mechanism); err != nil {
w.log.Error(err, "could not update internal user password", "user", user)
}
}

func (w *ConfigWatcher) syncUser(ctx context.Context, user, password, mechanism string, recreate bool) {
w.log.Info("synchronizing user", "user", user)

Expand Down
49 changes: 49 additions & 0 deletions operator/internal/configwatcher/configwatcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ import (
"fmt"
"strings"
"testing"
"time"

"github.com/go-logr/logr/testr"
"github.com/redpanda-data/common-go/rpadmin"
"github.com/spf13/afero"
"github.com/stretchr/testify/require"
"github.com/testcontainers/testcontainers-go"
"github.com/testcontainers/testcontainers-go/modules/redpanda"
"github.com/twmb/franz-go/pkg/kgo"
"github.com/twmb/franz-go/pkg/sasl/scram"
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/redpanda-data/redpanda-operator/operator/internal/configwatcher"
Expand All @@ -40,6 +43,8 @@ func TestConfigWatcher(t *testing.T) {
ctx,
"redpandadata/redpanda:v24.2.4",
redpanda.WithSuperusers("user"),
redpanda.WithEnableSASL(),
redpanda.WithEnableKafkaAuthorization(),
testcontainers.WithEnv(map[string]string{
"RP_BOOTSTRAP_USER": fmt.Sprintf("%s:%s:%s", user, password, saslMechanism),
}),
Expand Down Expand Up @@ -116,6 +121,31 @@ func TestConfigWatcher(t *testing.T) {

require.ElementsMatch(t, superusers, clusterUsers)

// Simulate the bootstrap user Secret being rotated (the operator
// regenerating it after it was deleted). getInternalUser() reads the
// password out of RPK_PASS on every sync, so flipping the env var and
// re-running SyncUsers is enough to exercise the rotation path.
//
// Without the fix, SyncUsers would call CreateUser, see "already
// exists", and return — leaving Redpanda's SCRAM DB pointed at the
// original password forever. With the fix, it follows up with
// UpdateUser so the rotated password actually takes effect. We
// validate via a Kafka SASL handshake because admin-API basic auth
// is derivable from rpk-config (the only way the configwatcher
// authenticates) and therefore can't prove the SCRAM DB changed.
const rotatedPassword = "rotated-password-after-secret-regen"
t.Setenv("RPK_PASS", rotatedPassword)

watcher.SyncUsers(ctx, "/etc/secret/users/users.txt")

kafkaBroker, err := container.KafkaSeedBroker(ctx)
require.NoError(t, err)

require.Error(t, kafkaSASLHandshake(ctx, kafkaBroker, user, password),
"original password must no longer authenticate after rotation")
require.NoError(t, kafkaSASLHandshake(ctx, kafkaBroker, user, rotatedPassword),
"rotated password must authenticate after SyncUsers propagates it")

cancel()

select {
Expand All @@ -142,3 +172,22 @@ rpk:
func createUserLine(user, password, mechanism string) string {
return user + ":" + password + ":" + mechanism
}

// kafkaSASLHandshake opens a short-lived kgo client against the Kafka listener
// with SCRAM-SHA-512 credentials and pings each seed broker. The SASL
// handshake runs as part of broker connection setup, so a non-nil return
// means the credentials were rejected (or the broker was unreachable).
func kafkaSASLHandshake(ctx context.Context, broker, user, password string) error {
client, err := kgo.NewClient(
kgo.SeedBrokers(broker),
kgo.SASL((&scram.Auth{User: user, Pass: password}).AsSha512Mechanism()),
)
if err != nil {
return err
}
defer client.Close()

pingCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
return client.Ping(pingCtx)
}
Loading