Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -384,13 +384,21 @@ tasks:
- quay.io/jetstack/cert-manager-controller:{{.SECOND_TEST_CERTMANAGER_VERSION}}
- quay.io/jetstack/cert-manager-cainjector:{{.SECOND_TEST_CERTMANAGER_VERSION}}
- quay.io/jetstack/cert-manager-webhook:{{.SECOND_TEST_CERTMANAGER_VERSION}}
# cert-manager v1.17.1 is used inside vclusters (pkg/testutil.CertManagerVersion).
- quay.io/jetstack/cert-manager-controller:v1.17.1
- quay.io/jetstack/cert-manager-cainjector:v1.17.1
- quay.io/jetstack/cert-manager-webhook:v1.17.1
- quay.io/jetstack/cert-manager-startupapicheck:v1.17.1
- '{{.TEST_REDPANDA_REPO}}:{{.TEST_REDPANDA_VERSION}}'
- '{{.DEFAULT_TEST_UPGRADE_REDPANDA_REPO}}:{{.TEST_UPGRADE_REDPANDA_VERSION}}'
- redpandadata/redpanda-operator:v25.1.3
- redpandadata/redpanda-operator:v25.2.2
- redpandadata/redpanda-operator:v25.3.1
- redpandadata/redpanda-operator:{{.TEST_UPGRADE_OPERATOR_VERSION}}
- ghcr.io/loft-sh/vcluster-pro:{{.TEST_VCLUSTER_VERSION}}
# vcluster's embedded k8s distro — the tag is hardcoded in
# pkg/vcluster/vcluster.go's DefaultValues.
- ghcr.io/loft-sh/kubernetes:v1.33.4
- registry.k8s.io/kube-controller-manager:{{.TEST_KUBE_VERSION}}
- registry.k8s.io/kube-apiserver:{{.TEST_KUBE_VERSION}}
- coredns/coredns:{{.TEST_COREDNS_VERSION}}
Expand All @@ -408,6 +416,7 @@ tasks:
- redpandadata/redpanda:v25.2.1
- redpandadata/redpanda:v25.2.11
- redpandadata/redpanda:v26.1.1
- redpandadata/redpanda:v26.1.5

cmds:
- |
Expand Down
71 changes: 71 additions & 0 deletions acceptance/features/stretch-cluster-ghost-node-ejection.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
@multicluster
@serial
Feature: StretchCluster ghost node ejection

When a node in a stretch cluster becomes permanently unreachable, Redpanda's
continuous data balancing should automatically decommission (eject) the ghost
node after the configured timeouts elapse.

The partition balancer's default timeouts are measured in hours, so we scale
all the interrelated tunables together per Redpanda's validators.cc rules —
scaling one without the others breaks implicit assumptions. The ejection
timer runs for autodecommission_timeout seconds from the moment a quorum of
alive brokers agrees the node has been missing too long, so total expected
ejection time is ~90 seconds after the outage begins.

@skip:gke @skip:aks @skip:eks
Scenario: Ghost node is ejected after a regional outage
Given I create a multicluster operator named "ghost" with 3 nodes
And I apply a multicluster Kubernetes manifest to "ghost":
"""
---
apiVersion: cluster.redpanda.com/v1alpha2
kind: StretchCluster
metadata:
name: cluster
namespace: default
spec:
external:
enabled: false
rbac:
enabled: true
config:
cluster:
partition_autobalancing_mode: continuous
partition_autobalancing_node_availability_timeout_sec: 45
partition_autobalancing_node_autodecommission_timeout_sec: 90
partition_autobalancing_tick_interval_ms: 10000
health_monitor_tick_interval: 5000
node_status_interval: 5000
"""
And I apply a NodePool Kubernetes manifest to "ghost":
"""
spec:
clusterRef:
group: cluster.redpanda.com
kind: StretchCluster
name: cluster
replicas: 1
image:
repository: redpandadata/redpanda
tag: v26.1.5
sidecarImage:
repository: localhost/redpanda-operator
tag: dev
services:
perPod:
remote:
enabled: false
"""
And I expect 3 statefulsets in 3 kubernetes cluster to be created and eventually ready
And I expect all 3 NodePools in "ghost" to be eventually bound and deployed
# Verify the cluster starts healthy with all 3 nodes.
When I execute "rpk cluster health" command in the statefulset container in each cluster
Then the cluster health output should show 3 nodes across all clusters in "ghost"
# Simulate a regional outage by taking a non-controller region offline. We
# avoid the controller region so the remaining cluster retains a stable
# controller for the decommission decision.
When I take a non-controller region of "ghost" offline
# Wait for the ghost node to be ejected. With our config this takes ~90s
# (partition_autobalancing_node_autodecommission_timeout_sec).
Then the cluster health output should eventually show 2 nodes in the remaining clusters of "ghost"
2 changes: 2 additions & 0 deletions acceptance/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ var setupSuite = sync.OnceValues(func() (*framework.Suite, error) {
"redpandadata/redpanda-operator:v25.3.1",
"redpandadata/redpanda:v25.1.1",
"redpandadata/redpanda:v25.2.1",
// Image used by ghost node ejection feature.
"redpandadata/redpanda:v26.1.5",
// Images used by upgrade and upgrade-regressions features.
"redpandadata/redpanda:v25.2.11",
"redpandadata/redpanda-unstable:v25.3.1-rc4",
Expand Down
5 changes: 5 additions & 0 deletions acceptance/steps/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ func init() {
framework.RegisterStep(`^the "([^"]*)" region of "([^"]*)" should reflect the updated StretchCluster spec$`, regionReflectsUpdatedSpec)
framework.RegisterStep(`^the operator in the "([^"]*)" region of "([^"]*)" should eventually be running and reconciling$`, operatorInRegionRecovering)

// Ghost node ejection scenario steps
framework.RegisterStep(`^I take a non-controller region of "([^"]*)" offline$`, takeNonControllerRegionOffline)
framework.RegisterStep(`^the cluster health output should show (\d+) nodes across all clusters in "([^"]*)"$`, expectClusterHealthNodeCount)
framework.RegisterStep(`^the cluster health output should eventually show (\d+) nodes in the remaining clusters of "([^"]*)"$`, expectEventualNodeCountInRemainingClusters)

// Scaling scenario steps
framework.RegisterStep(`^cluster "([^"]*)" should be stable with (\d+) nodes$`, checkClusterStableWithCount)
framework.RegisterStep(`^cluster "([^"]*)" is stable with (\d+) nodes$`, checkClusterStableWithCount)
Expand Down
88 changes: 83 additions & 5 deletions acceptance/steps/stretch.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ func (v vclusterNodes) ApplyNodepoolsWithDifferentNamePerCluster(ctx context.Con
func (v vclusterNodes) DeleteNodepools(ctx context.Context, manifest *godog.DocString) {
t := framework.T(ctx)
for _, node := range v {
if node.offline {
t.Logf("skipping NodePool cleanup for offline region %q", node.logicalName)
continue
}
fullManifest := nodepoolManifest(nameMap[node.logicalName], manifest)
t.Logf("applying manifest to %q", node.Name())
require.NoError(t, node.KubectlDelete(ctx, fullManifest))
Expand All @@ -252,6 +256,10 @@ func (v vclusterNodes) DeleteNodepools(ctx context.Context, manifest *godog.DocS
func (v vclusterNodes) DeleteAll(ctx context.Context, manifest []byte) {
t := framework.T(ctx)
for _, node := range v {
if node.offline {
t.Logf("skipping manifest cleanup for offline region %q", node.logicalName)
continue
}
require.NoError(t, node.KubectlDelete(ctx, manifest))
}
}
Expand Down Expand Up @@ -300,6 +308,11 @@ type vclusterNode struct {
// offline is set to true when the region is intentionally taken offline for
// disaster-recovery tests. ApplyAll and similar helpers skip offline nodes.
offline bool
// k3dNodeName is the name of the k3d agent node this vcluster's workloads
// are pinned to via `sync.fromHost.nodes.selector.labels`. The
// ghost-node-ejection test deletes this node to simulate a regional
// outage; it's empty if pinning wasn't applied.
k3dNodeName string
}

func (n *vclusterNode) APIServer() string {
Expand Down Expand Up @@ -385,7 +398,8 @@ func createNetworkedVClusterOperators(ctx context.Context, t framework.TestingT,
redpandaLicense := os.Getenv(LicenseEnvVar)
require.NotEmpty(t, redpandaLicense, LicenseEnvVar+" env var must be set")

vclusters := createVClusters(ctx, t, clusters)
k3dNodeNames := pickK3dAgentNodes(ctx, t, clusters)
vclusters := createVClusters(ctx, t, clusters, k3dNodeNames)
assignOperatorServiceIPs(ctx, t, vclusters, namespace)
peers := bootstrapTLS(ctx, t, vclusters, namespace)
deployOperators(ctx, t, vclusters, namespace, redpandaLicense, peers)
Expand All @@ -399,8 +413,8 @@ func createNetworkedVClusterOperators(ctx context.Context, t framework.TestingT,
return stashNodes(ctx, clusterName, vclusters)
}

func createVClusters(ctx context.Context, t framework.TestingT, clusters int32) []*vclusterNode {
t.Logf("creating %d vclusters", clusters)
func createVClusters(ctx context.Context, t framework.TestingT, clusters int32, k3dNodeNames []string) []*vclusterNode {
t.Logf("creating %d vclusters pinned to k3d nodes %v", clusters, k3dNodeNames)

// Generate a unique per-test suffix so that vcluster host namespaces never
// collide when tests run in parallel or back-to-back (a terminating namespace
Expand All @@ -425,13 +439,16 @@ func createVClusters(ctx context.Context, t framework.TestingT, clusters int32)
// with a letter); apirand.String can return strings starting with digits.
actualName := fmt.Sprintf("vc-%s-%d", suffix, i)

vClusterValues := vcluster.DefaultValues + networkingValues(i, clusters, suffix)
vClusterValues := vcluster.DefaultValues +
networkingValues(i, clusters, suffix) +
pinningValues(k3dNodeNames[i])
cluster, err := vcluster.New(ctx, t.RestConfig(), vcluster.WithName(actualName), vcluster.WithValues(helm.RawYAML(vClusterValues)))
require.NoError(t, err)
scheme := t.Scheme()
cluster.SetScheme(scheme)

t.Logf("finished creating vcluster %d (logical: %q, actual: %q)", i+1, logicalName, cluster.Name())
t.Logf("finished creating vcluster %d (logical: %q, actual: %q, pinned to k3d node %q)",
i+1, logicalName, cluster.Name(), k3dNodeNames[i])

cleanupWrapper(t, func(ctx context.Context) {
if err := cluster.Delete(); err != nil {
Expand All @@ -453,6 +470,7 @@ func createVClusters(ctx context.Context, t framework.TestingT, clusters int32)
Cluster: cluster,
apiServer: fmt.Sprintf("https://%s", actualName),
logicalName: logicalName,
k3dNodeName: k3dNodeNames[i],
}
}(i)
}
Expand All @@ -461,6 +479,66 @@ func createVClusters(ctx context.Context, t framework.TestingT, clusters int32)
return nodes
}

// pickK3dAgentNodes returns `clusters` worker-node hostnames from the host
// k3d cluster so each vcluster can be pinned to a distinct host node. Uses
// the built-in `kubernetes.io/hostname` label — no extra labeling needed.
// Skips the control-plane node and fails if there are not enough workers.
func pickK3dAgentNodes(ctx context.Context, t framework.TestingT, clusters int32) []string {
hostClient, err := client.New(t.RestConfig(), client.Options{})
require.NoError(t, err)

var nodeList corev1.NodeList
require.NoError(t, hostClient.List(ctx, &nodeList))

var workerNodes []corev1.Node
for _, n := range nodeList.Items {
if _, isControlPlane := n.Labels["node-role.kubernetes.io/control-plane"]; isControlPlane {
continue
}
workerNodes = append(workerNodes, n)
}
require.GreaterOrEqual(t, int32(len(workerNodes)), clusters,
"need at least %d worker nodes in host cluster, got %d", clusters, len(workerNodes))

// Sort for deterministic assignment within a single test run.
slices.SortFunc(workerNodes, func(a, b corev1.Node) int {
return strings.Compare(a.Name, b.Name)
})

names := make([]string, clusters)
for i := int32(0); i < clusters; i++ {
names[i] = workerNodes[i].Name
}
return names
}

// pinningValues returns vcluster helm values that pin the control plane pod
// to a specific k3d node via `kubernetes.io/hostname` and filter the nodes
// visible inside the vcluster to just that same host node. Combined with the
// virtual scheduler, this ensures all synced workloads (operator, Redpanda,
// cert-manager) land on that single host node — so the ghost node ejection
// test can take them all down at once via `k3d node delete`.
func pinningValues(k3dNodeName string) string {
return fmt.Sprintf(`
controlPlane:
statefulSet:
scheduling:
nodeSelector:
kubernetes.io/hostname: %s
advanced:
virtualScheduler:
enabled: true
sync:
fromHost:
nodes:
enabled: true
selector:
all: false
labels:
kubernetes.io/hostname: %s
`, k3dNodeName, k3dNodeName)
}

// stretchClusterResourceName is the StretchCluster resource name used in all
// multicluster acceptance tests. Per-pod service names are prefixed with this
// value (e.g. "cluster-first-0").
Expand Down
Loading
Loading