redpanda-data · hidalgopl · Apr 22, 2026 · Apr 17, 2026
@@ -384,13 +384,21 @@ tasks:
       - quay.io/jetstack/cert-manager-controller:{{.SECOND_TEST_CERTMANAGER_VERSION}}
       - quay.io/jetstack/cert-manager-cainjector:{{.SECOND_TEST_CERTMANAGER_VERSION}}
       - quay.io/jetstack/cert-manager-webhook:{{.SECOND_TEST_CERTMANAGER_VERSION}}
+      # cert-manager v1.17.1 is used inside vclusters (pkg/testutil.CertManagerVersion).
+      - quay.io/jetstack/cert-manager-controller:v1.17.1
+      - quay.io/jetstack/cert-manager-cainjector:v1.17.1
+      - quay.io/jetstack/cert-manager-webhook:v1.17.1
+      - quay.io/jetstack/cert-manager-startupapicheck:v1.17.1
       - '{{.TEST_REDPANDA_REPO}}:{{.TEST_REDPANDA_VERSION}}'
       - '{{.DEFAULT_TEST_UPGRADE_REDPANDA_REPO}}:{{.TEST_UPGRADE_REDPANDA_VERSION}}'
       - redpandadata/redpanda-operator:v25.1.3
       - redpandadata/redpanda-operator:v25.2.2
       - redpandadata/redpanda-operator:v25.3.1
       - redpandadata/redpanda-operator:{{.TEST_UPGRADE_OPERATOR_VERSION}}
       - ghcr.io/loft-sh/vcluster-pro:{{.TEST_VCLUSTER_VERSION}}
+      # vcluster's embedded k8s distro — the tag is hardcoded in
+      # pkg/vcluster/vcluster.go's DefaultValues.
+      - ghcr.io/loft-sh/kubernetes:v1.33.4
       - registry.k8s.io/kube-controller-manager:{{.TEST_KUBE_VERSION}}
       - registry.k8s.io/kube-apiserver:{{.TEST_KUBE_VERSION}}
       - coredns/coredns:{{.TEST_COREDNS_VERSION}}
@@ -408,6 +416,7 @@ tasks:
       - redpandadata/redpanda:v25.2.1
       - redpandadata/redpanda:v25.2.11
       - redpandadata/redpanda:v26.1.1
+      - redpandadata/redpanda:v26.1.5
 
     cmds:
       - |

@@ -0,0 +1,71 @@
+@multicluster
+@serial
+Feature: StretchCluster ghost node ejection
+
+  When a node in a stretch cluster becomes permanently unreachable, Redpanda's
+  continuous data balancing should automatically decommission (eject) the ghost
+  node after the configured timeouts elapse.
+
+  The partition balancer's default timeouts are measured in hours, so we scale
+  all the interrelated tunables together per Redpanda's validators.cc rules —
+  scaling one without the others breaks implicit assumptions. The ejection
+  timer runs for autodecommission_timeout seconds from the moment a quorum of
+  alive brokers agrees the node has been missing too long, so total expected
+  ejection time is ~90 seconds after the outage begins.
+
+  @skip:gke @skip:aks @skip:eks
+  Scenario: Ghost node is ejected after a regional outage
+    Given I create a multicluster operator named "ghost" with 3 nodes
+    And I apply a multicluster Kubernetes manifest to "ghost":
+    """
+    ---
+    apiVersion: cluster.redpanda.com/v1alpha2
+    kind: StretchCluster
+    metadata:
+      name: cluster
+      namespace: default
+    spec:
+      external:
+        enabled: false
+      rbac:
+        enabled: true
+      config:
+        cluster:
+          partition_autobalancing_mode: continuous
+          partition_autobalancing_node_availability_timeout_sec: 45
+          partition_autobalancing_node_autodecommission_timeout_sec: 90
+          partition_autobalancing_tick_interval_ms: 10000
+          health_monitor_tick_interval: 5000
+          node_status_interval: 5000
+    """
+    And I apply a NodePool Kubernetes manifest to "ghost":
+    """
+    spec:
+      clusterRef:
+        group: cluster.redpanda.com
+        kind: StretchCluster
+        name: cluster
+      replicas: 1
+      image:
+        repository: redpandadata/redpanda
+        tag: v26.1.5
+      sidecarImage:
+        repository: localhost/redpanda-operator
+        tag: dev
+      services:
+        perPod:
+          remote:
+            enabled: false
+    """
+    And I expect 3 statefulsets in 3 kubernetes cluster to be created and eventually ready
+    And I expect all 3 NodePools in "ghost" to be eventually bound and deployed
+    # Verify the cluster starts healthy with all 3 nodes.
+    When I execute "rpk cluster health" command in the statefulset container in each cluster
+    Then the cluster health output should show 3 nodes across all clusters in "ghost"
+    # Simulate a regional outage by taking a non-controller region offline. We
+    # avoid the controller region so the remaining cluster retains a stable
+    # controller for the decommission decision.
+    When I take a non-controller region of "ghost" offline
+    # Wait for the ghost node to be ejected. With our config this takes ~90s
+    # (partition_autobalancing_node_autodecommission_timeout_sec).
+    Then the cluster health output should eventually show 2 nodes in the remaining clusters of "ghost"
@@ -82,6 +82,8 @@ var setupSuite = sync.OnceValues(func() (*framework.Suite, error) {
 			"redpandadata/redpanda-operator:v25.3.1",
 			"redpandadata/redpanda:v25.1.1",
 			"redpandadata/redpanda:v25.2.1",
+			// Image used by ghost node ejection feature.
+			"redpandadata/redpanda:v26.1.5",
 			// Images used by upgrade and upgrade-regressions features.
 			"redpandadata/redpanda:v25.2.11",
 			"redpandadata/redpanda-unstable:v25.3.1-rc4",

@@ -129,6 +129,11 @@ func init() {
 	framework.RegisterStep(`^the "([^"]*)" region of "([^"]*)" should reflect the updated StretchCluster spec$`, regionReflectsUpdatedSpec)
 	framework.RegisterStep(`^the operator in the "([^"]*)" region of "([^"]*)" should eventually be running and reconciling$`, operatorInRegionRecovering)
 
+	// Ghost node ejection scenario steps
+	framework.RegisterStep(`^I take a non-controller region of "([^"]*)" offline$`, takeNonControllerRegionOffline)
+	framework.RegisterStep(`^the cluster health output should show (\d+) nodes across all clusters in "([^"]*)"$`, expectClusterHealthNodeCount)
+	framework.RegisterStep(`^the cluster health output should eventually show (\d+) nodes in the remaining clusters of "([^"]*)"$`, expectEventualNodeCountInRemainingClusters)
+
 	// Scaling scenario steps
 	framework.RegisterStep(`^cluster "([^"]*)" should be stable with (\d+) nodes$`, checkClusterStableWithCount)
 	framework.RegisterStep(`^cluster "([^"]*)" is stable with (\d+) nodes$`, checkClusterStableWithCount)

@@ -243,6 +243,10 @@ func (v vclusterNodes) ApplyNodepoolsWithDifferentNamePerCluster(ctx context.Con
 func (v vclusterNodes) DeleteNodepools(ctx context.Context, manifest *godog.DocString) {
 	t := framework.T(ctx)
 	for _, node := range v {
+		if node.offline {
+			t.Logf("skipping NodePool cleanup for offline region %q", node.logicalName)
+			continue
+		}
 		fullManifest := nodepoolManifest(nameMap[node.logicalName], manifest)
 		t.Logf("applying manifest to %q", node.Name())
 		require.NoError(t, node.KubectlDelete(ctx, fullManifest))
@@ -252,6 +256,10 @@ func (v vclusterNodes) DeleteNodepools(ctx context.Context, manifest *godog.DocS
 func (v vclusterNodes) DeleteAll(ctx context.Context, manifest []byte) {
 	t := framework.T(ctx)
 	for _, node := range v {
+		if node.offline {
+			t.Logf("skipping manifest cleanup for offline region %q", node.logicalName)
+			continue
+		}
 		require.NoError(t, node.KubectlDelete(ctx, manifest))
 	}
 }
@@ -300,6 +308,11 @@ type vclusterNode struct {
 	// offline is set to true when the region is intentionally taken offline for
 	// disaster-recovery tests. ApplyAll and similar helpers skip offline nodes.
 	offline bool
+	// k3dNodeName is the name of the k3d agent node this vcluster's workloads
+	// are pinned to via `sync.fromHost.nodes.selector.labels`. The
+	// ghost-node-ejection test deletes this node to simulate a regional
+	// outage; it's empty if pinning wasn't applied.
+	k3dNodeName string
 }
 
 func (n *vclusterNode) APIServer() string {
@@ -385,7 +398,8 @@ func createNetworkedVClusterOperators(ctx context.Context, t framework.TestingT,
 	redpandaLicense := os.Getenv(LicenseEnvVar)
 	require.NotEmpty(t, redpandaLicense, LicenseEnvVar+" env var must be set")
 
-	vclusters := createVClusters(ctx, t, clusters)
+	k3dNodeNames := pickK3dAgentNodes(ctx, t, clusters)
+	vclusters := createVClusters(ctx, t, clusters, k3dNodeNames)
 	assignOperatorServiceIPs(ctx, t, vclusters, namespace)
 	peers := bootstrapTLS(ctx, t, vclusters, namespace)
 	deployOperators(ctx, t, vclusters, namespace, redpandaLicense, peers)
@@ -399,8 +413,8 @@ func createNetworkedVClusterOperators(ctx context.Context, t framework.TestingT,
 	return stashNodes(ctx, clusterName, vclusters)
 }
 
-func createVClusters(ctx context.Context, t framework.TestingT, clusters int32) []*vclusterNode {
-	t.Logf("creating %d vclusters", clusters)
+func createVClusters(ctx context.Context, t framework.TestingT, clusters int32, k3dNodeNames []string) []*vclusterNode {
+	t.Logf("creating %d vclusters pinned to k3d nodes %v", clusters, k3dNodeNames)
 
 	// Generate a unique per-test suffix so that vcluster host namespaces never
 	// collide when tests run in parallel or back-to-back (a terminating namespace
@@ -425,13 +439,16 @@ func createVClusters(ctx context.Context, t framework.TestingT, clusters int32)
 			// with a letter); apirand.String can return strings starting with digits.
 			actualName := fmt.Sprintf("vc-%s-%d", suffix, i)
 
-			vClusterValues := vcluster.DefaultValues + networkingValues(i, clusters, suffix)
+			vClusterValues := vcluster.DefaultValues +
+				networkingValues(i, clusters, suffix) +
+				pinningValues(k3dNodeNames[i])
 			cluster, err := vcluster.New(ctx, t.RestConfig(), vcluster.WithName(actualName), vcluster.WithValues(helm.RawYAML(vClusterValues)))
 			require.NoError(t, err)
 			scheme := t.Scheme()
 			cluster.SetScheme(scheme)
 
-			t.Logf("finished creating vcluster %d (logical: %q, actual: %q)", i+1, logicalName, cluster.Name())
+			t.Logf("finished creating vcluster %d (logical: %q, actual: %q, pinned to k3d node %q)",
+				i+1, logicalName, cluster.Name(), k3dNodeNames[i])
 
 			cleanupWrapper(t, func(ctx context.Context) {
 				if err := cluster.Delete(); err != nil {
@@ -453,6 +470,7 @@ func createVClusters(ctx context.Context, t framework.TestingT, clusters int32)
 				Cluster:     cluster,
 				apiServer:   fmt.Sprintf("https://%s", actualName),
 				logicalName: logicalName,
+				k3dNodeName: k3dNodeNames[i],
 			}
 		}(i)
 	}
@@ -461,6 +479,66 @@ func createVClusters(ctx context.Context, t framework.TestingT, clusters int32)
 	return nodes
 }
 
+// pickK3dAgentNodes returns `clusters` worker-node hostnames from the host
+// k3d cluster so each vcluster can be pinned to a distinct host node. Uses
+// the built-in `kubernetes.io/hostname` label — no extra labeling needed.
+// Skips the control-plane node and fails if there are not enough workers.
+func pickK3dAgentNodes(ctx context.Context, t framework.TestingT, clusters int32) []string {
+	hostClient, err := client.New(t.RestConfig(), client.Options{})
+	require.NoError(t, err)
+
+	var nodeList corev1.NodeList
+	require.NoError(t, hostClient.List(ctx, &nodeList))
+
+	var workerNodes []corev1.Node
+	for _, n := range nodeList.Items {
+		if _, isControlPlane := n.Labels["node-role.kubernetes.io/control-plane"]; isControlPlane {
+			continue
+		}
+		workerNodes = append(workerNodes, n)
+	}
+	require.GreaterOrEqual(t, int32(len(workerNodes)), clusters,
+		"need at least %d worker nodes in host cluster, got %d", clusters, len(workerNodes))
+
+	// Sort for deterministic assignment within a single test run.
+	slices.SortFunc(workerNodes, func(a, b corev1.Node) int {
+		return strings.Compare(a.Name, b.Name)
+	})
+
+	names := make([]string, clusters)
+	for i := int32(0); i < clusters; i++ {
+		names[i] = workerNodes[i].Name
+	}
+	return names
+}
+
+// pinningValues returns vcluster helm values that pin the control plane pod
+// to a specific k3d node via `kubernetes.io/hostname` and filter the nodes
+// visible inside the vcluster to just that same host node. Combined with the
+// virtual scheduler, this ensures all synced workloads (operator, Redpanda,
+// cert-manager) land on that single host node — so the ghost node ejection
+// test can take them all down at once via `k3d node delete`.
+func pinningValues(k3dNodeName string) string {
+	return fmt.Sprintf(`
+controlPlane:
+  statefulSet:
+    scheduling:
+      nodeSelector:
+        kubernetes.io/hostname: %s
+  advanced:
+    virtualScheduler:
+      enabled: true
+sync:
+  fromHost:
+    nodes:
+      enabled: true
+      selector:
+        all: false
+        labels:
+          kubernetes.io/hostname: %s
+`, k3dNodeName, k3dNodeName)
+}
+
 // stretchClusterResourceName is the StretchCluster resource name used in all
 // multicluster acceptance tests. Per-pod service names are prefixed with this
 // value (e.g. "cluster-first-0").