diff --git a/Taskfile.yml b/Taskfile.yml index 7a1143ffc..cf3209a27 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -359,7 +359,8 @@ tasks: DEFAULT_TEST_UPGRADE_REDPANDA_REPO: redpandadata/redpanda DEFAULT_TEST_UPGRADE_REDPANDA_VERSION: v24.3.11 DEFAULT_TEST_UPGRADE_OPERATOR_VERSION: v2.3.9-24.3.11 - DEFAULT_TEST_VCLUSTER_VERSION: 0.31.2 + DEFAULT_TEST_VCLUSTER_VERSION: 0.33.1 + DEFAULT_TEST_VCLUSTER_KUBE_VERSION: v1.35.0 DEFAULT_TEST_KUBE_VERSION: v1.32.13 DEFAULT_TEST_COREDNS_VERSION: 1.11.1 TEST_CERTMANAGER_VERSION: '{{ .TEST_CERTMANAGER_VERSION | default .DEFAULT_TEST_CERTMANAGER_VERSION }}' @@ -370,6 +371,7 @@ tasks: TEST_UPGRADE_REDPANDA_VERSION: '{{ .TEST_UPGRADE_REDPANDA_VERSION | default .DEFAULT_TEST_UPGRADE_REDPANDA_VERSION }}' TEST_UPGRADE_OPERATOR_VERSION: '{{ .TEST_UPGRADE_OPERATOR_VERSION | default .DEFAULT_TEST_UPGRADE_OPERATOR_VERSION }}' TEST_VCLUSTER_VERSION: '{{ .TEST_VCLUSTER_VERSION | default .DEFAULT_TEST_VCLUSTER_VERSION }}' + TEST_VCLUSTER_KUBE_VERSION: '{{ .TEST_VCLUSTER_KUBE_VERSION | default .DEFAULT_TEST_VCLUSTER_KUBE_VERSION }}' TEST_KUBE_VERSION: '{{ .TEST_KUBE_VERSION | default .DEFAULT_TEST_KUBE_VERSION }}' TEST_COREDNS_VERSION: '{{ .TEST_COREDNS_VERSION | default .DEFAULT_TEST_COREDNS_VERSION }}' IMAGES: @@ -396,9 +398,8 @@ tasks: - redpandadata/redpanda-operator:v25.3.1 - redpandadata/redpanda-operator:{{.TEST_UPGRADE_OPERATOR_VERSION}} - ghcr.io/loft-sh/vcluster-pro:{{.TEST_VCLUSTER_VERSION}} - # vcluster's embedded k8s distro — the tag is hardcoded in - # pkg/vcluster/vcluster.go's DefaultValues. - - ghcr.io/loft-sh/kubernetes:v1.33.4 + - ghcr.io/loft-sh/kubernetes:{{.TEST_VCLUSTER_KUBE_VERSION}} + - ghcr.io/loft-sh/vm-container:latest - registry.k8s.io/kube-controller-manager:{{.TEST_KUBE_VERSION}} - registry.k8s.io/kube-apiserver:{{.TEST_KUBE_VERSION}} - coredns/coredns:{{.TEST_COREDNS_VERSION}} diff --git a/ci/scripts/run-in-nix-docker.sh b/ci/scripts/run-in-nix-docker.sh index 43d48c2e3..f49e15549 100755 --- a/ci/scripts/run-in-nix-docker.sh +++ b/ci/scripts/run-in-nix-docker.sh @@ -32,6 +32,9 @@ fi # Spray and pray work is done, now we want to fail on errors. set -e +VCLUSTER_STATE="${VCLUSTER_STATE:-/tmp/vcluster-ctl-config-vol}" +mkdir -p "$VCLUSTER_STATE" + # BUILDKITE environment variables are used by `buildkite-agent` cli to upload artifact printenv | { grep '^BUILDKITE.*=' || true; } > env-file-for-buildkite @@ -67,10 +70,12 @@ docker run --rm -it \ -e ACCEPTANCE_ARTIFACTS_DIR \ -e INTEGRATION_ARTIFACTS_DIR \ -e REDPANDA_SAMPLE_LICENSE \ + -e VCLUSTER_STATE="$VCLUSTER_STATE" \ --user 0:$(id -g) \ --privileged \ --net=host \ --ipc=host \ --volume ${DOCKER_SOCKET:7}:/var/run/docker.sock \ --volume $(pwd):/work \ + --volume "$VCLUSTER_STATE:$VCLUSTER_STATE" \ $IMAGE_SHA "$@" diff --git a/ci/vcluster.nix b/ci/vcluster.nix index 3537b6dcd..b7d74430f 100644 --- a/ci/vcluster.nix +++ b/ci/vcluster.nix @@ -5,16 +5,16 @@ }: let pname = "vcluster"; - version = "0.23.0"; + version = "0.33.1"; src = { # Update hashes with: nix hash to-sri --type sha256 (nix-prefetch-url $URL) aarch64-darwin = fetchurl { url = "https://github.com/loft-sh/vcluster/releases/download/v${version}/vcluster-darwin-arm64"; - hash = "sha256-3NWvQramcxvZaewc3hMvYGhSOIIsnWD3mqU/WNA723c="; + hash = "sha256-eWzsLSSUXwca3R/DAxqUz5nyL+ihwJdcO+2qg+2Lt8U="; }; x86_64-linux = fetchurl { url = "https://github.com/loft-sh/vcluster/releases/download/v${version}/vcluster-linux-amd64"; - hash = "sha256-4b/UKWj/1hef0A2tKlluiWVhi94FQL1qAmmTc5Q0DMc="; + hash = "sha256-vtDpEiOZbORptgGH2eyRkZh9K36kt4W7qfpqznYW3+g="; }; }.${stdenv.system} or (throw "${pname}-${version}: ${stdenv.system} is unsupported."); in diff --git a/operator/internal/controller/redpanda/multicluster_controller_test.go b/operator/internal/controller/redpanda/multicluster_controller_test.go index 4a027330b..e89b4bf50 100644 --- a/operator/internal/controller/redpanda/multicluster_controller_test.go +++ b/operator/internal/controller/redpanda/multicluster_controller_test.go @@ -15,6 +15,7 @@ import ( "fmt" "os" "slices" + "strings" "testing" "time" @@ -28,6 +29,7 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" @@ -91,7 +93,7 @@ func (s *MulticlusterControllerSuite) SetupSuite() { Tag: "dev", } - s.mc = testenv.NewMulticluster(t, s.ctx, testenv.MulticlusterOptions{ + s.mc = testenv.NewMulticlusterVind(t, s.ctx, testenv.MulticlusterOptions{ Name: "multicluster", ClusterSize: 3, Scheme: controller.MulticlusterScheme, @@ -115,6 +117,9 @@ func (s *MulticlusterControllerSuite) TestManagesFinalizers() { ObjectMeta: metav1.ObjectMeta{ Name: nn.Name, }, + Spec: redpandav1alpha2.StretchClusterSpec{ + External: &redpandav1alpha2.External{Enabled: ptr.To(false)}, + }, }) for i, env := range s.mc.Envs { @@ -155,6 +160,7 @@ func (s *MulticlusterControllerSuite) TestSpecConsistencyConditionSetOnDrift() { }, Spec: redpandav1alpha2.StretchClusterSpec{ CommonLabels: map[string]string{"env": "prod"}, + External: &redpandav1alpha2.External{Enabled: ptr.To(false)}, }, }) @@ -188,9 +194,13 @@ func (s *MulticlusterControllerSuite) TestSpecConsistencyConditionSetOnDrift() { // Introduce drift: patch the spec on one cluster only. driftedEnv := s.mc.Envs[0] var sc redpandav1alpha2.StretchCluster - require.NoError(t, driftedEnv.Client().Get(ctx, nn, &sc)) - sc.Spec.CommonLabels = map[string]string{"env": "staging"} - require.NoError(t, driftedEnv.Client().Update(ctx, &sc)) + require.NoError(t, retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := driftedEnv.Client().Get(ctx, nn, &sc); err != nil { + return err + } + sc.Spec.CommonLabels = map[string]string{"env": "staging"} + return driftedEnv.Client().Update(ctx, &sc) + })) // The reconciler should detect drift and set the condition. require.Eventually(t, func() bool { @@ -220,9 +230,13 @@ func (s *MulticlusterControllerSuite) TestSpecConsistencyConditionSetOnDrift() { } // Fix the drift: align the spec back. - require.NoError(t, driftedEnv.Client().Get(ctx, nn, &sc)) - sc.Spec.CommonLabels = map[string]string{"env": "prod"} - require.NoError(t, driftedEnv.Client().Update(ctx, &sc)) + require.NoError(t, retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := driftedEnv.Client().Get(ctx, nn, &sc); err != nil { + return err + } + sc.Spec.CommonLabels = map[string]string{"env": "prod"} + return driftedEnv.Client().Update(ctx, &sc) + })) // The condition should go back to True. require.Eventually(t, func() bool { @@ -293,7 +307,11 @@ func (s *MulticlusterControllerSuite) TestIssuerRef() { }, }, } - require.NoError(t, cl.Create(ctx, issuer), "creating Issuer in cluster %d", i) + require.NoError(t, retry.OnError(retry.DefaultBackoff, func(err error) bool { + return strings.Contains(err.Error(), "failed to call webhook") + }, func() error { + return cl.Create(ctx, issuer) + }), "creating Issuer in cluster %d", i) } // Step 3: Create a StretchCluster with IssuerRef pointing to the user's Issuer. @@ -304,6 +322,7 @@ func (s *MulticlusterControllerSuite) TestIssuerRef() { Name: scName, }, Spec: redpandav1alpha2.StretchClusterSpec{ + External: &redpandav1alpha2.External{Enabled: ptr.To(false)}, TLS: &redpandav1alpha2.TLS{ Enabled: ptr.To(true), Certs: map[string]*redpandav1alpha2.Certificate{ @@ -453,6 +472,7 @@ func (s *MulticlusterControllerSuite) TestUserProvidedCA() { Name: scName, }, Spec: redpandav1alpha2.StretchClusterSpec{ + External: &redpandav1alpha2.External{Enabled: ptr.To(false)}, TLS: &redpandav1alpha2.TLS{ Enabled: ptr.To(true), Certs: map[string]*redpandav1alpha2.Certificate{ diff --git a/operator/internal/testenv/multicluster.go b/operator/internal/testenv/multicluster.go index 24c90eba4..bedd549fb 100644 --- a/operator/internal/testenv/multicluster.go +++ b/operator/internal/testenv/multicluster.go @@ -17,6 +17,7 @@ import ( "strings" "sync" "testing" + "time" "github.com/go-logr/logr" "github.com/redpanda-data/common-go/kube" @@ -30,10 +31,9 @@ import ( "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" - "github.com/redpanda-data/redpanda-operator/pkg/helm" - "github.com/redpanda-data/redpanda-operator/pkg/k3d" "github.com/redpanda-data/redpanda-operator/pkg/multicluster" "github.com/redpanda-data/redpanda-operator/pkg/testutil" + "github.com/redpanda-data/redpanda-operator/pkg/vcluster" ) // MulticlusterEnv wraps multiple [Env] instances configured as a Raft-based @@ -82,148 +82,6 @@ type MulticlusterOptions struct { SetupFn func(multicluster.Manager) error } -// NewMulticluster creates a multicluster test environment with the given options. -// It creates ClusterSize k3d-backed environments, configures Raft-based leader -// election, sets up RBAC, and registers controllers via SetupFn. -func NewMulticluster(t *testing.T, ctx context.Context, opts MulticlusterOptions) *MulticlusterEnv { - t.Helper() - - if opts.ClusterSize == 0 { - opts.ClusterSize = 3 - } - if opts.Name == "" { - opts.Name = "mc" - } - if opts.CIDRBlock == 0 { - opts.CIDRBlock = 100 - } - if opts.Namespace == "" { - opts.Namespace = opts.Name - } - - ports := testutil.FreePorts(t, opts.ClusterSize) - - // Pre-create the shared Docker network so parallel k3d cluster creation - // doesn't race on network creation. - _ = exec.Command("docker", "network", "create", opts.Name).Run() //nolint:errcheck // ignore "already exists" - - // Pre-create k3d clusters in parallel to speed up bootstrapping. - // Each cluster gets a unique pod/service CIDR so that pods can communicate - // across clusters when routes are set up (flat networking). - // k3d.GetOrCreate is safe to call concurrently (uses file-based locking). - var wg sync.WaitGroup - errs := make([]error, opts.ClusterSize) - for i := range opts.ClusterSize { - wg.Add(1) - go func(idx int) { - defer wg.Done() - k3dOpts := []k3d.ClusterOpt{ - k3d.WithAgents(1), - k3d.WithNetwork(opts.Name), - } - // Non-overlapping CIDRs so pods can communicate across clusters. - k3dOpts = append(k3dOpts, k3d.WithCIDRs( - fmt.Sprintf("10.%d.0.0/16", opts.CIDRBlock+idx), - fmt.Sprintf("10.%d.0.0/16", opts.CIDRBlock+opts.ClusterSize+idx), - )) - _, errs[idx] = k3d.GetOrCreate( - fmt.Sprintf("%s-%d", opts.Name, idx), - k3dOpts..., - ) - }(i) - } - wg.Wait() - for i, err := range errs { - require.NoError(t, err, "creating k3d cluster %d", i) - } - - setupFlatNetworking(t, opts.Name, opts.ClusterSize) - - envs := make([]*Env, opts.ClusterSize) - for i := range opts.ClusterSize { - envs[i] = New(t, Options{ - Name: fmt.Sprintf("%s-%d", opts.Name, i), - Agents: 1, - Scheme: opts.Scheme, - CRDs: opts.CRDs, - Network: opts.Name, - Namespace: opts.Namespace, - Logger: opts.Logger.WithName(fmt.Sprintf("%s-%d", opts.Name, i)), - ImportImages: opts.ImportImages, - WatchAllNamespaces: opts.WatchAllNamespaces, - SkipNamespaceClient: opts.WatchAllNamespaces, - SkipVCluster: true, - }) - } - - if opts.InstallCertManager { - var cmWg sync.WaitGroup - cmErrs := make([]error, opts.ClusterSize) - for i, env := range envs { - cmWg.Add(1) - go func(idx int, env *Env) { //nolint:gosec // ctx is suite-scoped, not request-scoped - defer cmWg.Done() - hc, err := helm.New(helm.Options{KubeConfig: env.RESTConfig()}) - if err != nil { - cmErrs[idx] = fmt.Errorf("creating helm client for cluster %d: %w", idx, err) - return - } - if err := hc.RepoAdd(ctx, "jetstack", "https://charts.jetstack.io"); err != nil { - cmErrs[idx] = fmt.Errorf("adding jetstack repo on cluster %d: %w", idx, err) - return - } - if _, err := hc.Upgrade(ctx, "cert-manager", "jetstack/cert-manager", helm.UpgradeOptions{ - Install: true, - CreateNamespace: true, - Namespace: "cert-manager", - Version: "v1.17.2", - Values: map[string]any{ - "installCRDs": true, - }, - }); err != nil { - cmErrs[idx] = fmt.Errorf("installing cert-manager on cluster %d: %w", idx, err) - } - }(i, env) - } - cmWg.Wait() - for i, err := range cmErrs { - require.NoError(t, err, "installing cert-manager on cluster %d", i) - } - } - - peers := make([]multicluster.RaftCluster, opts.ClusterSize) - for i, env := range envs { - peers[i] = multicluster.RaftCluster{ - Name: env.Name, - Address: fmt.Sprintf("127.0.0.1:%d", ports[i]), - Kubeconfig: env.RESTConfig(), - } - } - - managers := make([]multicluster.Manager, opts.ClusterSize) - for i, env := range envs { - idx := i - sa := setupMulticlusterRBAC(t, ctx, env) - env.SetupMulticlusterManager( - sa, - fmt.Sprintf("127.0.0.1:%d", ports[i]), - peers, - func(mgr multicluster.Manager) error { - managers[idx] = mgr - if opts.SetupFn != nil { - return opts.SetupFn(mgr) - } - return nil - }, - ) - } - - return &MulticlusterEnv{ - Envs: envs, - Managers: managers, - } -} - // PrimaryManager returns the first non-nil Manager. Panics if none exist. func (m *MulticlusterEnv) PrimaryManager() multicluster.Manager { for _, mgr := range m.Managers { @@ -298,9 +156,9 @@ func (m *MulticlusterEnv) ApplyAll(t *testing.T, ctx context.Context, objs ...cl obj.SetManagedFields(nil) obj.SetResourceVersion("") obj.GetObjectKind().SetGroupVersionKind(gvk) - copy := obj.DeepCopyObject().(client.Object) + objCopy := obj.DeepCopyObject().(client.Object) t.Logf("[ApplyAll] applying %s/%s (ns=%s) to %s", gvk.Kind, obj.GetName(), obj.GetNamespace(), env.Name) - require.NoError(t, env.Client().Patch(ctx, copy, client.Apply, client.ForceOwnership, client.FieldOwner("tests"))) //nolint:staticcheck // TODO + require.NoError(t, env.Client().Patch(ctx, objCopy, client.Apply, client.ForceOwnership, client.FieldOwner("tests"))) //nolint:staticcheck // TODO t.Logf("[ApplyAll] applied %s/%s (ns=%s) to %s successfully", gvk.Kind, obj.GetName(), obj.GetNamespace(), env.Name) } } @@ -399,41 +257,6 @@ func (m *MulticlusterEnv) findPodByIP(ctx context.Context, namespace, ip string) return "", -1 } -// setupFlatNetworking adds routes between all k3d cluster nodes so that pods -// on different clusters can reach each other via their non-overlapping CIDRs. -// Each cluster i uses pod CIDR 10.{100+i}.0.0/16. We add a route on every node -// in cluster i pointing 10.{100+j}.0.0/16 to the server node of cluster j. -func setupFlatNetworking(t *testing.T, name string, clusterSize int) { - t.Helper() - - // Resolve the Docker-internal IP of each cluster's server node. - serverIPs := make([]string, clusterSize) - for i := range clusterSize { - container := fmt.Sprintf("k3d-%s-%d-server-0", name, i) - out, err := exec.Command("docker", "inspect", "-f", "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}", container).Output() - require.NoError(t, err, "inspecting container %s", container) - serverIPs[i] = string(out[:len(out)-1]) // trim trailing newline - } - - // For each cluster, add routes to every other cluster's pod CIDR via that cluster's server IP. - for i := range clusterSize { - // All node containers for cluster i: server + agents. - nodes := []string{fmt.Sprintf("k3d-%s-%d-server-0", name, i)} - nodes = append(nodes, fmt.Sprintf("k3d-%s-%d-agent-0", name, i)) - - for j := range clusterSize { - if i == j { - continue - } - cidr := fmt.Sprintf("10.%d.0.0/16", 100+j) - for _, node := range nodes { - // Ignore errors — route may already exist from a previous run. - _ = exec.Command("docker", "exec", node, "ip", "route", "add", cidr, "via", serverIPs[j]).Run() - } - } - } -} - // FixupCrossClusterServices patches per-pod Services on each cluster that // reference pods running on OTHER clusters. It removes the selector (so // kube doesn't fight with us) and creates an EndpointSlice pointing @@ -530,6 +353,168 @@ func (m *MulticlusterEnv) FixupCrossClusterServices(t *testing.T, ctx context.Co } } +// NewMulticlusterVind creates a multicluster test environment using vind +// (vCluster in Docker) clusters instead of k3d. It provides the same +// interface as NewMulticluster but uses Docker-based virtual clusters +// that share a Docker network for flat networking. Images available on the +// host Docker daemon are immediately accessible inside vind clusters. +func NewMulticlusterVind(t *testing.T, ctx context.Context, opts MulticlusterOptions) *MulticlusterEnv { + t.Helper() + + if opts.ClusterSize == 0 { + opts.ClusterSize = 3 + } + if opts.Name == "" { + opts.Name = "mc" + } + if opts.CIDRBlock == 0 { + opts.CIDRBlock = 100 + } + if opts.Namespace == "" { + opts.Namespace = opts.Name + } + + ports := testutil.FreePorts(t, opts.ClusterSize) + + // Pre-create the shared Docker network so parallel vind creation + // doesn't race on network creation. + _ = exec.Command("docker", "network", "create", opts.Name).Run() //nolint:errcheck // ignore "already exists" + + vindOpts := func(idx int) []vcluster.VindOpt { + o := []vcluster.VindOpt{ + vcluster.VindWithNetwork(opts.Name), + vcluster.VindWithPodCIDR(fmt.Sprintf("10.%d.0.0/16", opts.CIDRBlock+idx)), + vcluster.VindWithServiceCIDR(fmt.Sprintf("10.%d.0.0/16", opts.CIDRBlock+opts.ClusterSize+idx)), + } + if opts.InstallCertManager { + o = append(o, vcluster.VindWithCertManager()) + } + return o + } + + // Create vind clusters in parallel. CreateVind uses a file-based lock + // to serialize the binary extraction across parallel test processes. + clusters := make([]*vcluster.VindCluster, opts.ClusterSize) + var wg sync.WaitGroup + errs := make([]error, opts.ClusterSize) + for i := range opts.ClusterSize { + wg.Add(1) + go func(idx int) { + defer wg.Done() + clusters[idx], errs[idx] = vcluster.CreateVind( + fmt.Sprintf("%s-%d", opts.Name, idx), + vindOpts(idx)..., + ) + }(i) + } + wg.Wait() + for i, err := range errs { + require.NoError(t, err, "creating vind cluster %d", i) + } + + // Register vind cleanup BEFORE creating Envs so it runs AFTER + // Env cleanups (t.Cleanup is LIFO). + t.Cleanup(func() { + if testutil.Retain() { + return + } + for _, cluster := range clusters { + if err := cluster.Delete(); err != nil { + t.Logf("WARNING: failed to delete vind cluster %s: %v", cluster.Name(), err) + } + } + }) + + setupVindFlatNetworking(t, clusters, opts.CIDRBlock) + + envs := make([]*Env, opts.ClusterSize) + for i, cluster := range clusters { + envs[i] = NewFromConfig(t, cluster.RESTConfig(), Options{ + Name: cluster.Name(), + Scheme: opts.Scheme, + CRDs: opts.CRDs, + Namespace: opts.Namespace, + Logger: opts.Logger.WithName(cluster.Name()), + WatchAllNamespaces: opts.WatchAllNamespaces, + SkipNamespaceClient: opts.WatchAllNamespaces, + }) + } + + if opts.InstallCertManager { + for i, env := range envs { + require.NoError(t, testutil.WaitForCertManagerWebhook(ctx, env.Client(), 2*time.Minute), + "waiting for cert-manager webhook on cluster %d", i) + } + } + + peers := make([]multicluster.RaftCluster, opts.ClusterSize) + for i, env := range envs { + peers[i] = multicluster.RaftCluster{ + Name: env.Name, + Address: fmt.Sprintf("127.0.0.1:%d", ports[i]), + Kubeconfig: env.RESTConfig(), + } + } + + managers := make([]multicluster.Manager, opts.ClusterSize) + for i, env := range envs { + idx := i + sa := setupMulticlusterRBAC(t, ctx, env) + env.SetupMulticlusterManager( + sa, + fmt.Sprintf("127.0.0.1:%d", ports[i]), + peers, + func(mgr multicluster.Manager) error { + managers[idx] = mgr + if opts.SetupFn != nil { + return opts.SetupFn(mgr) + } + return nil + }, + ) + } + + return &MulticlusterEnv{ + Envs: envs, + Managers: managers, + } +} + +// setupVindFlatNetworking adds routes between all vind cluster containers so +// that pods on different clusters can reach each other via non-overlapping +// CIDRs. Each cluster i uses pod CIDR 10.{cidrBlock+i}.0.0/16. +func setupVindFlatNetworking(t *testing.T, clusters []*vcluster.VindCluster, cidrBlock int) { + t.Helper() + + clusterSize := len(clusters) + + // Get the Docker-internal IP of each cluster's control plane container. + cpIPs := make([]string, clusterSize) + for i, cluster := range clusters { + ip, err := cluster.CPContainerIP() + require.NoError(t, err, "getting CP IP for cluster %d", i) + cpIPs[i] = ip + } + + // For each cluster, add routes to every other cluster's pod CIDR + // via that cluster's CP container IP. + for i, cluster := range clusters { + containers, err := cluster.Containers() + require.NoError(t, err, "listing containers for cluster %d", i) + + for j := range clusterSize { + if i == j { + continue + } + cidr := fmt.Sprintf("10.%d.0.0/16", cidrBlock+j) + for _, container := range containers { + // Ignore errors — route may already exist from a previous run. + _ = exec.Command("docker", "exec", container, "ip", "route", "add", cidr, "via", cpIPs[j]).Run() //nolint:gosec,errcheck + } + } + } +} + // setupMulticlusterRBAC creates a ServiceAccount with cluster-admin privileges // for running the operator in tests. func setupMulticlusterRBAC(t *testing.T, ctx context.Context, env *Env) string { diff --git a/operator/internal/testenv/testenv.go b/operator/internal/testenv/testenv.go index 11e168c3f..59f3cca30 100644 --- a/operator/internal/testenv/testenv.go +++ b/operator/internal/testenv/testenv.go @@ -221,6 +221,82 @@ func New(t *testing.T, options Options) *Env { return env } +// NewFromConfig creates an Env from an existing REST config without creating +// any k3d or vCluster infrastructure. Used for vind-based environments where +// cluster creation is handled externally. +func NewFromConfig(t *testing.T, config *rest.Config, options Options) *Env { + t.Helper() + + if options.Scheme == nil { + options.Scheme = goclientscheme.Scheme + } + if options.Logger.IsZero() { + options.Logger = logr.Discard() + } + + if len(options.CRDs) > 0 { + crds, err := envtest.InstallCRDs(config, envtest.CRDInstallOptions{ + CRDs: dupCRDs(options.CRDs), + ErrorIfPathMissing: false, + }) + if err != nil && !k8sapierrors.IsAlreadyExists(err) && !strings.Contains(err.Error(), "already exists") { + require.NoError(t, err) + } + if err == nil { + require.Equal(t, len(options.CRDs), len(crds)) + } + } + + c, err := client.New(config, client.Options{Scheme: options.Scheme}) + require.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) //nolint:gosec + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "testenv-"}} + if options.Namespace != "" { + ns = &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: options.Namespace}} + } + createErr := c.Create(ctx, ns) + if !k8sapierrors.IsAlreadyExists(createErr) { + require.NoError(t, createErr) + } + + var otelClient client.Client + if options.SkipNamespaceClient { + otelClient = otelkube.NewClient(c) + } else { + otelClient = otelkube.NewClient(client.NewNamespacedClient(c, ns.Name)) + } + + g, ctx := errgroup.WithContext(ctx) + + env := &Env{ + t: t, + scheme: options.Scheme, + logger: options.Logger, + namespace: ns, + group: g, + ctx: ctx, + cancel: cancel, + config: config, + client: otelClient, + watchAllNamespaces: options.WatchAllNamespaces, + Name: options.Name, + } + + t.Logf("Executing in namespace %q on %s", ns.Name, config.Host) + + t.Cleanup(func() { + if t.Failed() { + env.dumpDiagnostics() + } + env.cancel() + assert.NoError(env.t, env.group.Wait()) + }) + + return env +} + func (e *Env) Client() client.Client { return e.client } diff --git a/operator/pkg/client/stretch_cluster_test.go b/operator/pkg/client/stretch_cluster_test.go index 4cab21a2c..22c7ddd29 100644 --- a/operator/pkg/client/stretch_cluster_test.go +++ b/operator/pkg/client/stretch_cluster_test.go @@ -54,6 +54,7 @@ func TestMulticlusterStretchClusterFactory(t *testing.T) { type StretchClusterFactorySuite struct { suite.Suite + ctx context.Context mc *testenv.MulticlusterEnv factory *internalclient.Factory redpandaImage lifecycle.Image @@ -65,6 +66,7 @@ var _ suite.SetupAllSuite = (*StretchClusterFactorySuite)(nil) func (s *StretchClusterFactorySuite) SetupSuite() { t := s.T() ctx := trace.Test(t) + s.ctx = ctx cloudSecrets := lifecycle.CloudSecretsFlags{CloudSecretsEnabled: false} s.redpandaImage = lifecycle.Image{ @@ -82,7 +84,7 @@ func (s *StretchClusterFactorySuite) SetupSuite() { // SetupFn) and the test's factory. Since NewMulticluster hasn't returned // yet when SetupFn runs, we capture the pointer and set it after. var mc *testenv.MulticlusterEnv - s.mc = testenv.NewMulticluster(t, ctx, testenv.MulticlusterOptions{ + s.mc = testenv.NewMulticlusterVind(t, s.ctx, testenv.MulticlusterOptions{ Name: "sc-factory", ClusterSize: 3, Scheme: controller.MulticlusterScheme, diff --git a/pkg/testutil/testutil.go b/pkg/testutil/testutil.go index daa2ea35a..b03e2a56a 100644 --- a/pkg/testutil/testutil.go +++ b/pkg/testutil/testutil.go @@ -28,8 +28,8 @@ var ( ) const ( - CertManagerVersion = "v1.17.1" - VClusterVersion = "v0.31.2" + CertManagerVersion = "v1.17.2" + VClusterVersion = "v0.33.1" ) func GetVClusterImageTag() string { diff --git a/pkg/vcluster/vind.go b/pkg/vcluster/vind.go new file mode 100644 index 000000000..b217ce14f --- /dev/null +++ b/pkg/vcluster/vind.go @@ -0,0 +1,284 @@ +// Copyright 2026 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +package vcluster + +import ( + "bytes" + "os" + "os/exec" + "path/filepath" + "strings" + "text/template" + + "github.com/cockroachdb/errors" + "golang.org/x/sys/unix" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + + "github.com/redpanda-data/redpanda-operator/pkg/testutil" +) + +// VindCluster represents a vCluster in Docker (vind) cluster. +type VindCluster struct { + name string + config *rest.Config + kubeconfigData []byte +} + +// VindOpt configures a vind cluster. +type VindOpt func(*vindConfig) + +type vindConfig struct { + network string + podCIDR string + serviceCIDR string + installCertMgr bool +} + +// VindWithNetwork sets the Docker network for the vind cluster. +func VindWithNetwork(network string) VindOpt { + return func(c *vindConfig) { c.network = network } +} + +// VindWithPodCIDR sets the pod CIDR for the vind cluster. +func VindWithPodCIDR(cidr string) VindOpt { + return func(c *vindConfig) { c.podCIDR = cidr } +} + +// VindWithServiceCIDR sets the service CIDR for the vind cluster. +func VindWithServiceCIDR(cidr string) VindOpt { + return func(c *vindConfig) { c.serviceCIDR = cidr } +} + +// VindWithCertManager deploys cert-manager inside the vind cluster via the +// experimental.deploy.vcluster.helm mechanism. +func VindWithCertManager() VindOpt { + return func(c *vindConfig) { c.installCertMgr = true } +} + +// CreateVind creates a new vind (vCluster in Docker) cluster. +// If a cluster with the same name already exists, it is upgraded. +// A file-based lock serializes creation across parallel test processes +// because the vind binary extraction is not concurrent-safe. +// +//nolint:gosec // this code is for tests +func CreateVind(name string, opts ...VindOpt) (*VindCluster, error) { + cfg := &vindConfig{} + for _, opt := range opts { + opt(cfg) + } + + // Use a file lock to coordinate vind cluster creation across parallel + // test processes (go test -p=N). The vind binary extraction to + // ~/.vcluster/docker/ races when multiple processes create concurrently. + // A single global lock is used because the extraction path is shared. + unlock, err := vindLockFile("create") + if err != nil { + return nil, errors.Wrap(err, "acquiring vind lock") + } + defer unlock() + + args := []string{ + "create", name, + "--driver", "docker", + "--connect=false", + "--upgrade", + } + // this is required when running inside the docker container (Docker in Docker in the CI) + if vclusterVolDir := os.Getenv("VCLUSTER_STATE"); vclusterVolDir != "" { + args = append(args, "--config", vclusterVolDir+"/config.json") + } + + valuesYAML := buildVindValues(cfg) + if valuesYAML != "" { + f, err := os.CreateTemp("", "vind-values-*.yaml") + if err != nil { + return nil, errors.Wrap(err, "creating temp values file") + } + defer os.Remove(f.Name()) + if _, err := f.WriteString(valuesYAML); err != nil { + return nil, errors.Wrap(err, "writing values file") + } + f.Close() + args = append(args, "-f", f.Name()) + } + cmd := exec.Command("vcluster", args...) + + out, err := cmd.CombinedOutput() + if err != nil { + return nil, errors.Wrapf(err, "command: %s failed with: %s", cmd.String(), out) + } + + kubeconfigData, config, err := vindKubeconfig(name) + if err != nil { + return nil, err + } + + return &VindCluster{name: name, config: config, kubeconfigData: kubeconfigData}, nil +} + +// vindLockFile acquires an exclusive file lock to coordinate vind operations +// across parallel test processes. +// +//nolint:gosec // best-effort cleanup in test infrastructure +func vindLockFile(name string) (func(), error) { + lockPath := filepath.Join(os.TempDir(), "vind-"+name+".lock") + f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_RDWR, 0o600) + if err != nil { + return nil, err + } + if err := unix.Flock(int(f.Fd()), unix.LOCK_EX); err != nil { + f.Close() //nolint:errcheck + return nil, err + } + return func() { + _ = unix.Flock(int(f.Fd()), unix.LOCK_UN) + _ = f.Close() + }, nil +} + +//nolint:gosec // this code is for tests +func vindKubeconfig(name string) ([]byte, *rest.Config, error) { + out, err := exec.Command( + "vcluster", "connect", + "--silent", "--print", "--driver=docker", + name, + ).CombinedOutput() + if err != nil { + return nil, nil, errors.Wrapf(err, "vcluster connect --print %s: %s", name, out) + } + + config, err := clientcmd.RESTConfigFromKubeConfig(out) + if err != nil { + return nil, nil, errors.Wrap(err, "parsing vind kubeconfig") + } + + return out, config, nil +} + +// Name returns the cluster name. +func (v *VindCluster) Name() string { return v.name } + +// RESTConfig returns a copy of the cluster's REST config. +func (v *VindCluster) RESTConfig() *rest.Config { return rest.CopyConfig(v.config) } + +// WriteKubeconfig writes the raw kubeconfig to the given path. This is useful +// for tools (like the helm CLI) that need a kubeconfig file rather than a +// rest.Config, since the kube.RestToConfig round-trip loses fields like +// insecure-skip-tls-verify. +func (v *VindCluster) WriteKubeconfig(path string) error { + return os.WriteFile(path, v.kubeconfigData, 0o600) +} + +// Delete removes the vind cluster. +// +//nolint:gosec // this code is for tests +func (v *VindCluster) Delete() error { + out, err := exec.Command("vcluster", "delete", v.name, "--driver", "docker").CombinedOutput() + if err != nil { + return errors.Wrapf(err, "vcluster delete %s: %s", v.name, out) + } + return nil +} + +// Containers returns all Docker container names for this vind cluster +// (control plane + worker nodes). +// +//nolint:gosec // this code is for tests +func (v *VindCluster) Containers() ([]string, error) { + out, err := exec.Command("docker", "ps", "--format", "{{.Names}}").CombinedOutput() + if err != nil { + return nil, errors.Wrapf(err, "listing containers: %s", out) + } + cpName := "vcluster.cp." + v.name + nodePrefix := "vcluster.node." + v.name + "." + var result []string + for _, name := range strings.Split(strings.TrimSpace(string(out)), "\n") { + if name == cpName || strings.HasPrefix(name, nodePrefix) { + result = append(result, name) + } + } + return result, nil +} + +// CPContainerIP returns the Docker-internal IP of the control plane container. +// +//nolint:gosec // this code is for tests +func (v *VindCluster) CPContainerIP() (string, error) { + container := "vcluster.cp." + v.name + out, err := exec.Command( + "docker", "inspect", "-f", + "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}", + container, + ).Output() + if err != nil { + return "", errors.Wrapf(err, "inspecting container %s", container) + } + ip := strings.TrimSpace(string(out)) + if ip == "" { + return "", errors.Newf("no IP found for container %s", container) + } + return ip, nil +} + +var vindValuesTemplate = template.Must(template.New("vind-values").Parse(` +{{- if or .Network .InstallCertMgr }} +experimental: + {{- if .Network }} + docker: + network: {{ .Network }} + {{- end }} + {{- if .InstallCertMgr }} + deploy: + vcluster: + helm: + - chart: + name: cert-manager + repo: https://charts.jetstack.io + version: {{ .CertManagerVersion }} + values: | + installCRDs: true + global: + leaderElection: + renewDeadline: 10s + retryPeriod: 5s + release: + name: cert-manager + namespace: cert-manager + {{- end }} +{{- end }} +{{- if or .PodCIDR .ServiceCIDR }} +networking: + {{- if .PodCIDR }} + podCIDR: {{ .PodCIDR }} + {{- end }} + {{- if .ServiceCIDR }} + serviceCIDR: {{ .ServiceCIDR }} + {{- end }} +{{- end }} +`)) + +func buildVindValues(cfg *vindConfig) string { + var buf bytes.Buffer + _ = vindValuesTemplate.Execute(&buf, struct { + Network string + PodCIDR string + ServiceCIDR string + InstallCertMgr bool + CertManagerVersion string + }{ + Network: cfg.network, + PodCIDR: cfg.podCIDR, + ServiceCIDR: cfg.serviceCIDR, + InstallCertMgr: cfg.installCertMgr, + CertManagerVersion: testutil.CertManagerVersion, + }) + return buf.String() +}