Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions api/v1/mdb/mongodb_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/mongodb/mongodb-kubernetes/pkg/dns"
"github.com/mongodb/mongodb-kubernetes/pkg/fcv"
"github.com/mongodb/mongodb-kubernetes/pkg/kube"
"github.com/mongodb/mongodb-kubernetes/pkg/multicluster"
"github.com/mongodb/mongodb-kubernetes/pkg/util"
"github.com/mongodb/mongodb-kubernetes/pkg/util/env"
"github.com/mongodb/mongodb-kubernetes/pkg/util/stringutil"
Expand Down Expand Up @@ -1665,6 +1666,48 @@ func (m *MongoDbSpec) IsMultiCluster() bool {
return m.GetTopology() == ClusterTopologyMultiCluster
}

func (m *MongoDbSpec) GetShardClusterSpecList() ClusterSpecList {
if m.IsMultiCluster() {
return m.ShardSpec.ClusterSpecList
} else {
return ClusterSpecList{
{
ClusterName: multicluster.LegacyCentralClusterName,
Members: m.MongodsPerShardCount,
MemberConfig: m.MemberConfig,
},
}
}
}

func (m *MongoDbSpec) GetMongosClusterSpecList() ClusterSpecList {
if m.IsMultiCluster() {
return m.MongosSpec.ClusterSpecList
} else {
return ClusterSpecList{
{
ClusterName: multicluster.LegacyCentralClusterName,
Members: m.MongosCount,
ExternalAccessConfiguration: m.ExternalAccessConfiguration,
},
}
}
}

func (m *MongoDbSpec) GetConfigSrvClusterSpecList() ClusterSpecList {
if m.IsMultiCluster() {
return m.ConfigSrvSpec.ClusterSpecList
} else {
return ClusterSpecList{
{
ClusterName: multicluster.LegacyCentralClusterName,
Members: m.ConfigServerCount,
MemberConfig: m.MemberConfig,
},
}
}
}

type MongoDBConnectionStringBuilder struct {
MongoDB
hostnames []string
Expand Down
19 changes: 16 additions & 3 deletions api/v1/mdb/shardedcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,25 @@ func (s *ShardedClusterComponentSpec) GetAgentConfig() *AgentConfig {
return &s.Agent
}

func (s *ShardedClusterComponentSpec) ClusterSpecItemExists(clusterName string) bool {
return s.getClusterSpecItemOrNil(clusterName) != nil
}

func (s *ShardedClusterComponentSpec) GetClusterSpecItem(clusterName string) ClusterSpecItem {
if clusterSpecItem := s.getClusterSpecItemOrNil(clusterName); clusterSpecItem != nil {
return *clusterSpecItem
}

// it should never occur - we preprocess all clusterSpecLists
panic(fmt.Errorf("clusterName %s not found in clusterSpecList", clusterName))
}

func (s *ShardedClusterComponentSpec) getClusterSpecItemOrNil(clusterName string) *ClusterSpecItem {
for i := range s.ClusterSpecList {
if s.ClusterSpecList[i].ClusterName == clusterName {
return s.ClusterSpecList[i]
return &s.ClusterSpecList[i]
}
}
// it should never occur - we preprocess all clusterSpecLists
panic(fmt.Errorf("clusterName %s not found in clusterSpecList", clusterName))

return nil
}
3 changes: 3 additions & 0 deletions api/v1/om/appdb_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,9 @@ func (m *AppDBSpec) GetMemberClusterSpecByName(memberClusterName string) mdbv1.C

// In case the member cluster is not found in the cluster spec list, we return an empty ClusterSpecItem
// with 0 members to handle the case of removing a cluster from the spec list without a panic.
// TODO: this is not ideal, because we don't consider other fields that were removed i.e.MemberConfig.
// When scaling down we should consider the full spec that was used to create the cluster.
// Create a ticket
return mdbv1.ClusterSpecItem{
ClusterName: memberClusterName,
Members: 0,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
kind: fix
date: 2025-10-06
---

* **MultiClusterSharded**: Block removing non-zero member cluster from MongoDB resource. This prevents from scaling down member cluster without current configuration available, which can lead to unexpected issues. Previously operator was crashing in that scenario, after the fix it will mark reconciliation as `Failed` with appropriate message. Example unsafe scenario that is now blocked:
* User has 2 member clusters: `main` is used for application traffic, `read-analytics` is used for read-only analytics
* `main` cluster has 7 voting members
* `read-analytics` cluster has 3 non-voting members
* User decides to remove `read-analytics` cluster, by removing the `clusterSpecItem` completely
* Operator scales down members from `read-analytics` cluster one by one
* Because the configuration does not have voting options specified anymore and by default `priority` is set to 1, the operator will remove one member, but the other two members will be reconfigured as voting members
* `replicaset` contains now 9 voting members, which is not [supported by MongoDB](https://www.mongodb.com/docs/manual/reference/limits/#mongodb-limit-Number-of-Voting-Members-of-a-Replica-Set)
Copy link
Contributor

@lsierant lsierant Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In practice there is a code in the operator that limits the number of voting members to 7 and mark the rest as non voting. But it's still a valid case of losing configuration and resulting in a non deterministic configuration of voting members.

https://github.com/mongodb/mongodb-kubernetes/blob/master/controllers/om/deployment.go#L1206-L1218

19 changes: 10 additions & 9 deletions controllers/operator/authentication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func TestX509ClusterAuthentication_CanBeEnabled_IfX509AuthenticationIsEnabled_Sh
ctx := context.Background()
scWithTls := test.DefaultClusterBuilder().EnableTLS().EnableX509().SetName("sc-with-tls").SetTLSCA("custom-ca").Build()

reconciler, _, client, _, err := defaultClusterReconciler(ctx, nil, "", "", scWithTls, nil)
reconciler, _, client, _, err := defaultShardedClusterReconciler(ctx, nil, "", "", scWithTls, nil)
require.NoError(t, err)
addKubernetesTlsResources(ctx, client, scWithTls)

Expand All @@ -76,7 +76,7 @@ func TestX509CanBeEnabled_WhenThereAreOnlyTlsDeployments_ShardedCluster(t *testi
ctx := context.Background()
scWithTls := test.DefaultClusterBuilder().EnableTLS().EnableX509().SetName("sc-with-tls").SetTLSCA("custom-ca").Build()

reconciler, _, client, _, err := defaultClusterReconciler(ctx, nil, "", "", scWithTls, nil)
reconciler, _, client, _, err := defaultShardedClusterReconciler(ctx, nil, "", "", scWithTls, nil)
require.NoError(t, err)
addKubernetesTlsResources(ctx, client, scWithTls)

Expand Down Expand Up @@ -333,7 +333,7 @@ func TestX509InternalClusterAuthentication_CanBeEnabledWithScram_ShardedCluster(
EnableX509InternalClusterAuth().
Build()

r, _, kubeClient, omConnectionFactory, _ := defaultClusterReconciler(ctx, nil, "", "", sc, nil)
r, _, kubeClient, omConnectionFactory, _ := defaultShardedClusterReconciler(ctx, nil, "", "", sc, nil)
addKubernetesTlsResources(ctx, r.client, sc)
checkReconcileSuccessful(ctx, t, r, sc, kubeClient)

Expand Down Expand Up @@ -770,15 +770,16 @@ func Test_NoAdditionalDomainsPresent(t *testing.T) {
// The default secret we create does not contain additional domains so it will not be valid for this RS
rs.Spec.Security.TLSConfig.AdditionalCertificateDomains = []string{"foo"}

reconciler, _, client, _, err := defaultClusterReconciler(ctx, nil, "", "", rs, nil)
require.NoError(t, err)
addKubernetesTlsResources(ctx, client, rs)
kubeClient, omConnectionFactory := mock.NewDefaultFakeClient(rs)
reconciler := newReplicaSetReconciler(ctx, kubeClient, nil, "", "", false, false, omConnectionFactory.GetConnectionFunc)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was previously using shardedClusterReconciler instead of replicaset reconciler

addKubernetesTlsResources(ctx, kubeClient, rs)

secret := &corev1.Secret{}
certSecret := &corev1.Secret{}

_ = client.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("%s-cert", rs.Name), Namespace: rs.Namespace}, secret)
_ = kubeClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("%s-cert", rs.Name), Namespace: rs.Namespace}, certSecret)

err = certs.VerifyAndEnsureCertificatesForStatefulSet(ctx, reconciler.SecretClient, reconciler.SecretClient, fmt.Sprintf("%s-cert", rs.Name), certs.ReplicaSetConfig(*rs), nil)
err := certs.VerifyAndEnsureCertificatesForStatefulSet(ctx, reconciler.SecretClient, reconciler.SecretClient, fmt.Sprintf("%s-cert", rs.Name), certs.ReplicaSetConfig(*rs), nil)
require.Error(t, err)
for i := 0; i < rs.Spec.Members; i++ {
expectedErrorMessage := fmt.Sprintf("domain %s-%d.foo is not contained in the list of DNSNames", rs.Name, i)
assert.Contains(t, err.Error(), expectedErrorMessage)
Expand Down
Loading