diff --git a/example/cmd/microctl/cluster_members.go b/example/cmd/microctl/cluster_members.go index 7c1fdb65..97d373f8 100644 --- a/example/cmd/microctl/cluster_members.go +++ b/example/cmd/microctl/cluster_members.go @@ -139,7 +139,8 @@ func (c *cmdClusterMembersList) listLocalClusterMembers(m *microcluster.MicroClu type cmdClusterMemberRemove struct { common *CmdControl - flagForce bool + flagForce bool + flagAddress string } func (c *cmdClusterMemberRemove) command() *cobra.Command { @@ -150,7 +151,7 @@ func (c *cmdClusterMemberRemove) command() *cobra.Command { } cmd.Flags().BoolVarP(&c.flagForce, "force", "f", false, "Forcibly remove the cluster member") - + cmd.Flags().StringVar(&c.flagAddress, "address", "", "Optional fallback address of the cluster member to remove") return cmd } @@ -164,7 +165,7 @@ func (c *cmdClusterMemberRemove) run(cmd *cobra.Command, args []string) error { return err } - return m.RemoveClusterMember(cmd.Context(), args[0], c.flagForce) + return m.RemoveClusterMember(cmd.Context(), args[0], c.flagAddress, c.flagForce) } type cmdClusterEdit struct { diff --git a/example/test/main.sh b/example/test/main.sh index 7ac7b59e..8dac73c8 100755 --- a/example/test/main.sh +++ b/example/test/main.sh @@ -488,11 +488,112 @@ test_membership_consistency() { cat /tmp/token_error fi + # Test member force flag overrides during removal with inconsistent state works + echo " -> Testing member force removal with inconsistent state" + if microctl --state-dir "${test_dir}/c1" cluster remove c2 --force --address 127.0.0.1:9002; then + echo " -> Member c2 force removal succeeded as expected" + else + echo "ERROR: Member force removal should have succeeded despite inconsistent state" + exit 1 + fi + + # Generating a new token should now succeed + echo " -> Testing token generation after resolving inconsistency" + if microctl --state-dir "${test_dir}/c1" tokens add c5 2>/tmp/token_resp; then + echo " -> Token generation succeeded as expected after resolving inconsistency" + cat /tmp/token_resp + else + echo "ERROR: Token generation should have succeeded after resolving inconsistency" + cat /tmp/token_resp + exit 1 + fi echo " -> Membership consistency checks working as expected" shutdown_systems } +test_truststore_force_removal() { + echo "Testing force removal" + + new_systems 3 --heartbeat 2s + + # Bootstrap first member + microctl --state-dir "${test_dir}/c1" init "c1" 127.0.0.1:9001 --bootstrap + + # Join second and third members + token_c2=$(microctl --state-dir "${test_dir}/c1" tokens add "c2") + microctl --state-dir "${test_dir}/c2" init "c2" 127.0.0.1:9002 --token "${token_c2}" + + token_c3=$(microctl --state-dir "${test_dir}/c1" tokens add "c3") + microctl --state-dir "${test_dir}/c3" init "c3" 127.0.0.1:9003 --token "${token_c3}" + + # Wait for cluster to stabilize + echo " -> Waiting for cluster to stabilize" + retry_count=0 + max_retries=10 + while [[ -n "$(microctl --state-dir "${test_dir}/c1" cluster list -f yaml | yq '.[] | select(.role == "PENDING")')" ]] && [[ ${retry_count} -lt ${max_retries} ]]; do + sleep 2 + retry_count=$((retry_count + 1)) + done + + echo " -> Cluster established with 3 members" + microctl --state-dir "${test_dir}/c1" cluster list + + # Test force removal of non-existing member and random address + echo " -> Testing force removal of non-existing member with random address" + if microctl --state-dir "${test_dir}/c1" cluster remove nonexist --force --address 127.0.0.1:9999 2>/tmp/remove_error; then + echo "ERROR: Force removal of non-existing member should have failed" + cat /tmp/remove_error + exit 1 + else + echo " -> Force removal of non-existing member failed as expected" + cat /tmp/remove_error + fi + + # Simulate truststore corruption: remove c3 from truststore while keeping DB and dqlite entries + # Need to remove from all nodes' truststores to prevent repopulation + echo " -> Simulating truststore deletion of c3 from all nodes (keeping DB and dqlite entries)" + rm -f "${test_dir}/c1/truststore/c3.yaml" + rm -f "${test_dir}/c2/truststore/c3.yaml" + rm -f "${test_dir}/c3/truststore/c3.yaml" + + # Attempt normal removal should fail (membership inconsistency detected) + echo " -> Testing normal removal of c3 (should fail due to missing truststore)" + if microctl --state-dir "${test_dir}/c1" cluster remove c3 --address 127.0.0.1:9003 2>/tmp/remove_error; then + echo "ERROR: Normal removal should have failed" + exit 1 + else + echo " -> Normal removal blocked as expected" + cat /tmp/remove_error + fi + + # Force remove with explicit address should succeed + echo " -> Testing force removal of c3 with address override" + if microctl --state-dir "${test_dir}/c1" cluster remove c3 --force --address 127.0.0.1:9003; then + echo " -> Force removal of c3 succeeded" + else + echo "ERROR: Force removal should have succeeded" + exit 1 + fi + + # Now generate a new token - this should succeed because membership is now consistent + echo " -> Testing token generation after force removal (should succeed)" + if microctl --state-dir "${test_dir}/c1" tokens add "c4" 2>/tmp/token_resp; then + echo " -> Token generation succeeded - membership is now consistent" + cat /tmp/token_resp + else + echo "ERROR: Token generation should have succeeded after force removal" + cat /tmp/token_resp + exit 1 + fi + + echo "SUCCESS: Force removal of non-existing member and random address blocked as expected" + echo "SUCCESS: Force removal of truststore-orphaned node successful" + echo "SUCCESS: Verified membership consistency restored after force removal" + + shutdown_systems +} + test_parallel_joins() { echo "Testing parallel joins" @@ -580,6 +681,7 @@ if [ "${1:-"all"}" = "all" ] || [ "${1}" = "" ]; then test_join_token_before_cluster_formed test_extended_endpoints test_membership_consistency + test_truststore_force_removal test_parallel_joins elif [ "${1}" = "recover" ]; then test_recover @@ -595,6 +697,8 @@ elif [ "${1}" = "extended" ]; then test_extended_endpoints elif [ "${1}" = "membership" ]; then test_membership_consistency +elif [ "${1}" = "force-removal" ]; then + test_truststore_force_removal elif [ "${1}" = "parallel-join" ]; then test_parallel_joins else diff --git a/internal/rest/client/cluster.go b/internal/rest/client/cluster.go index 9fcbddb4..4d29c75d 100644 --- a/internal/rest/client/cluster.go +++ b/internal/rest/client/cluster.go @@ -58,7 +58,11 @@ func ResetClusterMember(ctx context.Context, c types.Client, name string, force } // DeleteClusterMember deletes the cluster member with the given name. -func DeleteClusterMember(ctx context.Context, c types.Client, name string, force bool) error { +// If `address` is non-empty it is sent as a query parameter to target dqlite +// removal by address when the truststore cannot map name to address. +// Dqlite does not track names, so this is useful when the name is no longer +// resolvable. +func DeleteClusterMember(ctx context.Context, c types.Client, name string, address string, force bool) error { queryCtx, cancel := withTimeoutIfUnset(ctx) defer cancel() @@ -67,6 +71,10 @@ func DeleteClusterMember(ctx context.Context, c types.Client, name string, force endpoint = endpoint.WithQuery("force", "1") } + if address != "" { + endpoint = endpoint.WithQuery("address", address) + } + return c.Query(queryCtx, "DELETE", types.PublicEndpoint, &endpoint.URL, nil, nil) } diff --git a/internal/rest/resources/cluster.go b/internal/rest/resources/cluster.go index fc2c8768..8551a897 100644 --- a/internal/rest/resources/cluster.go +++ b/internal/rest/resources/cluster.go @@ -417,18 +417,49 @@ func resetClusterMember(ctx context.Context, s state.State, force bool) (reExec // clusterMemberDelete Removes a cluster member from dqlite and re-execs its daemon. func clusterMemberDelete(s state.State, r *http.Request) response.Response { force := r.URL.Query().Get("force") == "1" + addr := r.URL.Query().Get("address") name, err := url.PathUnescape(mux.Vars(r)["name"]) if err != nil { return response.SmartError(err) } - allRemotes := s.Truststore().RemotesByName() - remote, ok := allRemotes[name] - if !ok { - return response.SmartError(fmt.Errorf("No remote exists with the given name %q", name)) + ctx := r.Context() + + logger, err := log.LoggerFromContext(ctx) + if err != nil { + return response.InternalError(err) } - ctx := r.Context() + allRemotes := s.Truststore().RemotesByName() + remote, remotePresent := allRemotes[name] + + // Determine the address to use for dqlite removal: + // - If remote exists in truststore and no address provided, use the truststore address. + // - If remote missing and no address provided, require explicit address. + // - If address provided, it must match the truststore address (if remote exists) or be valid (if not). + if remotePresent && addr == "" { + addr = remote.Address.String() + } else if !remotePresent && addr == "" { + // If the remote is not present in the truststore and no address is provided, we cannot proceed. + return response.SmartError(fmt.Errorf("Cluster member %q not found in truststore; please provide a node address", name)) + } else if remotePresent && addr != "" && remote.Address.String() != addr { + // Reject if provided address doesn't match the truststore address for this remote name. + return response.SmartError(fmt.Errorf("Provided address %q does not match the address %q of the remote with name %q", addr, remote.Address.String(), name)) + } else if !remotePresent && addr != "" { + // Remote missing from truststore; validate the fallback address format. + addrPort, err := types.ParseAddrPort(addr) + if err != nil { + return response.SmartError(fmt.Errorf("Invalid address %q: %w", addr, err)) + } + + // Ensure the fallback address isn't claimed by another remote in the truststore. + existingRemote := s.Truststore().RemoteByAddress(addrPort) + if existingRemote != nil { + return response.SmartError(fmt.Errorf("Address %q is already used by remote %q (address %q); address is only a fallback for %q when it is missing from the truststore", addr, existingRemote.Name, existingRemote.Address.String(), name)) + } + + logger.Warn("Cluster member not found in truststore; proceeding with provided fallback address", slog.String("member", name), slog.String("address", addr)) + } // Check cluster membership consistency before allowing removals (unless forced) // This ensures core_cluster_members, truststore, and dqlite are all in sync @@ -454,14 +485,9 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { return response.SmartError(err) } - logger, err := log.LoggerFromContext(ctx) - if err != nil { - return response.InternalError(err) - } - // If we are not the leader, just forward the request. if leaderInfo.Address != s.Address().Host { - if allRemotes[name].Address.String() == s.Address().Host { + if addr == s.Address().Host { // If the member being removed is ourselves and we are not the leader, then lock the // clusterPutDisableMu before we forward the request to the leader, so that when the leader // goes on to request clusterPutDisable back to ourselves it won't be actioned until we @@ -482,7 +508,7 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { return response.SmartError(err) } - err = internalClient.DeleteClusterMember(ctx, client, name, force) + err = internalClient.DeleteClusterMember(ctx, client, name, addr, force) if err != nil { return response.SmartError(err) } @@ -511,7 +537,7 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { index := -1 for i, node := range info { - if node.Address == remote.Address.String() { + if node.Address == addr { index = i break } @@ -519,7 +545,7 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { // If we can't find the node in dqlite, that means it failed to fully initialize. It still might have a record in our database so continue along anyway. if index < 0 { - logger.Error(fmt.Sprintf("No dqlite record exists for %q, deleting from internal record instead", remote.Name)) + logger.Error("No dqlite record exists for the member", slog.String("member", name)) } var clusterMembers []cluster.CoreClusterMember @@ -533,6 +559,20 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { return response.SmartError(err) } + // Check if member exists in the database. + memberInDB := false + for _, m := range clusterMembers { + if m.Address == addr { + memberInDB = true + break + } + } + + // If member not found in dqlite and not in database, return error. + if index < 0 && !memberInDB { + return response.SmartError(fmt.Errorf("Cluster member %q with address %q not found in dqlite or database", name, addr)) + } + numPending := 0 for _, clusterMember := range clusterMembers { if clusterMember.Role == cluster.Pending { @@ -549,7 +589,7 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { } // If we are removing the leader of a 2-node cluster, ensure the remaining node is a voter. - if len(info) == 2 && allRemotes[name].Address.String() == leaderInfo.Address { + if len(info) == 2 && addr == leaderInfo.Address { for _, node := range info { if node.Address != leaderInfo.Address && node.Role != dqliteClient.Voter { err = leader.Assign(ctx, node.ID, dqliteClient.Voter) @@ -567,10 +607,10 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { } // If we are the leader and removing ourselves, reassign the leader role and perform the removal from there. - if allRemotes[name].Address.String() == leaderInfo.Address { + if remotePresent && addr == leaderInfo.Address { otherNodes := []uint64{} for _, node := range info { - if node.Address != allRemotes[name].Address.String() && node.Role == dqliteClient.Voter { + if node.Address != addr && node.Role == dqliteClient.Voter { otherNodes = append(otherNodes, node.ID) } } @@ -605,7 +645,7 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { clusterDisableMu.Unlock() }() - err = internalClient.DeleteClusterMember(ctx, client, name, force) + err = internalClient.DeleteClusterMember(ctx, client, name, addr, force) if err != nil { return response.SmartError(err) } @@ -632,22 +672,38 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { return response.SmartError(err) } - // Set the forwarded flag so that the the system to be removed knows the removal is in progress. - c, err := internalClient.New(remote.URL(), s.ServerCert(), publicKey, true) - if err != nil { - return response.SmartError(err) + var memberURL *url.URL + if !remotePresent { + memberURL, err = url.Parse("https://" + addr) + if err != nil { + return response.SmartError(fmt.Errorf("invalid address %q: %w", addr, err)) + } + } else { + memberURL = remote.URL() } // Tell the cluster member to run its PreRemove hook and return. - err = internalClient.RunPreRemoveHook(ctx, c.UseTarget(name), types.HookRemoveMemberOptions{Force: force}) - if err != nil && !force { - return response.SmartError(err) + // Set the forwarded flag so that the system to be removed knows the removal is in progress. + c, err := internalClient.New(memberURL, s.ServerCert(), publicKey, true) + if err != nil { + if !force { + return response.SmartError(err) + } + + logger.Warn("Failed creating client for remote PreRemove (forcing)", slog.String("error", err.Error())) + } else { + err = internalClient.RunPreRemoveHook(ctx, c.UseTarget(name), types.HookRemoveMemberOptions{Force: force}) + if err != nil && !force { + return response.SmartError(err) + } } - // Remove the cluster member from the database. + // Remove the cluster member from the database using its address if available; otherwise + // return an error indicating that no address was provided or found. err = s.Database().Transaction(ctx, func(ctx context.Context, tx *sql.Tx) error { - return cluster.DeleteCoreClusterMember(ctx, tx, remote.Address.String()) + return cluster.DeleteCoreClusterMember(ctx, tx, addr) }) + if err != nil && !force { return response.SmartError(err) } @@ -660,10 +716,10 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { } } - url := api.NewURL() - url.URL = *s.FileSystem().ControlSocket() + u := api.NewURL() + u.URL = *s.FileSystem().ControlSocket() - localClient, err := s.Connect().Member(&url.URL, false, nil) + localClient, err := s.Connect().Member(&u.URL, false, nil) if err != nil { return response.SmartError(err) } @@ -673,14 +729,18 @@ func clusterMemberDelete(s state.State, r *http.Request) response.Response { return response.SmartError(err) } - client, err := s.Connect().Member(remote.URL(), false, publicKey) + client, err := s.Connect().Member(memberURL, false, publicKey) if err != nil { - return response.SmartError(err) - } + if !force { + return response.SmartError(err) + } - err = internalClient.ResetClusterMember(ctx, client, name, force) - if err != nil && !force { - return response.SmartError(err) + logger.Warn("Failed connecting to cluster member to perform a node reset", slog.String("error", err.Error()), slog.Bool("force", force)) + } else { + err = internalClient.ResetClusterMember(ctx, client, name, force) + if err != nil && !force { + return response.SmartError(err) + } } intState, err := internalState.ToInternal(s) diff --git a/internal/rest/resources/control.go b/internal/rest/resources/control.go index f13663e4..083e41f1 100644 --- a/internal/rest/resources/control.go +++ b/internal/rest/resources/control.go @@ -131,7 +131,7 @@ func controlPost(state state.State, r *http.Request) response.Response { <-r.Context().Done() // Use `force=1` to ensure the node is fully removed, in case its listener hasn't been set up. - err = internalClient.DeleteClusterMember(context.Background(), client, req.Name, true) + err = internalClient.DeleteClusterMember(r.Context(), client, req.Name, "", true) if err != nil { logger.Error("Failed to clean up cluster state after join failure", slog.String("error", err.Error())) } diff --git a/microcluster/app.go b/microcluster/app.go index f5f54463..e83c7e63 100644 --- a/microcluster/app.go +++ b/microcluster/app.go @@ -323,13 +323,17 @@ func (m *MicroCluster) RevokeJoinToken(ctx context.Context, name string) error { } // RemoveClusterMember removes a member from the cluster. -func (m *MicroCluster) RemoveClusterMember(ctx context.Context, name string, force bool) error { +// If `address` is non-empty it is sent as a query parameter to target dqlite +// removal by address when the truststore cannot map name to address. +// Dqlite does not track names, so this is useful when the name is no longer +// resolvable. +func (m *MicroCluster) RemoveClusterMember(ctx context.Context, name string, address string, force bool) error { c, err := m.LocalClient() if err != nil { return err } - return internalClient.DeleteClusterMember(ctx, c, name, force) + return internalClient.DeleteClusterMember(ctx, c, name, address, force) } // LocalClient returns a client connected to the local control socket.