Dev: sbd: Improve leverage maintenance mode

liangxin1300 · liangxin1300 · commit 6fe9092d318b · 2025-10-23T13:38:31.000+08:00
diff --git a/crmsh/sbd.py b/crmsh/sbd.py
@@ -710,6 +710,9 @@ def init_and_deploy_sbd(self):
             self._load_attributes_from_bootstrap()
 
         with utils.leverage_maintenance_mode() as enabled:
+            if not utils.able_to_restart_cluster(enabled):
+                return
+
             self.initialize_sbd()
             self.update_configuration()
             self.enable_sbd_service()
@@ -723,7 +726,7 @@ def init_and_deploy_sbd(self):
                 # because the stonith-watchdog-timeout property requires sbd.service to be active.
                 restart_cluster_first = self.diskless_sbd and not ServiceManager().service_is_active(constants.SBD_SERVICE)
                 if restart_cluster_first:
-                    SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
+                    bootstrap.restart_cluster()
 
                 self.configure_sbd()
                 bootstrap.adjust_properties(with_sbd=True)
@@ -733,7 +736,7 @@ def init_and_deploy_sbd(self):
                 # This helps prevent unexpected issues, such as nodes being fenced
                 # due to large SBD_WATCHDOG_TIMEOUT values combined with smaller timeouts.
                 if not restart_cluster_first:
-                    SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
+                    bootstrap.restart_cluster()
 
     def join_sbd(self, remote_user, peer_host):
         '''
diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py
@@ -492,8 +492,11 @@ def _device_remove(self, devices_to_remove: typing.List[str]):
 
         logger.info("Remove devices: %s", ';'.join(devices_to_remove))
         update_dict = {"SBD_DEVICE": ";".join(left_device_list)}
-        sbd.SBDManager.update_sbd_configuration(update_dict)
-        sbd.SBDManager.restart_cluster_if_possible()
+        with utils.leverage_maintenance_mode() as enabled:
+            if not utils.able_to_restart_cluster(enabled):
+                return
+            sbd.SBDManager.update_sbd_configuration(update_dict)
+            bootstrap.restart_cluster()
 
     @command.completers_repeating(sbd_device_completer)
     def do_device(self, context, *args) -> bool:
@@ -578,17 +581,20 @@ def do_purge(self, context, *args) -> bool:
 
         utils.check_all_nodes_reachable("purging SBD")
 
-        if args and args[0] == "crashdump":
-            self._set_crashdump_option(delete=True)
-            update_dict = self._set_crashdump_in_sysconfig(restore=True)
-            if update_dict:
-                sbd.SBDManager.update_sbd_configuration(update_dict)
-                sbd.SBDManager.restart_cluster_if_possible()
-            return True
+        with utils.leverage_maintenance_mode() as enabled:
+            if not utils.able_to_restart_cluster(enabled):
+                return False
 
-        sbd.purge_sbd_from_cluster()
-        sbd.SBDManager.restart_cluster_if_possible()
-        return True
+            if args and args[0] == "crashdump":
+                self._set_crashdump_option(delete=True)
+                update_dict = self._set_crashdump_in_sysconfig(restore=True)
+                if update_dict:
+                    sbd.SBDManager.update_sbd_configuration(update_dict)
+            else:
+                sbd.purge_sbd_from_cluster()
+
+            bootstrap.restart_cluster()
+            return True
 
     def _print_sbd_type(self):
         if not self.service_manager.service_is_active(constants.SBD_SERVICE):
diff --git a/crmsh/utils.py b/crmsh/utils.py
@@ -3306,4 +3306,31 @@ def validate_and_get_reachable_nodes(
             member_list.remove(node)
 
     return member_list + remote_list
+
+
+def able_to_restart_cluster(in_maintenance_mode: bool) -> bool:
+    """
+    Check whether it is able to restart cluster now
+    1. If pacemaker is not running, return True
+    2. If no non-stonith resource is running, return True
+    3. If in maintenance mode and DLM is not running, return True
+    4. Otherwise, return False with warning messages to guide user
+    """
+    if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
+        return True
+    crm_mon_parser = xmlutil.CrmMonXmlParser()
+    if not crm_mon_parser.is_non_stonith_resource_running():
+        return True
+    elif in_maintenance_mode:
+        if is_dlm_running():
+            dlm_related_ids = crm_mon_parser.get_resource_top_parent_id_set_via_type(constants.DLM_CONTROLD_RA)
+            logger.warning("Please stop DLM related resources (%s) and try again", ', '.join(dlm_related_ids))
+            return False
+        else:
+            return True
+    else:
+        logger.warning("Please stop all running resources and try again")
+        logger.warning("Or run this command with -F/--force option to leverage maintenance mode")
+        logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
+        return False
 # vim:ts=4:sw=4:et:
diff --git a/crmsh/xmlutil.py b/crmsh/xmlutil.py
@@ -1609,6 +1609,13 @@ def is_resource_started(self, ra):
         xpath = f'//resource[(@id="{ra}" or @resource_agent="{ra}") and @active="true" and @role="Started"]'
         return bool(self.xml_elem.xpath(xpath))
 
+    def get_resource_top_parent_id_set_via_type(self, ra_type):
+        """
+        Given configured ra type, get the topmost parent ra id set
+        """
+        xpath = f'//resource[@resource_agent="{ra_type}"]'
+        return set([get_topmost_rsc(elem).get('id') for elem in self.xml_elem.xpath(xpath)])
+
     def get_resource_id_list_via_type(self, ra_type):
         """
         Given configured ra type, get the ra id list