diff --git a/crmsh/sbd.py b/crmsh/sbd.py index 1ef2e950e..a0253ce68 100644 --- a/crmsh/sbd.py +++ b/crmsh/sbd.py @@ -594,22 +594,6 @@ def enable_sbd_service(self): logger.info("Enable %s on node %s", constants.SBD_SERVICE, node) service_manager.enable_service(constants.SBD_SERVICE, node) - @staticmethod - def restart_cluster_if_possible(with_maintenance_mode=False): - if not ServiceManager().service_is_active(constants.PCMK_SERVICE): - return - if not xmlutil.CrmMonXmlParser().is_non_stonith_resource_running(): - bootstrap.restart_cluster() - elif with_maintenance_mode: - if not utils.is_dlm_running(): - bootstrap.restart_cluster() - else: - logger.warning("Resource is running, need to restart cluster service manually on each node") - else: - logger.warning("Resource is running, need to restart cluster service manually on each node") - logger.warning("Or, run with `crm -F` or `--force` option, the `sbd` subcommand will leverage maintenance mode for any changes that require restarting sbd.service") - logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting") - def configure_sbd(self): ''' Configure fence_sbd resource and related properties @@ -747,6 +731,9 @@ def init_and_deploy_sbd(self, restart_first=False): self._load_attributes_from_bootstrap() with utils.leverage_maintenance_mode() as enabled: + if not utils.able_to_restart_cluster(enabled): + return + self.initialize_sbd() self.update_configuration() self.enable_sbd_service() @@ -761,7 +748,7 @@ def init_and_deploy_sbd(self, restart_first=False): restart_cluster_first = restart_first or \ (self.diskless_sbd and not ServiceManager().service_is_active(constants.SBD_SERVICE)) if restart_cluster_first: - SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled) + bootstrap.restart_cluster() self.configure_sbd() bootstrap.adjust_properties(with_sbd=True) @@ -771,7 +758,7 @@ def init_and_deploy_sbd(self, restart_first=False): # This helps prevent unexpected issues, such as nodes being fenced # due to large SBD_WATCHDOG_TIMEOUT values combined with smaller timeouts. if not restart_cluster_first: - SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled) + bootstrap.restart_cluster() def join_sbd(self, remote_user, peer_host): ''' diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py index d9bae7e6e..2733f33e9 100644 --- a/crmsh/ui_sbd.py +++ b/crmsh/ui_sbd.py @@ -517,8 +517,11 @@ def _device_remove(self, devices_to_remove: typing.List[str]): logger.info("Remove devices: %s", ';'.join(devices_to_remove)) update_dict = {"SBD_DEVICE": ";".join(left_device_list)} - sbd.SBDManager.update_sbd_configuration(update_dict) - sbd.SBDManager.restart_cluster_if_possible() + with utils.leverage_maintenance_mode() as enabled: + if not utils.able_to_restart_cluster(enabled): + return + sbd.SBDManager.update_sbd_configuration(update_dict) + bootstrap.restart_cluster() @command.completers_repeating(sbd_device_completer) def do_device(self, context, *args) -> bool: @@ -601,22 +604,34 @@ def do_purge(self, context, *args) -> bool: if not self._service_is_active(constants.SBD_SERVICE): return False + purge_crashdump = False + if args: + if args[0] == "crashdump": + if not self._is_crashdump_configured(): + logger.error("SBD crashdump is not configured") + return False + purge_crashdump = True + else: + logger.error("Invalid argument: %s", ' '.join(args)) + logger.info("Usage: crm sbd purge [crashdump]") + return False + utils.check_all_nodes_reachable("purging SBD") - if args and args[0] == "crashdump": - if not self._is_crashdump_configured(): - logger.error("SBD crashdump is not configured") + with utils.leverage_maintenance_mode() as enabled: + if not utils.able_to_restart_cluster(enabled): return False - self._set_crashdump_option(delete=True) - update_dict = self._set_crashdump_in_sysconfig(restore=True) - if update_dict: - sbd.SBDManager.update_sbd_configuration(update_dict) - sbd.SBDManager.restart_cluster_if_possible() - return True - sbd.purge_sbd_from_cluster() - sbd.SBDManager.restart_cluster_if_possible() - return True + if purge_crashdump: + self._set_crashdump_option(delete=True) + update_dict = self._set_crashdump_in_sysconfig(restore=True) + if update_dict: + sbd.SBDManager.update_sbd_configuration(update_dict) + else: + sbd.purge_sbd_from_cluster() + + bootstrap.restart_cluster() + return True def _print_sbd_type(self): if not self.service_manager.service_is_active(constants.SBD_SERVICE): diff --git a/crmsh/utils.py b/crmsh/utils.py index a8a133ecc..3b7002077 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -3306,4 +3306,31 @@ def validate_and_get_reachable_nodes( member_list.remove(node) return member_list + remote_list + + +def able_to_restart_cluster(in_maintenance_mode: bool = False) -> bool: + """ + Check whether it is able to restart cluster now + 1. If pacemaker is not running, return True + 2. If no non-stonith resource is running, return True + 3. If in maintenance mode and DLM is not running, return True + 4. Otherwise, return False with warning messages to guide user + """ + if not ServiceManager().service_is_active(constants.PCMK_SERVICE): + return True + crm_mon_parser = xmlutil.CrmMonXmlParser() + if not crm_mon_parser.is_non_stonith_resource_running(): + return True + elif in_maintenance_mode: + if is_dlm_running(): + dlm_related_ids = crm_mon_parser.get_resource_top_parent_id_set_via_type(constants.DLM_CONTROLD_RA) + logger.warning("Please stop DLM related resources (%s) and try again", ', '.join(dlm_related_ids)) + return False + else: + return True + else: + logger.warning("Please stop all running resources and try again") + logger.warning("Or run this command with -F/--force option to leverage maintenance mode") + logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting") + return False # vim:ts=4:sw=4:et: diff --git a/crmsh/xmlutil.py b/crmsh/xmlutil.py index 5377d6f29..f9eef51d0 100644 --- a/crmsh/xmlutil.py +++ b/crmsh/xmlutil.py @@ -1627,6 +1627,13 @@ def is_resource_started(self, ra): xpath = f'//resource[(@id="{ra}" or @resource_agent="{ra}") and @active="true" and @role="Started"]' return bool(self.xml_elem.xpath(xpath)) + def get_resource_top_parent_id_set_via_type(self, ra_type): + """ + Given configured ra type, get the topmost parent ra id set + """ + xpath = f'//resource[@resource_agent="{ra_type}"]' + return set([get_topmost_rsc(elem).get('id') for elem in self.xml_elem.xpath(xpath)]) + def get_resource_id_list_via_type(self, ra_type): """ Given configured ra type, get the ra id list diff --git a/test/unittests/test_sbd.py b/test/unittests/test_sbd.py index 351853f45..52d4fdf87 100644 --- a/test/unittests/test_sbd.py +++ b/test/unittests/test_sbd.py @@ -406,57 +406,6 @@ def test_enable_sbd_service(self, mock_list_cluster_nodes, mock_ServiceManager, call("Enable %s on node %s", constants.SBD_SERVICE, 'node2') ]) - @patch('crmsh.xmlutil.CrmMonXmlParser') - @patch('crmsh.sbd.ServiceManager') - def test_restart_cluster_if_possible_return(self, mock_ServiceManager, mock_CrmMonXmlParser): - mock_ServiceManager.return_value.service_is_active.return_value = False - SBDManager.restart_cluster_if_possible() - mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE) - mock_CrmMonXmlParser.assert_not_called() - - @patch('logging.Logger.warning') - @patch('crmsh.utils.is_dlm_running') - @patch('crmsh.xmlutil.CrmMonXmlParser') - @patch('crmsh.sbd.ServiceManager') - def test_restart_cluster_if_possible_manually( - self, mock_ServiceManager, mock_CrmMonXmlParser, mock_is_dlm_running, mock_logger_warning, - ): - mock_ServiceManager.return_value.service_is_active.return_value = True - mock_CrmMonXmlParser.return_value.is_non_stonith_resource_running.return_value = True - mock_is_dlm_running.return_value = False - SBDManager.restart_cluster_if_possible() - mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE) - mock_logger_warning.assert_has_calls([ - call("Resource is running, need to restart cluster service manually on each node"), - call("Or, run with `crm -F` or `--force` option, the `sbd` subcommand will leverage maintenance mode for any changes that require restarting sbd.service"), - call("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting") - ]) - - @patch('logging.Logger.warning') - @patch('crmsh.utils.is_dlm_running') - @patch('crmsh.xmlutil.CrmMonXmlParser') - @patch('crmsh.sbd.ServiceManager') - def test_restart_cluster_if_possible_dlm_running( - self, mock_ServiceManager, mock_CrmMonXmlParser, mock_is_dlm_running, mock_logger_warning, - ): - mock_ServiceManager.return_value.service_is_active.return_value = True - mock_CrmMonXmlParser.return_value.is_non_stonith_resource_running.return_value = True - mock_is_dlm_running.return_value = True - SBDManager.restart_cluster_if_possible(with_maintenance_mode=True) - mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE) - mock_logger_warning.assert_called_once_with("Resource is running, need to restart cluster service manually on each node") - - @patch('crmsh.bootstrap.restart_cluster') - @patch('logging.Logger.warning') - @patch('crmsh.xmlutil.CrmMonXmlParser') - @patch('crmsh.sbd.ServiceManager') - def test_restart_cluster_if_possible(self, mock_ServiceManager, mock_CrmMonXmlParser, mock_logger_warning, mock_restart_cluster): - mock_ServiceManager.return_value.service_is_active.return_value = True - mock_CrmMonXmlParser.return_value.is_non_stonith_resource_running.return_value = False - SBDManager.restart_cluster_if_possible() - mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE) - mock_restart_cluster.assert_called_once() - @patch('crmsh.bootstrap.prompt_for_string') def test_prompt_for_sbd_device_diskless(self, mock_prompt_for_string): mock_prompt_for_string.return_value = "none" @@ -644,10 +593,10 @@ def test_init_and_deploy_sbd_not_config_sbd(self, mock_ServiceManager): sbdmanager_instance._load_attributes_from_bootstrap.assert_not_called() @patch('crmsh.bootstrap.adjust_properties') - @patch('crmsh.sbd.SBDManager.restart_cluster_if_possible') + @patch('crmsh.bootstrap.restart_cluster') @patch('crmsh.sbd.SBDManager.enable_sbd_service') @patch('crmsh.sbd.ServiceManager') - def test_init_and_deploy_sbd(self, mock_ServiceManager, mock_enable_sbd_service, mock_restart_cluster_if_possible, mock_adjust_properties): + def test_init_and_deploy_sbd(self, mock_ServiceManager, mock_enable_sbd_service, mock_restart_cluster, mock_adjust_properties): mock_bootstrap_ctx = Mock(cluster_is_running=True) sbdmanager_instance = SBDManager(bootstrap_context=mock_bootstrap_ctx) sbdmanager_instance.get_sbd_device_from_bootstrap = Mock() diff --git a/test/unittests/test_ui_sbd.py b/test/unittests/test_ui_sbd.py index e8f3c2b5e..0e6ab406b 100644 --- a/test/unittests/test_ui_sbd.py +++ b/test/unittests/test_ui_sbd.py @@ -469,14 +469,14 @@ def test_device_remove_last_dev(self): self.sbd_instance_diskbased._device_remove(["/dev/sda1"]) self.assertEqual(str(e.exception), "Not allowed to remove all devices") - @mock.patch('crmsh.sbd.SBDManager.restart_cluster_if_possible') + @mock.patch('crmsh.bootstrap.restart_cluster') @mock.patch('crmsh.sbd.SBDManager.update_sbd_configuration') @mock.patch('logging.Logger.info') - def test_device_remove(self, mock_logger_info, mock_update_sbd_configuration, mock_restart_cluster_if_possible): + def test_device_remove(self, mock_logger_info, mock_update_sbd_configuration, mock_restart_cluster): self.sbd_instance_diskbased.device_list_from_config = ["/dev/sda1", "/dev/sda2"] self.sbd_instance_diskbased._device_remove(["/dev/sda1"]) mock_update_sbd_configuration.assert_called_once_with({"SBD_DEVICE": "/dev/sda2"}) - mock_restart_cluster_if_possible.assert_called_once() + mock_restart_cluster.assert_called_once() mock_logger_info.assert_called_once_with("Remove devices: %s", "/dev/sda1") def test_do_device_no_service(self): @@ -571,9 +571,10 @@ def test_do_purge_no_service(self, mock_purge_sbd_from_cluster): self.assertFalse(res) mock_purge_sbd_from_cluster.assert_not_called() + @mock.patch('crmsh.bootstrap.restart_cluster') @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.sbd.purge_sbd_from_cluster') - def test_do_purge(self, mock_purge_sbd_from_cluster, mock_check_all_nodes_reachable): + def test_do_purge(self, mock_purge_sbd_from_cluster, mock_check_all_nodes_reachable, mock_restart_cluster): self.sbd_instance_diskbased._load_attributes = mock.Mock() self.sbd_instance_diskbased._service_is_active = mock.Mock(return_value=True) res = self.sbd_instance_diskbased.do_purge(mock.Mock())