Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 5 additions & 18 deletions crmsh/sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,22 +594,6 @@ def enable_sbd_service(self):
logger.info("Enable %s on node %s", constants.SBD_SERVICE, node)
service_manager.enable_service(constants.SBD_SERVICE, node)

@staticmethod
def restart_cluster_if_possible(with_maintenance_mode=False):
if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
return
if not xmlutil.CrmMonXmlParser().is_non_stonith_resource_running():
bootstrap.restart_cluster()
elif with_maintenance_mode:
if not utils.is_dlm_running():
bootstrap.restart_cluster()
else:
logger.warning("Resource is running, need to restart cluster service manually on each node")
else:
logger.warning("Resource is running, need to restart cluster service manually on each node")
logger.warning("Or, run with `crm -F` or `--force` option, the `sbd` subcommand will leverage maintenance mode for any changes that require restarting sbd.service")
logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")

def configure_sbd(self):
'''
Configure fence_sbd resource and related properties
Expand Down Expand Up @@ -747,6 +731,9 @@ def init_and_deploy_sbd(self, restart_first=False):
self._load_attributes_from_bootstrap()

with utils.leverage_maintenance_mode() as enabled:
if not utils.able_to_restart_cluster(enabled):
return

self.initialize_sbd()
self.update_configuration()
self.enable_sbd_service()
Expand All @@ -761,7 +748,7 @@ def init_and_deploy_sbd(self, restart_first=False):
restart_cluster_first = restart_first or \
(self.diskless_sbd and not ServiceManager().service_is_active(constants.SBD_SERVICE))
if restart_cluster_first:
SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
bootstrap.restart_cluster()

self.configure_sbd()
bootstrap.adjust_properties(with_sbd=True)
Expand All @@ -771,7 +758,7 @@ def init_and_deploy_sbd(self, restart_first=False):
# This helps prevent unexpected issues, such as nodes being fenced
# due to large SBD_WATCHDOG_TIMEOUT values combined with smaller timeouts.
if not restart_cluster_first:
SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
bootstrap.restart_cluster()

def join_sbd(self, remote_user, peer_host):
'''
Expand Down
43 changes: 29 additions & 14 deletions crmsh/ui_sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,8 +517,11 @@ def _device_remove(self, devices_to_remove: typing.List[str]):

logger.info("Remove devices: %s", ';'.join(devices_to_remove))
update_dict = {"SBD_DEVICE": ";".join(left_device_list)}
sbd.SBDManager.update_sbd_configuration(update_dict)
sbd.SBDManager.restart_cluster_if_possible()
with utils.leverage_maintenance_mode() as enabled:
if not utils.able_to_restart_cluster(enabled):
return
sbd.SBDManager.update_sbd_configuration(update_dict)
bootstrap.restart_cluster()

@command.completers_repeating(sbd_device_completer)
def do_device(self, context, *args) -> bool:
Expand Down Expand Up @@ -601,22 +604,34 @@ def do_purge(self, context, *args) -> bool:
if not self._service_is_active(constants.SBD_SERVICE):
return False

purge_crashdump = False
if args:
if args[0] == "crashdump":
if not self._is_crashdump_configured():
logger.error("SBD crashdump is not configured")
return False
purge_crashdump = True
else:
logger.error("Invalid argument: %s", ' '.join(args))
logger.info("Usage: crm sbd purge [crashdump]")
return False

utils.check_all_nodes_reachable("purging SBD")

if args and args[0] == "crashdump":
if not self._is_crashdump_configured():
logger.error("SBD crashdump is not configured")
with utils.leverage_maintenance_mode() as enabled:
if not utils.able_to_restart_cluster(enabled):
return False
self._set_crashdump_option(delete=True)
update_dict = self._set_crashdump_in_sysconfig(restore=True)
if update_dict:
sbd.SBDManager.update_sbd_configuration(update_dict)
sbd.SBDManager.restart_cluster_if_possible()
return True

sbd.purge_sbd_from_cluster()
sbd.SBDManager.restart_cluster_if_possible()
return True
if purge_crashdump:
self._set_crashdump_option(delete=True)
update_dict = self._set_crashdump_in_sysconfig(restore=True)
if update_dict:
sbd.SBDManager.update_sbd_configuration(update_dict)
else:
sbd.purge_sbd_from_cluster()

bootstrap.restart_cluster()
return True

def _print_sbd_type(self):
if not self.service_manager.service_is_active(constants.SBD_SERVICE):
Expand Down
27 changes: 27 additions & 0 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3306,4 +3306,31 @@ def validate_and_get_reachable_nodes(
member_list.remove(node)

return member_list + remote_list


def able_to_restart_cluster(in_maintenance_mode: bool = False) -> bool:
"""
Check whether it is able to restart cluster now
1. If pacemaker is not running, return True
2. If no non-stonith resource is running, return True
3. If in maintenance mode and DLM is not running, return True
4. Otherwise, return False with warning messages to guide user
"""
if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
return True
crm_mon_parser = xmlutil.CrmMonXmlParser()
if not crm_mon_parser.is_non_stonith_resource_running():
return True
elif in_maintenance_mode:
if is_dlm_running():
dlm_related_ids = crm_mon_parser.get_resource_top_parent_id_set_via_type(constants.DLM_CONTROLD_RA)
logger.warning("Please stop DLM related resources (%s) and try again", ', '.join(dlm_related_ids))
return False
else:
return True
else:
logger.warning("Please stop all running resources and try again")
logger.warning("Or run this command with -F/--force option to leverage maintenance mode")
logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
return False
# vim:ts=4:sw=4:et:
7 changes: 7 additions & 0 deletions crmsh/xmlutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,6 +1627,13 @@ def is_resource_started(self, ra):
xpath = f'//resource[(@id="{ra}" or @resource_agent="{ra}") and @active="true" and @role="Started"]'
return bool(self.xml_elem.xpath(xpath))

def get_resource_top_parent_id_set_via_type(self, ra_type):
"""
Given configured ra type, get the topmost parent ra id set
"""
xpath = f'//resource[@resource_agent="{ra_type}"]'
return set([get_topmost_rsc(elem).get('id') for elem in self.xml_elem.xpath(xpath)])

def get_resource_id_list_via_type(self, ra_type):
"""
Given configured ra type, get the ra id list
Expand Down
55 changes: 2 additions & 53 deletions test/unittests/test_sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,57 +406,6 @@ def test_enable_sbd_service(self, mock_list_cluster_nodes, mock_ServiceManager,
call("Enable %s on node %s", constants.SBD_SERVICE, 'node2')
])

@patch('crmsh.xmlutil.CrmMonXmlParser')
@patch('crmsh.sbd.ServiceManager')
def test_restart_cluster_if_possible_return(self, mock_ServiceManager, mock_CrmMonXmlParser):
mock_ServiceManager.return_value.service_is_active.return_value = False
SBDManager.restart_cluster_if_possible()
mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE)
mock_CrmMonXmlParser.assert_not_called()

@patch('logging.Logger.warning')
@patch('crmsh.utils.is_dlm_running')
@patch('crmsh.xmlutil.CrmMonXmlParser')
@patch('crmsh.sbd.ServiceManager')
def test_restart_cluster_if_possible_manually(
self, mock_ServiceManager, mock_CrmMonXmlParser, mock_is_dlm_running, mock_logger_warning,
):
mock_ServiceManager.return_value.service_is_active.return_value = True
mock_CrmMonXmlParser.return_value.is_non_stonith_resource_running.return_value = True
mock_is_dlm_running.return_value = False
SBDManager.restart_cluster_if_possible()
mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE)
mock_logger_warning.assert_has_calls([
call("Resource is running, need to restart cluster service manually on each node"),
call("Or, run with `crm -F` or `--force` option, the `sbd` subcommand will leverage maintenance mode for any changes that require restarting sbd.service"),
call("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
])

@patch('logging.Logger.warning')
@patch('crmsh.utils.is_dlm_running')
@patch('crmsh.xmlutil.CrmMonXmlParser')
@patch('crmsh.sbd.ServiceManager')
def test_restart_cluster_if_possible_dlm_running(
self, mock_ServiceManager, mock_CrmMonXmlParser, mock_is_dlm_running, mock_logger_warning,
):
mock_ServiceManager.return_value.service_is_active.return_value = True
mock_CrmMonXmlParser.return_value.is_non_stonith_resource_running.return_value = True
mock_is_dlm_running.return_value = True
SBDManager.restart_cluster_if_possible(with_maintenance_mode=True)
mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE)
mock_logger_warning.assert_called_once_with("Resource is running, need to restart cluster service manually on each node")

@patch('crmsh.bootstrap.restart_cluster')
@patch('logging.Logger.warning')
@patch('crmsh.xmlutil.CrmMonXmlParser')
@patch('crmsh.sbd.ServiceManager')
def test_restart_cluster_if_possible(self, mock_ServiceManager, mock_CrmMonXmlParser, mock_logger_warning, mock_restart_cluster):
mock_ServiceManager.return_value.service_is_active.return_value = True
mock_CrmMonXmlParser.return_value.is_non_stonith_resource_running.return_value = False
SBDManager.restart_cluster_if_possible()
mock_ServiceManager.return_value.service_is_active.assert_called_once_with(constants.PCMK_SERVICE)
mock_restart_cluster.assert_called_once()

@patch('crmsh.bootstrap.prompt_for_string')
def test_prompt_for_sbd_device_diskless(self, mock_prompt_for_string):
mock_prompt_for_string.return_value = "none"
Expand Down Expand Up @@ -644,10 +593,10 @@ def test_init_and_deploy_sbd_not_config_sbd(self, mock_ServiceManager):
sbdmanager_instance._load_attributes_from_bootstrap.assert_not_called()

@patch('crmsh.bootstrap.adjust_properties')
@patch('crmsh.sbd.SBDManager.restart_cluster_if_possible')
@patch('crmsh.bootstrap.restart_cluster')
@patch('crmsh.sbd.SBDManager.enable_sbd_service')
@patch('crmsh.sbd.ServiceManager')
def test_init_and_deploy_sbd(self, mock_ServiceManager, mock_enable_sbd_service, mock_restart_cluster_if_possible, mock_adjust_properties):
def test_init_and_deploy_sbd(self, mock_ServiceManager, mock_enable_sbd_service, mock_restart_cluster, mock_adjust_properties):
mock_bootstrap_ctx = Mock(cluster_is_running=True)
sbdmanager_instance = SBDManager(bootstrap_context=mock_bootstrap_ctx)
sbdmanager_instance.get_sbd_device_from_bootstrap = Mock()
Expand Down
9 changes: 5 additions & 4 deletions test/unittests/test_ui_sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,14 +469,14 @@ def test_device_remove_last_dev(self):
self.sbd_instance_diskbased._device_remove(["/dev/sda1"])
self.assertEqual(str(e.exception), "Not allowed to remove all devices")

@mock.patch('crmsh.sbd.SBDManager.restart_cluster_if_possible')
@mock.patch('crmsh.bootstrap.restart_cluster')
@mock.patch('crmsh.sbd.SBDManager.update_sbd_configuration')
@mock.patch('logging.Logger.info')
def test_device_remove(self, mock_logger_info, mock_update_sbd_configuration, mock_restart_cluster_if_possible):
def test_device_remove(self, mock_logger_info, mock_update_sbd_configuration, mock_restart_cluster):
self.sbd_instance_diskbased.device_list_from_config = ["/dev/sda1", "/dev/sda2"]
self.sbd_instance_diskbased._device_remove(["/dev/sda1"])
mock_update_sbd_configuration.assert_called_once_with({"SBD_DEVICE": "/dev/sda2"})
mock_restart_cluster_if_possible.assert_called_once()
mock_restart_cluster.assert_called_once()
mock_logger_info.assert_called_once_with("Remove devices: %s", "/dev/sda1")

def test_do_device_no_service(self):
Expand Down Expand Up @@ -571,9 +571,10 @@ def test_do_purge_no_service(self, mock_purge_sbd_from_cluster):
self.assertFalse(res)
mock_purge_sbd_from_cluster.assert_not_called()

@mock.patch('crmsh.bootstrap.restart_cluster')
@mock.patch('crmsh.utils.check_all_nodes_reachable')
@mock.patch('crmsh.sbd.purge_sbd_from_cluster')
def test_do_purge(self, mock_purge_sbd_from_cluster, mock_check_all_nodes_reachable):
def test_do_purge(self, mock_purge_sbd_from_cluster, mock_check_all_nodes_reachable, mock_restart_cluster):
self.sbd_instance_diskbased._load_attributes = mock.Mock()
self.sbd_instance_diskbased._service_is_active = mock.Mock(return_value=True)
res = self.sbd_instance_diskbased.do_purge(mock.Mock())
Expand Down