Skip to content

Commit 6fe9092

Browse files
committed
Dev: sbd: Improve leverage maintenance mode
1 parent 2b481a7 commit 6fe9092

File tree

4 files changed

+57
-14
lines changed

4 files changed

+57
-14
lines changed

crmsh/sbd.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,9 @@ def init_and_deploy_sbd(self):
710710
self._load_attributes_from_bootstrap()
711711

712712
with utils.leverage_maintenance_mode() as enabled:
713+
if not utils.able_to_restart_cluster(enabled):
714+
return
715+
713716
self.initialize_sbd()
714717
self.update_configuration()
715718
self.enable_sbd_service()
@@ -723,7 +726,7 @@ def init_and_deploy_sbd(self):
723726
# because the stonith-watchdog-timeout property requires sbd.service to be active.
724727
restart_cluster_first = self.diskless_sbd and not ServiceManager().service_is_active(constants.SBD_SERVICE)
725728
if restart_cluster_first:
726-
SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
729+
bootstrap.restart_cluster()
727730

728731
self.configure_sbd()
729732
bootstrap.adjust_properties(with_sbd=True)
@@ -733,7 +736,7 @@ def init_and_deploy_sbd(self):
733736
# This helps prevent unexpected issues, such as nodes being fenced
734737
# due to large SBD_WATCHDOG_TIMEOUT values combined with smaller timeouts.
735738
if not restart_cluster_first:
736-
SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
739+
bootstrap.restart_cluster()
737740

738741
def join_sbd(self, remote_user, peer_host):
739742
'''

crmsh/ui_sbd.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -492,8 +492,11 @@ def _device_remove(self, devices_to_remove: typing.List[str]):
492492

493493
logger.info("Remove devices: %s", ';'.join(devices_to_remove))
494494
update_dict = {"SBD_DEVICE": ";".join(left_device_list)}
495-
sbd.SBDManager.update_sbd_configuration(update_dict)
496-
sbd.SBDManager.restart_cluster_if_possible()
495+
with utils.leverage_maintenance_mode() as enabled:
496+
if not utils.able_to_restart_cluster(enabled):
497+
return
498+
sbd.SBDManager.update_sbd_configuration(update_dict)
499+
bootstrap.restart_cluster()
497500

498501
@command.completers_repeating(sbd_device_completer)
499502
def do_device(self, context, *args) -> bool:
@@ -578,17 +581,20 @@ def do_purge(self, context, *args) -> bool:
578581

579582
utils.check_all_nodes_reachable("purging SBD")
580583

581-
if args and args[0] == "crashdump":
582-
self._set_crashdump_option(delete=True)
583-
update_dict = self._set_crashdump_in_sysconfig(restore=True)
584-
if update_dict:
585-
sbd.SBDManager.update_sbd_configuration(update_dict)
586-
sbd.SBDManager.restart_cluster_if_possible()
587-
return True
584+
with utils.leverage_maintenance_mode() as enabled:
585+
if not utils.able_to_restart_cluster(enabled):
586+
return False
588587

589-
sbd.purge_sbd_from_cluster()
590-
sbd.SBDManager.restart_cluster_if_possible()
591-
return True
588+
if args and args[0] == "crashdump":
589+
self._set_crashdump_option(delete=True)
590+
update_dict = self._set_crashdump_in_sysconfig(restore=True)
591+
if update_dict:
592+
sbd.SBDManager.update_sbd_configuration(update_dict)
593+
else:
594+
sbd.purge_sbd_from_cluster()
595+
596+
bootstrap.restart_cluster()
597+
return True
592598

593599
def _print_sbd_type(self):
594600
if not self.service_manager.service_is_active(constants.SBD_SERVICE):

crmsh/utils.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3306,4 +3306,31 @@ def validate_and_get_reachable_nodes(
33063306
member_list.remove(node)
33073307

33083308
return member_list + remote_list
3309+
3310+
3311+
def able_to_restart_cluster(in_maintenance_mode: bool) -> bool:
3312+
"""
3313+
Check whether it is able to restart cluster now
3314+
1. If pacemaker is not running, return True
3315+
2. If no non-stonith resource is running, return True
3316+
3. If in maintenance mode and DLM is not running, return True
3317+
4. Otherwise, return False with warning messages to guide user
3318+
"""
3319+
if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
3320+
return True
3321+
crm_mon_parser = xmlutil.CrmMonXmlParser()
3322+
if not crm_mon_parser.is_non_stonith_resource_running():
3323+
return True
3324+
elif in_maintenance_mode:
3325+
if is_dlm_running():
3326+
dlm_related_ids = crm_mon_parser.get_resource_top_parent_id_set_via_type(constants.DLM_CONTROLD_RA)
3327+
logger.warning("Please stop DLM related resources (%s) and try again", ', '.join(dlm_related_ids))
3328+
return False
3329+
else:
3330+
return True
3331+
else:
3332+
logger.warning("Please stop all running resources and try again")
3333+
logger.warning("Or run this command with -F/--force option to leverage maintenance mode")
3334+
logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
3335+
return False
33093336
# vim:ts=4:sw=4:et:

crmsh/xmlutil.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1609,6 +1609,13 @@ def is_resource_started(self, ra):
16091609
xpath = f'//resource[(@id="{ra}" or @resource_agent="{ra}") and @active="true" and @role="Started"]'
16101610
return bool(self.xml_elem.xpath(xpath))
16111611

1612+
def get_resource_top_parent_id_set_via_type(self, ra_type):
1613+
"""
1614+
Given configured ra type, get the topmost parent ra id set
1615+
"""
1616+
xpath = f'//resource[@resource_agent="{ra_type}"]'
1617+
return set([get_topmost_rsc(elem).get('id') for elem in self.xml_elem.xpath(xpath)])
1618+
16121619
def get_resource_id_list_via_type(self, ra_type):
16131620
"""
16141621
Given configured ra type, get the ra id list

0 commit comments

Comments
 (0)