diff --git a/crmsh/bootstrap.py b/crmsh/bootstrap.py index 4aba415dc..e074f5da9 100644 --- a/crmsh/bootstrap.py +++ b/crmsh/bootstrap.py @@ -2773,7 +2773,7 @@ def adjust_stonith_timeout(with_sbd: bool = False): Adjust stonith-timeout for sbd and other scenarios """ if ServiceManager().service_is_active(constants.SBD_SERVICE) or with_sbd: - sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration() + sbd.SBDTimeoutChecker(fix=True, warn=False, from_bootstrap=True).check_and_fix() else: value = get_stonith_timeout_generally_expected() if value: diff --git a/crmsh/sbd.py b/crmsh/sbd.py index 1ef2e950e..a61c9a785 100644 --- a/crmsh/sbd.py +++ b/crmsh/sbd.py @@ -26,18 +26,21 @@ def get_sbd_device_metadata(dev, timeout_only=False, remote=None) -> dict: Extract metadata from sbd device header ''' sbd_info = {} - try: - out = sh.cluster_shell().get_stdout_or_raise_error(f"sbd -d {dev} dump", remote) - except Exception: - return sbd_info - pattern = r"UUID\s+:\s+(\S+)|Timeout\s+\((\w+)\)\s+:\s+(\d+)" + + out = sh.cluster_shell().get_stdout_or_raise_error(f"sbd -d {dev} dump", remote) matches = re.findall(pattern, out) for uuid, timeout_type, timeout_value in matches: if uuid and not timeout_only: sbd_info["uuid"] = uuid elif timeout_type and timeout_value: sbd_info[timeout_type] = int(timeout_value) + + if "msgwait" not in sbd_info: + raise ValueError(f"Cannot find msgwait timeout in sbd device {dev}") + if "watchdog" not in sbd_info: + raise ValueError(f"Cannot find watchdog timeout in sbd device {dev}") + return sbd_info @staticmethod @@ -141,14 +144,8 @@ def check_devices_metadata_consistent(dev_list) -> bool: if len(dev_list) < 2: return consistent first_dev_metadata = SBDUtils.get_sbd_device_metadata(dev_list[0], timeout_only=True) - if not first_dev_metadata: - logger.warning(f"Cannot get metadata for {dev_list[0]}") - return False for dev in dev_list[1:]: this_dev_metadata = SBDUtils.get_sbd_device_metadata(dev, timeout_only=True) - if not this_dev_metadata: - logger.warning(f"Cannot get metadata for {dev}") - return False if this_dev_metadata != first_dev_metadata: logger.warning(f"Device {dev} doesn't have the same metadata as {dev_list[0]}") consistent = False @@ -194,13 +191,18 @@ def __init__(self, context=None): Init function ''' self.context = context + self.disk_based = None self.sbd_msgwait = None self.stonith_timeout = None self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT self.stonith_watchdog_timeout = None self.two_node_without_qdevice = False + self.qdevice_sync_timeout = None + self.crashdump_watchdog_timeout = None + if self.context: + self._initialize_timeout_in_bootstrap() - def initialize_timeout(self): + def _initialize_timeout_in_bootstrap(self): self._set_sbd_watchdog_timeout() if self.context.diskless_sbd: self._adjust_sbd_watchdog_timeout_with_diskless_and_qdevice() @@ -232,21 +234,6 @@ def _set_sbd_msgwait(self): sbd_msgwait = sbd_msgwait_default self.sbd_msgwait = sbd_msgwait - @classmethod - def get_advised_sbd_timeout(cls, diskless=False) -> typing.Tuple[int, int]: - ''' - Get suitable sbd_watchdog_timeout and sbd_msgwait - ''' - ctx = bootstrap.Context() - ctx.diskless_sbd = diskless - ctx.load_profiles() - time_inst = cls(ctx) - time_inst.initialize_timeout() - - sbd_watchdog_timeout = time_inst.sbd_watchdog_timeout - sbd_msgwait = None if diskless else time_inst.sbd_msgwait - return sbd_watchdog_timeout, sbd_msgwait - def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self): ''' When using diskless SBD with Qdevice, adjust value of sbd_watchdog_timeout @@ -264,16 +251,6 @@ def _adjust_sbd_watchdog_timeout_with_diskless_and_qdevice(self): logger.warning("sbd_watchdog_timeout is set to {} for qdevice, it was {}".format(self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE, self.sbd_watchdog_timeout)) self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE - @staticmethod - def get_sbd_msgwait(dev): - ''' - Get msgwait for sbd device - ''' - res = SBDUtils.get_sbd_device_metadata(dev).get("msgwait") - if not res: - raise ValueError(f"Cannot get sbd msgwait for {dev}") - return res - @staticmethod def get_sbd_watchdog_timeout(): ''' @@ -284,38 +261,47 @@ def get_sbd_watchdog_timeout(): raise ValueError("Cannot get the value of SBD_WATCHDOG_TIMEOUT") return int(res) - @staticmethod - def get_stonith_watchdog_timeout_expected(): - ''' - Returns the value of the stonith-watchdog-timeout cluster property. - Default is 2 * SBD_WATCHDOG_TIMEOUT. - ''' - default = 2 * SBDTimeout.get_sbd_watchdog_timeout() - if not ServiceManager().service_is_active(constants.PCMK_SERVICE): - return default - value = utils.get_property("stonith-watchdog-timeout", get_default=False) - return int(value) if value else default + def get_stonith_watchdog_timeout_expected(self): + if self.crashdump_watchdog_timeout: + return SBDTimeout.get_sbd_watchdog_timeout() + self.crashdump_watchdog_timeout + else: + return 2 * SBDTimeout.get_sbd_watchdog_timeout() def _load_configurations(self): ''' Load necessary configurations for both disk-based/disk-less sbd ''' self.two_node_without_qdevice = utils.is_2node_cluster_without_qdevice() - + self.crashdump_watchdog_timeout = SBDUtils.get_crashdump_watchdog_timeout() dev_list = SBDUtils.get_sbd_device_from_config() if dev_list: # disk-based self.disk_based = True - self.msgwait = SBDTimeout.get_sbd_msgwait(dev_list[0]) + first_dev = dev_list[0] + device_metadata = SBDUtils.get_sbd_device_metadata(first_dev) + self.sbd_msgwait = device_metadata.get("msgwait") + self.sbd_watchdog_timeout = device_metadata.get("watchdog") self.pcmk_delay_max = utils.get_pcmk_delay_max(self.two_node_without_qdevice) else: # disk-less self.disk_based = False self.sbd_watchdog_timeout = SBDTimeout.get_sbd_watchdog_timeout() self.stonith_watchdog_timeout = SBDTimeout.get_stonith_watchdog_timeout_expected() + if corosync.is_qdevice_configured() and ServiceManager().service_is_active("corosync-qdevice.service"): + self.qdevice_sync_timeout = utils.get_qdevice_sync_timeout() self.sbd_delay_start_value_expected = self.get_sbd_delay_start_expected() if utils.detect_virt() else "no" self.sbd_delay_start_value_from_config = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START") + if not self.sbd_delay_start_value_from_config: + logger.error("No SBD_DELAY_START entry found in %s", SBDManager.SYSCONFIG_SBD) + raise utils.TerminateSubCommand + self.sbd_systemd_start_timeout_expected = self.get_sbd_systemd_start_timeout_expected() logger.debug("Inspect SBDTimeout: %s", vars(self)) + def get_sbd_msgwait_expected(self): + if self.crashdump_watchdog_timeout: + return 2 * self.sbd_watchdog_timeout + self.crashdump_watchdog_timeout + else: + return 2 * self.sbd_watchdog_timeout + def get_stonith_timeout_expected(self): ''' Get stonith-timeout value for sbd cases, formulas are: @@ -326,7 +312,7 @@ def get_stonith_timeout_expected(self): stonith_timeout = max(value_from_sbd, constants.STONITH_TIMEOUT_DEFAULT) + token + consensus ''' if self.disk_based: - value_from_sbd = int(1.2*self.msgwait) + value_from_sbd = int(1.2*self.sbd_msgwait) else: value_from_sbd = int(1.2*max(self.stonith_watchdog_timeout, 2*self.sbd_watchdog_timeout)) @@ -349,11 +335,19 @@ def get_sbd_delay_start_expected(self): ''' token_and_consensus_timeout = corosync.token_and_consensus_timeout() if self.disk_based: - value = token_and_consensus_timeout + self.pcmk_delay_max + self.msgwait + value = token_and_consensus_timeout + self.pcmk_delay_max + self.sbd_msgwait else: value = token_and_consensus_timeout + 2*self.sbd_watchdog_timeout return value + def get_sbd_systemd_start_timeout_expected(self): + default_value = SBDTimeout.get_default_systemd_start_timeout() + if self.sbd_delay_start_value_from_config.isdigit(): + calculated_value = int(1.2*int(self.sbd_delay_start_value_from_config)) + return max(calculated_value, default_value) + else: + return default_value + @staticmethod def get_sbd_delay_start_sec_from_sysconfig(): ''' @@ -383,74 +377,6 @@ def get_default_systemd_start_timeout() -> int: out = sh.cluster_shell().get_stdout_or_raise_error(SBDTimeout.SHOW_DEFAULT_START_TIMEOUT_CMD) return utils.get_systemd_timeout_start_in_sec(out) - def adjust_systemd_start_timeout(self): - ''' - Adjust start timeout for sbd when set SBD_DELAY_START - ''' - sbd_delay_start_value = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START") - if sbd_delay_start_value == "no": - SBDTimeout.restore_systemd_start_timeout() - return - expected_start_timeout = int(1.2*int(sbd_delay_start_value)) - actual_start_timeout = SBDTimeout.get_sbd_systemd_start_timeout() - default_start_timeout = SBDTimeout.get_default_systemd_start_timeout() - if expected_start_timeout == actual_start_timeout: - return - elif expected_start_timeout > default_start_timeout: - logger.info("Adjust systemd start timeout for sbd.service to %ds, it was %ds", - expected_start_timeout, actual_start_timeout) - SBDTimeout.set_systemd_start_timeout(expected_start_timeout) - else: - if os.path.isdir(SBDManager.SBD_SYSTEMD_DELAY_START_DIR): - logger.info("Restore systemd start timeout for sbd.service to default %ds, it was %ds", - default_start_timeout, actual_start_timeout) - SBDTimeout.restore_systemd_start_timeout() - - @staticmethod - def set_systemd_start_timeout(value): - utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) - sbd_delay_start_file = os.path.join(SBDManager.SBD_SYSTEMD_DELAY_START_DIR, "sbd_delay_start.conf") - utils.str2file(f"[Service]\nTimeoutStartSec={value}", sbd_delay_start_file) - bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) - utils.cluster_run_cmd("systemctl daemon-reload") - - @staticmethod - def restore_systemd_start_timeout(): - test_dir_cmd = f"test -d {SBDManager.SBD_SYSTEMD_DELAY_START_DIR}" - rm_dir_cmd = f"rm -rf {SBDManager.SBD_SYSTEMD_DELAY_START_DIR}" - reload_cmd = "systemctl daemon-reload" - utils.cluster_run_cmd(f"{test_dir_cmd} && {rm_dir_cmd} && {reload_cmd} || exit 0") - - def adjust_stonith_timeout(self): - ''' - Adjust stonith-timeout property - ''' - utils.set_property("stonith-timeout", self.get_stonith_timeout_expected()) - - def adjust_sbd_delay_start(self): - ''' - Adjust SBD_DELAY_START in /etc/sysconfig/sbd - ''' - expected_value = str(self.sbd_delay_start_value_expected) - config_value = self.sbd_delay_start_value_from_config - if expected_value == config_value: - return - if expected_value == "no" \ - or (not re.search(r'\d+', config_value)) \ - or (int(expected_value) != int(config_value)): - SBDManager.update_sbd_configuration({"SBD_DELAY_START": expected_value}) - - @classmethod - def adjust_sbd_timeout_related_cluster_configuration(cls): - ''' - Adjust sbd timeout related configurations - ''' - cls_inst = cls() - cls_inst._load_configurations() - cls_inst.adjust_sbd_delay_start() - cls_inst.adjust_stonith_timeout() - cls_inst.adjust_systemd_start_timeout() - @staticmethod def able_to_set_stonith_watchdog_timeout(value: int) -> bool: ''' @@ -467,6 +393,187 @@ def able_to_set_stonith_watchdog_timeout(value: int) -> bool: return True +class FixFailure(Exception): + pass + + +class SBDTimeoutChecker(SBDTimeout): + + def __init__(self, warn=True, fix=False, check_category: str = "", from_bootstrap=False): + super().__init__() + self.warn = warn + self.fix = fix + self.check_category = check_category + self.from_bootstrap = from_bootstrap + + def check_and_fix(self) -> bool: + checks_and_fixes = [ + # issue name, check category, check method, fix method + ("SBD disk metadata", "disk_metadata", + self._check_sbd_disk_metadata, self._fix_sbd_disk_metadata), + ("SBD_WATCHDOG_TIMEOUT", "sysconfig", + self._check_sbd_watchdog_timeout, self._fix_sbd_watchdog_timeout), + ("SBD_DELAY_START", "sysconfig", + self._check_sbd_delay_start, self._fix_sbd_delay_start), + ("systemd start timeout for sbd.service", "property", + self._check_sbd_systemd_start_timeout, self._fix_sbd_systemd_start_timeout), + ("stonith-watchdog-timeout property", "property", + self._check_stonith_watchdog_timeout, self._fix_stonith_watchdog_timeout), + ("stonith-timeout property", "property", + self._check_stonith_timeout, self._fix_stonith_timeout) + ] + + if not self.from_bootstrap and not ServiceManager().service_is_active(constants.SBD_SERVICE): + if self.warn: + logger.warning("%s is not active, skip SBD timeout checks", constants.SBD_SERVICE) + raise utils.TerminateSubCommand + + if not self._check_config_consistency(): + raise utils.TerminateSubCommand + + self._load_configurations() + + for name, category, check_method, fix_method in checks_and_fixes: + if (self.check_category and self.check_category != category) or check_method(): + continue + elif not self.fix: + return False + else: + fix_method() + self._load_configurations() + if not check_method(): + raise FixFailure(f"Failed to fix {name} issue") + return True + + def _check_config_consistency(self) -> bool: + consistent = True + # Don't check consistency during bootstrap process + if self.from_bootstrap: + return consistent + + this_node = utils.this_node() + other_nodes = utils.list_cluster_nodes_except_me() + + diff_output = utils.remote_diff_this(corosync.conf(), other_nodes, this_node) + if diff_output: + logger.warning("corosync.conf is not consistent across cluster nodes") + consistent = False + + diff_output = utils.remote_diff_this(SBDManager.SYSCONFIG_SBD, other_nodes, this_node) + if diff_output: + logger.warning("%s is not consistent across cluster nodes", SBDManager.SYSCONFIG_SBD) + consistent = False + + configured_devices = SBDUtils.get_sbd_device_from_config() + if not SBDUtils.check_devices_metadata_consistent(configured_devices): + logger.warning("SBD device metadata is not consistent across cluster nodes") + consistent = False + + if not consistent: + logger.warning("Please ensure the configurations are consistent across all cluster nodes") + return consistent + + def _check_sbd_disk_metadata(self) -> bool: + if self.disk_based: + expected_msgwait = self.get_sbd_msgwait_expected() + if self.sbd_msgwait < expected_msgwait: + if self.warn: + logger.warning("It's recommended that SBD msgwait(now %d) >= %d", self.sbd_msgwait, expected_msgwait) + return False + return True + + def _fix_sbd_disk_metadata(self) -> None: + expected_msgwait = self.get_sbd_msgwait_expected() + logger.info("Adjusting sbd msgwait to %d", expected_msgwait) + cmd = f"crm sbd configure msgwait-timeout={expected_msgwait} watchdog-timeout={self.sbd_watchdog_timeout}" + output = sh.cluster_shell().get_stdout_or_raise_error(cmd) + if output: + print(output) + + def _check_sbd_watchdog_timeout(self) -> bool: + if self.disk_based or not self.qdevice_sync_timeout: + return True + if self.sbd_watchdog_timeout < self.qdevice_sync_timeout: + if self.warn: + logger.warning("It's recommended that SBD_WATCHDOG_TIMEOUT(now %d) >= qdevice sync timeout(now %d)", + self.sbd_watchdog_timeout, self.qdevice_sync_timeout) + return False + return True + + def _fix_sbd_watchdog_timeout(self): + advised_watchdog_timeout = self.qdevice_sync_timeout + SBDTimeout.QDEVICE_SYNC_TIMEOUT_MARGIN + SBDManager.update_sbd_configuration({"SBD_WATCHDOG_TIMEOUT": str(advised_watchdog_timeout)}) + + def _check_sbd_delay_start(self) -> bool: + expected_value = str(self.sbd_delay_start_value_expected) + config_value = self.sbd_delay_start_value_from_config + if expected_value != config_value: + if self.warn: + logger.warning("It's recommended that SBD_DELAY_START is set to %s, now is %s", + expected_value, config_value) + return False + return True + + def _fix_sbd_delay_start(self): + advised_value = str(self.sbd_delay_start_value_expected) + SBDManager.update_sbd_configuration({"SBD_DELAY_START": advised_value}) + + def _check_sbd_systemd_start_timeout(self) -> bool: + actual_start_timeout = SBDTimeout.get_sbd_systemd_start_timeout() + if actual_start_timeout != self.sbd_systemd_start_timeout_expected: + if self.warn: + logger.warning("It's recommended that systemd start timeout for sbd.service is set to %ds, now is %ds", + self.sbd_systemd_start_timeout_expected, actual_start_timeout) + return False + return True + + def _fix_sbd_systemd_start_timeout(self): + logger.info("Adjusting systemd start timeout for sbd.service to %ds", self.sbd_systemd_start_timeout_expected) + utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) + sbd_delay_start_file = os.path.join(SBDManager.SBD_SYSTEMD_DELAY_START_DIR, "sbd_delay_start.conf") + utils.str2file(f"[Service]\nTimeoutStartSec={self.sbd_systemd_start_timeout_expected}", sbd_delay_start_file) + bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) + utils.cluster_run_cmd("systemctl daemon-reload") + + def _check_stonith_watchdog_timeout(self) -> bool: + value = utils.get_property("stonith-watchdog-timeout", get_default=False) + if self.disk_based: + if value: + if self.warn: + logger.warning("It's recommended that stonith-watchdog-timeout is not set when using disk-based SBD") + return False + else: + if not value or int(value) < self.stonith_watchdog_timeout: + if self.warn: + logger.warning("It's recommended that stonith-watchdog-timeout is set to %d, now is %s", + self.stonith_watchdog_timeout, value if value else "not set") + return False + return True + + def _fix_stonith_watchdog_timeout(self): + if self.disk_based: + logger.info("Removing stonith-watchdog-timeout property") + utils.delete_property("stonith-watchdog-timeout") + else: + logger.info("Adjusting stonith-watchdog-timeout to %d", self.stonith_watchdog_timeout) + utils.set_property("stonith-watchdog-timeout", self.stonith_watchdog_timeout) + + def _check_stonith_timeout(self) -> bool: + expected_value = self.get_stonith_timeout_expected() + value = utils.get_property("stonith-timeout", get_default=False) + if value and int(value) == expected_value: + return True + if self.warn: + logger.warning("It's recommended that stonith-timeout is set to %d, now is %s", + expected_value, value if value else "not set") + return False + + def _fix_stonith_timeout(self): + expected_value = self.get_stonith_timeout_expected() + logger.info("Adjusting stonith-timeout to %d", expected_value) + utils.set_property("stonith-timeout", expected_value) + + class SBDManager: SYSCONFIG_SBD = "/etc/sysconfig/sbd" SYSCONFIG_SBD_TEMPLATE = "/usr/share/fillup-templates/sysconfig.sbd" @@ -524,7 +631,6 @@ def _load_attributes_from_bootstrap(self): return if not self.timeout_dict: timeout_inst = SBDTimeout(self.bootstrap_context) - timeout_inst.initialize_timeout() self.timeout_dict["watchdog"] = timeout_inst.sbd_watchdog_timeout if self.diskless_sbd: self.update_dict["SBD_WATCHDOG_TIMEOUT"] = str(timeout_inst.sbd_watchdog_timeout) diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py index 4e7963ea6..fe8d3d1b4 100644 --- a/crmsh/ui_cluster.py +++ b/crmsh/ui_cluster.py @@ -28,8 +28,7 @@ from .sh import ShellUtils from .ui_node import parse_option_for_nodes from . import constants - - +from . import sbd from . import log from .utils import TerminateSubCommand @@ -794,10 +793,8 @@ def do_geo_init_arbitrator(self, context, *args): bootstrap.bootstrap_arbitrator(geo_context) return True - @command.completers(compl.choice([ - 'hawk2', - 'sles16', - ])) + HEALTH_COMPONENTS = ['hawk2', 'sles16', 'sbd'] + @command.completers(compl.choice(HEALTH_COMPONENTS)) def do_health(self, context, *args): ''' Extensive health check. @@ -805,7 +802,7 @@ def do_health(self, context, *args): if not args: return Cluster._do_health_legacy() parser = argparse.ArgumentParser('health') - parser.add_argument('component', choices=['hawk2', 'sles16']) + parser.add_argument('component', choices=Cluster.HEALTH_COMPONENTS) parser.add_argument('-f', '--fix', action='store_true') parsed_args, remaining_args = parser.parse_known_args(args) match parsed_args.component: @@ -840,6 +837,22 @@ def do_health(self, context, *args): logger.error("hawk2: passwordless ssh authentication: FAIL.") logger.warning('Please run "crm cluster health hawk2 --fix"') return False + case 'sbd': + fix = parsed_args.fix + try: + warn = not fix + result = sbd.SBDTimeoutChecker(fix=fix, warn=warn).check_and_fix() + except sbd.FixFailure as e: + logger.error('%s', e) + return False + if result: + logger.info('SBD: Check sbd timeout configuration: OK.') + return True + else: + logger.error('SBD: Check sbd timeout configuration: FAIL.') + if not fix: + logger.warning('Please run "crm cluster health sbd --fix"') + return False case 'sles16': try: if parsed_args.fix: diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py index d9bae7e6e..600c6b99d 100644 --- a/crmsh/ui_sbd.py +++ b/crmsh/ui_sbd.py @@ -107,9 +107,9 @@ class SBD(command.UI): PCMK_ATTRS = ( "have-watchdog", "stonith-timeout", - "stonith-enabled" + "stonith-enabled", + "stonith-watchdog-timeout" ) - PCMK_ATTRS_DISKLESS = ('stonith-watchdog-timeout',) PARSE_RE = re.compile( # To extract key, suffix and value from these possible arguments: # watchdog-timeout=30 @@ -222,11 +222,7 @@ def _show_property(self) -> None: out = self.cluster_shell.get_stdout_or_raise_error("crm configure show") logger.info("crm sbd configure show property") - if self.device_list_from_config: - attrs = self.PCMK_ATTRS - else: - attrs = self.PCMK_ATTRS + self.PCMK_ATTRS_DISKLESS - regex = f"({'|'.join(attrs)})=(\\S+)" + regex = f"({'|'.join(self.PCMK_ATTRS)})=(\\S+)" matches = re.findall(regex, out) for match in matches: print(f"{match[0]}={match[1]}") @@ -244,6 +240,7 @@ def _show_property(self) -> None: print(f"TimeoutStartUSec={systemd_start_timeout}") def _configure_show(self, args) -> None: + check_rc = True if len(args) > 2: raise self.SyntaxError("Invalid argument") elif len(args) == 2: @@ -256,6 +253,7 @@ def _configure_show(self, args) -> None: self._show_property() case _: raise self.SyntaxError(f"Unknown argument: {args[1]}") + check_rc = sbd.SBDTimeoutChecker(check_category=args[1]).check_and_fix() else: self._show_disk_metadata() if self.device_list_from_config: @@ -263,6 +261,10 @@ def _configure_show(self, args) -> None: self._show_sysconfig() print() self._show_property() + check_rc = sbd.SBDTimeoutChecker().check_and_fix() + + if not check_rc: + logger.info('Please run "crm cluster health sbd --fix" to fix the above warning') def _parse_args(self, args: tuple[str, ...]) -> dict[str, int|str]: ''' diff --git a/crmsh/utils.py b/crmsh/utils.py index a8a133ecc..66b5805a1 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -1858,9 +1858,10 @@ def remote_diff_this(local_path, nodes, this_node): if isinstance(result, crmsh.parallax.Error): raise ValueError("Failed on %s: %s" % (host, str(result))) path = result - _, s = ShellUtils().get_stdout("diff -U 0 -d -b --label %s --label %s %s %s" % - (host, this_node, path, local_path)) - page_string(s) + _, output = ShellUtils().get_stdout("diff -U 0 -d -b --label %s --label %s %s %s" % + (host, this_node, path, local_path)) + page_string(output) + return output def remote_diff(local_path, nodes): diff --git a/doc/crm.8.adoc b/doc/crm.8.adoc index c2b167832..6d2ac1c91 100644 --- a/doc/crm.8.adoc +++ b/doc/crm.8.adoc @@ -1084,13 +1084,15 @@ Usage 2: Topic-Specified Health Check Verifies the health of a specified topic. ............... -health hawk2|sles16 [--local] [--fix] +health hawk2|sbd|sles16 [--local] [--fix] ............... * `hawk2`: check or fix key-based ssh authentication for user hacluster, which is needed by hawk2. ** `--fix`: attempts to automatically resolve any detected issues, eg. hacluster passwordless +* `sbd`: check or fix SBD timeout-related configurations. + ** `--fix`: attempts to automatically resolve any detected issues. * `sles16`: check whether the cluster is good to migrate to SLES 16. ** `--local`: run checks in local mode ** `--fix`: attempts to automatically resolve any detected issues. @@ -2216,6 +2218,7 @@ Main functionailities include: - Show contents of /etc/sysconfig/sbd - Show SBD related cluster properties - Update the SBD related configuration parameters +- Give warnings for timeout-related misconfigurations - NOTE: sbd crashdump is used for debugging. Understand the risks and run `crm sbd purge crashdump` afterward For more details on SBD and related parameters, please see man sbd(8). diff --git a/test/features/bootstrap_sbd_delay.feature b/test/features/bootstrap_sbd_delay.feature index 4a5fcfbc7..f5a6b9274 100644 --- a/test/features/bootstrap_sbd_delay.feature +++ b/test/features/bootstrap_sbd_delay.feature @@ -305,3 +305,79 @@ Feature: configure sbd delay start correctly And Property "priority" in "rsc_defaults" is "0" And Cluster property "priority-fencing-delay" is "0" And Parameter "pcmk_delay_max" not configured in "stonith-sbd" + + @clean + Scenario: Check and fix sbd-related timeout values for disk-based sbd + Given Cluster service is "stopped" on "hanode1" + Given Cluster service is "stopped" on "hanode2" + When Run "crm cluster init -y" on "hanode1" + Then Cluster service is "started" on "hanode1" + When Run "crm cluster join -c hanode1 -y" on "hanode2" + Then Cluster service is "started" on "hanode2" + When Run "crm cluster init sbd -s /dev/sda1 -y" on "hanode1" + Then Service "sbd" is "started" on "hanode1" + And Service "sbd" is "started" on "hanode2" + # check /etc/sysconf/sbd consistency + When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START="no"/' /etc/sysconfig/sbd" on "hanode2" + When Try "crm sbd configure show" + Then Expected "/etc/sysconfig/sbd is not consistent across cluster nodes" in stderr + When Try "crm cluster health sbd" + Then Expected "/etc/sysconfig/sbd is not consistent across cluster nodes" in stderr + When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=71/' /etc/sysconfig/sbd" on "hanode2" + When Run "crm cluster health sbd" on "hanode1" + Then Expected "SBD: Check sbd timeout configuration: OK" in stdout + # check sbd disk metadata + When Run "sbd -1 15 -4 16 -d /dev/sda1 create" on "hanode1" + When Try "crm sbd configur show disk_metadata" on "hanode1" + Then Expected "It's recommended that msgwait(now 16) >= 2*watchdog timeout(now 15)" in stderr + When Try "crm cluster health sbd" on "hanode1" + Then Expected "It's recommended that msgwait(now 16) >= 2*watchdog timeout(now 15)" in stderr + When Run "crm cluster health sbd --fix" on "hanode1" + Then Expected "SBD: Check sbd timeout configuration: OK" in stdout + # check SBD_DELAY_START + When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=40/' /etc/sysconfig/sbd" on "hanode1" + When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=40/' /etc/sysconfig/sbd" on "hanode2" + When Try "crm sbd configure show" on "hanode1" + Then Expected "It's recommended that SBD_DELAY_START is set to 71, now is 40" in stderr + When Try "crm cluster health sbd" on "hanode1" + Then Expected "It's recommended that SBD_DELAY_START is set to 71, now is 40" in stderr + When Run "crm cluster health sbd --fix" on "hanode1" + Then Expected "SBD: Check sbd timeout configuration: OK" in stdout + # check stonith-timeout + When Run "crm configure property stonith-timeout=50" on "hanode1" + When Try "crm sbd configure show" on "hanode1" + Then Expected "It's recommended that stonith-timeout is set to 71, now is 50" in stderr + When Try "crm cluster health sbd" on "hanode1" + Then Expected "It's recommended that stonith-timeout is set to 71, now is 50" in stderr + When Run "crm cluster health sbd --fix" on "hanode1" + Then Expected "SBD: Check sbd timeout configuration: OK" in stdout + # Adjust token timeout in corosync.conf + When Run "sed -i 's/token: .*/token: 10000/' /etc/corosync/corosync.conf" on "hanode1" + When Run "sed -i 's/token: .*/token: 10000/' /etc/corosync/corosync.conf" on "hanode2" + When Run "corosync-cfgtool -R" on "hanode1" + When Try "crm sbd configure show" on "hanode1" + Then Expected "It's recommended that SBD_DELAY_START is set to 82, now is 71" in stderr + When Try "crm cluster health sbd" on "hanode1" + Then Expected "It's recommended that SBD_DELAY_START is set to 82, now is 71" in stderr + When Run "crm cluster health sbd --fix" on "hanode1" + Then Expected "SBD: Check sbd timeout configuration: OK" in stdout + + @clean + Scenario: Check and fix sbd-related timeout values for diskless sbd + Given Cluster service is "stopped" on "hanode1" + Given Cluster service is "stopped" on "hanode2" + When Run "crm cluster init -y" on "hanode1" + Then Cluster service is "started" on "hanode1" + When Run "crm cluster join -c hanode1 -y" on "hanode2" + Then Cluster service is "started" on "hanode2" + When Run "crm cluster init sbd -S -y" on "hanode1" + Then Service "sbd" is "started" on "hanode1" + And Service "sbd" is "started" on "hanode2" + # Delete stonith-watchdog-timeout + When Delete property "stonith-watchdog-timeout" from cluster + When Try "crm sbd configure show" on "hanode1" + Then Expected "It's recommended that stonith-watchdog-timeout is set to 30, now is not set" in stderr + When Try "crm cluster health sbd" on "hanode1" + Then Expected "It's recommended that stonith-watchdog-timeout is set to 30, now is not set" in stderr + When Run "crm cluster health sbd --fix" on "hanode1" + Then Expected "SBD: Check sbd timeout configuration: OK" in stdout diff --git a/test/features/bootstrap_sbd_normal.feature b/test/features/bootstrap_sbd_normal.feature index e2322e7e0..df43c92e2 100644 --- a/test/features/bootstrap_sbd_normal.feature +++ b/test/features/bootstrap_sbd_normal.feature @@ -81,14 +81,12 @@ Feature: crmsh bootstrap sbd management Given Cluster service is "stopped" on "hanode1" Given Cluster service is "stopped" on "hanode2" When Run "crm cluster init ssh -y" on "hanode1" - And Run "crm cluster init csync2 -y" on "hanode1" And Run "crm cluster init corosync -y" on "hanode1" And Run "crm cluster init sbd -s /dev/sda1 -y" on "hanode1" And Run "crm cluster init cluster -y" on "hanode1" Then Cluster service is "started" on "hanode1" And Service "sbd" is "started" on "hanode1" When Run "crm cluster join ssh -y -c hanode1" on "hanode2" - And Run "crm cluster join csync2 -y -c hanode1" on "hanode2" And Run "crm cluster join ssh_merge -y -c hanode1" on "hanode2" And Run "crm cluster join cluster -y -c hanode1" on "hanode2" Then Cluster service is "started" on "hanode2" @@ -100,14 +98,12 @@ Feature: crmsh bootstrap sbd management Given Cluster service is "stopped" on "hanode1" Given Cluster service is "stopped" on "hanode2" When Run "crm cluster init ssh -y" on "hanode1" - And Run "crm cluster init csync2 -y" on "hanode1" And Run "crm cluster init corosync -y" on "hanode1" And Run "crm cluster init sbd -S -y" on "hanode1" And Run "crm cluster init cluster -y" on "hanode1" Then Cluster service is "started" on "hanode1" And Service "sbd" is "started" on "hanode1" When Run "crm cluster join ssh -y -c hanode1" on "hanode2" - And Run "crm cluster join csync2 -y -c hanode1" on "hanode2" And Run "crm cluster join ssh_merge -y -c hanode1" on "hanode2" And Run "crm cluster join cluster -y -c hanode1" on "hanode2" Then Cluster service is "started" on "hanode2" diff --git a/test/features/steps/step_implementation.py b/test/features/steps/step_implementation.py index 471745737..2c86cef92 100644 --- a/test/features/steps/step_implementation.py +++ b/test/features/steps/step_implementation.py @@ -479,14 +479,21 @@ def step_impl(context, key, value): @then('SBD option "{key}" value for "{dev}" is "{value}"') def step_impl(context, key, dev, value): - res = sbd.SBDTimeout.get_sbd_msgwait(dev) + res = sbd.SBDUtils.get_sbd_device_metadata(dev).get(key) assert_eq(int(value), res) + @then('Start timeout for sbd.service is "{value}" seconds') def step_impl(context, value): systemd_start_timeout = sbd.SBDTimeout.get_sbd_systemd_start_timeout() assert_eq(int(value), systemd_start_timeout) + +@when('Delete property "{key}" from cluster') +def step_impl(context, key): + crmutils.delete_property(key) + + @then('Cluster property "{key}" is "{value}"') def step_impl(context, key, value): res = crmutils.get_property(key) diff --git a/test/unittests/test_bootstrap.py b/test/unittests/test_bootstrap.py index dc6463c66..cb6389219 100644 --- a/test/unittests/test_bootstrap.py +++ b/test/unittests/test_bootstrap.py @@ -1539,12 +1539,15 @@ def test_adjust_pcmk_delay(self, mock_cib_inst, mock_run, mock_debug): bootstrap.adjust_pcmk_delay_max(False) mock_run.assert_called_once_with("crm resource param res_1 delete pcmk_delay_max") - @mock.patch('crmsh.sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration') + @mock.patch('crmsh.sbd.SBDTimeoutChecker') @mock.patch('crmsh.service_manager.ServiceManager.service_is_active') - def test_adjust_stonith_timeout_sbd(self, mock_is_active, mock_sbd_adjust_timeout): + def test_adjust_stonith_timeout_sbd(self, mock_is_active, mock_sbd_checker): + mock_sbd_checker_inst = mock.Mock() + mock_sbd_checker.return_value = mock_sbd_checker_inst + mock_sbd_checker_inst.check_and_fix = mock.Mock() mock_is_active.return_value = True bootstrap.adjust_stonith_timeout() - mock_sbd_adjust_timeout.assert_called_once_with() + mock_sbd_checker.assert_called_once_with(fix=True, warn=False, from_bootstrap=True) @mock.patch('crmsh.utils.set_property') @mock.patch('crmsh.bootstrap.get_stonith_timeout_generally_expected') diff --git a/test/unittests/test_sbd.py b/test/unittests/test_sbd.py index 351853f45..597a31a26 100644 --- a/test/unittests/test_sbd.py +++ b/test/unittests/test_sbd.py @@ -22,12 +22,6 @@ def test_get_sbd_device_metadata_success(self, mock_cluster_shell): expected = {'uuid': '1234-5678', 'watchdog': 5, 'msgwait': 10} self.assertEqual(result, expected) - @patch('crmsh.sh.cluster_shell') - def test_get_sbd_device_metadata_exception(self, mock_cluster_shell): - mock_cluster_shell.return_value.get_stdout_or_raise_error.side_effect = Exception - result = SBDUtils.get_sbd_device_metadata("/dev/sbd_device") - self.assertEqual(result, {}) - @patch('crmsh.sh.cluster_shell') def test_get_sbd_device_metadata_timeout_only(self, mock_cluster_shell): mock_cluster_shell.return_value.get_stdout_or_raise_error.return_value = self.TEST_DATA @@ -186,19 +180,6 @@ class TestSBDTimeout(unittest.TestCase): """ Unitary tests for crmsh.sbd.SBDTimeout """ - @patch('crmsh.sbd.SBDUtils.get_sbd_device_metadata') - def test_get_sbd_msgwait_exception(self, mock_get_sbd_device_metadata): - mock_get_sbd_device_metadata.return_value = {} - with self.assertRaises(ValueError) as context: - sbd.SBDTimeout.get_sbd_msgwait("/dev/sbd_device") - self.assertTrue("Cannot get sbd msgwait for /dev/sbd_device" in str(context.exception)) - - @patch('crmsh.sbd.SBDUtils.get_sbd_device_metadata') - def test_get_sbd_msgwait(self, mock_get_sbd_device_metadata): - mock_get_sbd_device_metadata.return_value = {'msgwait': 10} - result = sbd.SBDTimeout.get_sbd_msgwait("/dev/sbd_device") - self.assertEqual(result, 10) - @patch('crmsh.sbd.SBDUtils.get_sbd_value_from_config') def test_get_sbd_watchdog_timeout_exception(self, mock_get_sbd_value_from_config): mock_get_sbd_value_from_config.return_value = None @@ -246,103 +227,12 @@ def test_get_sbd_systemd_start_timeout(self, mock_cluster_shell, mock_get_system mock_cluster_shell.return_value.get_stdout_or_raise_error.assert_called_once_with(sbd.SBDTimeout.SHOW_SBD_START_TIMEOUT_CMD) mock_get_systemd_timeout_start_in_sec.assert_called_once_with("1min 30s") - @patch('crmsh.sbd.SBDTimeout.adjust_systemd_start_timeout') - @patch('crmsh.sbd.SBDTimeout.adjust_stonith_timeout') - @patch('crmsh.sbd.SBDTimeout.adjust_sbd_delay_start') - @patch('crmsh.sbd.SBDTimeout._load_configurations') - def test_adjust_sbd_timeout_related_cluster_configuration(self, mock_load_configurations, mock_adjust_sbd_delay_start, mock_adjust_stonith_timeout, - mock_adjust_systemd_start_timeout): - sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration() - mock_load_configurations.assert_called_once() - mock_adjust_sbd_delay_start.assert_called_once() - mock_adjust_stonith_timeout.assert_called_once() - mock_adjust_systemd_start_timeout.assert_called_once() - - @patch('crmsh.sbd.SBDManager.update_sbd_configuration') - def test_adjust_sbd_delay_start_return(self, mock_update_sbd_configuration): - inst = sbd.SBDTimeout() - inst.sbd_delay_start_value_expected = 100 - inst.sbd_delay_start_value_from_config = "100" - inst.adjust_sbd_delay_start() - mock_update_sbd_configuration.assert_not_called() - - @patch('crmsh.sbd.SBDManager.update_sbd_configuration') - def test_adjust_sbd_delay_start(self, mock_update_sbd_configuration): - inst = sbd.SBDTimeout() - inst.sbd_delay_start_value_expected = "no" - inst.sbd_delay_start_value_from_config = 200 - inst.adjust_sbd_delay_start() - mock_update_sbd_configuration.assert_called_once_with({'SBD_DELAY_START': 'no'}) - - @patch('crmsh.utils.set_property') - def test_adjust_stonith_timeout(self, mock_set_property): - inst = sbd.SBDTimeout() - inst.get_stonith_timeout_expected = MagicMock(return_value=10) - inst.adjust_stonith_timeout() - mock_set_property.assert_called_once_with("stonith-timeout", 10) - - @patch('crmsh.sbd.SBDTimeout.restore_systemd_start_timeout') - @patch('crmsh.sbd.SBDTimeout.get_sbd_systemd_start_timeout') - @patch('crmsh.sbd.SBDUtils.get_sbd_value_from_config') - def test_adjust_systemd_start_timeout_no_delay_start(self, mock_get_sbd_value_from_config, mock_get_sbd_systemd_start_timeout, mock_restore_systemd_start_timeout): - mock_get_sbd_value_from_config.return_value = "no" - inst = sbd.SBDTimeout() - inst.adjust_systemd_start_timeout() - mock_get_sbd_value_from_config.assert_called_once_with("SBD_DELAY_START") - mock_get_sbd_systemd_start_timeout.assert_not_called() - - @patch('crmsh.sbd.SBDTimeout.restore_systemd_start_timeout') - @patch('crmsh.sbd.SBDTimeout.get_default_systemd_start_timeout') - @patch('crmsh.sbd.SBDTimeout.get_sbd_systemd_start_timeout') - @patch('crmsh.sbd.SBDUtils.get_sbd_value_from_config') - def test_adjust_systemd_start_timeout_return( - self, - mock_get_sbd_value_from_config, - mock_get_sbd_systemd_start_timeout, - mock_get_default_systemd_start_timeout, - mock_restore_systemd_start_timeout, - ): - mock_get_sbd_value_from_config.return_value = "10" - mock_get_sbd_systemd_start_timeout.return_value = 90 - mock_get_default_systemd_start_timeout.return_value = 90 - inst = sbd.SBDTimeout() - inst.adjust_systemd_start_timeout() - mock_get_sbd_value_from_config.assert_called_once_with("SBD_DELAY_START") - mock_get_sbd_systemd_start_timeout.assert_called_once() - - @patch('crmsh.utils.cluster_run_cmd') - @patch('crmsh.bootstrap.sync_path') - @patch('crmsh.utils.str2file') - @patch('crmsh.utils.mkdirp') - @patch('crmsh.sbd.SBDTimeout.get_default_systemd_start_timeout') - @patch('crmsh.sbd.SBDTimeout.get_sbd_systemd_start_timeout') - @patch('crmsh.sbd.SBDUtils.get_sbd_value_from_config') - def test_adjust_systemd_start_timeout( - self, - mock_get_sbd_value_from_config, - mock_get_sbd_systemd_start_timeout, - mock_get_default_systemd_start_timeout, - mock_mkdirp, - mock_str2file, - mock_sync_file, - mock_cluster_run_cmd, - ): - mock_get_sbd_value_from_config.return_value = "150" - mock_get_sbd_systemd_start_timeout.return_value = 90 - mock_get_default_systemd_start_timeout.return_value = 90 - inst = sbd.SBDTimeout() - inst.adjust_systemd_start_timeout() - mock_get_sbd_value_from_config.assert_called_once_with("SBD_DELAY_START") - mock_get_sbd_systemd_start_timeout.assert_called_once() - mock_mkdirp.assert_called_once_with(sbd.SBDManager.SBD_SYSTEMD_DELAY_START_DIR) - mock_cluster_run_cmd.assert_called_once_with("systemctl daemon-reload") - @patch('crmsh.corosync.token_and_consensus_timeout') def test_get_sbd_delay_start_expected_diskbased(self, mock_token_and_consensus_timeout): inst = sbd.SBDTimeout() inst.disk_based = True inst.pcmk_delay_max = 10 - inst.msgwait = 5 + inst.sbd_msgwait = 5 mock_token_and_consensus_timeout.return_value = 10 self.assertEqual(inst.get_sbd_delay_start_expected(), 25) @@ -368,7 +258,7 @@ def test_get_stonith_timeout(self, mock_load_configurations, mock_get_sbd_delay_ def test_get_stonith_timeout_expected_diskbased(self, mock_token_and_consensus_timeout, mock_logger_debug): inst = sbd.SBDTimeout() inst.disk_based = True - inst.msgwait = 5 + inst.sbd_msgwait = 5 mock_token_and_consensus_timeout.return_value = 10 result = inst.get_stonith_timeout_expected() self.assertEqual(result, 70) @@ -385,6 +275,268 @@ def test_get_stonith_timeout_expected_diskless(self, mock_token_and_consensus_ti self.assertEqual(result, 80) +class TestSBDTimeoutChecker(unittest.TestCase): + + def setUp(self): + self.instance_check = sbd.SBDTimeoutChecker(fix=False) + self.instance_fix = sbd.SBDTimeoutChecker(fix=True) + + @patch('logging.Logger.warning') + @patch('crmsh.sbd.ServiceManager') + def test_check_and_fix_sbd_not_active(self, mock_service_manager, mock_logger_warning): + mock_service_manager_inst = Mock() + mock_service_manager.return_value = mock_service_manager_inst + mock_service_manager_inst.service_is_active = Mock(return_value=False) + with self.assertRaises(utils.TerminateSubCommand): + self.instance_check.check_and_fix() + mock_service_manager_inst.service_is_active.assert_called_once_with(constants.SBD_SERVICE) + mock_logger_warning.assert_called_once_with('%s is not active, skip SBD timeout checks', constants.SBD_SERVICE) + + @patch('crmsh.sbd.ServiceManager') + def test_check_and_fix_sbd_inconsistent(self, mock_service_manager): + mock_service_manager_inst = Mock() + mock_service_manager.return_value = mock_service_manager_inst + mock_service_manager_inst.service_is_active = Mock(return_value=True) + self.instance_check._check_config_consistency = Mock(return_value=False) + with self.assertRaises(utils.TerminateSubCommand): + self.instance_check.check_and_fix() + mock_service_manager_inst.service_is_active.assert_called_once_with(constants.SBD_SERVICE) + self.instance_check._check_config_consistency.assert_called_once() + + @patch('crmsh.sbd.ServiceManager') + def test_check_and_fix_not_fix(self, mock_service_manager): + mock_service_manager_inst = Mock() + mock_service_manager.return_value = mock_service_manager_inst + mock_service_manager_inst.service_is_active = Mock(return_value=True) + self.instance_check._check_config_consistency = Mock(return_value=True) + self.instance_check._load_configurations = Mock() + self.instance_check._check_sbd_disk_metadata = Mock(return_value=False) + self.instance_check._check_sbd_watchdog_timeout = Mock(return_value=True) + + self.assertFalse(self.instance_check.check_and_fix()) + + mock_service_manager_inst.service_is_active.assert_called_once_with(constants.SBD_SERVICE) + self.instance_check._check_config_consistency.assert_called_once() + self.instance_check._load_configurations.assert_called_once() + self.instance_check._check_sbd_disk_metadata.assert_called_once() + self.instance_check._check_sbd_watchdog_timeout.assert_not_called() + + @patch('crmsh.sbd.ServiceManager') + def test_check_and_fix_fix_failure(self, mock_service_manager): + mock_service_manager_inst = Mock() + mock_service_manager.return_value = mock_service_manager_inst + mock_service_manager_inst.service_is_active = Mock(return_value=True) + self.instance_fix._check_config_consistency = Mock(return_value=True) + self.instance_fix._load_configurations = Mock() + self.instance_fix._check_sbd_disk_metadata = Mock(side_effect=[False, False]) + self.instance_fix._fix_sbd_disk_metadata = Mock() + + with self.assertRaises(sbd.FixFailure) as context: + self.instance_fix.check_and_fix() + self.assertTrue("Failed to fix SBD disk metadata" in str(context.exception)) + + @patch('crmsh.sbd.ServiceManager') + def test_check_and_fix_fix_success(self, mock_service_manager): + mock_service_manager_inst = Mock() + mock_service_manager.return_value = mock_service_manager_inst + mock_service_manager_inst.service_is_active = Mock(return_value=True) + self.instance_fix._check_config_consistency = Mock(return_value=True) + self.instance_fix._load_configurations = Mock() + self.instance_fix._check_sbd_disk_metadata = Mock(return_value=True) + self.instance_fix._check_sbd_watchdog_timeout = Mock(return_value=True) + self.instance_fix._check_sbd_delay_start = Mock(return_value=True) + self.instance_fix._check_sbd_systemd_start_timeout = Mock(return_value=True) + self.instance_fix._check_stonith_watchdog_timeout = Mock(return_value=True) + self.instance_fix._check_stonith_timeout = Mock(return_value=True) + + self.assertTrue(self.instance_fix.check_and_fix()) + + mock_service_manager_inst.service_is_active.assert_called_once_with(constants.SBD_SERVICE) + self.instance_fix._check_config_consistency.assert_called_once() + self.instance_fix._load_configurations.assert_called_once() + self.instance_fix._check_sbd_disk_metadata.assert_called_once() + self.instance_fix._check_sbd_watchdog_timeout.assert_called_once() + self.instance_fix._check_sbd_delay_start.assert_called_once() + self.instance_fix._check_sbd_systemd_start_timeout.assert_called_once() + self.instance_fix._check_stonith_watchdog_timeout.assert_called_once() + self.instance_fix._check_stonith_timeout.assert_called_once() + + @patch('crmsh.sbd.SBDUtils.check_devices_metadata_consistent') + @patch('crmsh.sbd.SBDUtils.get_sbd_device_from_config') + @patch('logging.Logger.warning') + @patch('crmsh.utils.remote_diff_this') + @patch('crmsh.corosync.conf') + @patch('crmsh.utils.list_cluster_nodes_except_me') + @patch('crmsh.utils.this_node') + def test_check_config_consistency(self, mock_this_node, mock_list_cluster_nodes_except_me, mock_corosync_conf, mock_remote_diff_this, mock_logger_warning, mock_get_sbd_device_from_config, mock_check_devices_metadata_consistent): + mock_this_node.return_value = 'node1' + mock_list_cluster_nodes_except_me.return_value = ['node2', 'node3'] + mock_corosync_conf.return_value = "corosync.conf" + mock_remote_diff_this.side_effect = ["diff data1", "diff data2"] + mock_get_sbd_device_from_config.return_value = ['/dev/sbd_device'] + mock_check_devices_metadata_consistent.return_value = False + + self.assertFalse(self.instance_check._check_config_consistency()) + + mock_logger_warning.assert_has_calls([ + call('corosync.conf is not consistent across cluster nodes'), + call('%s is not consistent across cluster nodes', sbd.SBDManager.SYSCONFIG_SBD), + call('SBD device metadata is not consistent across cluster nodes'), + call('Please ensure the configurations are consistent across all cluster nodes') + ]) + + @patch('logging.Logger.warning') + def test_check_sbd_disk_metadata_failure(self, mock_logger_warning): + self.instance_check.disk_based = True + self.instance_check.sbd_msgwait = 1 + self.instance_check.get_sbd_msgwait_expected = Mock(return_value=10) + self.assertFalse(self.instance_check._check_sbd_disk_metadata()) + mock_logger_warning.assert_called_once_with("It's recommended that SBD msgwait(now %d) >= %d", 1, 10) + + def test_check_sbd_disk_metadata_success(self): + self.instance_check.disk_based = True + self.instance_check.sbd_msgwait = 15 + self.instance_check.get_sbd_msgwait_expected = Mock(return_value=10) + self.assertTrue(self.instance_check._check_sbd_disk_metadata()) + + @patch('crmsh.sh.cluster_shell') + @patch('logging.Logger.info') + def test_fix_sbd_disk_metadata(self, mock_logger_info, mock_cluster_shell): + self.instance_fix.sbd_watchdog_timeout = 5 + mock_cluster_shell_inst = Mock() + mock_cluster_shell.return_value = mock_cluster_shell_inst + mock_cluster_shell_inst.get_stdout_or_raise_error.return_value = "data" + self.instance_fix._fix_sbd_disk_metadata() + mock_logger_info.assert_called_once_with("Adjusting sbd msgwait to %d", 10) + + @patch('logging.Logger.warning') + def test_check_sbd_watchdog_timeout_failure(self, mock_logger_warning): + self.instance_check.disk_based = False + self.instance_check.sbd_watchdog_timeout = 3 + self.instance_check.qdevice_sync_timeout = 10 + self.assertFalse(self.instance_check._check_sbd_watchdog_timeout()) + mock_logger_warning.assert_called_once_with("It's recommended that SBD_WATCHDOG_TIMEOUT(now %d) >= qdevice sync timeout(now %d)", 3, 10) + + def test_check_sbd_watchdog_timeout_success(self): + self.instance_check.disk_based = False + self.instance_check.sbd_watchdog_timeout = 15 + self.instance_check.qdevice_sync_timeout = 10 + self.assertTrue(self.instance_check._check_sbd_watchdog_timeout()) + + @patch('crmsh.sbd.SBDManager.update_sbd_configuration') + def test_fix_sbd_watchdog_timeout(self, mock_update_sbd_configuration): + self.instance_fix.qdevice_sync_timeout = 10 + self.instance_fix._fix_sbd_watchdog_timeout() + mock_update_sbd_configuration.assert_called_once_with({"SBD_WATCHDOG_TIMEOUT": str(10+sbd.SBDTimeout.QDEVICE_SYNC_TIMEOUT_MARGIN)}) + + @patch('logging.Logger.warning') + def test_check_sbd_delay_start_failure(self, mock_logger_warning): + self.instance_check.sbd_delay_start_value_expected = "100" + self.instance_check.sbd_delay_start_value_from_config = "30" + self.assertFalse(self.instance_check._check_sbd_delay_start()) + mock_logger_warning.assert_called_once_with("It's recommended that SBD_DELAY_START is set to %s, now is %s", "100", "30") + + def test_check_sbd_delay_start_success(self): + self.instance_check.sbd_delay_start_value_expected = "50" + self.instance_check.sbd_delay_start_value_from_config = "50" + self.assertTrue(self.instance_check._check_sbd_delay_start()) + + @patch('crmsh.sbd.SBDManager.update_sbd_configuration') + def test_fix_sbd_delay_start(self, mock_update_sbd_configuration): + self.instance_fix.sbd_delay_start_value_expected = "80" + self.instance_fix._fix_sbd_delay_start() + mock_update_sbd_configuration.assert_called_once_with({"SBD_DELAY_START": "80"}) + + @patch('logging.Logger.warning') + @patch('crmsh.sbd.SBDTimeout.get_sbd_systemd_start_timeout') + def test_check_sbd_systemd_start_timeout_failure(self, mock_get_sbd_systemd_start_timeout, mock_logger_warning): + mock_get_sbd_systemd_start_timeout.return_value = 30 + self.instance_check.sbd_systemd_start_timeout_expected = 60 + self.assertFalse(self.instance_check._check_sbd_systemd_start_timeout()) + mock_logger_warning.assert_called_once_with("It's recommended that systemd start timeout for sbd.service is set to %ds, now is %ds", 60, 30) + + @patch('crmsh.sbd.SBDTimeout.get_sbd_systemd_start_timeout') + def test_check_sbd_systemd_start_timeout_success(self, mock_get_sbd_systemd_start_timeout): + mock_get_sbd_systemd_start_timeout.return_value = 90 + self.instance_check.sbd_systemd_start_timeout_expected = 90 + self.assertTrue(self.instance_check._check_sbd_systemd_start_timeout()) + + @patch('crmsh.utils.cluster_run_cmd') + @patch('crmsh.bootstrap.sync_path') + @patch('crmsh.utils.str2file') + @patch('os.path.join') + @patch('crmsh.utils.mkdirp') + @patch('logging.Logger.info') + def test_fix_sbd_systemd_start_timeout(self, mock_logger_info, mock_mkdirp, mock_os_path_join, mock_str2file, mock_sync_path, mock_cluster_run_cmd): + mock_os_path_join.return_value = f"{sbd.SBDManager.SBD_SYSTEMD_DELAY_START_DIR}/sbd_delay_start.conf" + self.instance_fix.sbd_systemd_start_timeout_expected = 120 + self.instance_fix._fix_sbd_systemd_start_timeout() + mock_logger_info.assert_called_once_with("Adjusting systemd start timeout for sbd.service to %ds", 120) + + @patch('logging.Logger.warning') + @patch('crmsh.utils.get_property') + def test_check_stonith_watchdog_timeout_disk_based_failure(self, mock_get_property, mock_logger_warning): + self.instance_check.disk_based = True + mock_get_property.return_value = 5 + self.assertFalse(self.instance_check._check_stonith_watchdog_timeout()) + mock_logger_warning.assert_called_once_with("It's recommended that stonith-watchdog-timeout is not set when using disk-based SBD") + + @patch('logging.Logger.warning') + @patch('crmsh.utils.get_property') + def test_check_stonith_watchdog_timeout_disk_less_failure(self, mock_get_property, mock_logger_warning): + self.instance_check.disk_based = False + self.instance_check.stonith_watchdog_timeout = 15 + mock_get_property.return_value = "" + self.assertFalse(self.instance_check._check_stonith_watchdog_timeout()) + mock_logger_warning.assert_called_once_with("It's recommended that stonith-watchdog-timeout is set to %d, now is %s", 15, "not set") + + @patch('crmsh.utils.get_property') + def test_check_stonith_watchdog_timeout_success(self, mock_get_property): + self.instance_check.disk_based = False + self.instance_check.stonith_watchdog_timeout = 20 + mock_get_property.return_value = 25 + self.assertTrue(self.instance_check._check_stonith_watchdog_timeout()) + + @patch('crmsh.utils.delete_property') + @patch('logging.Logger.info') + def test_fix_stonith_watchdog_timeout_disk_based_success(self, mock_logger_info, mock_delete_property): + self.instance_fix.disk_based = True + self.instance_fix._fix_stonith_watchdog_timeout() + mock_logger_info.assert_called_once_with("Removing stonith-watchdog-timeout property") + mock_delete_property.assert_called_once_with('stonith-watchdog-timeout') + + @patch('crmsh.utils.set_property') + @patch('logging.Logger.info') + def test_fix_stonith_watchdog_timeout_disk_less_success(self, mock_logger_info, mock_set_property): + self.instance_fix.disk_based = False + self.instance_fix.stonith_watchdog_timeout = 15 + self.instance_fix._fix_stonith_watchdog_timeout() + mock_logger_info.assert_called_once_with("Adjusting stonith-watchdog-timeout to %d", 15) + mock_set_property.assert_called_once_with('stonith-watchdog-timeout', 15) + + @patch('logging.Logger.warning') + @patch('crmsh.utils.get_property') + def test_check_stonith_timeout_failure(self, mock_get_property, mock_logger_warning): + self.instance_check.get_stonith_timeout_expected = Mock(return_value=60) + mock_get_property.return_value = 30 + self.assertFalse(self.instance_check._check_stonith_timeout()) + mock_logger_warning.assert_called_once_with("It's recommended that stonith-timeout is set to %d, now is %s", 60, 30) + + @patch('crmsh.utils.get_property') + def test_check_stonith_timeout_success(self, mock_get_property): + self.instance_check.get_stonith_timeout_expected = Mock(return_value=50) + mock_get_property.return_value = 50 + self.assertTrue(self.instance_check._check_stonith_timeout()) + + @patch('crmsh.utils.set_property') + @patch('logging.Logger.info') + def test_fix_stonith_timeout(self, mock_logger_info, mock_set_property): + self.instance_fix.get_stonith_timeout_expected = Mock(return_value=70) + self.instance_fix._fix_stonith_timeout() + mock_logger_info.assert_called_once_with("Adjusting stonith-timeout to %d", 70) + mock_set_property.assert_called_once_with('stonith-timeout', 70) + + class TestSBDManager(unittest.TestCase): def test_convert_timeout_dict_to_opt_str(self): diff --git a/test/unittests/test_ui_sbd.py b/test/unittests/test_ui_sbd.py index e8f3c2b5e..694356fe3 100644 --- a/test/unittests/test_ui_sbd.py +++ b/test/unittests/test_ui_sbd.py @@ -239,20 +239,38 @@ def test_configure_show_unknown_arg(self): res = self.sbd_instance_diskbased._configure_show(["xxx1", "xxx2"]) self.assertEqual(str(e.exception), f"Unknown argument: xxx2") - def test_configure_show_disk_metadata(self): + @mock.patch('crmsh.sbd.SBDTimeoutChecker') + def test_configure_show_disk_metadata(self, mock_sbd_timeout_checker): + mock_sbd_timeout_checker_instance = mock.Mock() + mock_sbd_timeout_checker.return_value = mock_sbd_timeout_checker_instance + mock_sbd_timeout_checker_instance.check_and_fix = mock.Mock() self.sbd_instance_diskbased._show_disk_metadata = mock.Mock() self.sbd_instance_diskbased._configure_show(["show", "disk_metadata"]) self.sbd_instance_diskbased._show_disk_metadata.assert_called_once() + mock_sbd_timeout_checker.assert_called_once_with(check_category="disk_metadata") + mock_sbd_timeout_checker_instance.check_and_fix.assert_called_once() + @mock.patch('crmsh.sbd.SBDTimeoutChecker') @mock.patch('crmsh.ui_sbd.SBD._show_sysconfig') - def test_configure_show_sysconfig(self, mock_show_sysconfig): + def test_configure_show_sysconfig(self, mock_show_sysconfig, mock_sbd_timeout_checker): + mock_sbd_timeout_checker_instance = mock.Mock() + mock_sbd_timeout_checker.return_value = mock_sbd_timeout_checker_instance + mock_sbd_timeout_checker_instance.check_and_fix = mock.Mock() self.sbd_instance_diskbased._configure_show(["show", "sysconfig"]) mock_show_sysconfig.assert_called_once() - - def test_configure_show_property(self): + mock_sbd_timeout_checker.assert_called_once_with(check_category="sysconfig") + mock_sbd_timeout_checker_instance.check_and_fix.assert_called_once() + + @mock.patch('crmsh.sbd.SBDTimeoutChecker') + def test_configure_show_property(self, mock_sbd_timeout_checker): + mock_sbd_timeout_checker_instance = mock.Mock() + mock_sbd_timeout_checker.return_value = mock_sbd_timeout_checker_instance + mock_sbd_timeout_checker_instance.check_and_fix = mock.Mock() self.sbd_instance_diskbased._show_property = mock.Mock() self.sbd_instance_diskbased._configure_show(["show", "property"]) self.sbd_instance_diskbased._show_property.assert_called_once() + mock_sbd_timeout_checker.assert_called_once_with(check_category="property") + mock_sbd_timeout_checker_instance.check_and_fix.assert_called_once() def test_parse_re(self): test_data = [ @@ -267,13 +285,18 @@ def test_parse_re(self): self.assertIsNotNone(match) self.assertEqual(match.groups(), expected) + @mock.patch('crmsh.sbd.SBDTimeoutChecker') @mock.patch('crmsh.ui_sbd.SBD._show_sysconfig') @mock.patch('builtins.print') - def test_configure_show(self, mock_print, mock_show_sysconfig): + def test_configure_show(self, mock_print, mock_show_sysconfig, mock_sbd_timeout_checker): + mock_sbd_timeout_checker_instance = mock.Mock() + mock_sbd_timeout_checker.return_value = mock_sbd_timeout_checker_instance + mock_sbd_timeout_checker_instance.check_and_fix = mock.Mock() self.sbd_instance_diskbased._show_disk_metadata = mock.Mock() self.sbd_instance_diskbased._show_property = mock.Mock() self.sbd_instance_diskbased._configure_show(["show"]) mock_print.assert_has_calls([mock.call(), mock.call()]) + mock_sbd_timeout_checker_instance.check_and_fix.assert_called_once() def test_parse_args_invalid_args(self): with self.assertRaises(ui_sbd.SBD.SyntaxError) as e: