ClusterLabs · liangxin1300 · Nov 10, 2025 · Nov 10, 2025 · Nov 6, 2025 · Nov 8, 2025
diff --git a/crmsh/bootstrap.py b/crmsh/bootstrap.py
@@ -2773,7 +2773,7 @@ def adjust_stonith_timeout(with_sbd: bool = False):
     Adjust stonith-timeout for sbd and other scenarios
     """
     if ServiceManager().service_is_active(constants.SBD_SERVICE) or with_sbd:
-        sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration()
+        sbd.SBDTimeoutChecker(fix=True, warn=False, from_bootstrap=True).check_and_fix()
     else:
         value = get_stonith_timeout_generally_expected()
         if value:

diff --git a/crmsh/sbd.py b/crmsh/sbd.py
diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py
@@ -28,8 +28,7 @@
 from .sh import ShellUtils
 from .ui_node import parse_option_for_nodes
 from . import constants
-
-
+from . import sbd
 from . import log
 from .utils import TerminateSubCommand
 
@@ -794,18 +793,16 @@ def do_geo_init_arbitrator(self, context, *args):
         bootstrap.bootstrap_arbitrator(geo_context)
         return True
 
-    @command.completers(compl.choice([
-        'hawk2',
-        'sles16',
-    ]))
+    HEALTH_COMPONENTS = ['hawk2', 'sles16', 'sbd']
+    @command.completers(compl.choice(HEALTH_COMPONENTS))
     def do_health(self, context, *args):
         '''
         Extensive health check.
         '''
         if not args:
             return Cluster._do_health_legacy()
         parser = argparse.ArgumentParser('health')
-        parser.add_argument('component', choices=['hawk2', 'sles16'])
+        parser.add_argument('component', choices=Cluster.HEALTH_COMPONENTS)
         parser.add_argument('-f', '--fix', action='store_true')
         parsed_args, remaining_args = parser.parse_known_args(args)
         match parsed_args.component:
@@ -840,6 +837,22 @@ def do_health(self, context, *args):
                         logger.error("hawk2: passwordless ssh authentication: FAIL.")
                         logger.warning('Please run "crm cluster health hawk2 --fix"')
                         return False
+            case 'sbd':
+                fix = parsed_args.fix
+                try:
+                    warn = not fix
+                    result = sbd.SBDTimeoutChecker(fix=fix, warn=warn).check_and_fix()
+                except sbd.FixFailure as e:
+                    logger.error('%s', e)
+                    return False
+                if result:
+                    logger.info('SBD: Check sbd timeout configuration: OK.')
+                    return True
+                else:
+                    logger.error('SBD: Check sbd timeout configuration: FAIL.')
+                    if not fix:
+                        logger.warning('Please run "crm cluster health sbd --fix"')
+                    return False
             case 'sles16':
                 try:
                     if parsed_args.fix:

diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py
@@ -107,9 +107,9 @@ class SBD(command.UI):
     PCMK_ATTRS = (
         "have-watchdog",
         "stonith-timeout",
-        "stonith-enabled"
+        "stonith-enabled",
+        "stonith-watchdog-timeout"
     )
-    PCMK_ATTRS_DISKLESS = ('stonith-watchdog-timeout',)
     PARSE_RE = re.compile(
         # To extract key, suffix and value from these possible arguments:
         # watchdog-timeout=30
@@ -222,11 +222,7 @@ def _show_property(self) -> None:
         out = self.cluster_shell.get_stdout_or_raise_error("crm configure show")
 
         logger.info("crm sbd configure show property")
-        if self.device_list_from_config:
-            attrs = self.PCMK_ATTRS
-        else:
-            attrs = self.PCMK_ATTRS + self.PCMK_ATTRS_DISKLESS
-        regex = f"({'|'.join(attrs)})=(\\S+)"
+        regex = f"({'|'.join(self.PCMK_ATTRS)})=(\\S+)"
         matches = re.findall(regex, out)
         for match in matches:
             print(f"{match[0]}={match[1]}")
@@ -244,6 +240,7 @@ def _show_property(self) -> None:
         print(f"TimeoutStartUSec={systemd_start_timeout}")
 
     def _configure_show(self, args) -> None:
+        check_rc = True
         if len(args) > 2:
             raise self.SyntaxError("Invalid argument")
         elif len(args) == 2:
@@ -256,13 +253,18 @@ def _configure_show(self, args) -> None:
                     self._show_property()
                 case _:
                     raise self.SyntaxError(f"Unknown argument: {args[1]}")
+            check_rc = sbd.SBDTimeoutChecker(check_category=args[1]).check_and_fix()
         else:
             self._show_disk_metadata()
             if self.device_list_from_config:
                 print()
             self._show_sysconfig()
             print()
             self._show_property()
+            check_rc = sbd.SBDTimeoutChecker().check_and_fix()
+
+        if not check_rc:
+            logger.info('Please run "crm cluster health sbd --fix" to fix the above warning')
 
     def _parse_args(self, args: tuple[str, ...]) -> dict[str, int|str]:
         '''

diff --git a/crmsh/utils.py b/crmsh/utils.py
@@ -1858,9 +1858,10 @@ def remote_diff_this(local_path, nodes, this_node):
         if isinstance(result, crmsh.parallax.Error):
             raise ValueError("Failed on %s: %s" % (host, str(result)))
         path = result
-        _, s = ShellUtils().get_stdout("diff -U 0 -d -b --label %s --label %s %s %s" %
-                          (host, this_node, path, local_path))
-        page_string(s)
+        _, output = ShellUtils().get_stdout("diff -U 0 -d -b --label %s --label %s %s %s" %
+                                            (host, this_node, path, local_path))
+        page_string(output)
+    return output
 
 
 def remote_diff(local_path, nodes):

diff --git a/doc/crm.8.adoc b/doc/crm.8.adoc
@@ -1084,13 +1084,15 @@ Usage 2: Topic-Specified Health Check
 Verifies the health of a specified topic.
 
 ...............
-health hawk2|sles16 [--local] [--fix]
+health hawk2|sbd|sles16 [--local] [--fix]
 ...............
 
 * `hawk2`: check or fix key-based ssh authentication for user hacluster, which
   is needed by hawk2.
     ** `--fix`: attempts to automatically resolve any detected issues, eg.
        hacluster passwordless
+* `sbd`: check or fix SBD timeout-related configurations.
+    ** `--fix`: attempts to automatically resolve any detected issues.
 * `sles16`: check whether the cluster is good to migrate to SLES 16.
     ** `--local`: run checks in local mode
     ** `--fix`: attempts to automatically resolve any detected issues.
@@ -2216,6 +2218,7 @@ Main functionailities include:
 - Show contents of /etc/sysconfig/sbd
 - Show SBD related cluster properties
 - Update the SBD related configuration parameters
+- Give warnings for timeout-related misconfigurations
 - NOTE: sbd crashdump is used for debugging. Understand the risks and run `crm sbd purge crashdump` afterward
 
 For more details on SBD and related parameters, please see man sbd(8).

diff --git a/test/features/bootstrap_sbd_delay.feature b/test/features/bootstrap_sbd_delay.feature
@@ -305,3 +305,79 @@ Feature: configure sbd delay start correctly
     And     Property "priority" in "rsc_defaults" is "0"
     And     Cluster property "priority-fencing-delay" is "0"
     And     Parameter "pcmk_delay_max" not configured in "stonith-sbd"
+
+  @clean
+  Scenario: Check and fix sbd-related timeout values for disk-based sbd
+    Given   Cluster service is "stopped" on "hanode1"
+    Given   Cluster service is "stopped" on "hanode2"
+    When    Run "crm cluster init -y" on "hanode1"
+    Then    Cluster service is "started" on "hanode1"
+    When    Run "crm cluster join -c hanode1 -y" on "hanode2"
+    Then    Cluster service is "started" on "hanode2"
+    When    Run "crm cluster init sbd -s /dev/sda1 -y" on "hanode1"
+    Then    Service "sbd" is "started" on "hanode1"
+    And     Service "sbd" is "started" on "hanode2"
+    # check /etc/sysconf/sbd consistency
+    When    Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START="no"/' /etc/sysconfig/sbd" on "hanode2"
+    When    Try "crm sbd configure show"
+    Then    Expected "/etc/sysconfig/sbd is not consistent across cluster nodes" in stderr
+    When    Try "crm cluster health sbd"
+    Then    Expected "/etc/sysconfig/sbd is not consistent across cluster nodes" in stderr
+    When    Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=71/' /etc/sysconfig/sbd" on "hanode2"
+    When    Run "crm cluster health sbd" on "hanode1"
+    Then    Expected "SBD: Check sbd timeout configuration: OK" in stdout
+    # check sbd disk metadata
+    When    Run "sbd -1 15 -4 16 -d /dev/sda1 create" on "hanode1"
+    When    Try "crm sbd configur show disk_metadata" on "hanode1"
+    Then    Expected "It's recommended that msgwait(now 16) >= 2*watchdog timeout(now 15)" in stderr
+    When    Try "crm cluster health sbd" on "hanode1"
+    Then    Expected "It's recommended that msgwait(now 16) >= 2*watchdog timeout(now 15)" in stderr
+    When    Run "crm cluster health sbd --fix" on "hanode1"
+    Then    Expected "SBD: Check sbd timeout configuration: OK" in stdout
+    # check SBD_DELAY_START
+    When    Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=40/' /etc/sysconfig/sbd" on "hanode1"
+    When    Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=40/' /etc/sysconfig/sbd" on "hanode2"
+    When    Try "crm sbd configure show" on "hanode1"
+    Then    Expected "It's recommended that SBD_DELAY_START is set to 71, now is 40" in stderr
+    When    Try "crm cluster health sbd" on "hanode1"
+    Then    Expected "It's recommended that SBD_DELAY_START is set to 71, now is 40" in stderr
+    When    Run "crm cluster health sbd --fix" on "hanode1"
+    Then    Expected "SBD: Check sbd timeout configuration: OK" in stdout
+    # check stonith-timeout
+    When    Run "crm configure property stonith-timeout=50" on "hanode1"
+    When    Try "crm sbd configure show" on "hanode1"
+    Then    Expected "It's recommended that stonith-timeout is set to 71, now is 50" in stderr
+    When    Try "crm cluster health sbd" on "hanode1"
+    Then    Expected "It's recommended that stonith-timeout is set to 71, now is 50" in stderr
+    When    Run "crm cluster health sbd --fix" on "hanode1"
+    Then    Expected "SBD: Check sbd timeout configuration: OK" in stdout
+    # Adjust token timeout in corosync.conf
+    When    Run "sed -i 's/token: .*/token: 10000/' /etc/corosync/corosync.conf" on "hanode1"
+    When    Run "sed -i 's/token: .*/token: 10000/' /etc/corosync/corosync.conf" on "hanode2"
+    When    Run "corosync-cfgtool -R" on "hanode1"
+    When    Try "crm sbd configure show" on "hanode1"
+    Then    Expected "It's recommended that SBD_DELAY_START is set to 82, now is 71" in stderr
+    When    Try "crm cluster health sbd" on "hanode1"
+    Then    Expected "It's recommended that SBD_DELAY_START is set to 82, now is 71" in stderr
+    When    Run "crm cluster health sbd --fix" on "hanode1"
+    Then    Expected "SBD: Check sbd timeout configuration: OK" in stdout
+
+  @clean
+  Scenario: Check and fix sbd-related timeout values for diskless sbd
+    Given   Cluster service is "stopped" on "hanode1"
+    Given   Cluster service is "stopped" on "hanode2"
+    When    Run "crm cluster init -y" on "hanode1"
+    Then    Cluster service is "started" on "hanode1"
+    When    Run "crm cluster join -c hanode1 -y" on "hanode2"
+    Then    Cluster service is "started" on "hanode2"
+    When    Run "crm cluster init sbd -S -y" on "hanode1"
+    Then    Service "sbd" is "started" on "hanode1"
+    And     Service "sbd" is "started" on "hanode2"
+    # Delete stonith-watchdog-timeout
+    When    Delete property "stonith-watchdog-timeout" from cluster
+    When    Try "crm sbd configure show" on "hanode1"
+    Then    Expected "It's recommended that stonith-watchdog-timeout is set to 30, now is not set" in stderr
+    When    Try "crm cluster health sbd" on "hanode1"
+    Then    Expected "It's recommended that stonith-watchdog-timeout is set to 30, now is not set" in stderr
+    When    Run "crm cluster health sbd --fix" on "hanode1"
+    Then    Expected "SBD: Check sbd timeout configuration: OK" in stdout
diff --git a/test/features/bootstrap_sbd_normal.feature b/test/features/bootstrap_sbd_normal.feature
@@ -81,14 +81,12 @@ Feature: crmsh bootstrap sbd management
     Given   Cluster service is "stopped" on "hanode1"
     Given   Cluster service is "stopped" on "hanode2"
     When    Run "crm cluster init ssh -y" on "hanode1"
-    And     Run "crm cluster init csync2 -y" on "hanode1"
     And     Run "crm cluster init corosync -y" on "hanode1"
     And     Run "crm cluster init sbd -s /dev/sda1 -y" on "hanode1"
     And     Run "crm cluster init cluster -y" on "hanode1"
     Then    Cluster service is "started" on "hanode1"
     And     Service "sbd" is "started" on "hanode1"
     When    Run "crm cluster join ssh -y -c hanode1" on "hanode2"
-    And     Run "crm cluster join csync2 -y -c hanode1" on "hanode2"
     And     Run "crm cluster join ssh_merge -y -c hanode1" on "hanode2"
     And     Run "crm cluster join cluster -y -c hanode1" on "hanode2"
     Then    Cluster service is "started" on "hanode2"
@@ -100,14 +98,12 @@ Feature: crmsh bootstrap sbd management
     Given   Cluster service is "stopped" on "hanode1"
     Given   Cluster service is "stopped" on "hanode2"
     When    Run "crm cluster init ssh -y" on "hanode1"
-    And     Run "crm cluster init csync2 -y" on "hanode1"
     And     Run "crm cluster init corosync -y" on "hanode1"
     And     Run "crm cluster init sbd -S -y" on "hanode1"
     And     Run "crm cluster init cluster -y" on "hanode1"
     Then    Cluster service is "started" on "hanode1"
     And     Service "sbd" is "started" on "hanode1"
     When    Run "crm cluster join ssh -y -c hanode1" on "hanode2"
-    And     Run "crm cluster join csync2 -y -c hanode1" on "hanode2"
     And     Run "crm cluster join ssh_merge -y -c hanode1" on "hanode2"
     And     Run "crm cluster join cluster -y -c hanode1" on "hanode2"
     Then    Cluster service is "started" on "hanode2"

diff --git a/test/features/steps/step_implementation.py b/test/features/steps/step_implementation.py
@@ -479,14 +479,21 @@ def step_impl(context, key, value):
 
 @then('SBD option "{key}" value for "{dev}" is "{value}"')
 def step_impl(context, key, dev, value):
-    res = sbd.SBDTimeout.get_sbd_msgwait(dev)
+    res = sbd.SBDUtils.get_sbd_device_metadata(dev).get(key)
     assert_eq(int(value), res)
 
+
 @then('Start timeout for sbd.service is "{value}" seconds')
 def step_impl(context, value):
     systemd_start_timeout = sbd.SBDTimeout.get_sbd_systemd_start_timeout()
     assert_eq(int(value), systemd_start_timeout)
 
+
+@when('Delete property "{key}" from cluster')
+def step_impl(context, key):
+    crmutils.delete_property(key)
+
+
 @then('Cluster property "{key}" is "{value}"')
 def step_impl(context, key, value):
     res = crmutils.get_property(key)

diff --git a/test/unittests/test_bootstrap.py b/test/unittests/test_bootstrap.py
@@ -1539,12 +1539,15 @@ def test_adjust_pcmk_delay(self, mock_cib_inst, mock_run, mock_debug):
         bootstrap.adjust_pcmk_delay_max(False)
         mock_run.assert_called_once_with("crm resource param res_1 delete pcmk_delay_max")
 
-    @mock.patch('crmsh.sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration')
+    @mock.patch('crmsh.sbd.SBDTimeoutChecker')
     @mock.patch('crmsh.service_manager.ServiceManager.service_is_active')
-    def test_adjust_stonith_timeout_sbd(self, mock_is_active, mock_sbd_adjust_timeout):
+    def test_adjust_stonith_timeout_sbd(self, mock_is_active, mock_sbd_checker):
+        mock_sbd_checker_inst = mock.Mock()
+        mock_sbd_checker.return_value = mock_sbd_checker_inst
+        mock_sbd_checker_inst.check_and_fix = mock.Mock()
         mock_is_active.return_value = True
         bootstrap.adjust_stonith_timeout()
-        mock_sbd_adjust_timeout.assert_called_once_with()
+        mock_sbd_checker.assert_called_once_with(fix=True, warn=False, from_bootstrap=True)
 
     @mock.patch('crmsh.utils.set_property')
     @mock.patch('crmsh.bootstrap.get_stonith_timeout_generally_expected')