Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2773,7 +2773,7 @@ def adjust_stonith_timeout(with_sbd: bool = False):
Adjust stonith-timeout for sbd and other scenarios
"""
if ServiceManager().service_is_active(constants.SBD_SERVICE) or with_sbd:
sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration()
sbd.SBDTimeoutChecker(fix=True, warn=False, from_bootstrap=True).check_and_fix()
else:
value = get_stonith_timeout_generally_expected()
if value:
Expand Down
348 changes: 227 additions & 121 deletions crmsh/sbd.py

Large diffs are not rendered by default.

27 changes: 20 additions & 7 deletions crmsh/ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
from .sh import ShellUtils
from .ui_node import parse_option_for_nodes
from . import constants


from . import sbd
from . import log
from .utils import TerminateSubCommand

Expand Down Expand Up @@ -794,18 +793,16 @@ def do_geo_init_arbitrator(self, context, *args):
bootstrap.bootstrap_arbitrator(geo_context)
return True

@command.completers(compl.choice([
'hawk2',
'sles16',
]))
HEALTH_COMPONENTS = ['hawk2', 'sles16', 'sbd']
@command.completers(compl.choice(HEALTH_COMPONENTS))
def do_health(self, context, *args):
'''
Extensive health check.
'''
if not args:
return Cluster._do_health_legacy()
parser = argparse.ArgumentParser('health')
parser.add_argument('component', choices=['hawk2', 'sles16'])
parser.add_argument('component', choices=Cluster.HEALTH_COMPONENTS)
parser.add_argument('-f', '--fix', action='store_true')
parsed_args, remaining_args = parser.parse_known_args(args)
match parsed_args.component:
Expand Down Expand Up @@ -840,6 +837,22 @@ def do_health(self, context, *args):
logger.error("hawk2: passwordless ssh authentication: FAIL.")
logger.warning('Please run "crm cluster health hawk2 --fix"')
return False
case 'sbd':
fix = parsed_args.fix
try:
warn = not fix
result = sbd.SBDTimeoutChecker(fix=fix, warn=warn).check_and_fix()
except sbd.FixFailure as e:
logger.error('%s', e)
return False
if result:
logger.info('SBD: Check sbd timeout configuration: OK.')
return True
else:
logger.error('SBD: Check sbd timeout configuration: FAIL.')
if not fix:
logger.warning('Please run "crm cluster health sbd --fix"')
return False
case 'sles16':
try:
if parsed_args.fix:
Expand Down
16 changes: 9 additions & 7 deletions crmsh/ui_sbd.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,9 @@ class SBD(command.UI):
PCMK_ATTRS = (
"have-watchdog",
"stonith-timeout",
"stonith-enabled"
"stonith-enabled",
"stonith-watchdog-timeout"
)
PCMK_ATTRS_DISKLESS = ('stonith-watchdog-timeout',)
PARSE_RE = re.compile(
# To extract key, suffix and value from these possible arguments:
# watchdog-timeout=30
Expand Down Expand Up @@ -222,11 +222,7 @@ def _show_property(self) -> None:
out = self.cluster_shell.get_stdout_or_raise_error("crm configure show")

logger.info("crm sbd configure show property")
if self.device_list_from_config:
attrs = self.PCMK_ATTRS
else:
attrs = self.PCMK_ATTRS + self.PCMK_ATTRS_DISKLESS
regex = f"({'|'.join(attrs)})=(\\S+)"
regex = f"({'|'.join(self.PCMK_ATTRS)})=(\\S+)"
matches = re.findall(regex, out)
for match in matches:
print(f"{match[0]}={match[1]}")
Expand All @@ -244,6 +240,7 @@ def _show_property(self) -> None:
print(f"TimeoutStartUSec={systemd_start_timeout}")

def _configure_show(self, args) -> None:
check_rc = True
if len(args) > 2:
raise self.SyntaxError("Invalid argument")
elif len(args) == 2:
Expand All @@ -256,13 +253,18 @@ def _configure_show(self, args) -> None:
self._show_property()
case _:
raise self.SyntaxError(f"Unknown argument: {args[1]}")
check_rc = sbd.SBDTimeoutChecker(check_category=args[1]).check_and_fix()
else:
self._show_disk_metadata()
if self.device_list_from_config:
print()
self._show_sysconfig()
print()
self._show_property()
check_rc = sbd.SBDTimeoutChecker().check_and_fix()

if not check_rc:
logger.info('Please run "crm cluster health sbd --fix" to fix the above warning')

def _parse_args(self, args: tuple[str, ...]) -> dict[str, int|str]:
'''
Expand Down
7 changes: 4 additions & 3 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1858,9 +1858,10 @@ def remote_diff_this(local_path, nodes, this_node):
if isinstance(result, crmsh.parallax.Error):
raise ValueError("Failed on %s: %s" % (host, str(result)))
path = result
_, s = ShellUtils().get_stdout("diff -U 0 -d -b --label %s --label %s %s %s" %
(host, this_node, path, local_path))
page_string(s)
_, output = ShellUtils().get_stdout("diff -U 0 -d -b --label %s --label %s %s %s" %
(host, this_node, path, local_path))
page_string(output)
return output


def remote_diff(local_path, nodes):
Expand Down
5 changes: 4 additions & 1 deletion doc/crm.8.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -1084,13 +1084,15 @@ Usage 2: Topic-Specified Health Check
Verifies the health of a specified topic.

...............
health hawk2|sles16 [--local] [--fix]
health hawk2|sbd|sles16 [--local] [--fix]
...............

* `hawk2`: check or fix key-based ssh authentication for user hacluster, which
is needed by hawk2.
** `--fix`: attempts to automatically resolve any detected issues, eg.
hacluster passwordless
* `sbd`: check or fix SBD timeout-related configurations.
** `--fix`: attempts to automatically resolve any detected issues.
* `sles16`: check whether the cluster is good to migrate to SLES 16.
** `--local`: run checks in local mode
** `--fix`: attempts to automatically resolve any detected issues.
Expand Down Expand Up @@ -2216,6 +2218,7 @@ Main functionailities include:
- Show contents of /etc/sysconfig/sbd
- Show SBD related cluster properties
- Update the SBD related configuration parameters
- Give warnings for timeout-related misconfigurations
- NOTE: sbd crashdump is used for debugging. Understand the risks and run `crm sbd purge crashdump` afterward

For more details on SBD and related parameters, please see man sbd(8).
Expand Down
76 changes: 76 additions & 0 deletions test/features/bootstrap_sbd_delay.feature
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,79 @@ Feature: configure sbd delay start correctly
And Property "priority" in "rsc_defaults" is "0"
And Cluster property "priority-fencing-delay" is "0"
And Parameter "pcmk_delay_max" not configured in "stonith-sbd"

@clean
Scenario: Check and fix sbd-related timeout values for disk-based sbd
Given Cluster service is "stopped" on "hanode1"
Given Cluster service is "stopped" on "hanode2"
When Run "crm cluster init -y" on "hanode1"
Then Cluster service is "started" on "hanode1"
When Run "crm cluster join -c hanode1 -y" on "hanode2"
Then Cluster service is "started" on "hanode2"
When Run "crm cluster init sbd -s /dev/sda1 -y" on "hanode1"
Then Service "sbd" is "started" on "hanode1"
And Service "sbd" is "started" on "hanode2"
# check /etc/sysconf/sbd consistency
When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START="no"/' /etc/sysconfig/sbd" on "hanode2"
When Try "crm sbd configure show"
Then Expected "/etc/sysconfig/sbd is not consistent across cluster nodes" in stderr
When Try "crm cluster health sbd"
Then Expected "/etc/sysconfig/sbd is not consistent across cluster nodes" in stderr
When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=71/' /etc/sysconfig/sbd" on "hanode2"
When Run "crm cluster health sbd" on "hanode1"
Then Expected "SBD: Check sbd timeout configuration: OK" in stdout
# check sbd disk metadata
When Run "sbd -1 15 -4 16 -d /dev/sda1 create" on "hanode1"
When Try "crm sbd configur show disk_metadata" on "hanode1"
Then Expected "It's recommended that msgwait(now 16) >= 2*watchdog timeout(now 15)" in stderr
When Try "crm cluster health sbd" on "hanode1"
Then Expected "It's recommended that msgwait(now 16) >= 2*watchdog timeout(now 15)" in stderr
When Run "crm cluster health sbd --fix" on "hanode1"
Then Expected "SBD: Check sbd timeout configuration: OK" in stdout
# check SBD_DELAY_START
When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=40/' /etc/sysconfig/sbd" on "hanode1"
When Run "sed -i 's/SBD_DELAY_START=.*/SBD_DELAY_START=40/' /etc/sysconfig/sbd" on "hanode2"
When Try "crm sbd configure show" on "hanode1"
Then Expected "It's recommended that SBD_DELAY_START is set to 71, now is 40" in stderr
When Try "crm cluster health sbd" on "hanode1"
Then Expected "It's recommended that SBD_DELAY_START is set to 71, now is 40" in stderr
When Run "crm cluster health sbd --fix" on "hanode1"
Then Expected "SBD: Check sbd timeout configuration: OK" in stdout
# check stonith-timeout
When Run "crm configure property stonith-timeout=50" on "hanode1"
When Try "crm sbd configure show" on "hanode1"
Then Expected "It's recommended that stonith-timeout is set to 71, now is 50" in stderr
When Try "crm cluster health sbd" on "hanode1"
Then Expected "It's recommended that stonith-timeout is set to 71, now is 50" in stderr
When Run "crm cluster health sbd --fix" on "hanode1"
Then Expected "SBD: Check sbd timeout configuration: OK" in stdout
# Adjust token timeout in corosync.conf
When Run "sed -i 's/token: .*/token: 10000/' /etc/corosync/corosync.conf" on "hanode1"
When Run "sed -i 's/token: .*/token: 10000/' /etc/corosync/corosync.conf" on "hanode2"
When Run "corosync-cfgtool -R" on "hanode1"
When Try "crm sbd configure show" on "hanode1"
Then Expected "It's recommended that SBD_DELAY_START is set to 82, now is 71" in stderr
When Try "crm cluster health sbd" on "hanode1"
Then Expected "It's recommended that SBD_DELAY_START is set to 82, now is 71" in stderr
When Run "crm cluster health sbd --fix" on "hanode1"
Then Expected "SBD: Check sbd timeout configuration: OK" in stdout

@clean
Scenario: Check and fix sbd-related timeout values for diskless sbd
Given Cluster service is "stopped" on "hanode1"
Given Cluster service is "stopped" on "hanode2"
When Run "crm cluster init -y" on "hanode1"
Then Cluster service is "started" on "hanode1"
When Run "crm cluster join -c hanode1 -y" on "hanode2"
Then Cluster service is "started" on "hanode2"
When Run "crm cluster init sbd -S -y" on "hanode1"
Then Service "sbd" is "started" on "hanode1"
And Service "sbd" is "started" on "hanode2"
# Delete stonith-watchdog-timeout
When Delete property "stonith-watchdog-timeout" from cluster
When Try "crm sbd configure show" on "hanode1"
Then Expected "It's recommended that stonith-watchdog-timeout is set to 30, now is not set" in stderr
When Try "crm cluster health sbd" on "hanode1"
Then Expected "It's recommended that stonith-watchdog-timeout is set to 30, now is not set" in stderr
When Run "crm cluster health sbd --fix" on "hanode1"
Then Expected "SBD: Check sbd timeout configuration: OK" in stdout
4 changes: 0 additions & 4 deletions test/features/bootstrap_sbd_normal.feature
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,12 @@ Feature: crmsh bootstrap sbd management
Given Cluster service is "stopped" on "hanode1"
Given Cluster service is "stopped" on "hanode2"
When Run "crm cluster init ssh -y" on "hanode1"
And Run "crm cluster init csync2 -y" on "hanode1"
And Run "crm cluster init corosync -y" on "hanode1"
And Run "crm cluster init sbd -s /dev/sda1 -y" on "hanode1"
And Run "crm cluster init cluster -y" on "hanode1"
Then Cluster service is "started" on "hanode1"
And Service "sbd" is "started" on "hanode1"
When Run "crm cluster join ssh -y -c hanode1" on "hanode2"
And Run "crm cluster join csync2 -y -c hanode1" on "hanode2"
And Run "crm cluster join ssh_merge -y -c hanode1" on "hanode2"
And Run "crm cluster join cluster -y -c hanode1" on "hanode2"
Then Cluster service is "started" on "hanode2"
Expand All @@ -100,14 +98,12 @@ Feature: crmsh bootstrap sbd management
Given Cluster service is "stopped" on "hanode1"
Given Cluster service is "stopped" on "hanode2"
When Run "crm cluster init ssh -y" on "hanode1"
And Run "crm cluster init csync2 -y" on "hanode1"
And Run "crm cluster init corosync -y" on "hanode1"
And Run "crm cluster init sbd -S -y" on "hanode1"
And Run "crm cluster init cluster -y" on "hanode1"
Then Cluster service is "started" on "hanode1"
And Service "sbd" is "started" on "hanode1"
When Run "crm cluster join ssh -y -c hanode1" on "hanode2"
And Run "crm cluster join csync2 -y -c hanode1" on "hanode2"
And Run "crm cluster join ssh_merge -y -c hanode1" on "hanode2"
And Run "crm cluster join cluster -y -c hanode1" on "hanode2"
Then Cluster service is "started" on "hanode2"
Expand Down
9 changes: 8 additions & 1 deletion test/features/steps/step_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,14 +479,21 @@ def step_impl(context, key, value):

@then('SBD option "{key}" value for "{dev}" is "{value}"')
def step_impl(context, key, dev, value):
res = sbd.SBDTimeout.get_sbd_msgwait(dev)
res = sbd.SBDUtils.get_sbd_device_metadata(dev).get(key)
assert_eq(int(value), res)


@then('Start timeout for sbd.service is "{value}" seconds')
def step_impl(context, value):
systemd_start_timeout = sbd.SBDTimeout.get_sbd_systemd_start_timeout()
assert_eq(int(value), systemd_start_timeout)


@when('Delete property "{key}" from cluster')
def step_impl(context, key):
crmutils.delete_property(key)


@then('Cluster property "{key}" is "{value}"')
def step_impl(context, key, value):
res = crmutils.get_property(key)
Expand Down
9 changes: 6 additions & 3 deletions test/unittests/test_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -1539,12 +1539,15 @@ def test_adjust_pcmk_delay(self, mock_cib_inst, mock_run, mock_debug):
bootstrap.adjust_pcmk_delay_max(False)
mock_run.assert_called_once_with("crm resource param res_1 delete pcmk_delay_max")

@mock.patch('crmsh.sbd.SBDTimeout.adjust_sbd_timeout_related_cluster_configuration')
@mock.patch('crmsh.sbd.SBDTimeoutChecker')
@mock.patch('crmsh.service_manager.ServiceManager.service_is_active')
def test_adjust_stonith_timeout_sbd(self, mock_is_active, mock_sbd_adjust_timeout):
def test_adjust_stonith_timeout_sbd(self, mock_is_active, mock_sbd_checker):
mock_sbd_checker_inst = mock.Mock()
mock_sbd_checker.return_value = mock_sbd_checker_inst
mock_sbd_checker_inst.check_and_fix = mock.Mock()
mock_is_active.return_value = True
bootstrap.adjust_stonith_timeout()
mock_sbd_adjust_timeout.assert_called_once_with()
mock_sbd_checker.assert_called_once_with(fix=True, warn=False, from_bootstrap=True)

@mock.patch('crmsh.utils.set_property')
@mock.patch('crmsh.bootstrap.get_stonith_timeout_generally_expected')
Expand Down
Loading