|
10 | 10 | from . import xmlutil |
11 | 11 | from . import watchdog |
12 | 12 | from . import parallax |
| 13 | +from . import healthcheck |
13 | 14 | from .service_manager import ServiceManager |
14 | 15 | from .sh import ShellUtils |
15 | 16 |
|
@@ -196,6 +197,7 @@ def __init__(self, context=None): |
196 | 197 | self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT |
197 | 198 | self.stonith_watchdog_timeout = None |
198 | 199 | self.two_node_without_qdevice = False |
| 200 | + self.qdevice_sync_timeout = None |
199 | 201 | if self.context: |
200 | 202 | self._initialize_timeout_in_bootstrap() |
201 | 203 |
|
@@ -292,6 +294,8 @@ def _load_configurations(self): |
292 | 294 | self.disk_based = False |
293 | 295 | self.sbd_watchdog_timeout = SBDTimeout.get_sbd_watchdog_timeout() |
294 | 296 | self.stonith_watchdog_timeout = SBDTimeout.get_stonith_watchdog_timeout_expected() |
| 297 | + if corosync.is_qdevice_configured() and ServiceManager().service_is_active("corosync-qdevice.service") |
| 298 | + self.qdevice_sync_timeout = utils.get_qdevice_sync_timeout() |
295 | 299 | self.sbd_delay_start_value_expected = self.get_sbd_delay_start_expected() if utils.detect_virt() else "no" |
296 | 300 | self.sbd_delay_start_value_from_config = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START") |
297 | 301 |
|
@@ -359,53 +363,183 @@ def get_sbd_systemd_start_timeout() -> int: |
359 | 363 | out = sh.cluster_shell().get_stdout_or_raise_error(SBDTimeout.SHOW_SBD_START_TIMEOUT_CMD) |
360 | 364 | return utils.get_systemd_timeout_start_in_sec(out) |
361 | 365 |
|
362 | | - def adjust_systemd_start_timeout(self): |
| 366 | + |
| 367 | +class SBDTimeoutChecker(SBDTimeout): |
| 368 | + |
| 369 | + def __init__(self, fix=False, warn=True): |
| 370 | + super().__init__() |
| 371 | + self.fix = fix |
| 372 | + self.warning_during_check = warn |
| 373 | + |
| 374 | + def check_and_fix(self) -> bool: |
| 375 | + ''' |
| 376 | + Return True if all checks pass (after optional fixes), otherwise False |
| 377 | + ''' |
| 378 | + checks_and_fixes = [ |
| 379 | + # failure name, check function, fix function |
| 380 | + ("SBD disk metadata", |
| 381 | + self._check_sbd_disk_metadata, self._adjust_sbd_disk_metadata), |
| 382 | + ("SBD_WATCHDOG_TIMEOUT", |
| 383 | + self._check_sbd_watchdog_timeout, self._adjust_sbd_watchdog_timeout), |
| 384 | + ("SBD_DELAY_START", |
| 385 | + self._check_sbd_delay_start, self._adjust_sbd_delay_start), |
| 386 | + ("systemd start timeout for sbd.service", |
| 387 | + self._check_sbd_systemd_start_timeout, self._adjust_sbd_systemd_start_timeout), |
| 388 | + ("stonith-watchdog-timeout property", |
| 389 | + self._check_stonith_watchdog_timeout, self._adjust_stonith_watchdog_timeout), |
| 390 | + ("stonith-timeout property", |
| 391 | + self._check_stonith_timeout, self._adjust_stonith_timeout) |
| 392 | + ] |
| 393 | + |
| 394 | + self._load_configurations() |
| 395 | + for name, check_func, fix_func in checks_and_fixes: |
| 396 | + if check_func(warn=self.warning_during_check): |
| 397 | + continue |
| 398 | + if not self.fix: |
| 399 | + return False |
| 400 | + fix_func() |
| 401 | + self._load_configurations() |
| 402 | + if not check_func(warn=False): |
| 403 | + raise healthcheck.FixFailure(f"Failed to fix: {name}") |
| 404 | + |
| 405 | + return True |
| 406 | + |
| 407 | + def _check_sbd_disk_metadata(self, warn=True) -> bool: |
363 | 408 | ''' |
364 | | - Adjust start timeout for sbd when set SBD_DELAY_START |
| 409 | + Check msgwait and watchdog timeout for disk-based sbd |
365 | 410 | ''' |
366 | | - sbd_delay_start_value = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START") |
367 | | - if sbd_delay_start_value == "no": |
368 | | - return |
| 411 | + if self.disk_based and self.sbd_msgwait < 2*self.sbd_watchdog_timeout: |
| 412 | + if warn: |
| 413 | + logger.warning("It's recommended that msgwait(now %d) >= 2*watchdog timeout(now %d)", self.sbd_msgwait, self.sbd_watchdog_timeout) |
| 414 | + return False |
| 415 | + return True |
369 | 416 |
|
370 | | - start_timeout = SBDTimeout.get_sbd_systemd_start_timeout() |
371 | | - if start_timeout > int(sbd_delay_start_value): |
372 | | - return |
| 417 | + def _adjust_sbd_disk_metadata(self): |
| 418 | + ''' |
| 419 | + Adjust msgwait to 2*watchdog timeout for disk-based sbd |
| 420 | + ''' |
| 421 | + advised_msgwait = 2*self.sbd_watchdog_timeout |
| 422 | + logger.info("Adjusting sbd msgwait to %d", advised_msgwait) |
| 423 | + cmd = f"crm sbd configure msgwait-timeout={advised_msgwait} watchdog-timeout={self.sbd_watchdog_timeout}" |
| 424 | + output = sh.cluster_shell().get_stdout_or_raise_error(cmd) |
| 425 | + if output: |
| 426 | + print(output) |
373 | 427 |
|
374 | | - utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) |
375 | | - sbd_delay_start_file = "{}/sbd_delay_start.conf".format(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) |
376 | | - utils.str2file("[Service]\nTimeoutSec={}".format(int(1.2*int(sbd_delay_start_value))), sbd_delay_start_file) |
377 | | - bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) |
378 | | - utils.cluster_run_cmd("systemctl daemon-reload") |
| 428 | + def _check_sbd_watchdog_timeout(self, warn=True) -> bool: |
| 429 | + ''' |
| 430 | + Check SBD_WATCHDOG_TIMEOUT for disk-less sbd |
| 431 | + ''' |
| 432 | + if self.disk_based or not self.qdevice_sync_timeout: |
| 433 | + return True |
| 434 | + if self.sbd_watchdog_timeout < self.qdevice_sync_timeout: |
| 435 | + if warn: |
| 436 | + logger.warning("It's recommended that SBD_WATCHDOG_TIMEOUT(now %d) > qdevice sync timeout(now %d)", |
| 437 | + self.sbd_watchdog_timeout, self.qdevice_sync_timeout) |
| 438 | + return False |
| 439 | + return True |
379 | 440 |
|
380 | | - def adjust_stonith_timeout(self): |
| 441 | + def _adjust_sbd_watchdog_timeout(self): |
381 | 442 | ''' |
382 | | - Adjust stonith-timeout property |
| 443 | + Adjust SBD_WATCHDOG_TIMEOUT for disk-less sbd |
383 | 444 | ''' |
384 | | - utils.set_property("stonith-timeout", self.get_stonith_timeout_expected(), conditional=True) |
| 445 | + advised_watchdog_timeout = self.qdevice_sync_timeout + SBDTimeout.QDEVICE_SYNC_TIMEOUT_MARGIN |
| 446 | + SBDManager.update_sbd_configuration({"SBD_WATCHDOG_TIMEOUT": str(advised_watchdog_timeout)}) |
| 447 | + logger.info("Adjusting SBD_WATCHDOG_TIMEOUT to %d", advised_watchdog_timeout) |
385 | 448 |
|
386 | | - def adjust_sbd_delay_start(self): |
| 449 | + def _check_sbd_delay_start(self, warn=True) -> bool: |
387 | 450 | ''' |
388 | | - Adjust SBD_DELAY_START in /etc/sysconfig/sbd |
| 451 | + Check SBD_DELAY_START |
389 | 452 | ''' |
390 | 453 | expected_value = str(self.sbd_delay_start_value_expected) |
391 | 454 | config_value = self.sbd_delay_start_value_from_config |
392 | 455 | if expected_value == config_value: |
393 | | - return |
| 456 | + return True |
394 | 457 | if expected_value == "no" \ |
395 | 458 | or (not re.search(r'\d+', config_value)) \ |
396 | 459 | or (int(expected_value) > int(config_value)): |
397 | | - SBDManager.update_sbd_configuration({"SBD_DELAY_START": expected_value}) |
| 460 | + if warn: |
| 461 | + logger.warning("It's recommended that SBD_DELAY_START is set to %s, current value is %s", expected_value, config_value) |
| 462 | + return False |
398 | 463 |
|
399 | | - @classmethod |
400 | | - def adjust_sbd_timeout_related_cluster_configuration(cls): |
| 464 | + def _adjust_sbd_delay_start(self): |
401 | 465 | ''' |
402 | | - Adjust sbd timeout related configurations |
| 466 | + Adjust SBD_DELAY_START |
403 | 467 | ''' |
404 | | - cls_inst = cls() |
405 | | - cls_inst._load_configurations() |
406 | | - cls_inst.adjust_sbd_delay_start() |
407 | | - cls_inst.adjust_stonith_timeout() |
408 | | - cls_inst.adjust_systemd_start_timeout() |
| 468 | + SBDManager.update_sbd_configuration({"SBD_DELAY_START": str(self.sbd_delay_start_value_expected)}) |
| 469 | + |
| 470 | + def _check_sbd_systemd_start_timeout(self, warn=True) -> bool: |
| 471 | + ''' |
| 472 | + Check systemd start timeout for sbd.service |
| 473 | + ''' |
| 474 | + if not self.sbd_delay_start_value_from_config or self.sbd_delay_start_value_from_config == "no": |
| 475 | + return True |
| 476 | + systemd_start_timeout = SBDTimeout.get_sbd_systemd_start_timeout() |
| 477 | + if systemd_start_timeout > int(self.sbd_delay_start_value_from_config): |
| 478 | + return True |
| 479 | + if warn: |
| 480 | + logger.warning("It's recommended that systemd start timeout for sbd.service is set to %d, current value is %d", |
| 481 | + int(1.2*int(self.sbd_delay_start_value_from_config)), systemd_start_timeout) |
| 482 | + return False |
| 483 | + |
| 484 | + def _adjust_sbd_systemd_start_timeout(self): |
| 485 | + ''' |
| 486 | + Adjust systemd start timeout for sbd.service |
| 487 | + ''' |
| 488 | + systemd_start_timeout = int(1.2*int(self.sbd_delay_start_value_from_config)) |
| 489 | + utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) |
| 490 | + sbd_delay_start_file = f"{SBDManager.SBD_SYSTEMD_DELAY_START_DIR}/sbd_delay_start.conf" |
| 491 | + utils.str2file(f"[Service]\nTimeoutSec={systemd_start_timeout}", sbd_delay_start_file) |
| 492 | + bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR) |
| 493 | + utils.cluster_run_cmd("systemctl daemon-reload") |
| 494 | + logger.info("Adjusted systemd start timeout for sbd.service to %d", systemd_start_timeout) |
| 495 | + |
| 496 | + def _check_stonith_watchdog_timeout(self, warn=True) -> bool: |
| 497 | + ''' |
| 498 | + Check stonith-watchdog-timeout property |
| 499 | + ''' |
| 500 | + value = utils.get_property("stonith-watchdog-timeout", get_default=False) |
| 501 | + if self.disk_based: |
| 502 | + if value and warn: |
| 503 | + logger.warning("It's recommended to remove stonith-watchdog-timeout property when using disk-based SBD") |
| 504 | + return False |
| 505 | + elif int(value) < 2*self.sbd_watchdog_timeout: |
| 506 | + if warn: |
| 507 | + logger.warning("It's recommended that stonith-watchdog-timeout(now %s) >= 2*SBD_WATCHDOG_TIMEOUT(now %d)", |
| 508 | + value, self.sbd_watchdog_timeout) |
| 509 | + return False |
| 510 | + return True |
| 511 | + |
| 512 | + def _adjust_stonith_watchdog_timeout(self): |
| 513 | + ''' |
| 514 | + Adjust stonith-watchdog-timeout property |
| 515 | + ''' |
| 516 | + if self.disk_based: |
| 517 | + utils.delete_property("stonith-watchdog-timeout") |
| 518 | + logger.info("Removed stonith-watchdog-timeout property") |
| 519 | + else: |
| 520 | + adviced_value = SBDTimeout.get_stonith_watchdog_timeout_expected() |
| 521 | + utils.set_property("stonith-watchdog-timeout", adviced_value) |
| 522 | + logger.info("Adjusted stonith-watchdog-timeout to %d", adviced_value) |
| 523 | + |
| 524 | + def _check_stonith_timeout(self, warn=True) -> bool: |
| 525 | + ''' |
| 526 | + Check stonith-timeout property |
| 527 | + ''' |
| 528 | + value = utils.get_property("stonith-timeout", get_default=False) |
| 529 | + expected_value = self.get_stonith_timeout_expected() |
| 530 | + if not value or int(value) < expected_value: |
| 531 | + if warn: |
| 532 | + logger.warning("It's recommended that stonith-timeout is set to %d, current value is %s", expected_value, value) |
| 533 | + return False |
| 534 | + return True |
| 535 | + |
| 536 | + def _adjust_stonith_timeout(self): |
| 537 | + ''' |
| 538 | + Adjust stonith-timeout property |
| 539 | + ''' |
| 540 | + expected_value = self.get_stonith_timeout_expected() |
| 541 | + utils.set_property("stonith-timeout", expected_value) |
| 542 | + logger.info("Adjusted stonith-timeout to %d", expected_value) |
409 | 543 |
|
410 | 544 |
|
411 | 545 | class SBDManager: |
@@ -497,7 +631,6 @@ def update_configuration(self) -> None: |
497 | 631 | utils.sysconfig_set(self.SYSCONFIG_SBD, **self.update_dict) |
498 | 632 | if self.cluster_is_running: |
499 | 633 | bootstrap.sync_path(self.SYSCONFIG_SBD) |
500 | | - logger.info("Already synced %s to all nodes", self.SYSCONFIG_SBD) |
501 | 634 |
|
502 | 635 | @classmethod |
503 | 636 | def update_sbd_configuration(cls, update_dict: typing.Dict[str, str]) -> None: |
|
0 commit comments