From 49e6709e6365a65313017b684acbf095c24ac382 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Mon, 13 Apr 2026 19:08:50 +0200 Subject: [PATCH 01/15] feat: add etcd lock (#9) * patch: etcd rolling ops version * first working version * fix format * fix linting * add tenacity to integration test * remove unnecessary logs * add dataplatform as reviewes * rename and add integration tests * linting and rebase * first part of comments * more comments answered * more comments answered * fix linting job * fix UT * mark tests as only k8s * fix integration tests * use charmlibs apt * remove sans dns * add dependencies to .toml * add uv lock * add wait in itnegration tests * increate timeout * increase log count * unlimited debug-log * comments review * fix paths * migrate v1 * fix integration tests * fix integration tests * add lock and integration tests * unify operations * add tenacity * draft * fallback implementation * add sync lock and state * feat: advanced rolling ops using etcd (#364) ## Context All the code on this PR is new This implementation is based on [DA241 Spec](https://docs.google.com/document/d/1ez4h6vOOyHy5mu6xDblcBt8PPAtMe7MUp75MtgG1sns/edit?tab=t.0) - The content of `charmlibs/advanced-rollingops/src/charmlibs/advanced_rollingops/_dp_interfaces_v1.py` belongs to another library that is currently being migrated to charmlibs so you can ignore it for now. ## Summary This PR is the first part of the implementation of advanced rolling ops at cluster level. This PR includes: - Management of the client certificate used to connect to etcd - The leader unit creates a self-signed certificate with live of 50 years - Share the certificate with the other units using a peer relation - Implementation of the integration with etcd - Share the mtls certificate - Observe the `resource_created` event - Observe the `endpoints_changed` event - Management of a env file needed to connecto etcd via `etcdctl` This PR does not implement the locking mechanism. In here we only test that we can connect to etcd from each unit. ## Current workflow: 1. The unit make a request 2. A new background process is spawn 3. The background process dispatches a Juju hook 4. The unit observes that hook 5. The unit writes and read a value in etcd 6. If the unit was able to connect to etcd, it executes the "restart" function. This is a very simplified workflow to be able to test that the units from different apps can reach etcd. ## To do - Implement the actual locking mechanism - Figure out how to properly install etcdctl * feat: migrate rollingops v1 from charm-rolling-ops repo (#415) * define syn lock backend * fix merge * clean up * fix peer integration tests * fix integration tests * fix integration tests * docstrings * add update status handled and improve integration tests * general cleanup --- CODEOWNERS | 1 + rollingops/CHANGELOG.md | 0 rollingops/README.md | 29 + rollingops/pyproject.toml | 78 +++ .../src/charmlibs/rollingops/__init__.py | 45 ++ .../rollingops/_rollingops_manager.py | 452 +++++++++++++ .../src/charmlibs/rollingops/_version.py | 15 + .../charmlibs/rollingops/common/__init__.py | 15 + .../rollingops/common/_base_worker.py | 273 ++++++++ .../rollingops/common/_exceptions.py | 71 +++ .../charmlibs/rollingops/common/_models.py | 456 ++++++++++++++ .../src/charmlibs/rollingops/common/_utils.py | 120 ++++ .../src/charmlibs/rollingops/etcd/__init__.py | 15 + .../src/charmlibs/rollingops/etcd/_backend.py | 385 ++++++++++++ .../rollingops/etcd/_certificates.py | 168 +++++ .../src/charmlibs/rollingops/etcd/_etcd.py | 482 ++++++++++++++ .../src/charmlibs/rollingops/etcd/_etcdctl.py | 393 ++++++++++++ .../src/charmlibs/rollingops/etcd/_models.py | 213 +++++++ .../charmlibs/rollingops/etcd/_relations.py | 284 +++++++++ .../charmlibs/rollingops/etcd/_rollingops.py | 165 +++++ .../src/charmlibs/rollingops/etcd/_worker.py | 112 ++++ .../src/charmlibs/rollingops/peer/__init__.py | 15 + .../src/charmlibs/rollingops/peer/_backend.py | 592 ++++++++++++++++++ .../src/charmlibs/rollingops/peer/_models.py | 392 ++++++++++++ .../charmlibs/rollingops/peer/_rollingops.py | 38 ++ .../src/charmlibs/rollingops/peer/_worker.py | 101 +++ rollingops/src/charmlibs/rollingops/py.typed | 0 rollingops/tests/__init__.py | 13 + rollingops/tests/functional/conftest.py | 15 + rollingops/tests/functional/test_version.py | 21 + rollingops/tests/integration/__init__.py | 13 + .../tests/integration/charms/actions.yaml | 44 ++ rollingops/tests/integration/charms/common.py | 169 +++++ .../tests/integration/charms/k8s/actions.yaml | 1 + .../integration/charms/k8s/charmcraft.yaml | 35 ++ .../integration/charms/k8s/library/README.md | 1 + .../charms/k8s/library/pyproject.toml | 1 + .../tests/integration/charms/k8s/library/src | 1 + .../integration/charms/k8s/pyproject.toml | 1 + .../tests/integration/charms/k8s/src/charm.py | 40 ++ .../integration/charms/k8s/src/common.py | 1 + .../integration/charms/machine/actions.yaml | 1 + .../charms/machine/charmcraft.yaml | 25 + .../charms/machine/library/README.md | 1 + .../charms/machine/library/pyproject.toml | 1 + .../integration/charms/machine/library/src | 1 + .../integration/charms/machine/pyproject.toml | 1 + .../integration/charms/machine/src/charm.py | 49 ++ .../integration/charms/machine/src/common.py | 1 + .../tests/integration/charms/pyproject.toml | 19 + rollingops/tests/integration/conftest.py | 82 +++ rollingops/tests/integration/pack.sh | 31 + .../integration/test_etcd_rolling_ops.py | 502 +++++++++++++++ .../integration/test_peer_rolling_ops.py | 390 ++++++++++++ rollingops/tests/integration/utils.py | 64 ++ rollingops/tests/unit/conftest.py | 314 ++++++++++ rollingops/tests/unit/test_common_models.py | 379 +++++++++++ .../tests/unit/test_etcd_certificates.py | 152 +++++ rollingops/tests/unit/test_etcd_etcdctl.py | 94 +++ rollingops/tests/unit/test_etcd_models.py | 39 ++ .../unit/test_etcd_rollingops_in_charm.py | 352 +++++++++++ rollingops/tests/unit/test_peer_models.py | 144 +++++ .../unit/test_peer_rollingops_in_charm.py | 529 ++++++++++++++++ rollingops/tests/unit/test_version.py | 21 + .../tests/unit/test_version_in_charm.py | 38 ++ rollingops/uv.lock | 472 ++++++++++++++ 66 files changed, 8968 insertions(+) create mode 100644 rollingops/CHANGELOG.md create mode 100644 rollingops/README.md create mode 100644 rollingops/pyproject.toml create mode 100644 rollingops/src/charmlibs/rollingops/__init__.py create mode 100644 rollingops/src/charmlibs/rollingops/_rollingops_manager.py create mode 100644 rollingops/src/charmlibs/rollingops/_version.py create mode 100644 rollingops/src/charmlibs/rollingops/common/__init__.py create mode 100644 rollingops/src/charmlibs/rollingops/common/_base_worker.py create mode 100644 rollingops/src/charmlibs/rollingops/common/_exceptions.py create mode 100644 rollingops/src/charmlibs/rollingops/common/_models.py create mode 100644 rollingops/src/charmlibs/rollingops/common/_utils.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/__init__.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_backend.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_certificates.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_etcd.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_models.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_relations.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_rollingops.py create mode 100644 rollingops/src/charmlibs/rollingops/etcd/_worker.py create mode 100644 rollingops/src/charmlibs/rollingops/peer/__init__.py create mode 100644 rollingops/src/charmlibs/rollingops/peer/_backend.py create mode 100644 rollingops/src/charmlibs/rollingops/peer/_models.py create mode 100644 rollingops/src/charmlibs/rollingops/peer/_rollingops.py create mode 100644 rollingops/src/charmlibs/rollingops/peer/_worker.py create mode 100644 rollingops/src/charmlibs/rollingops/py.typed create mode 100644 rollingops/tests/__init__.py create mode 100644 rollingops/tests/functional/conftest.py create mode 100644 rollingops/tests/functional/test_version.py create mode 100644 rollingops/tests/integration/__init__.py create mode 100644 rollingops/tests/integration/charms/actions.yaml create mode 100644 rollingops/tests/integration/charms/common.py create mode 120000 rollingops/tests/integration/charms/k8s/actions.yaml create mode 100644 rollingops/tests/integration/charms/k8s/charmcraft.yaml create mode 120000 rollingops/tests/integration/charms/k8s/library/README.md create mode 120000 rollingops/tests/integration/charms/k8s/library/pyproject.toml create mode 120000 rollingops/tests/integration/charms/k8s/library/src create mode 120000 rollingops/tests/integration/charms/k8s/pyproject.toml create mode 100644 rollingops/tests/integration/charms/k8s/src/charm.py create mode 120000 rollingops/tests/integration/charms/k8s/src/common.py create mode 120000 rollingops/tests/integration/charms/machine/actions.yaml create mode 100644 rollingops/tests/integration/charms/machine/charmcraft.yaml create mode 120000 rollingops/tests/integration/charms/machine/library/README.md create mode 120000 rollingops/tests/integration/charms/machine/library/pyproject.toml create mode 120000 rollingops/tests/integration/charms/machine/library/src create mode 120000 rollingops/tests/integration/charms/machine/pyproject.toml create mode 100644 rollingops/tests/integration/charms/machine/src/charm.py create mode 120000 rollingops/tests/integration/charms/machine/src/common.py create mode 100644 rollingops/tests/integration/charms/pyproject.toml create mode 100644 rollingops/tests/integration/conftest.py create mode 100755 rollingops/tests/integration/pack.sh create mode 100644 rollingops/tests/integration/test_etcd_rolling_ops.py create mode 100644 rollingops/tests/integration/test_peer_rolling_ops.py create mode 100644 rollingops/tests/integration/utils.py create mode 100644 rollingops/tests/unit/conftest.py create mode 100644 rollingops/tests/unit/test_common_models.py create mode 100644 rollingops/tests/unit/test_etcd_certificates.py create mode 100644 rollingops/tests/unit/test_etcd_etcdctl.py create mode 100644 rollingops/tests/unit/test_etcd_models.py create mode 100644 rollingops/tests/unit/test_etcd_rollingops_in_charm.py create mode 100644 rollingops/tests/unit/test_peer_models.py create mode 100644 rollingops/tests/unit/test_peer_rollingops_in_charm.py create mode 100644 rollingops/tests/unit/test_version.py create mode 100644 rollingops/tests/unit/test_version_in_charm.py create mode 100644 rollingops/uv.lock diff --git a/CODEOWNERS b/CODEOWNERS index 369675b7a..1ca2e5bf4 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -27,6 +27,7 @@ /nginx_k8s/ @canonical/tracing-and-profiling /passwd/ @canonical/charmlibs-maintainers /pathops/ @canonical/charmlibs-maintainers +/rollingops/ @canonical/data /snap/ @canonical/charmlibs-maintainers /sysctl/ @canonical/charmlibs-maintainers /systemd/ @canonical/charmlibs-maintainers diff --git a/rollingops/CHANGELOG.md b/rollingops/CHANGELOG.md new file mode 100644 index 000000000..e69de29bb diff --git a/rollingops/README.md b/rollingops/README.md new file mode 100644 index 000000000..fbe937d06 --- /dev/null +++ b/rollingops/README.md @@ -0,0 +1,29 @@ +# charmlibs.rollingops + +The `rollingops` library. + +`rollingops` provides a rolling-operations manager for Juju charms backed by etcd. + +It coordinates operations across units by using etcd as a shared lock and queue backend, +and uses TLS client credentials to authenticate requests to the etcd cluster. + +To install, add `charmlibs-rollingops` to your Python dependencies. Then in your Python code, import as: + +```py +from charmlibs import rollingops +``` + +See the [reference documentation](https://documentation.ubuntu.com/charmlibs/reference/charmlibs/rollingops) for more. + +## Unit tests +```py +just python=3.12 unit rollingops +``` +## Pack +```py +just python=3.12 pack-machine rollingops +``` +## Integration tests +```py +just python=3.12 integration-machine rollingops +``` diff --git a/rollingops/pyproject.toml b/rollingops/pyproject.toml new file mode 100644 index 000000000..bd096196b --- /dev/null +++ b/rollingops/pyproject.toml @@ -0,0 +1,78 @@ +[project] +name = "charmlibs-rollingops" +description = "The charmlibs.rollingops package." +readme = "README.md" +requires-python = ">=3.12" +authors = [ + {name="Data Platform"}, +] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Operating System :: POSIX :: Linux", + "Development Status :: 5 - Production/Stable", +] +dynamic = ["version"] +dependencies = [ + "ops", + "charmlibs-interfaces-tls-certificates>=1.8.1", + "charmlibs-pathops>=1.2.1", + "dpcharmlibs-interfaces==1.0.0", + "tenacity" +] + +[dependency-groups] +lint = [ # installed for `just lint rollingops` (unit, functional, and integration are also installed) + # "typing_extensions", +] +unit = [ # installed for `just unit rollingops` + "ops[testing]", +] +functional = [ # installed for `just functional rollingops` +] +integration = [ # installed for `just integration rollingops` + "jubilant", + "tenacity", + "charmlibs-apt", +] + +[project.urls] +"Repository" = "https://github.com/canonical/charmlibs" +"Issues" = "https://github.com/canonical/charmlibs/issues" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/charmlibs"] + +[tool.hatch.version] +path = "src/charmlibs/rollingops/_version.py" + +[tool.ruff] +extend = "../pyproject.toml" +src = ["src", "tests/unit", "tests/functional", "tests/integration"] # correctly sort local imports in tests + +[tool.ruff.lint.extend-per-file-ignores] +# add additional per-file-ignores here to avoid overriding repo-level config +"tests/**/*" = [ + # "E501", # line too long +] + +[tool.pyright] +extends = "../pyproject.toml" +include = ["src", "tests"] +exclude = ["tests/integration/.tmp/**"] +pythonVersion = "3.12" # check no python > 3.12 features are used + +[tool.charmlibs.functional] +ubuntu = [] # ubuntu versions to run functional tests with, e.g. "24.04" (defaults to just "latest") +pebble = [] # pebble versions to run functional tests with, e.g. "v1.0.0", "master" (defaults to no pebble versions) +sudo = false # whether to run functional tests with sudo (defaults to false) + +[tool.charmlibs.integration] +# tags to run integration tests with (defaults to running once with no tag, i.e. tags = ['']) +# Available in CI in tests/integration/pack.sh and integration tests as CHARMLIBS_TAG +tags = [] # Not used by the pack.sh and integration tests generated by the template diff --git a/rollingops/src/charmlibs/rollingops/__init__.py b/rollingops/src/charmlibs/rollingops/__init__.py new file mode 100644 index 000000000..d70339b58 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/__init__.py @@ -0,0 +1,45 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""The charmlibs.rollingops package.""" + +from ._rollingops_manager import RollingOpsManager +from ._version import __version__ as __version__ +from .common._exceptions import ( + RollingOpsDecodingError, + RollingOpsEtcdctlError, + RollingOpsEtcdNotConfiguredError, + RollingOpsFileSystemError, + RollingOpsInvalidLockRequestError, + RollingOpsInvalidSecretContentError, + RollingOpsLibMissingError, + RollingOpsNoRelationError, + RollingOpsSyncLockError, +) +from .common._models import OperationResult, SyncLockBackend + +__all__ = ( + 'OperationResult', + 'RollingOpsDecodingError', + 'RollingOpsEtcdNotConfiguredError', + 'RollingOpsEtcdctlError', + 'RollingOpsFileSystemError', + 'RollingOpsInvalidLockRequestError', + 'RollingOpsInvalidSecretContentError', + 'RollingOpsLibMissingError', + 'RollingOpsManager', + 'RollingOpsNoRelationError', + 'RollingOpsSyncLockError', + 'SyncLockBackend', +) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py new file mode 100644 index 000000000..dfc33a52f --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -0,0 +1,452 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common rolling-ops interface coordinating etcd-backed and peer-backed execution.""" + +import logging +from contextlib import contextmanager +from typing import Any + +from ops import CharmBase, Object, Relation, RelationBrokenEvent +from ops.framework import EventBase + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsDecodingError, + RollingOpsInvalidLockRequestError, + RollingOpsNoRelationError, + RollingOpsSyncLockError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationQueue, + ProcessingBackend, + RollingOpsState, + RollingOpsStatus, + SyncLockBackend, + UnitBackendState, +) +from charmlibs.rollingops.etcd._backend import EtcdRollingOpsBackend +from charmlibs.rollingops.peer._backend import PeerRollingOpsBackend +from charmlibs.rollingops.peer._models import PeerUnitOperations + +logger = logging.getLogger(__name__) + + +class RollingOpsLockGrantedEvent(EventBase): + """Custom event emitted when the background worker grants the lock.""" + + +class RollingOpsEtcdFailedEvent(EventBase): + """Custom event emitted when the etcd worker hits a fatal error.""" + + +class RollingOpsManager(Object): + """Coordinate rolling operations across etcd and peer backends. + + This object exposes a common API for queuing asynchronous rolling + operations and acquiring synchronous locks. It prefers etcd when + available, mirrors operation state into the peer relation, and falls + back to peer-based processing when etcd becomes unavailable or + inconsistent. + """ + + def __init__( + self, + charm: CharmBase, + peer_relation_name: str, + etcd_relation_name: str, + cluster_id: str, + callback_targets: dict[str, Any], + sync_lock_targets: dict[str, type[SyncLockBackend]] | None = None, + ): + """Create a rolling operations manager with etcd and peer backends. + + This manager coordinates rolling operations across two backends: + + - an etcd-backed backend, used when etcd is available + - a peer-relation-backed backend, used as a fallback + + Operations are always persisted in the peer backend. When etcd is + available, operations are also mirrored to etcd and processed there. + If etcd becomes unavailable or unhealthy, this manager falls back to + the peer backend and continues processing from the mirrored state. + + Args: + charm: The charm instance owning this manager. + peer_relation_name: Name of the peer relation used for fallback + state and operation mirroring. + etcd_relation_name: Name of the relation providing etcd access. + cluster_id: Identifier used to scope etcd-backed state for this + rolling-ops instance. + callback_targets: Mapping of callback identifiers to callables + executed when queued operations are granted the lock. + sync_lock_targets: Optional mapping of sync lock backend + identifiers to backend implementations used when acquiring + synchronous locks through the peer fallback path. + """ + super().__init__(charm, 'rolling-ops-manager') + + self.charm = charm + self.peer_relation_name = peer_relation_name + self.etcd_relation_name = etcd_relation_name + self._sync_lock_targets = sync_lock_targets or {} + charm.on.define_event('rollingops_lock_granted', RollingOpsLockGrantedEvent) + charm.on.define_event('rollingops_etcd_failed', RollingOpsEtcdFailedEvent) + + self.peer_backend = PeerRollingOpsBackend( + charm=charm, + relation_name=peer_relation_name, + callback_targets=callback_targets, + ) + self.etcd_backend = EtcdRollingOpsBackend( + charm=charm, + peer_relation_name=peer_relation_name, + etcd_relation_name=etcd_relation_name, + cluster_id=cluster_id, + callback_targets=callback_targets, + ) + self.framework.observe( + charm.on[self.etcd_relation_name].relation_broken, self._on_etcd_relation_broken + ) + self.framework.observe(charm.on.rollingops_lock_granted, self._on_rollingops_lock_granted) + self.framework.observe(charm.on.rollingops_etcd_failed, self._on_rollingops_etcd_failed) + self.framework.observe(charm.on.update_status, self._on_update_status) + + @property + def _peer_relation(self) -> Relation | None: + """Return the peer relation for this charm.""" + return self.model.get_relation(self.peer_relation_name) + + @property + def _backend_state(self) -> UnitBackendState: + """Return the backend selection state stored for the current unit. + + This state determines whether the current unit is managed by the etcd + backend or the peer backend, and is used to control fallback and + recovery decisions. + """ + return UnitBackendState(self.model, self.peer_relation_name, self.model.unit) + + def _on_etcd_relation_broken(self, event: RelationBrokenEvent) -> None: + """Handle the etcd relation being fully removed. + + This method stops the etcd worker process since the required + relation is no longer available. + """ + self._fallback_current_unit_to_peer() + + def _select_processing_backend(self) -> ProcessingBackend: + """Choose which backend should handle new operations for this unit. + + Etcd is preferred when available, but a unit that has fallen back to + peer remains peer-managed until its pending peer work is drained. + This ensures backend transitions happen only from a clean state. + + Returns: + The selected processing backend. + """ + if not self.etcd_backend.is_available(): + logger.info('etcd backend unavailable; selecting peer backend.') + return ProcessingBackend.PEER + + if self._backend_state.is_peer_managed() and not self.peer_backend.has_pending_work(): + logger.info('etcd backend is available. Switching to etcd backend.') + return ProcessingBackend.ETCD + + if self._backend_state.is_etcd_managed(): + logger.info('etcd backend selected.') + return ProcessingBackend.ETCD + + logger.info('peer backend selected.') + return ProcessingBackend.PEER + + def _fallback_current_unit_to_peer(self) -> None: + """Move the current unit to the peer backend and resume processing there. + + This method marks the unit as peer-managed, stops the etcd worker, + and ensures that peer-based processing is running. + + It is used when etcd becomes unavailable, unhealthy, or inconsistent, + so that queued operations can continue without being lost. + """ + self._backend_state.fallback_to_peer() + self.etcd_backend.worker.stop() + self.peer_backend.ensure_processing() + + def request_async_lock( + self, + callback_id: str, + kwargs: dict[str, Any] | None = None, + max_retry: int | None = None, + ) -> None: + """Queue a rolling operation and trigger processing on the active backend. + + A new operation is created and always persisted in the peer backend. + If etcd is currently selected as the processing backend, the operation + is also mirrored to etcd and processing is triggered there. + + If persisting to etcd fails, the manager falls back to peer-based + processing. This guarantees that operations remain schedulable even + when etcd is unavailable. + + Args: + callback_id: Identifier of the callback to execute when the + operation is granted the rolling lock. + kwargs: Optional keyword arguments passed to the callback target. + max_retry: Optional maximum number of retries allowed for the + operation. None means infinte retries. + + Raises: + RollingOpsInvalidLockRequestError: If the callback identifier is + unknown, the operation cannot be created, or it cannot be + persisted in the peer backend. + RollingOpsNoRelationError: If the peer relation is not available. + """ + if callback_id not in self.peer_backend.callback_targets: + raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') + + if not self._peer_relation: + raise RollingOpsNoRelationError('No %s peer relation yet.', self.peer_relation_name) + + if kwargs is None: + kwargs = {} + + backend = self._select_processing_backend() + + try: + operation = Operation.create(callback_id, kwargs, max_retry) + except (RollingOpsDecodingError, ValueError) as e: + logger.error('Failed to create operation: %s', e) + raise RollingOpsInvalidLockRequestError('Failed to create the lock request') from e + + try: + self.peer_backend.enqueue_operation(operation) + except (RollingOpsDecodingError, ValueError) as e: + logger.error('Failed to persists operation in peer backend: %s', e) + raise RollingOpsInvalidLockRequestError( + 'Failed to persists operation in peer backend.' + ) from e + + if backend == ProcessingBackend.ETCD: + try: + self.etcd_backend.enqueue_operation(operation) + except Exception as e: + logger.warning( + 'Failed to persist operation in etcd backend; falling back to peer: %s', + e, + ) + backend = ProcessingBackend.PEER + + if backend == ProcessingBackend.ETCD: + self.etcd_backend.ensure_processing() + else: + self._fallback_current_unit_to_peer() + + def _on_rollingops_lock_granted(self, event: RollingOpsLockGrantedEvent) -> None: + """Handle a granted rolling lock and dispatch execution to the active backend. + + If the current unit is peer-managed, the operation is executed through + the peer backend. + + If the current unit is etcd-managed, the operation is executed through + the etcd backend. On successful execution, the result is mirrored back + to the peer relation so that peer state remains consistent and can be + used for fallback. + + If etcd execution fails or mirrored state becomes inconsistent, the + manager falls back to the peer backend and resumes processing there. + """ + if self._backend_state.is_peer_managed(): + logger.info('Executing rollingop on peer backend.') + self.peer_backend._on_rollingops_lock_granted(event) + return + outcome = None + try: + logger.info('Executing rollingop on etcd backend.') + outcome = self.etcd_backend._on_run_with_lock() + except Exception as e: + logger.warning( + 'etcd backend failed while handling rollingops_lock_granted; ' + 'falling back to peer: %s', + e, + ) + self._fallback_current_unit_to_peer() + return + + try: + self.peer_backend.mirror_outcome(outcome) + except RollingOpsDecodingError: + logger.info( + 'Inconsistencies found between peer relation and etcd. ' + 'Falling back to peer backend.' + ) + self._fallback_current_unit_to_peer() + return + logger.info('Execution mirrored to peer relation.') + + def _on_rollingops_etcd_failed(self, event: RollingOpsEtcdFailedEvent) -> None: + """Fall back to peer when the etcd worker reports a fatal failure.""" + logger.warning('Received rollingops_etcd_failed; falling back to peer backend.') + self._fallback_current_unit_to_peer() + + def _get_sync_lock_backend(self, backend_id: str) -> SyncLockBackend: + """Instantiate the configured peer sync lock backend. + + Args: + backend_id: Identifier of the configured sync lock backend. + + Returns: + A new sync lock backend instance. + + Raises: + RollingOpsSyncLockError: If no backend is registered for + the given identifier. + """ + backend_cls = self._sync_lock_targets.get(backend_id, None) + if backend_cls is None: + raise RollingOpsSyncLockError(f'Unknown sync lock backend: {backend_id}.') + + return backend_cls() + + @contextmanager + def acquire_sync_lock(self, backend_id: str, timeout: int): + """Acquire a synchronous lock, using etcd when available and peer as fallback. + + This context manager first attempts to acquire the lock through the + etcd backend. If etcd is available and the lock is acquired, the + protected block is executed under the etcd lock. + + If etcd fails due to an operational error, the manager falls back to + the configured peer sync lock backend identified by `backend_id`. + If etcd acquisition times out, the timeout is propagated and no + fallback occurs. + + On context exit, the acquired lock is released through the backend + that granted it. + + Args: + backend_id: Identifier of the peer sync lock backend to use if + etcd acquisition cannot be used. + timeout: Maximum time in seconds to wait for lock acquisition. + None means infinite time. + + Yields: + None. The protected code runs while the lock is held. + + Raises: + TimeoutError: If lock acquisition through etcd or the peer backend + times out. + RollingOpsSyncLockError: if there is an error when acquiring the lock. + """ + if self.etcd_backend.is_available(): + logger.info('Acquiring sync lock on etcd.') + try: + self.etcd_backend.acquire_sync_lock(timeout) + yield + return + except TimeoutError: + raise + except Exception as e: + # etcd is not reachable or unhealthy + logger.exception( + 'Failed to request etcd sync lock; falling back to peer: %s', + e, + ) + finally: + try: + self.etcd_backend.release_sync_lock() + logger.info('etcd lock released.') + except Exception as e: + logger.exception('Failed to release sync lock: %s', e) + + backend = self._get_sync_lock_backend(backend_id) + logger.info('Acquiring sync lock backend %s.', backend_id) + try: + backend.acquire(timeout=timeout) + except Exception as e: + raise RollingOpsSyncLockError( + f'Failed to acquire sync lock backend {backend_id}' + ) from e + + try: + yield + finally: + try: + backend.release() + logger.info('Sync lock backend %s released.', backend_id) + except Exception as e: + raise RollingOpsSyncLockError( + f'Failed to release sync lock backend {backend_id}' + ) from e + + @property + def state(self) -> RollingOpsState: + """Return the current rolling-ops state for this unit. + + The returned state is always based on the peer relation for the + operation queue, since peer state is the durable fallback source of + truth. + + Status is taken from the etcd backend when this unit is currently + etcd-managed. If status retrieval from etcd fails, the unit falls + back to the peer backend and peer status is returned instead. + + Returns: + A snapshot of the current rolling-ops status, backend selection, + and queued operations for this unit. + """ + if self._peer_relation is None: + return RollingOpsState( + status=RollingOpsStatus.INVALID, + processing_backend=None, + operations=OperationQueue(), + ) + + status = self.peer_backend.get_status() + if self._backend_state.is_etcd_managed(): + try: + status = self.etcd_backend.get_status() + except Exception as e: + logger.exception('Failed to get status: %s', e) + self._fallback_current_unit_to_peer() + status = self.peer_backend.get_status() + + operations = PeerUnitOperations(self.model, self.peer_relation_name, self.model.unit) + return RollingOpsState( + status=status, + processing_backend=self._backend_state.backend, + operations=operations.queue, + ) + + def _on_update_status(self, _: EventBase) -> None: + """Periodic reconciliation of rolling-ops state. + + Ensures the correct backend is active, workers are running, + and fallback is triggered if etcd becomes unhealthy. + """ + if self._backend_state.is_etcd_managed(): + if not self.etcd_backend.is_available(): + logger.warning('etcd unavailable during update_status; falling back.') + self._fallback_current_unit_to_peer() + return + + try: + self.etcd_backend.ensure_processing() + except Exception as e: + logger.warning('etcd worker failed: %s; falling back.', e) + self._fallback_current_unit_to_peer() + return + + else: + self.peer_backend.ensure_processing() diff --git a/rollingops/src/charmlibs/rollingops/_version.py b/rollingops/src/charmlibs/rollingops/_version.py new file mode 100644 index 000000000..867de4948 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/_version.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = '0.0.0.dev0' diff --git a/rollingops/src/charmlibs/rollingops/common/__init__.py b/rollingops/src/charmlibs/rollingops/common/__init__.py new file mode 100644 index 000000000..33bb77934 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common code used by peer and etcd rolling ops.""" diff --git a/rollingops/src/charmlibs/rollingops/common/_base_worker.py b/rollingops/src/charmlibs/rollingops/common/_base_worker.py new file mode 100644 index 000000000..7dc1e46c2 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_base_worker.py @@ -0,0 +1,273 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common class to manager background processes.""" + +import logging +import os +import signal +import subprocess +from sys import version_info + +from ops import CharmBase, Object, Relation + +from charmlibs import pathops +from charmlibs.rollingops.common._exceptions import RollingOpsLibMissingError +from charmlibs.rollingops.common._utils import with_pebble_retry + +logger = logging.getLogger(__name__) + + +class BaseRollingOpsAsyncWorker(Object): + """Base class for external rolling-ops worker processes. + + This class provides the common lifecycle management for background + worker processes used by rolling-ops backends. It is responsible for: + + - locating the worker script inside the charm virtualenv + - building the execution environment for the subprocess + - validating required files before startup + - starting and stopping the worker process + - persisting and retrieving the worker PID through backend-specific storage + + Subclasses define where worker state is stored, how existing workers + should be handled, and which worker script and arguments should be used. + """ + + _pid_field: str + _log_filename: str + + def __init__(self, charm: CharmBase, handle_name: str, peer_relation_name: str): + """Initialize the base rolling-ops worker helper. + + Args: + charm: The charm instance managing the worker process. + handle_name: Framework handle name used for this worker object. + peer_relation_name: Name of the peer relation used by subclasses + to store and retrieve worker state. + """ + super().__init__(charm, handle_name) + self._charm = charm + self._charm_dir = charm.charm_dir + self._peer_relation_name = peer_relation_name + self._handle_name = handle_name + + @property + def _relation(self) -> Relation | None: + """Return the peer relation used for worker state.""" + return self._charm.model.get_relation(self._peer_relation_name) + + def _venv_site_packages(self) -> pathops.LocalPath: + """Return the site-packages path for the charm virtualenv. + + This path is used to locate the rolling-ops worker scripts and ensure + the spawned subprocess can import charm library code. + """ + return pathops.LocalPath( + self._charm_dir + / 'venv' + / 'lib' + / f'python{version_info.major}.{version_info.minor}' + / 'site-packages' + ) + + def _build_env(self) -> dict[str, str]: + """Build the environment used to spawn the worker subprocess. + + The worker runs outside the current Juju hook context, so the Juju + context identifier is removed from the environment. The charm virtualenv + site-packages path is also prepended to ``PYTHONPATH`` so that the + worker can import charm libraries correctly. + + Returns: + A copy of the current environment adjusted for the worker process. + """ + new_env = os.environ.copy() + new_env.pop('JUJU_CONTEXT_ID', None) + + venv_path = self._venv_site_packages() + + for loc in new_env.get('PYTHONPATH', '').split(':'): + path = pathops.LocalPath(loc) + + if path.stem != 'lib': + continue + new_env['PYTHONPATH'] = f'{venv_path.resolve()}:{new_env["PYTHONPATH"]}' + break + return new_env + + def _worker_script_path(self) -> pathops.LocalPath: + """Return the worker script path.""" + raise NotImplementedError + + def _worker_args(self) -> list[str]: + """Return additional backend-specific command-line arguments. + + Subclasses may override this to pass extra arguments required by the + worker process. + + Returns: + A list of command-line arguments to append when starting the worker. + """ + return [] + + def _get_pid_str(self) -> str: + """Return the stored worker PID string. + + Returns: + The stored PID as a string, or an empty string if no PID is stored. + + Raises: + NotImplementedError: If not implemented by a subclass. + """ + raise NotImplementedError + + def _set_pid_str(self, pid: str) -> None: + """Persist the worker PID string. + + Args: + pid: The PID string to persist. An empty string clears the stored PID. + + Raises: + NotImplementedError: If not implemented by a subclass. + """ + raise NotImplementedError + + def _on_existing_worker(self, pid: int) -> bool: + """Handle case where a worker is already running. + + Returns: + True if a new worker should be started, + False if start() should return early. + """ + raise NotImplementedError + + def _validate_startup_paths(self) -> None: + """Validate that the worker runtime files exist before startup. + + This checks that the charm virtualenv site-packages directory exists + and that the backend-specific worker script is present. + + Raises: + RollingOpsLibMissingError: If the virtualenv or worker script + cannot be found. + """ + venv_path = self._venv_site_packages() + if not with_pebble_retry(lambda: venv_path.exists()): + raise RollingOpsLibMissingError( + f'Expected virtualenv site-packages not found: {venv_path}' + ) + + worker = self._worker_script_path() + if not with_pebble_retry(lambda: worker.exists()): + raise RollingOpsLibMissingError(f'Worker script not found: {worker}') + + def _is_pid_alive(self, pid: int) -> bool: + """Return whether the given PID appears to be alive.""" + if pid <= 0: + return False + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + + def start(self) -> None: + """Start the worker subprocess if one is not already running. + + Raises: + RollingOpsLibMissingError: If the virtualenv or worker script + required to start the worker is missing. + OSError: If the worker subprocess cannot be started. + """ + pid_str = self._get_pid_str() + if pid_str: + try: + pid = int(pid_str) + except (ValueError, TypeError): + pid = None + + if pid is not None and self._is_pid_alive(pid) and not self._on_existing_worker(pid): + return + + self._validate_startup_paths() + + worker = self._worker_script_path() + env = self._build_env() + + log_out = open(f'/var/log/{self._log_filename}.log', 'a') # noqa: SIM115 + pid = subprocess.Popen( + [ + '/usr/bin/python3', + '-u', + str(worker), + '--unit-name', + self.model.unit.name, + '--charm-dir', + str(self._charm_dir), + *self._worker_args(), + ], + cwd=str(self._charm_dir), + stdout=log_out, + stderr=log_out, + env=env, + ).pid + + self._set_pid_str(str(pid)) + logger.info('Started %s process with PID %s', self._handle_name, pid) + + def stop(self) -> None: + """Stop the running worker subprocess, if one is recorded. + + This method reads the stored PID, sends ``SIGTERM`` to the process, + and falls back to ``SIGKILL`` if termination fails. If the process is + already gone or the stored PID is invalid, worker state is cleaned up. + + The stored PID is cleared when the worker is successfully considered + stopped or no longer present. + """ + pid_str = self._get_pid_str() + + try: + pid = int(pid_str) + except (TypeError, ValueError): + logger.info('Missing PID or invalid PID found in worker state.') + self._set_pid_str('') + return + + try: + os.kill(pid, signal.SIGTERM) + logger.info('Sent SIGTERM to rollingops worker process PID %s.', pid) + except ProcessLookupError: + logger.info('Process PID %s is already gone.', pid) + except PermissionError: + logger.warning('No permission to stop rollingops worker process PID %s.', pid) + return + except OSError: + logger.warning('SIGTERM failed for PID %s, attempting SIGKILL', pid) + try: + os.kill(pid, signal.SIGKILL) + logger.info('Sent SIGKILL to rollingops worker process PID %s', pid) + except ProcessLookupError: + logger.info('Process PID %s exited before SIGKILL', pid) + except PermissionError: + logger.warning('No permission to SIGKILL process PID %s', pid) + return + except OSError: + logger.warning('Failed to SIGKILL process PID %s', pid) + return + + self._set_pid_str('') diff --git a/rollingops/src/charmlibs/rollingops/common/_exceptions.py b/rollingops/src/charmlibs/rollingops/common/_exceptions.py new file mode 100644 index 000000000..209f0c67e --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_exceptions.py @@ -0,0 +1,71 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Exceptions used in rollingops.""" + + +class RollingOpsError(Exception): + """General rollingops error.""" + + +class RollingOpsNoRelationError(RollingOpsError): + """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" + + +class RollingOpsNoEtcdRelationError(RollingOpsNoRelationError): + """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" + + +class RollingOpsFileSystemError(RollingOpsError): + """Raised if there is a problem when interacting with the filesystem.""" + + +class RollingOpsInvalidLockRequestError(RollingOpsError): + """Raised if the lock request is invalid.""" + + +class RollingOpsDecodingError(RollingOpsError): + """Raised if json content cannot be processed.""" + + +class RollingOpsInvalidSecretContentError(RollingOpsError): + """Raised if the content of a secret is invalid.""" + + +class RollingOpsLibMissingError(RollingOpsError): + """Raised if the path to the libraries cannot be resolved.""" + + +class RollingOpsEtcdctlError(RollingOpsError): + """Base exception for etcdctl command failures.""" + + +class RollingOpsEtcdctlRetryableError(RollingOpsEtcdctlError): + """A transient etcdctl failure that may succeed on retry.""" + + +class RollingOpsEtcdNotConfiguredError(RollingOpsEtcdctlError): + """Raised if etcd client has not been configured yet (env file does not exist).""" + + +class RollingOpsEtcdctlFatalError(RollingOpsEtcdctlError): + """A non-retryable etcdctl failure.""" + + +class RollingOpsEtcdctlParseError(RollingOpsEtcdctlError): + """Raised when etcdctl output cannot be parsed.""" + + +class RollingOpsSyncLockError(RollingOpsError): + """Raised when there is an error during sync lock execution.""" diff --git a/rollingops/src/charmlibs/rollingops/common/_models.py b/rollingops/src/charmlibs/rollingops/common/_models.py new file mode 100644 index 000000000..162d9f825 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_models.py @@ -0,0 +1,456 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rolling ops common models.""" + +import json +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from enum import StrEnum +from typing import Any + +from ops import Model, Unit + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsDecodingError, + RollingOpsNoRelationError, +) +from charmlibs.rollingops.common._utils import datetime_to_str, now_timestamp, parse_timestamp + +logger = logging.getLogger(__name__) + + +class OperationResult(StrEnum): + """Callback return values.""" + + RELEASE = 'release' + RETRY_RELEASE = 'retry-release' + RETRY_HOLD = 'retry-hold' + + +class ProcessingBackend(StrEnum): + """Backend responsible for processing a unit's queue.""" + + PEER = 'peer' + ETCD = 'etcd' + + +class RunWithLockStatus(StrEnum): + """Status of an attempt to execute an operation under a distributed lock. + + These values describe what happened when a unit tried to run an + operation while interacting with the lock. + """ + + NOT_GRANTED = 'not_granted' + NO_OPERATION = 'no_operation' + MISSING_CALLBACK = 'missing_callback' + EXECUTED = 'executed' + + +class RollingOpsStatus(StrEnum): + """High-level rolling-ops state for a unit. + + This status reflects whether the unit is currently executing, waiting, + or idle with respect to rolling operations. + """ + + INVALID = 'invalid' + WAITING = 'waiting' + GRANTED = 'granted' + IDLE = 'idle' + + +@dataclass +class RunWithLockOutcome: + """Result of attempting to execute an operation under a distributed lock. + + This object captures both whether an operation was executed and, if so, + the identity and result of that operation. It is used to propagate + execution outcomes across backends (e.g. etcd → peer mirroring). + """ + + status: RunWithLockStatus + op_id: str | None = None + result: OperationResult | None = None + + +@dataclass +class BackendState: + """Unit-scoped backend ownership and recovery state.""" + + processing_backend: str = ProcessingBackend.PEER + etcd_cleanup_needed: str = 'false' + + @property + def cleanup_needed(self) -> bool: + """Return whether stale etcd state must be cleaned before reuse.""" + return self.etcd_cleanup_needed == 'true' + + @cleanup_needed.setter + def cleanup_needed(self, value: bool) -> None: + """Persist whether stale etcd state cleanup is required.""" + self.etcd_cleanup_needed = 'true' if value else 'false' + + @property + def backend(self) -> ProcessingBackend: + """Return which backend owns execution for this unit's queue.""" + if not self.processing_backend: + return ProcessingBackend.PEER + return ProcessingBackend(self.processing_backend) + + @backend.setter + def backend(self, value: ProcessingBackend) -> None: + """Persist the backend owner.""" + self.processing_backend = value + + +class UnitBackendState: + """Manage backend ownership and fallback state for one unit queue.""" + + def __init__(self, model: Model, relation_name: str, unit: Unit): + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + self._relation = relation + self.unit = unit + + def _load(self) -> BackendState: + return self._relation.load(BackendState, self.unit, decoder=lambda s: s) + + def _save(self, data: BackendState) -> None: + self._relation.save(data, self.unit, encoder=str) + + @property + def backend(self) -> ProcessingBackend: + """Return which backend owns execution for this unit's queue.""" + return self._load().backend + + @property + def cleanup_needed(self) -> bool: + """Return whether etcd cleanup is required before etcd can be reused.""" + return self._load().cleanup_needed + + def fallback_to_peer(self) -> None: + """Switch this unit's queue to peer processing and mark etcd cleanup needed.""" + data = self._load() + data.backend = ProcessingBackend.PEER + data.cleanup_needed = True + self._save(data) + + def clear_fallback(self) -> None: + """Clear the etcd cleanup-needed flag and set the backend to ETCD.""" + data = self._load() + data.backend = ProcessingBackend.ETCD + data.cleanup_needed = False + self._save(data) + + def is_peer_managed(self) -> bool: + """Return whether the peer backend should process this unit's queue.""" + return self.backend == ProcessingBackend.PEER + + def is_etcd_managed(self) -> bool: + """Return whether the etcd backend should process this unit's queue.""" + return self.backend == ProcessingBackend.ETCD + + +@dataclass +class Operation: + """A single queued operation.""" + + callback_id: str + requested_at: datetime + max_retry: int | None + attempt: int + result: OperationResult | None + kwargs: dict[str, Any] = field(default_factory=dict[str, Any]) + + @classmethod + def _validate_fields( + cls, callback_id: Any, kwargs: Any, requested_at: Any, max_retry: Any, attempt: Any + ) -> None: + """Validate the class attributes.""" + if not isinstance(callback_id, str) or not callback_id.strip(): + raise ValueError('callback_id must be a non-empty string') + + if not isinstance(kwargs, dict): + raise ValueError('kwargs must be a dict') + try: + json.dumps(kwargs) + except TypeError as e: + raise ValueError(f'kwargs must be JSON-serializable: {e}') from e + + if not isinstance(requested_at, datetime): + raise ValueError('requested_at must be a datetime') + + if max_retry is not None: + if not isinstance(max_retry, int): + raise ValueError('max_retry must be an int') + if max_retry < 0: + raise ValueError('max_retry must be >= 0') + + if not isinstance(attempt, int): + raise ValueError('attempt must be an int') + if attempt < 0: + raise ValueError('attempt must be >= 0') + + def __post_init__(self) -> None: + """Validate the class attributes.""" + self._validate_fields( + self.callback_id, + self.kwargs, + self.requested_at, + self.max_retry, + self.attempt, + ) + + @classmethod + def create( + cls, + callback_id: str, + kwargs: dict[str, Any], + max_retry: int | None = None, + ) -> 'Operation': + """Create a new operation from a callback id and kwargs.""" + return cls( + callback_id=callback_id, + kwargs=kwargs, + requested_at=now_timestamp(), + max_retry=max_retry, + attempt=0, + result=None, + ) + + def _to_dict(self) -> dict[str, str]: + """Dict form (string-only values).""" + return { + 'callback_id': self.callback_id, + 'kwargs': self._kwargs_to_json(), + 'requested_at': datetime_to_str(self.requested_at), + 'max_retry': '' if self.max_retry is None else str(self.max_retry), + 'attempt': str(self.attempt), + 'result': '' if self.result is None else self.result, + } + + def to_string(self) -> str: + """Serialize to a string suitable for a Juju databag.""" + return json.dumps(self._to_dict(), separators=(',', ':')) + + def increase_attempt(self) -> None: + """Increment the attempt counter.""" + self.attempt += 1 + + def is_max_retry_reached(self) -> bool: + """Return True if attempt exceeds max_retry (unless max_retry is None).""" + if self.max_retry is None: + return False + return self.attempt > self.max_retry + + def complete(self) -> None: + """Mark the operation as completed to indicate the lock should be released.""" + self.increase_attempt() + self.result = OperationResult.RELEASE + + def retry_release(self) -> None: + """Mark the operation for retry if it has not reached the max retry.""" + self.increase_attempt() + if self.is_max_retry_reached(): + logger.warning('Operation max retry reached. Dropping.') + self.result = OperationResult.RELEASE + else: + self.result = OperationResult.RETRY_RELEASE + + def retry_hold(self) -> None: + """Mark the operation for retry if it has not reached the max retry.""" + self.increase_attempt() + if self.is_max_retry_reached(): + self.result = OperationResult.RELEASE + logger.warning('Operation max retry reached. Dropping.') + else: + self.result = OperationResult.RETRY_HOLD + + @property + def op_id(self) -> str: + """Return the unique identifier for this operation.""" + return f'{datetime_to_str(self.requested_at)}-{self.callback_id}' + + @classmethod + def from_string(cls, data: str) -> 'Operation': + """Deserialize from a Juju databag string. + + Raises: + RollingOpsDecodingError: if data cannot be deserialized. + """ + try: + obj = json.loads(data) + except json.JSONDecodeError as e: + logger.error('Failed to deserialize Operation from %s: %s', data, e) + raise RollingOpsDecodingError( + 'Failed to deserialize data to create an Operation' + ) from e + return cls.from_dict(obj) + + @classmethod + def from_dict(cls, data: dict[str, str]) -> 'Operation': + """Create an Operation from its dict (etcd) representation.""" + try: + return cls( + callback_id=data['callback_id'], + requested_at=parse_timestamp(data['requested_at']), # type: ignore[reportArgumentType] + max_retry=int(data['max_retry']) if data.get('max_retry') else None, + attempt=int(data['attempt']), + kwargs=json.loads(data['kwargs']) if data.get('kwargs') else {}, + result=OperationResult(data['result']) if data.get('result') else None, + ) + + except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e: + logger.error('Failed to deserialize Operation from %s: %s', data, e) + raise RollingOpsDecodingError( + 'Failed to deserialize data to create an Operation' + ) from e + + def _kwargs_to_json(self) -> str: + """Deterministic JSON serialization for kwargs.""" + return json.dumps(self.kwargs, sort_keys=True, separators=(',', ':')) + + def __eq__(self, other: object) -> bool: + """Equal for the operation.""" + if not isinstance(other, Operation): + return False + return self.callback_id == other.callback_id and self.kwargs == other.kwargs + + def __hash__(self) -> int: + """Hash for the operation.""" + return hash((self.callback_id, self._kwargs_to_json())) + + +class OperationQueue: + """In-memory FIFO queue of Operations with encode/decode helpers for storing in a databag.""" + + def __init__(self, operations: list[Operation] | None = None): + self.operations: list[Operation] = list(operations or []) + + def __len__(self) -> int: + """Return the number of operations in the queue.""" + return len(self.operations) + + @property + def empty(self) -> bool: + """Return True if there are no queued operations.""" + return not self.operations + + def peek(self) -> Operation | None: + """Return the first operation in the queue if it exists.""" + return self.operations[0] if self.operations else None + + def _peek_last(self) -> Operation | None: + """Return the last operation in the queue if it exists.""" + return self.operations[-1] if self.operations else None + + def dequeue(self) -> Operation | None: + """Drop the first operation in the queue if it exists and return it.""" + return self.operations.pop(0) if self.operations else None + + def increase_attempt(self) -> None: + """Increment the attempt counter for the head operation and persist it.""" + if self.empty: + return + self.operations[0].increase_attempt() + + def enqueue(self, operation: Operation) -> None: + """Append operation only if it is not equal to the tail operation.""" + last_operation = self._peek_last() + if last_operation is not None and last_operation == operation: + return + self.operations.append(operation) + + def to_string(self) -> str: + """Encode entire queue to a single string.""" + items = [op.to_string() for op in self.operations] + return json.dumps(items, separators=(',', ':')) + + @classmethod + def from_string(cls, data: str) -> 'OperationQueue': + """Decode queue from a string. + + Raises: + RollingOpsDecodingError: if data cannot be deserialized. + """ + if not data: + return cls() + + try: + items = json.loads(data) + except json.JSONDecodeError as e: + logger.error( + 'Failed to deserialize data to create an OperationQueue from %s: %s', data, e + ) + raise RollingOpsDecodingError( + 'Failed to deserialize data to create an OperationQueue.' + ) from e + if not isinstance(items, list) or not all(isinstance(s, str) for s in items): # type: ignore[reportUnknownVariableType] + raise RollingOpsDecodingError( + 'OperationQueue string must decode to a JSON list of strings.' + ) + + operations = [Operation.from_string(s) for s in items] # type: ignore[reportUnknownVariableType] + return cls(operations) + + +@dataclass +class RollingOpsState: + """Snapshot of the rolling-ops state for a unit. + + This aggregates the current status, the backend responsible for + processing operations, and the unit's operation queue. + """ + + status: RollingOpsStatus + processing_backend: ProcessingBackend | None + operations: OperationQueue + + +class SyncLockBackend(ABC): + """Interface for synchronous lock backends. + + Implementations provide a mechanism to acquire and release a lock + protecting a critical section. These backends are used by the + RollingOpsManager to coordinate synchronous operations within a + single unit when etcd is not available. + """ + + @abstractmethod + def acquire(self, timeout: int | None) -> None: + """Acquire the lock, blocking until it is granted or timeout expires. + + Args: + timeout: Maximum time in seconds to wait for the lock. + None means wait indefinitely. + + Raises: + TimeoutError: If the lock could not be acquired within the timeout. + """ + pass + + @abstractmethod + def release(self) -> None: + """Release the lock. + + Implementations must ensure that only the lock owner can release + the lock and that any associated resources are cleaned up. + """ + pass diff --git a/rollingops/src/charmlibs/rollingops/common/_utils.py b/rollingops/src/charmlibs/rollingops/common/_utils.py new file mode 100644 index 000000000..fbf819bc0 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_utils.py @@ -0,0 +1,120 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rolling ops common functions.""" + +import logging +import subprocess +from collections.abc import Callable +from datetime import UTC, datetime +from logging.handlers import RotatingFileHandler +from typing import TypeVar + +from ops import pebble +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed + +from charmlibs.pathops import PebbleConnectionError + +logger = logging.getLogger(__name__) +T = TypeVar('T') + + +@retry( + retry=retry_if_exception_type((PebbleConnectionError, pebble.APIError, pebble.ChangeError)), + stop=stop_after_attempt(3), + wait=wait_fixed(10), + reraise=True, +) +def with_pebble_retry[T](func: Callable[[], T]) -> T: + return func() + + +def now_timestamp() -> datetime: + """UTC timestamp.""" + return datetime.now(UTC) + + +def parse_timestamp(timestamp: str) -> datetime | None: + """Parse epoch timestamp string. Return None on errors.""" + try: + return datetime.fromtimestamp(float(timestamp), tz=UTC) + except Exception: + return None + + +def datetime_to_str(datetime: datetime) -> str: + return str(datetime.timestamp()) + + +def setup_logging(log_file: str) -> None: + """Configure logging with file rotation. + + This sets up the root logger to write INFO-level (and above) logs + to a rotating file handler. Log files are capped at 10 MB each, + with up to 3 backup files retained. + + This functions is used in the context of the background process. + + Args: + log_file: Path to the log file where logs should be written. + """ + handler = RotatingFileHandler( + log_file, + maxBytes=10 * 1024 * 1024, # 10 MB + backupCount=3, + ) + + formatter = logging.Formatter( + '%(asctime)s [%(levelname)s] [%(process)d] %(name)s: %(message)s' + ) + handler.setFormatter(formatter) + + root = logging.getLogger() + root.setLevel(logging.INFO) + root.addHandler(handler) + + +def dispatch_hook(unit_name: str, charm_dir: str, hook_name: str) -> None: + """Execute a Juju hook on a specific unit via juju-exec. + + This function triggers a charm hook by invoking the charm's `dispatch` + script with the appropriate JUJU_DISPATCH_PATH environment variable. + + Args: + unit_name: The Juju unit name (e.g., "app/0") on which to run the hook. + charm_dir: Filesystem path to the charm directory containing the dispatch script. + hook_name: Name of the hook to dispatch (without the "hooks/" prefix). + + Raises: + subprocess.CalledProcessError: If the juju-exec command fails. + """ + run_cmd = '/usr/bin/juju-exec' + dispatch_sub_cmd = f'JUJU_DISPATCH_PATH=hooks/{hook_name} {charm_dir}/dispatch' + res = subprocess.run([run_cmd, '-u', unit_name, dispatch_sub_cmd], check=False) + res.check_returncode() + logger.info('%s hook dispatched.', hook_name) + + +def dispatch_lock_granted(unit_name: str, charm_dir: str) -> None: + """Dispatch the 'rollingops_lock_granted' hook on a unit. + + Args: + unit_name: The Juju unit name (e.g., "app/0"). + charm_dir: Filesystem path to the charm directory. + + Raises: + subprocess.CalledProcessError: If the hook execution fails. + """ + hook_name = 'rollingops_lock_granted' + dispatch_hook(unit_name, charm_dir, hook_name) diff --git a/rollingops/src/charmlibs/rollingops/etcd/__init__.py b/rollingops/src/charmlibs/rollingops/etcd/__init__.py new file mode 100644 index 000000000..064b097a3 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rollingops for charms using etcd.""" diff --git a/rollingops/src/charmlibs/rollingops/etcd/_backend.py b/rollingops/src/charmlibs/rollingops/etcd/_backend.py new file mode 100644 index 000000000..7fede9add --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_backend.py @@ -0,0 +1,385 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +from typing import Any + +from ops import Object, Relation +from ops.charm import ( + CharmBase, + RelationCreatedEvent, + RelationDepartedEvent, +) + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsInvalidLockRequestError, + RollingOpsNoEtcdRelationError, + RollingOpsSyncLockError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationResult, + RollingOpsStatus, + RunWithLockOutcome, + RunWithLockStatus, + UnitBackendState, +) +from charmlibs.rollingops.etcd import _etcdctl as etcdctl +from charmlibs.rollingops.etcd._etcd import EtcdLease, EtcdLock, ManagerOperationStore +from charmlibs.rollingops.etcd._models import RollingOpsKeys +from charmlibs.rollingops.etcd._relations import EtcdRequiresV1, SharedClientCertificateManager +from charmlibs.rollingops.etcd._worker import EtcdRollingOpsAsyncWorker + +logger = logging.getLogger(__name__) + + +class EtcdRollingOpsBackend(Object): + """Manage rolling operations using etcd-backed coordination. + + This backend stores operation state in etcd, coordinates asynchronous + execution through an etcd-backed distributed lock, and exposes a + synchronous lock interface for critical sections. + + Each unit manages its own etcd worker process and operation queues. + Operations are scoped using a cluster identifier and a per-unit owner. + """ + + def __init__( + self, + charm: CharmBase, + peer_relation_name: str, + etcd_relation_name: str, + cluster_id: str, + callback_targets: dict[str, Any], + ): + """Initialize the etcd-backed rolling-ops backend. + + Args: + charm: The charm instance owning this backend. + peer_relation_name: Name of the peer relation used for shared + state and worker coordination. + etcd_relation_name: Name of the relation providing etcd access. + cluster_id: Identifier used to scope etcd keys for this rolling-ops + instance. + callback_targets: Mapping from callback identifiers to callables + executed when an operation is granted the asynchronous lock. + """ + super().__init__(charm, 'etcd-rolling-ops-manager') + self._charm = charm + self.peer_relation_name = peer_relation_name + self.etcd_relation_name = etcd_relation_name + self.callback_targets = callback_targets + + owner = f'{self.model.uuid}-{self.model.unit.name}'.replace('/', '-') + self.worker = EtcdRollingOpsAsyncWorker( + charm, peer_relation_name=peer_relation_name, owner=owner, cluster_id=cluster_id + ) + self.keys = RollingOpsKeys.for_owner(cluster_id, owner) + + self.shared_certificates = SharedClientCertificateManager( + charm, + peer_relation_name=peer_relation_name, + ) + + self.etcd = EtcdRequiresV1( + charm, + relation_name=etcd_relation_name, + cluster_id=self.keys.cluster_prefix, + shared_certificates=self.shared_certificates, + ) + + self.keys = RollingOpsKeys.for_owner(cluster_id=cluster_id, owner=owner) + self._async_lock = EtcdLock(lock_key=self.keys.lock_key, owner=owner) + self._sync_lock = EtcdLock(lock_key=self.keys.lock_key, owner=f'{owner}:sync') + self.operations = ManagerOperationStore(self.keys, owner) + self._lease = None + + self.framework.observe( + charm.on[self.peer_relation_name].relation_departed, self._on_peer_relation_departed + ) + self.framework.observe( + charm.on[self.etcd_relation_name].relation_created, self._on_etcd_relation_created + ) + + @property + def _peer_relation(self) -> Relation | None: + """Return the peer relation for this backend.""" + return self.model.get_relation(self.peer_relation_name) + + @property + def _etcd_relation(self) -> Relation | None: + """Return the etcd relation for this backend.""" + return self.model.get_relation(self.etcd_relation_name) + + def is_available(self) -> bool: + """Return whether the etcd backend is currently usable. + + The backend is considered available only if the etcd relation exists + and the etcd client has been initialized successfully. + + Returns: + True if etcd can currently be used, otherwise False. + """ + if self._etcd_relation is None: + return False + try: + etcdctl.ensure_initialized() + except Exception: + return False + return True + + def enqueue_operation(self, operation: Operation) -> None: + """Persist an operation in etcd for this unit. + + Before storing the operation, this method clears any pending fallback + state for the current unit. If the unit had previously fallen back + from etcd to peer processing and cleanup is still required, stale etcd + operation state is removed first so processing can resume from a clean + slate. + + Args: + operation: The operation to enqueue. + + Raises: + RollingOpsNoEtcdRelationError: If the etcd relation does not exist. + RollingOpsEtcdNotConfiguredError: If the etcd client has not been + configured yet. + PebbleConnectionError: If the remote container cannot be reached. + """ + if self._etcd_relation is None: + raise RollingOpsNoEtcdRelationError + + etcdctl.ensure_initialized() + + backend_state = UnitBackendState(self.model, self.peer_relation_name, self.model.unit) + if backend_state.cleanup_needed: + self.operations.clean_up() + backend_state.clear_fallback() + + self.operations.request(operation) + + def ensure_processing(self): + """Ensure that the etcd worker process is running. + + The worker is responsible for acquiring the asynchronous lock and + processing queued operations for this unit. + """ + self.worker.start() + + def _on_etcd_relation_created(self, event: RelationCreatedEvent) -> None: + """Validate that the etcdctl command is available when etcd is related. + + Args: + event: The relation-created event for the etcd relation. + """ + if not etcdctl.is_etcdctl_installed(): + logger.error('%s is not installed.', etcdctl.ETCDCTL_CMD) + + def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: + """Handle removal of a unit from the peer relation. + + If the current unit is departing, the etcd worker process is stopped + to ensure a clean shutdown and avoid leaving a stale worker running. + + Args: + event: The peer relation departed event. + """ + unit = event.departing_unit + if unit == self.model.unit: + self.worker.stop() + + def request_async_lock( + self, + callback_id: str, + kwargs: dict[str, Any] | None = None, + max_retry: int | None = None, + ) -> None: + """Queue a rolling operation and trigger asynchronous lock acquisition. + + This method creates a new operation representing a callback to execute + once the distributed lock is granted. The operation is appended to the + unit's pending operation queue stored in etcd. + + If the operation is successfully enqueued, the background worker process + responsible for acquiring the distributed lock and processing operations + is started. + + Args: + callback_id: Identifier of the registered callback to execute when + the lock is granted. + kwargs: Optional keyword arguments passed to the callback when + executed. Must be JSON-serializable. + max_retry: Maximum number of retries for the operation. + - None: retry indefinitely + - 0: do not retry on failure + + Raises: + RollingOpsInvalidLockRequestError: If the callback_id is not registered or + invalid parameters were provided. + RollingOpsNoEtcdRelationError: if the etcd relation does not exist + RollingOpsEtcdNotConfiguredError: if etcd client has not been configured yet + PebbleConnectionError: if the remote container cannot be reached. + """ + if callback_id not in self.callback_targets: + raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') + + if not self._etcd_relation: + raise RollingOpsNoEtcdRelationError + + etcdctl.ensure_initialized() + + if kwargs is None: + kwargs = {} + + operation = Operation.create(callback_id, kwargs, max_retry) + self.operations.request(operation) + self.worker.start() + + def _on_run_with_lock(self) -> RunWithLockOutcome: + """Execute the current operation while holding the distributed lock. + + This method is triggered when the worker determines that the current + unit owns the distributed lock. The method retrieves the head operation + from the in-progress queue and executes its registered callback. + + After execution, the operation is moved to the completed queue and its + updated state is persisted. + + Returns: + A structured outcome describing whether an operation was executed + and, if so, which operation was finalized and with what result. + """ + if not self._async_lock.is_held(): + logger.info('Lock is not granted. Operation will not run.') + return RunWithLockOutcome(status=RunWithLockStatus.NOT_GRANTED) + + if not (operation := self.operations.peek_current()): + logger.info('Lock granted but there is no operation to run.') + return RunWithLockOutcome(status=RunWithLockStatus.NO_OPERATION) + + if not (callback := self.callback_targets.get(operation.callback_id)): + logger.error( + 'Operation %s target was not found. Releasing operation without retry.', + operation.callback_id, + ) + self.operations.finalize(operation, OperationResult.RELEASE) + return RunWithLockOutcome( + status=RunWithLockStatus.MISSING_CALLBACK, + op_id=operation.op_id, + result=OperationResult.RELEASE, + ) + logger.info( + 'Executing callback_id=%s, attempt=%s', operation.callback_id, operation.attempt + ) + + try: + result = callback(**operation.kwargs) + except Exception as e: + logger.exception('Operation failed: %s: %s', operation.callback_id, e) + result = OperationResult.RETRY_RELEASE + + match result: + case OperationResult.RETRY_HOLD: + logger.info( + 'Finished %s. Operation will be retried immediately.', operation.callback_id + ) + case OperationResult.RETRY_RELEASE: + logger.info('Finished %s. Operation will be retried later.', operation.callback_id) + case _: + logger.info('Finished %s. Lock will be released.', operation.callback_id) + result = OperationResult.RELEASE + + self.operations.finalize(operation, result) + return RunWithLockOutcome( + status=RunWithLockStatus.EXECUTED, + op_id=operation.op_id, + result=result, + ) + + def acquire_sync_lock(self, timeout: int | None) -> None: + """Acquire the etcd-backed synchronous lock for this unit. + + A dedicated lease is granted and kept alive for the duration of the + lock. The backend then repeatedly attempts to acquire the sync lock + until it succeeds or the timeout expires. + + Args: + timeout: Maximum time in seconds to wait for the lock. + None means wait indefinitely. + + Raises: + TimeoutError: If the lock could not be acquired before the timeout. + RollingOpsSyncLockError: if there was an error obtaining the lock. + """ + self._lease = EtcdLease() + + deadline = None if timeout is None else time.monotonic() + timeout + + try: + self._lease.grant() + while True: + try: + if self._sync_lock.try_acquire(self._lease.id): # type: ignore[reportArgumentType] + logger.info('etcd lock acquired.') + return + except Exception: + logger.exception('Failed while trying to acquire etcd sync lock.') + raise + + if deadline is not None and time.monotonic() >= deadline: + raise TimeoutError(f'Timed out acquiring etcd sync lock after {timeout}s.') + + time.sleep(15) + + except Exception as e: + try: + self._lease.revoke() + except Exception: + logger.exception('Failed to revoke lease %s.', self._lease.id) + raise RollingOpsSyncLockError('Failed to acquire the etcd sync lock') from e + + def release_sync_lock(self) -> None: + """Release the synchronous lock and revoke its lease.""" + self._sync_lock.release() + if self._lease is not None: + self._lease.revoke() + + def get_status(self) -> RollingOpsStatus: + """Return the rolling-ops status for this unit in etcd mode. + + Status is derived from the current etcd-backed lock state and the + unit's queued operation state. + + Returned values: + - INVALID: the peer or etcd relation is missing + - GRANTED: the async lock is currently held by this unit + - WAITING: this unit has queued work but does not hold the lock + - IDLE: this unit has no pending work + + Returns: + The current rolling-ops status for this unit. + """ + if self._peer_relation is None or self._etcd_relation is None: + return RollingOpsStatus.INVALID + + etcdctl.ensure_initialized() + + if self._async_lock.is_held(): + return RollingOpsStatus.GRANTED + + if self.operations.has_pending_work(): + return RollingOpsStatus.WAITING + + return RollingOpsStatus.IDLE diff --git a/rollingops/src/charmlibs/rollingops/etcd/_certificates.py b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py new file mode 100644 index 000000000..e408731ae --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py @@ -0,0 +1,168 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Manage generation and persistence of TLS certificates for etcd client access. + +This file contains functions responsible for creating and storing a client Certificate +Authority (CA) and a client certificate/key pair used to authenticate +with etcd via TLS. Certificates are generated only once and persisted +under a local directory so they can be reused across charm executions. + +Certificates are valid for 50 years. They are not renewed or rotated. +""" + +from datetime import timedelta + +from charmlibs import pathops +from charmlibs.interfaces.tls_certificates import ( + Certificate, + CertificateRequestAttributes, + CertificateSigningRequest, + PrivateKey, + TLSCertificatesError, +) +from charmlibs.rollingops.common._exceptions import RollingOpsFileSystemError +from charmlibs.rollingops.common._utils import with_pebble_retry +from charmlibs.rollingops.etcd._models import SharedCertificate + +BASE_DIR = pathops.LocalPath('/var/lib/rollingops/tls') +CA_CERT_PATH = BASE_DIR / 'client-ca.pem' +CLIENT_KEY_PATH = BASE_DIR / 'client.key' +CLIENT_CERT_PATH = BASE_DIR / 'client.pem' +VALIDITY_DAYS = 365 * 50 +KEY_SIZE = 4096 + + +def persist_client_cert_key_and_ca(shared: SharedCertificate) -> None: + """Persist the provided client certificate, key, and CA to disk. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + if _has_client_cert_key_and_ca(shared): + return + try: + with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) + shared.write_to_paths(CLIENT_CERT_PATH, CLIENT_KEY_PATH, CA_CERT_PATH) + + except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to persist client certificates and key.') from e + + +def _has_client_cert_key_and_ca(shared: SharedCertificate) -> bool: + """Return whether the provided certificate material matches local files. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + if not _exists(): + return False + try: + stored = SharedCertificate.from_paths( + CLIENT_CERT_PATH, + CLIENT_KEY_PATH, + CA_CERT_PATH, + ) + return stored == shared + + except ( + FileNotFoundError, + IsADirectoryError, + PermissionError, + TLSCertificatesError, + ValueError, + ) as e: + raise RollingOpsFileSystemError('Failed to read certificates and key.') from e + + +def generate(common_name: str) -> SharedCertificate: + """Generate a client CA and client certificate if they do not exist. + + This method creates: + 1. A CA private key and self-signed CA certificate. + 2. A client private key. + 3. A certificate signing request (CSR) using the provided common name. + 4. A client certificate signed by the generated CA. + + The generated files are written to disk and reused in future runs. + If the certificates already exist, this method does nothing. + + Args: + common_name: Common Name (CN) used in the client certificate + subject. This value should not contain slashes. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + if _exists(): + return SharedCertificate.from_paths( + CLIENT_CERT_PATH, + CLIENT_KEY_PATH, + CA_CERT_PATH, + ) + + ca_key = PrivateKey.generate(key_size=KEY_SIZE) + ca_attributes = CertificateRequestAttributes( + common_name=common_name, + is_ca=True, + add_unique_id_to_subject_name=False, + ) + ca_crt = Certificate.generate_self_signed_ca( + attributes=ca_attributes, + private_key=ca_key, + validity=timedelta(days=VALIDITY_DAYS), + ) + + client_key = PrivateKey.generate(key_size=KEY_SIZE) + + csr_attributes = CertificateRequestAttributes( + common_name=common_name, add_unique_id_to_subject_name=False + ) + csr = CertificateSigningRequest.generate( + attributes=csr_attributes, + private_key=client_key, + ) + + client_crt = Certificate.generate( + csr=csr, + ca=ca_crt, + ca_private_key=ca_key, + validity=timedelta(days=VALIDITY_DAYS), + is_ca=False, + ) + + shared = SharedCertificate( + certificate=client_crt, + key=client_key, + ca=ca_crt, + ) + + persist_client_cert_key_and_ca(shared) + return shared + + +def _exists() -> bool: + """Check whether the client certificates and CA certificate already exist. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + """ + return ( + with_pebble_retry(lambda: CA_CERT_PATH.exists()) + and with_pebble_retry(lambda: CLIENT_KEY_PATH.exists()) + and with_pebble_retry(lambda: CLIENT_CERT_PATH.exists()) + ) diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py new file mode 100644 index 000000000..6a38f949e --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -0,0 +1,482 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes that manage etcd concepts.""" + +import json +import logging +import subprocess +import time + +import charmlibs.rollingops.etcd._etcdctl as etcdctl +from charmlibs.rollingops.common._models import Operation, OperationResult +from charmlibs.rollingops.etcd._models import RollingOpsKeys + +logger = logging.getLogger(__name__) + +LOCK_LEASE_TTL = 60 + + +class EtcdLease: + """Manage the lifecycle of an etcd lease and its keep-alive process.""" + + def __init__(self): + self.id: str | None = None + self.keepalive_proc: subprocess.Popen[str] | None = None + + def grant(self) -> None: + """Create a new lease and start the keep-alive process.""" + res = etcdctl.run('lease', 'grant', str(LOCK_LEASE_TTL)) + # parse: "lease 694d9c9aeca3422a granted with TTL(1800s)" + parts = res.split() + self.id = parts[1] + logger.info('%s', res) + self._start_lease_keepalive() + + def revoke(self) -> None: + """Revoke the current lease and stop the keep-alive process.""" + lease_id = self.id + try: + if self.id is not None: + etcdctl.run('lease', 'revoke', self.id) + except Exception: + logger.exception('Fail to revoke lease %s.', lease_id) + raise + finally: + try: + self._stop_keepalive() + except Exception: + logger.exception('Fail to stop keepalive for lease %s.', lease_id) + finally: + self.id = None + + def _start_lease_keepalive(self) -> None: + """Start the background process that keeps the lease alive.""" + lease_id = self.id + if lease_id is None: + logger.info('Lease ID is None. Keepalive for this lease cannot be started.') + return + etcdctl.ensure_initialized() + self.keepalive_proc = subprocess.Popen( + [etcdctl.ETCDCTL_CMD, 'lease', 'keep-alive', lease_id], + env=etcdctl.load_env(), + stdin=subprocess.PIPE, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + text=True, + ) + logger.info('Keepalive started for lease %s.', self.id) + + def _stop_keepalive(self) -> None: + """Terminate the keep-alive subprocess if it is running.""" + if self.keepalive_proc is None: + return + try: + self.keepalive_proc.terminate() + except ProcessLookupError: + # Already dead + return + except Exception: + try: + self.keepalive_proc.wait(timeout=2) + except subprocess.TimeoutExpired: + logger.exception('Fail to stop keepalive for lease %s.') + self.keepalive_proc.kill() + return + finally: + self.keepalive_proc = None + + +class EtcdLock: + """Distributed lock implementation backed by etcd. + + The lock is represented by a key whose value identifies the current owner. + + Lock acquisition and release are performed using transactions to + ensure atomicity. + + The lock is attached to an etcd lease so that it is + automatically released if the owner stops refreshing the lease. + """ + + def __init__(self, lock_key: str, owner: str): + self.lock_key = lock_key + self.owner = owner + + def try_acquire(self, lease_id: str) -> bool: + """Attempt to acquire the lock. + + This method uses an etcd transaction that succeeds only if the + lock key does not yet exist. If successful, the lock key is created with the current + owner as its value and is attached to the provided lease. + + Args: + lease_id: ID of the etcd lease to associate with the lock. + + Returns: + True if the lock was successfully acquired, otherwise False. + """ + txn = f"""\ + version("{self.lock_key}") = "0" + + put "{self.lock_key}" "{self.owner}" --lease={lease_id} + + + """ + return etcdctl.txn(txn) + + def release(self) -> None: + """Release the lock if it is currently held by this owner. + + The lock is removed only if the value of the lock key matches + the current owner. This prevents one process from accidentally + releasing a lock held by another owner. + """ + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + + del "{self.lock_key}" + + + """ + etcdctl.txn(txn) + + def is_held(self) -> bool: + """Check whether the lock is currently held by this owner.""" + res = etcdctl.run('get', self.lock_key, '--print-value-only') + return res == self.owner + + +class EtcdOperationQueue: + """Queue abstraction for operations stored in etcd. + + This class represents a queue of operations stored under a common + key prefix in etcd. Each operation is stored as a key-value pair + where the key encodes the operation identifier and ordering, and + the value contains the serialized operation data. + """ + + def __init__(self, prefix: str, lock_key: str, owner: str): + self.prefix = prefix + self.lock_key = lock_key + self.owner = owner + + def peek(self) -> Operation | None: + """Return the first operation in the queue without removing it.""" + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is None: + return None + return Operation.from_dict(kv.value) + + def _peek_last(self) -> Operation | None: + """Return the last operation in the queue without removing it.""" + kv = etcdctl.get_last_key_value_pair(self.prefix) + if kv is None: + return None + return Operation.from_dict(kv.value) + + def move_head(self, to_queue_prefix: str) -> bool: + """Move the first operation in the queue to another queue. + + This operation is performed atomically using an etcd transaction. + The transaction succeeds only if: + - The lock is currently held by the configured owner. + - The head operation still exists. + + Args: + to_queue_prefix: Destination queue prefix. + + Returns: + True if the operation was moved successfully, otherwise False. + """ + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is None: + return False + + op_id = kv.key.split('/')[-1] + new_key = f'{to_queue_prefix}{op_id}' + data = json.dumps(kv.value) + value_escaped = data.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') + + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + version("{kv.key}") != "0" + + put "{new_key}" "{value_escaped}" + del "{kv.key}" + + + """ + return etcdctl.txn(txn) + + def move_operation(self, to_queue_prefix: str, operation: Operation) -> bool: + """Move a specific operation from this queue to another queue. + + The operation is identified using its operation ID and moved + atomically via an etcd transaction. + + Args: + to_queue_prefix: Destination queue prefix. + operation: Operation to move. + + Returns: + True if the operation was successfully moved, otherwise False. + """ + old_key = f'{self.prefix}{operation.op_id}' + new_key = f'{to_queue_prefix}{operation.op_id}' + + data = operation.to_string() + value_escaped = data.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') + + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + version("{old_key}") != "0" + + put "{new_key}" "{value_escaped}" + del "{old_key}" + + + """ + return etcdctl.txn(txn) + + def watch(self) -> Operation: + """Block until at least one operation exists and return it.""" + while True: + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is not None: + return Operation.from_dict(kv.value) + time.sleep(10) + + def dequeue(self) -> bool: + """Remove the first operation from the queue. + + The removal is performed using an etcd transaction that ensures + the lock owner still holds the lock and the operation exists. + + Returns: + True if the operation was removed successfully, otherwise False. + """ + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is None: + return False + + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + version("{kv.key}") != "0" + + del "{kv.key}" + + + """ + return etcdctl.txn(txn) + + def enqueue(self, operation: Operation) -> None: + """Insert a new operation into the queue. + + The method avoids inserting duplicate operations by comparing + the new operation with the last operation currently in the queue. + + Args: + operation: Operation to insert. + """ + old_operation = self._peek_last() + + if old_operation is not None and operation == old_operation: + logger.info( + 'Operation %s not added to the etcd queue. ' + 'It already exists in the back of the queue.', + operation.callback_id, + ) + return + + op_str = operation.to_string() + key = f'{self.prefix}{operation.op_id}' + etcdctl.run('put', key, op_str) + logger.info('Operation %s added to the etcd queue.', operation.callback_id) + + def clear(self) -> None: + etcdctl.run('del', self.prefix, '--prefix') + + +class WorkerOperationStore: + """Background-worker view of etcd-backed rolling operations. + + This class is used by the background process that coordinates lock + ownership and operation execution. It manages the lifecycle of queued + operations across the etcd-backed queue prefixes: + + - pending: operations waiting to be claimed + - in-progress: operations currently being executed + - completed: operations that finished execution and await post-processing + + It provides worker-oriented methods to: + - detect pending work + - claim the next operation for execution + - wait for completed operations + - requeue or delete completed operations + """ + + def __init__(self, keys: RollingOpsKeys, owner: str): + self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner) + self._inprogress = EtcdOperationQueue(keys.inprogress, keys.lock_key, owner) + self._completed = EtcdOperationQueue(keys.completed, keys.lock_key, owner) + + def has_pending(self) -> bool: + """Check whether there are pending operations. + + Returns: + True if at least one operation exists in the pending queue, + otherwise False. + """ + return self._pending.peek() is not None + + def has_inprogress(self) -> bool: + """Check whether there are in-progress operations. + + Returns: + True if at least one operation exists in the inprogress queue, + otherwise False. + """ + return self._inprogress.peek() is not None + + def has_completed(self) -> bool: + """Check whether there are completed operations. + + Returns: + True if at least one operation exists in the completed queue, + otherwise False. + """ + return self._completed.peek() is not None + + def claim_next(self) -> bool: + """Move the next pending operation to the in-progress queue. + + This operation is performed atomically and only succeeds if: + - the lock is still held by this owner + - the head of the pending queue has not changed + + Returns: + True if the operation was successfully claimed, + otherwise False. + """ + return self._pending.move_head(self._inprogress.prefix) + + def wait_until_completed(self) -> Operation: + """Block until at least one operation appears in the completed queue.""" + return self._completed.watch() + + def requeue_completed(self) -> bool: + """Requeue the head completed operation back to the pending queue. + + This is typically used when an operation needs to be retried + (e.g., RETRY_RELEASE or RETRY_HOLD semantics). + + Returns: + True if the operation was successfully moved back to pending, + otherwise False. + """ + return self._completed.move_head(self._pending.prefix) + + def delete_completed(self) -> bool: + """Remove the head operation from the completed queue. + + This is typically used when an operation has finished successfully + and does not need to be retried. + + Returns: + True if the operation was successfully removed, + otherwise False. + """ + return self._completed.dequeue() + + +class ManagerOperationStore: + """Charm-facing interface for requesting and finalizing etcd-backed operations. + + This class is used by the RollingOps manager running inside the charm. + It provides a narrow interface for interacting with the etcd-backed + operation queues without exposing the full queue topology. + + The manager can use it to: + - request a new operation + - inspect the current in-progress operation + - finalize an operation after execution + + Queue transitions and storage details remain encapsulated behind this API. + """ + + def __init__(self, keys: RollingOpsKeys, owner: str): + self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner) + self._inprogress = EtcdOperationQueue(keys.inprogress, keys.lock_key, owner) + self._completed = EtcdOperationQueue(keys.completed, keys.lock_key, owner) + + def request(self, operation: Operation) -> None: + """Add a new operation to the pending queue. + + Duplicate operations (same callback_id and kwargs as the last queued + operation) are not inserted. + + Args: + operation: Operation to enqueue. + """ + self._pending.enqueue(operation) + + def finalize(self, operation: Operation, result: OperationResult) -> bool: + """Move an in-progress operation to the completed queue. + + This should be called after the operation has been executed and its + result has been recorded. + + Args: + operation: The operation currently in the in-progress queue. + result: Result of the executions. + """ + match result: + case OperationResult.RETRY_HOLD: + operation.retry_hold() + case OperationResult.RETRY_RELEASE: + operation.retry_release() + case _: + operation.complete() + + return self._inprogress.move_operation(self._completed.prefix, operation) + + def peek_current(self) -> Operation | None: + """Return the current in-progress operation without modifying state. + + Returns: + The current in-progress operation, or None if no operation is + being processed. + """ + return self._inprogress.peek() + + def has_pending_work(self) -> bool: + """Return whether there is an operation currently being processed. + + Returns: + True if there is a current operation, otherwise False. + """ + return self.peek_current() is not None + + def clean_up(self) -> None: + """Clear all operation queues for this unit. + + This removes all in-progress, pending, and completed operations, + resetting the local etcd-backed state. It is typically used when + recovering from inconsistencies or after switching backends to + ensure a clean starting point. + """ + self._inprogress.clear() + self._pending.clear() + self._completed.clear() diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py new file mode 100644 index 000000000..c191d608c --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py @@ -0,0 +1,393 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functions for interacting with etcd through the etcdctl CLI. + +The functions in this file manage the environment variables required for +connecting to an etcd cluster, including TLS configuration, and provide +convenience functions for executing commands and retrieving structured results. +""" + +import json +import logging +import os +import shutil +import subprocess +from dataclasses import asdict +from functools import lru_cache + +from tenacity import ( + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_attempt, + wait_fixed, +) + +from charmlibs import pathops +from charmlibs.rollingops.common._exceptions import ( + RollingOpsEtcdctlFatalError, + RollingOpsEtcdctlParseError, + RollingOpsEtcdctlRetryableError, + RollingOpsEtcdNotConfiguredError, + RollingOpsFileSystemError, +) +from charmlibs.rollingops.common._utils import with_pebble_retry +from charmlibs.rollingops.etcd._models import CERT_MODE, EtcdConfig, EtcdKV + +logger = logging.getLogger(__name__) + +BASE_DIR = pathops.LocalPath('/var/lib/rollingops/etcd') +SERVER_CA_PATH = BASE_DIR / 'server-ca.pem' +CONFIG_FILE_PATH = BASE_DIR / 'etcdctl.json' +ETCDCTL_CMD = 'etcdctl' +ETCDCTL_TIMEOUT_SECONDS = 15 +ETCDCTL_RETRY_ATTEMPTS = 12 +ETCDCTL_RETRY_WAIT_SECONDS = 5 + + +@lru_cache(maxsize=1) +def is_etcdctl_installed() -> bool: + """Return whether the snap-provided etcdctl command is available.""" + return shutil.which(ETCDCTL_CMD) is not None + + +def write_trusted_server_ca(tls_ca_pem: str) -> None: + """Persist the etcd server CA certificate to disk. + + Args: + tls_ca_pem: PEM-encoded CA certificate. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + try: + with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) + with_pebble_retry(lambda: SERVER_CA_PATH.write_text(tls_ca_pem, mode=CERT_MODE)) + except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to persist etcd trusted CA certificate.') from e + + +def write_config_file( + endpoints: str, + client_cert_path: pathops.LocalPath, + client_key_path: pathops.LocalPath, +) -> None: + """Create or update the etcdctl configuration JSON file. + + This function writes a JSON file containing the required ETCDCTL_* + variables used by etcdctl to connect to the etcd cluster. + + Args: + endpoints: Comma-separated list of etcd endpoints. + client_cert_path: Path to the client certificate. + client_key_path: Path to the client private key. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + config = EtcdConfig( + endpoints=endpoints, + cacert_path=str(SERVER_CA_PATH), + cert_path=str(client_cert_path), + key_path=str(client_key_path), + ) + + try: + with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) + with_pebble_retry( + lambda: CONFIG_FILE_PATH.write_text(json.dumps(asdict(config), indent=2), mode=0o600) + ) + except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to persist etcd config file.') from e + + +def _load_config() -> EtcdConfig: + """Load etcd configuration from disk. + + Raises: + RollingOpsEtcdNotConfiguredError: If the config file does not exist. + RollingOpsFileSystemError: if we faile to read the etcd configuration file or + file cannot be deserialized. + PebbleConnectionError: if the remote container cannot be reached + """ + if not with_pebble_retry(lambda: CONFIG_FILE_PATH.exists()): + raise RollingOpsEtcdNotConfiguredError( + f'etcdctl config file does not exist: {CONFIG_FILE_PATH}' + ) + + try: + data = json.loads(CONFIG_FILE_PATH.read_text()) + return EtcdConfig(**data) + except FileNotFoundError as e: + raise RollingOpsEtcdNotConfiguredError('etcd configuration file not found.') from e + except (IsADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to read the etcd config file.') from e + except (json.JSONDecodeError, TypeError) as e: + raise RollingOpsFileSystemError('Invalid etcd configuration file format.') from e + + +def load_env() -> dict[str, str]: + """Return environment variables for etcdctl. + + Returns: A dictionary containing environment variables to pass to subprocess calls. + + Raises: + RollingOpsEtcdNotConfiguredError: If the environment file does not exist. + RollingOpsFileSystemError: if we fail to read the etcd configuration file or + the file cannot be deserialized. + PebbleConnectionError: if the remote container cannot be reached + """ + config = _load_config() + + env = os.environ.copy() + env.update({ + 'ETCDCTL_API': '3', + 'ETCDCTL_ENDPOINTS': config.endpoints, + 'ETCDCTL_CACERT': config.cacert_path, + 'ETCDCTL_CERT': config.cert_path, + 'ETCDCTL_KEY': config.key_path, + }) + return env + + +def ensure_initialized(): + """Checks whether the etcd config file for etcdctl is setup. + + Raises: + RollingOpsEtcdNotConfiguredError: if the etcd config file does not exist, etcd + server CA does not exist or etcdctl is not installed. + PebbleConnectionError: if the remote container cannot be reached. + """ + if not with_pebble_retry(lambda: CONFIG_FILE_PATH.exists()): + raise RollingOpsEtcdNotConfiguredError( + f'etcdctl config file does not exist: {CONFIG_FILE_PATH}' + ) + if not with_pebble_retry(lambda: SERVER_CA_PATH.exists()): + raise RollingOpsEtcdNotConfiguredError( + f'etcdctl server CA file does not exist: {SERVER_CA_PATH}' + ) + if not is_etcdctl_installed(): + raise RollingOpsEtcdNotConfiguredError(f'{ETCDCTL_CMD} is not installed.') + + +def cleanup() -> None: + """Removes the etcdctl env file and the trusted etcd server CA. + + Raises: + RollingOpsFileSystemError: if there is a problem when deleting the files. + PebbleConnectionError: if the remote container cannot be reached. + """ + try: + with_pebble_retry(lambda: SERVER_CA_PATH.unlink(missing_ok=True)) + with_pebble_retry(lambda: CONFIG_FILE_PATH.unlink(missing_ok=True)) + except (IsADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to remove etcd config file and CA.') from e + + +def _is_retryable_stderr(stderr: str) -> bool: + """Return whether stderr looks like a transient etcd/client failure.""" + text = stderr.lower() + retryable_markers = ( + 'connection refused', + 'context deadline exceeded', + 'deadline exceeded', + 'temporarily unavailable', + 'transport is closing', + 'connection reset', + 'broken pipe', + 'unavailable', + 'leader changed', + 'etcdserver: request timed out', + ) + return any(marker in text for marker in retryable_markers) + + +@retry( + retry=retry_if_exception_type(RollingOpsEtcdctlRetryableError), + stop=stop_after_attempt(ETCDCTL_RETRY_ATTEMPTS), + wait=wait_fixed(ETCDCTL_RETRY_WAIT_SECONDS), + before_sleep=before_sleep_log(logger, logging.WARNING), + reraise=True, +) +def _run_checked(*args: str, cmd_input: str | None = None) -> subprocess.CompletedProcess[str]: + """Execute etcdctl and return the completed process. + + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlRetryableError: for transient command failures. + RollingOpsEtcdctlFatalError: for non-retryable command failures. + """ + ensure_initialized() + + cmd = [ETCDCTL_CMD, *args] + + try: + res = subprocess.run( + cmd, + env=load_env(), + input=cmd_input, + text=True, + capture_output=True, + check=False, + timeout=ETCDCTL_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + logger.warning( + 'Timed out running etcdctl: cmd=%r stdout=%r stderr=%r', cmd, e.stdout, e.stderr + ) + raise RollingOpsEtcdctlRetryableError(f'Timed out running etcdctl: {cmd!r}') from e + except FileNotFoundError as e: + logger.exception('etcdctl executable not found: %s', ETCDCTL_CMD) + raise RollingOpsEtcdctlFatalError(f'etcdctl executable not found: {ETCDCTL_CMD}') from e + except OSError as e: + logger.exception('Failed to execute etcdctl: cmd=%r', cmd) + raise RollingOpsEtcdctlFatalError(f'Failed to execute etcdctl: {cmd!r}') from e + + if res.returncode != 0: + logger.warning( + 'etcdctl command failed: cmd=%r returncode=%s stdout=%r stderr=%r', + cmd, + res.returncode, + res.stdout, + res.stderr, + ) + if _is_retryable_stderr(res.stderr): + raise RollingOpsEtcdctlRetryableError( + f'Retryable etcdctl failure (rc={res.returncode}): {res.stderr.strip()}' + ) + raise RollingOpsEtcdctlFatalError( + f'etcdctl failed (rc={res.returncode}): {res.stderr.strip()}' + ) + + logger.debug('etcdctl command succeeded: cmd=%r stdout=%r', cmd, res.stdout) + return res + + +def run(*args: str) -> str: + """Execute an etcdctl command. + + Args: + args: List of arguments to pass to etcdctl. + + Returns: + The stdout of the command, stripped, or None if execution failed. + + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + RollingOpsFileSystemError: if configuration cannot be read. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlError: etcdctl command error. + """ + return _run_checked(*args).stdout.strip() + + +def _get_key_value_pair(key_prefix: str, *extra_args: str) -> EtcdKV | None: + """Retrieve the first key and value under a given prefix. + + Args: + key_prefix: Key prefix to search for. + extra_args: Arguments to the get command + + Returns: + A EtcdKV containing: + - The key string + - The parsed JSON value as a dictionary + + Returns None if no key exists. + + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + + """ + res = run('get', key_prefix, '--prefix', *extra_args) + out = res.splitlines() + if len(out) < 2: + return None + + try: + value = json.loads(out[1]) + except json.JSONDecodeError as e: + raise RollingOpsEtcdctlParseError( + f'Failed to parse JSON value for key {out[0]}: {out[1]}' + ) from e + + return EtcdKV(key=out[0], value=value) + + +def get_first_key_value_pair(key_prefix: str) -> EtcdKV | None: + """Retrieve the first key and value under a given prefix. + + Args: + key_prefix: Key prefix to search for. + + Returns: + A tuple containing: + - The key string + - The parsed JSON value as a dictionary + + Returns None if no key exists or the command fails. + + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + """ + return _get_key_value_pair(key_prefix, '--limit=1') + + +def get_last_key_value_pair(key_prefix: str) -> EtcdKV | None: + """Retrieve the last key and value under a given prefix. + + Args: + key_prefix: Key prefix to search for. + + Returns: + A tuple containing: + - The key string + - The parsed JSON value as a dictionary + + Returns None if no key exists or the command fails. + + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + """ + return _get_key_value_pair( + key_prefix, + '--sort-by=KEY', + '--order=DESCEND', + '--limit=1', + ) + + +def txn(txn_input: str) -> bool: + """Execute an etcd transaction. + + The transaction string should follow the etcdctl transaction format + where comparison statements are followed by operations. + + Args: + txn_input: The transaction specification passed to `etcdctl txn`. + + Returns: + True if the transaction succeeded, otherwise False. + + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlError: etcdctl command error. + """ + res = _run_checked('txn', cmd_input=txn_input) + return 'SUCCESS' in res.stdout diff --git a/rollingops/src/charmlibs/rollingops/etcd/_models.py b/rollingops/src/charmlibs/rollingops/etcd/_models.py new file mode 100644 index 000000000..fa1daaa08 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_models.py @@ -0,0 +1,213 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""etcd rolling ops models.""" + +from dataclasses import dataclass +from typing import ClassVar + +from charmlibs.interfaces.tls_certificates import Certificate, PrivateKey +from charmlibs.pathops import LocalPath +from charmlibs.rollingops.common._utils import with_pebble_retry + +CERT_MODE = 0o644 +KEY_MODE = 0o600 + + +@dataclass(frozen=True) +class SharedCertificate: + """Represent the certificates shared within units of an app to connect to etcd.""" + + certificate: Certificate + key: PrivateKey + ca: Certificate + + @classmethod + def from_paths( + cls, cert_path: LocalPath, key_path: LocalPath, ca_path: LocalPath + ) -> 'SharedCertificate': + """Create a SharedCertificate from certificate files on disk. + + This method reads the certificate, private key, and CA certificate + from the provided file paths and converts them into their respective + typed objects. + + Args: + cert_path: Path to the client certificate file (PEM format). + key_path: Path to the private key file (PEM format). + ca_path: Path to the CA certificate file (PEM format). + + Returns: + A SharedCertificate instance containing the loaded certificate material. + + Raises: + TLSCertificatesError: If any certificate cannot be parsed. + ValueError: If the key cannot be parsed + PebbleConnectionError: If the remote container cannot be reached + after retries. + FileNotFoundError: If the file does not exist. + PermissionError: If the file cannot be accessed. + """ + return cls( + certificate=Certificate.from_string(cls._read_text_with_retry(cert_path)), + key=PrivateKey.from_string(cls._read_text_with_retry(key_path)), + ca=Certificate.from_string(cls._read_text_with_retry(ca_path)), + ) + + @classmethod + def from_strings(cls, certificate: str, key: str, ca: str) -> 'SharedCertificate': + """Create a SharedCertificate from PEM-encoded strings. + + Raises: + TLSCertificatesError: If any certificate cannot be parsed. + ValueError: If the key cannot be parsed + """ + return cls( + certificate=Certificate.from_string(certificate), + key=PrivateKey.from_string(key), + ca=Certificate.from_string(ca), + ) + + def write_to_paths( + self, cert_path: LocalPath, key_path: LocalPath, ca_path: LocalPath + ) -> None: + """Write the certificate material to disk. + + This method writes the client certificate, private key, and CA certificate + to the specified file paths using appropriate file permissions. + + - Certificate and CA files are written with mode 0o644. + - Private key is written with mode 0o600. + + Args: + cert_path: Path where the client certificate will be written. + key_path: Path where the private key will be written. + ca_path: Path where the CA certificate will be written. + + Raises: + PebbleConnectionError: If the remote container cannot be reached + after retries. + PermissionError: If the file cannot be written. + NotADirectoryError: If the parent path is invalid. + """ + self._write_text_with_retry(path=cert_path, content=self.certificate.raw, mode=CERT_MODE) + self._write_text_with_retry(path=key_path, content=self.key.raw, mode=KEY_MODE) + self._write_text_with_retry(path=ca_path, content=self.ca.raw, mode=CERT_MODE) + + @classmethod + def _read_text_with_retry(cls, path: LocalPath) -> str: + """Read the content of a file, retrying on transient Pebble errors. + + Args: + path: The file path to read. + + Returns: + The file content as a string. + + Raises: + PebbleConnectionError: If the remote container cannot be reached + after retries. + FileNotFoundError: If the file does not exist. + PermissionError: If the file cannot be accessed. + """ + return with_pebble_retry(lambda: path.read_text()) + + def _write_text_with_retry(self, path: LocalPath, content: str, mode: int) -> None: + """Write text to a file, retrying on transient Pebble errors. + + Args: + path: The file path to write to. + content: The text content to write. + mode: File permission mode to apply (e.g. 0o600). + + Raises: + PebbleConnectionError: If the remote container cannot be reached + after retries. + PermissionError: If the file cannot be written. + NotADirectoryError: If the parent path is invalid. + """ + with_pebble_retry(lambda: path.write_text(content, mode=mode)) + + +@dataclass(frozen=True) +class EtcdConfig: + """Represent the etcd configuration.""" + + endpoints: str + cacert_path: str + cert_path: str + key_path: str + + +@dataclass +class EtcdKV: + """A single etcd key-value entry.""" + + key: str + value: dict[str, str] + + +@dataclass(frozen=True) +class RollingOpsKeys: + """Collection of etcd key prefixes used for rolling operations. + + Layout: + /rollingops/{lock_name}/{cluster_id}/granted-unit/ + /rollingops/{lock_name}/{cluster_id}/{owner}/pending/ + /rollingops/{lock_name}/{cluster_id}/{owner}/inprogress/ + /rollingops/{lock_name}/{cluster_id}/{owner}/completed/ + + The distributed lock key is cluster-scoped + """ + + ROOT: ClassVar[str] = '/rollingops' + + cluster_id: str + owner: str + lock_name: str = 'default' + + @property + def cluster_prefix(self) -> str: + """Etcd prefix corresponding to the cluster namespace.""" + return f'{self.ROOT}/{self.lock_name}/{self.cluster_id}/' + + @property + def _owner_prefix(self) -> str: + """Etcd prefix for all the queues belonging to an owner.""" + return f'{self.cluster_prefix}{self.owner}/' + + @property + def lock_key(self) -> str: + """Etcd key of the lock.""" + return f'{self.cluster_prefix}granted-unit/' + + @property + def pending(self) -> str: + """Prefix for operations waiting to be executed.""" + return f'{self._owner_prefix}pending/' + + @property + def inprogress(self) -> str: + """Prefix for operations currently being executed.""" + return f'{self._owner_prefix}inprogress/' + + @property + def completed(self) -> str: + """Prefix for operations that have finished execution.""" + return f'{self._owner_prefix}completed/' + + @classmethod + def for_owner(cls, cluster_id: str, owner: str) -> 'RollingOpsKeys': + """Create a set of keys for a given owner on a cluster.""" + return cls(cluster_id=cluster_id, owner=owner) diff --git a/rollingops/src/charmlibs/rollingops/etcd/_relations.py b/rollingops/src/charmlibs/rollingops/etcd/_relations.py new file mode 100644 index 000000000..7c2ff809b --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_relations.py @@ -0,0 +1,284 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +from dpcharmlibs.interfaces import ( + RequirerCommonModel, + ResourceCreatedEvent, + ResourceEndpointsChangedEvent, + ResourceProviderModel, + ResourceRequirerEventHandler, +) +from ops import Relation +from ops.charm import ( + CharmBase, + LeaderElectedEvent, + RelationBrokenEvent, + RelationChangedEvent, + SecretChangedEvent, +) +from ops.framework import Object + +from charmlibs.interfaces.tls_certificates import Certificate, TLSCertificatesError +from charmlibs.rollingops.common._exceptions import RollingOpsInvalidSecretContentError +from charmlibs.rollingops.etcd import _certificates as certificates +from charmlibs.rollingops.etcd import _etcdctl as etcdctl +from charmlibs.rollingops.etcd._models import SharedCertificate + +logger = logging.getLogger(__name__) +CERT_SECRET_FIELD = 'rollingops-client-secret-id' # noqa: S105 +CERT_SECRET_LABEL = 'rollingops-client-cert' # noqa: S105 +CLIENT_CERT_FIELD = 'client-cert' +CLIENT_KEY_FIELD = 'client-key' +CLIENT_CA_FIELD = 'client-ca' + + +class SharedClientCertificateManager(Object): + """Manage the shared rollingops client certificate via peer relation secret.""" + + def __init__(self, charm: CharmBase, peer_relation_name: str) -> None: + super().__init__(charm, 'shared-client-certificate') + self.charm = charm + self.peer_relation_name = peer_relation_name + + self.framework.observe(charm.on.leader_elected, self._on_leader_elected) + self.framework.observe( + charm.on[peer_relation_name].relation_changed, + self._on_peer_relation_changed, + ) + self.framework.observe(charm.on.secret_changed, self._on_secret_changed) + + @property + def _peer_relation(self) -> Relation | None: + """Return the peer relation for this charm.""" + return self.model.get_relation(self.peer_relation_name) + + def _on_leader_elected(self, event: LeaderElectedEvent) -> None: + """Handle the leader elected event. + + When this unit becomes the leader, it is responsible for generating + and sharing the client certificate material with other units. + """ + self.create_and_share_certificate() + + def _on_secret_changed(self, event: SecretChangedEvent) -> None: + """Handle updates to secrets. + + This method is triggered when a secret changes. It ensures that + the latest certificate material is synchronized to local files. + """ + if event.secret.label == CERT_SECRET_LABEL: + self.sync_to_local_files() + + def _on_peer_relation_changed(self, event: RelationChangedEvent) -> None: + """React to peer relation changes. + + The leader ensures the shared certificate exists. + All units try to persist the shared certificate locally if available. + """ + self.create_and_share_certificate() + self.sync_to_local_files() + + def create_and_share_certificate(self) -> None: + """Ensure the application client certificate exists. + + Only the leader generates the certificate and writes it to the peer + relation application databag. + + If the secret ID corresponding to the shared certificate already + exists in the peer relation, it is not created again. + """ + relation = self._peer_relation + if relation is None or not self.model.unit.is_leader(): + return + + app_data = relation.data[self.model.app] + + if app_data.get(CERT_SECRET_FIELD): + logger.info( + 'Shared certificate already exists in the databag. No new certificate is created.' + ) + return + + common_name = f'rollingops-{self.model.uuid}-{self.model.app.name}' + shared = certificates.generate(common_name) + + secret = self.model.app.add_secret( + content={ + CLIENT_CERT_FIELD: shared.certificate.raw, + CLIENT_KEY_FIELD: shared.key.raw, + CLIENT_CA_FIELD: shared.ca.raw, + }, + label=CERT_SECRET_LABEL, + ) + + app_data.update({CERT_SECRET_FIELD: secret.id}) # type: ignore[arg-type] + logger.info('Shared certificate added to the databag.') + + def get_shared_certificate_from_peer_relation(self) -> SharedCertificate | None: + """Return the client certificate, key and ca from peer app data. + + Returns: + SharedCertificate or None if not yet available. + + Raises: + RollingOpsInvalidSecretContent: if the content of the secret holding + the certificates does not contain all the fields or they are empty. + """ + if not (relation := self._peer_relation): + logger.debug('Peer relation is not available yet.') + return None + + if not (secret_id := relation.data[self.model.app].get(CERT_SECRET_FIELD)): + logger.info('Shared certificate secret ID does not exist in the databag yet.') + return None + + secret = self.model.get_secret(id=secret_id) + content = secret.get_content(refresh=True) + + certificate = content.get(CLIENT_CERT_FIELD, '') + key = content.get(CLIENT_KEY_FIELD, '') + ca = content.get(CLIENT_CA_FIELD, '') + + if not certificate or not key or not ca: + raise RollingOpsInvalidSecretContentError( + 'Invalid secret content: expected non-empty values for ' + f"'{CLIENT_CERT_FIELD}', '{CLIENT_KEY_FIELD}', and '{CLIENT_CA_FIELD}'. " + 'Missing or empty values are not allowed.' + ) + + try: + return SharedCertificate.from_strings( + certificate=certificate, + key=key, + ca=ca, + ) + except (TLSCertificatesError, ValueError) as e: + raise RollingOpsInvalidSecretContentError( + 'Invalid secret content: certificate material could not be parsed.' + ) from e + + def sync_to_local_files(self) -> None: + """Persist shared certificate locally if available.""" + shared = self.get_shared_certificate_from_peer_relation() + if shared is None: + logger.info('Shared rollingops etcd client certificate is not available yet.') + return + + certificates.persist_client_cert_key_and_ca(shared) + + def get_local_request_cert(self) -> Certificate | None: + """Return the cert to place in relation requests.""" + shared = self.get_shared_certificate_from_peer_relation() + return None if shared is None else shared.certificate + + +class EtcdRequiresV1(Object): + """EtcdRequires implementation for data interfaces version 1.""" + + def __init__( + self, + charm: CharmBase, + relation_name: str, + cluster_id: str, + shared_certificates: SharedClientCertificateManager, + ) -> None: + super().__init__(charm, f'requirer-{relation_name}') + self.charm = charm + self.cluster_id = cluster_id + self.shared_certificates = shared_certificates + + self.etcd_interface = ResourceRequirerEventHandler( + self.charm, + relation_name=relation_name, + requests=self.client_requests(), + response_model=ResourceProviderModel, + ) + + self.framework.observe( + self.etcd_interface.on.endpoints_changed, self._on_endpoints_changed + ) + self.framework.observe(charm.on[relation_name].relation_broken, self._on_relation_broken) + self.framework.observe(self.etcd_interface.on.resource_created, self._on_resource_created) + + @property + def etcd_relation(self) -> Relation | None: + """Return the etcd relation if present.""" + relations = self.etcd_interface.relations + return relations[0] if relations else None + + def _on_relation_broken(self, event: RelationBrokenEvent) -> None: + """Remove the stored information about the etcd server.""" + etcdctl.cleanup() + + def _on_endpoints_changed( + self, event: ResourceEndpointsChangedEvent[ResourceProviderModel] + ) -> None: + """Handle updates to etcd endpoints from the provider. + + The method writes an environment configuration + file used by etcdctl to connect securely to the cluster. + + If no endpoints are provided in the event, the update is skipped. + """ + response = event.response + + if not response.endpoints: + logger.error('Received a endpoints changed event but no etcd endpoints available.') + return + + logger.info('etcd endpoints changed: %s', response.endpoints) + + etcdctl.write_config_file( + endpoints=response.endpoints, + client_cert_path=certificates.CLIENT_CERT_PATH, + client_key_path=certificates.CLIENT_KEY_PATH, + ) + + def _on_resource_created(self, event: ResourceCreatedEvent[ResourceProviderModel]) -> None: + """Handle provisioning of etcd connection resources. + + This method stores the trusted server CA locally and write the etcd client environment + configuration file. + """ + response = event.response + + if not response.tls_ca: + logger.error( + 'Received a resource created event but no etcd server CA chain available.' + ) + return + + etcdctl.write_trusted_server_ca(tls_ca_pem=response.tls_ca) + + if not response.endpoints: + logger.error('Received a resource created event but no etcd endpoints available.') + return + + etcdctl.write_config_file( + endpoints=response.endpoints, + client_cert_path=certificates.CLIENT_CERT_PATH, + client_key_path=certificates.CLIENT_KEY_PATH, + ) + + def client_requests(self) -> list[RequirerCommonModel]: + """Return the client requests for the etcd requirer interface.""" + cert = self.shared_certificates.get_local_request_cert() + return [ + RequirerCommonModel( + resource=self.cluster_id, + mtls_cert=None if cert is None else cert.raw, + ) + ] diff --git a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py new file mode 100644 index 000000000..e8121aaa9 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py @@ -0,0 +1,165 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import time + +from charmlibs.rollingops.common._models import OperationResult +from charmlibs.rollingops.common._utils import dispatch_hook, dispatch_lock_granted, setup_logging +from charmlibs.rollingops.etcd._etcd import ( + EtcdLease, + EtcdLock, + WorkerOperationStore, +) +from charmlibs.rollingops.etcd._models import RollingOpsKeys + +logger = logging.getLogger(__name__) + +INITIAL_SLEEP = 10 # Delay before the worker begins processing. +LOCK_ACQUIRE_SLEEP = 15 # Delay between etcd lock acquisition attempts. +NEXT_OP_SLEEP = 30 # Delay between queue polls when idle. + + +class RollingOpsEtcdInconsistencyError(Exception): + """Raised when unexpected or inconsistent etcd operation state is found.""" + + +def _dispatch_etcd_failed(unit_name: str, charm_dir: str) -> None: + """Dispatch the fatal etcd-worker failure hook. + + This notifies the charm that the etcd worker encountered an + unrecoverable error so that higher-level logic can fall back to the + peer backend. + + Args: + unit_name: Name of the unit dispatching the hook. + charm_dir: Path to the charm root directory. + """ + hook_name = 'rollingops_etcd_failed' + dispatch_hook(unit_name, charm_dir, hook_name) + + +def main(): + """Run the etcd rolling-ops worker loop. + + This worker is responsible for processing the current unit's + etcd-backed operation queue. It waits for pending work, acquires the + etcd lock, claims the next operation, dispatches the lock-granted + hook, and then waits for the operation result to be written back. + + Processing behavior depends on the final operation result: + + - `RETRY_HOLD`: requeue the operation immediately and keep the lock + - `RETRY_RELEASE`: requeue the operation and release the lock + - any other result: remove the completed operation and release the lock + + If the worker detects invalid etcd queue state or encounters an + unrecoverable error, it dispatches the `rollingops_etcd_failed` + hook so the charm can fall back to peer-based processing. + + The worker always attempts to revoke its lease and release the lock + before exiting. + """ + setup_logging('/var/log/etcd_rollingops_worker.log') + + parser = argparse.ArgumentParser() + parser.add_argument('--unit-name', required=True) + parser.add_argument('--charm-dir', required=True) + parser.add_argument('--owner', required=True) + parser.add_argument('--cluster-id', required=True) + args = parser.parse_args() + + logger.info( + 'Worker starting (unit=%s owner=%s cluster=%s)', + args.unit_name, + args.owner, + args.cluster_id, + ) + + time.sleep(INITIAL_SLEEP) + + keys = RollingOpsKeys.for_owner(args.cluster_id, args.owner) + lock = EtcdLock(keys.lock_key, args.owner) + lease = EtcdLease() + operations = WorkerOperationStore(keys, args.owner) + + try: + while True: + if operations.has_inprogress() or operations.has_completed(): + raise RollingOpsEtcdInconsistencyError('Invalid operations found in etcd queues.') + + if not operations.has_pending(): + time.sleep(NEXT_OP_SLEEP) + continue + + if not lock.is_held(): + if lease.id is None: + lease.grant() + + logger.info('Try to get lock.') + if not lock.try_acquire(lease.id): # pyright: ignore[reportArgumentType] + time.sleep(LOCK_ACQUIRE_SLEEP) + continue + logger.info('Lock granted.') + + if not operations.claim_next(): + raise RollingOpsEtcdInconsistencyError('Failed to get next operation.') + + dispatch_lock_granted(args.unit_name, args.charm_dir) + + logger.info('Waiting for operation to be finished.') + operation = operations.wait_until_completed() + + logger.info('Operation %s completed with %s', operation.callback_id, operation.result) + match operation.result: + case OperationResult.RETRY_HOLD: + operations.requeue_completed() + continue + + case OperationResult.RETRY_RELEASE: + operations.requeue_completed() + + case _: + operations.delete_completed() + + lease.revoke() + lock.release() + logger.info('Lease revoked and lock released.') + time.sleep(NEXT_OP_SLEEP) + + except Exception as e: + logger.exception('Fatal etcd worker error: %s', e) + + try: + _dispatch_etcd_failed(args.unit_name, args.charm_dir) + except Exception: + logger.exception('Failed to dispatch rollingops_etcd_failed hook.') + + finally: + try: + lease.revoke() + except Exception: + logger.exception('Failed to revoke lease during worker shutdown.') + + try: + lock.release() + except Exception: + logger.exception('Failed to release lock during worker shutdown.') + + logger.info('Exit.') + + +if __name__ == '__main__': + main() diff --git a/rollingops/src/charmlibs/rollingops/etcd/_worker.py b/rollingops/src/charmlibs/rollingops/etcd/_worker.py new file mode 100644 index 000000000..00f193109 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_worker.py @@ -0,0 +1,112 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" + +import logging + +from ops.charm import CharmBase + +from charmlibs import pathops +from charmlibs.rollingops.common._base_worker import BaseRollingOpsAsyncWorker + +logger = logging.getLogger(__name__) + +WORKER_PID_FIELD = 'etcd-rollingops-worker-pid' + + +class EtcdRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): + """Manage the etcd-backed rolling-ops worker process. + + Unlike the peer backend, each unit runs its own worker process when + using the etcd backend. Worker PID is stored in the unit databag, + ensuring isolation between units and allowing each unit to independently + manage its own worker lifecycle. + """ + + _pid_field = WORKER_PID_FIELD + _log_filename = 'etcd_rollingops_worker' + + def __init__(self, charm: CharmBase, peer_relation_name: str, owner: str, cluster_id: str): + super().__init__(charm, 'etcd-rollingops-async-worker', peer_relation_name) + self._owner = owner + self._cluster_id = cluster_id + + def _worker_script_path(self) -> pathops.LocalPath: + """Return the path to the etcd rolling-ops worker script. + + This script is executed in a background process to handle operation + processing for the etcd backend. + """ + return pathops.LocalPath( + self._venv_site_packages() / 'charmlibs' / 'rollingops' / 'etcd' / '_rollingops.py' + ) + + def _worker_args(self) -> list[str]: + """Return the arguments passed to the etcd worker process. + + Returns: + A list of command-line arguments for the worker process. + """ + return [ + '--owner', + self._owner, + '--cluster-id', + self._cluster_id, + ] + + def _get_pid_str(self) -> str: + """Return the stored worker process PID for this unit. + + The PID is stored in the unit databag because each unit runs its own + independent worker process when using the etcd backend. This ensures + that worker lifecycle management is isolated per unit. + + Returns: + The worker process PID as a string, or an empty string if not set. + """ + if self._relation is None: + return '' + return self._relation.data[self.model.unit].get(self._pid_field, '') + + def _set_pid_str(self, pid: str) -> None: + """Persist the worker process PID in the unit databag. + + The PID is stored per unit to reflect that each unit owns and manages + its own worker process when using the etcd backend. + + Args: + pid: The process identifier to store. + """ + if self._relation is None: + return + self._relation.data[self.model.unit].update({self._pid_field: pid}) + + def _on_existing_worker(self, pid: int) -> bool: + """Executed on detection of an already running worker for this unit. + + Since each unit manages its own worker process, an existing worker is + considered valid and is left running. No restart is performed. + + Args: + pid: The PID of the currently running worker. + + Returns: + False to indicate that no new worker should be started. + """ + logger.info( + 'RollingOps worker already running with PID %s; not starting a new one.', + pid, + ) + return False diff --git a/rollingops/src/charmlibs/rollingops/peer/__init__.py b/rollingops/src/charmlibs/rollingops/peer/__init__.py new file mode 100644 index 000000000..c75a6c654 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rollingops for charms using peer relations.""" diff --git a/rollingops/src/charmlibs/rollingops/peer/_backend.py b/rollingops/src/charmlibs/rollingops/peer/_backend.py new file mode 100644 index 000000000..4f324f710 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/_backend.py @@ -0,0 +1,592 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rolling Ops v1 — coordinated rolling operations for Juju charms. + +This library provides a reusable mechanism for coordinating rolling operations +across units of a Juju application using a peer-relation distributed lock. + +The library guarantees that at most one unit executes a rolling operation at any +time, while allowing multiple units to enqueue operations and participate +in a coordinated rollout. + +## Data model (peer relation) + +### Unit databag + +Each unit maintains a FIFO queue of operations it wishes to execute. + +Keys: +- `operations`: JSON-encoded list of queued `Operation` objects +- `state`: `"idle"` | `"request"` | `"retry-release"` | `"retry-hold"` +- `executed_at`: UTC timestamp string indicating when the current operation last ran + +Each `Operation` contains: +- `callback_id`: identifier of the callback to execute +- `kwargs`: JSON-serializable arguments for the callback +- `requested_at`: UTC timestamp when the operation was enqueued +- `max_retry (optional)`: maximum retry count. `None` means unlimited +- `attempt`: current attempt number + +### Application databag + +The application databag represents the global lock state. + +Keys: +- `granted_unit`: unit identifier (unit name), or empty +- `granted_at`: UTC timestamp indicating when the lock was granted + +## Operation semantics + +- Units enqueue operations instead of overwriting a single pending request. +- Duplicate operations (same `callback_id` and `kwargs`) are ignored if they are + already the last queued operation. +- When granted the lock, a unit executes exactly one operation (the queue head). +- After execution, the lock is released so that other units may proceed. + +## Retry semantics + +- If a callback returns `OperationResult.RETRY_RELEASE` the unit will release the +lock and retry the operation later. +- If a callback returns `OperationResult.RETRY_HOLD` the unit will keep the +lock and retry immediately. +- Retry state (`attempt`) is tracked per operation. +- When `max_retry` is exceeded, the failing operation is dropped and the unit + proceeds to the next queued operation, if any. + +## Scheduling semantics + +- Only the leader schedules lock grants. +- If a valid lock grant exists, no new unit is scheduled. +- Requests are preferred over retries. +- Among requests, the operation with the oldest `requested_at` timestamp is selected. +- Among retries, the operation with the oldest `executed_at` timestamp is selected. +- Stale grants (e.g., pointing to departed units) are automatically released. + +All timestamps are stored in UTC using ISO 8601 format. + +## Using the library in a charm + +### 1. Declare a peer relation + +```yaml +peers: + restart: + interface: rolling_op +``` + +Import this library into src/charm.py, and initialize a PeerRollingOpsBackend in the Charm's +`__init__`. The Charm should also define a callback routine, which will be executed when +a unit holds the distributed lock: + +src/charm.py +```python +from charms.rolling_ops.v1.rollingops import PeerRollingOpsBackend, OperationResult + +class SomeCharm(CharmBase): + def __init__(self, *args): + super().__init__(*args) + + self.rolling_ops = PeerRollingOpsBackend( + charm=self, + relation_name="restart", + callback_targets={ + "restart": self._restart, + "failed_restart": self._failed_restart, + "defer_restart": self._defer_restart, + }, + ) + + def _restart(self, force: bool) -> OperationResult: + # perform restart logic + return OperationResult.RELEASE + + def _failed_restart(self) -> OperationResult: + # perform restart logic + return OperationResult.RETRY_RELEASE + + def _defer_restart(self) -> OperationResult: + if not self.some_condition(): + return OperationResult.RETRY_HOLD + # do restart logic + return OperationResult.RELEASE +``` + +Request a rolling operation + +```python + + def _on_restart_action(self, event) -> None: + self.rolling_ops.request_async_lock( + callback_id="restart", + kwargs={"force": True}, + max_retry=3, + ) +``` + +All participating units must enqueue the operation in order to be included +in the rolling execution. + +Units that do not enqueue the operation will be skipped, allowing operators +to recover from partial failures by reissuing requests selectively. + +Do not include sensitive information in the kwargs of the callback. +These values will be stored in the databag. + +Make sure that callback_targets is not dynamic and that the mapping +contains the expected values at the moment of the callback execution. +""" + +import logging +from collections.abc import Callable +from typing import Any + +from ops import Object, Relation, Unit +from ops.charm import ( + CharmBase, + RelationChangedEvent, + RelationDepartedEvent, +) +from ops.framework import EventBase + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsDecodingError, + RollingOpsInvalidLockRequestError, + RollingOpsNoRelationError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationResult, + RollingOpsStatus, + RunWithLockOutcome, + RunWithLockStatus, +) +from charmlibs.rollingops.peer._models import ( + PeerAppLock, + PeerUnitOperations, + iter_peer_units, + pick_oldest_completed, + pick_oldest_request, +) +from charmlibs.rollingops.peer._worker import PeerRollingOpsAsyncWorker + +logger = logging.getLogger(__name__) + + +class PeerRollingOpsBackend(Object): + """Manage rolling operations using the peer-relation backend. + + This backend stores operation queues in the peer relation and relies + on the leader unit to schedule lock grants across units. Once a unit + is granted the lock, it executes its queued operation locally. + + The peer backend acts as both the primary backend when etcd is not + available and as the durable fallback state used to continue + processing when etcd-backed execution fails. + """ + + def __init__( + self, charm: CharmBase, relation_name: str, callback_targets: dict[str, Callable[..., Any]] + ): + """Initialize the peer-backed rolling-ops backend. + + Args: + charm: The charm instance owning this backend. + relation_name: Name of the peer relation used to store lock and + operation state. + callback_targets: Mapping from callback identifiers to callables + executed when this unit is granted the lock. + """ + super().__init__(charm, 'peer-rolling-ops-manager') + self._charm = charm + self.relation_name = relation_name + self.callback_targets = callback_targets + self.worker = PeerRollingOpsAsyncWorker(charm, relation_name=relation_name) + + self.framework.observe( + charm.on[self.relation_name].relation_changed, self._on_relation_changed + ) + self.framework.observe( + charm.on[self.relation_name].relation_departed, self._on_relation_departed + ) + self.framework.observe(charm.on.leader_elected, self._process_locks) + self.framework.observe(charm.on.update_status, self._on_rollingops_lock_granted) + + @property + def _relation(self) -> Relation | None: + """Return the peer relation used for lock and operation state.""" + return self.model.get_relation(self.relation_name) + + def _lock(self) -> PeerAppLock: + """Return the shared application-level peer lock. + + This lock is stored in the peer relation application databag and is + used by the leader to grant execution rights to one unit at a time. + """ + return PeerAppLock(self.model, self.relation_name) + + def _operations(self, unit: Unit) -> PeerUnitOperations: + """Return the peer-backed operation queue for a unit. + + Args: + unit: The unit whose operation queue should be accessed. + + Returns: + A helper for reading and updating that unit's queued operations. + """ + return PeerUnitOperations(self.model, self.relation_name, unit) + + def enqueue_operation(self, operation: Operation) -> None: + """Persist an operation in the current unit's peer-backed queue. + + Args: + operation: The operation to enqueue. + + Raises: + RollingOpsInvalidLockRequestError: If the operation could not be + persisted due to invalid or undecodable queue state. + RollingOpsNoRelationError: If the peer relation is not available. + """ + try: + self._operations(self.model.unit).request(operation) + except (RollingOpsDecodingError, ValueError) as e: + logger.error('Failed to create operation: %s', e) + raise RollingOpsInvalidLockRequestError('Failed to create the lock request') from e + except RollingOpsNoRelationError as e: + logger.debug('No %s peer relation yet.', self.relation_name) + raise e + + def ensure_processing(self) -> None: + """Trigger peer-based scheduling if the current unit is leader. + + In the peer backend, scheduling decisions are made only by the + leader unit. Non-leader units do not actively process locks. + """ + if self.model.unit.is_leader(): + self._process_locks() + + def has_pending_work(self) -> bool: + """Return whether the current unit has pending peer-managed work.""" + return self._operations(self.model.unit).has_pending_work() + + def _on_rollingops_lock_granted(self, event: EventBase) -> None: + """Handler of the custom hook rollingops_lock_granted. + + The custom hook is triggered by a background process. + """ + if not self._relation: + return + logger.info('Received a rolling-ops lock granted event.') + lock = self._lock() + operations = self._operations(self.model.unit) + if operations.should_run(lock): + self._on_run_with_lock() + self._process_locks() + + def _on_relation_departed(self, event: RelationDepartedEvent) -> None: + """Leader cleanup: if a departing unit was granted a lock, clear the grant. + + This prevents deadlocks when the granted unit leaves the relation. + """ + if not self.model.unit.is_leader(): + return + if unit := event.departing_unit: + lock = self._lock() + if lock.is_granted(unit.name): + lock.release() + self._process_locks() + + def _on_relation_changed(self, _: RelationChangedEvent) -> None: + """React to peer relation changes. + + The leader re-runs scheduling whenever peer relation state changes. + Non-leader units only check whether they should execute an operation + that has already been granted to them. + """ + if self.model.unit.is_leader(): + self._process_locks() + return + + lock = self._lock() + operations = self._operations(self.model.unit) + if operations.should_run(lock): + self._on_run_with_lock() + + def _valid_peer_unit_names(self) -> set[str]: + """Return all unit names currently participating in the peer relation.""" + if not self._relation: + return set() + names = {u.name for u in self._relation.units} + names.add(self.model.unit.name) + return names + + def _release_stale_grant(self) -> None: + """Ensure granted_unit refers to a unit currently on the peer relation.""" + if not self._relation: + return + + lock = self._lock() + granted_unit = lock.granted_unit + if not granted_unit: + return + + valid_units = self._valid_peer_unit_names() + if granted_unit not in valid_units: + logger.warning( + 'granted_unit=%s is not in current peer units; releasing stale grant.', + granted_unit, + ) + lock.release() + + def _process_locks(self, _: EventBase | None = None) -> None: + """Process locks. + + This method is only executed by the leader unit. + It effectively releases the lock and triggers scheduling. + """ + if not self.model.unit.is_leader(): + return + + lock = self._lock() + + for unit in iter_peer_units(self.model, self.relation_name): + operations = self._operations(unit) + if not operations.is_peer_managed(): + continue + if operations.should_release(lock): + lock.release() + break + + self._release_stale_grant() + + if lock.granted_unit: + logger.info( + 'Current granted_unit=%s. No new unit will be scheduled.', + lock.granted_unit, + ) + return + + self._schedule(lock) + + def _schedule(self, lock: PeerAppLock) -> None: + """Select and grant the next lock based on priority and queue state. + + This method iterates over all locks associated with the relation and + determines which unit should receive the lock next. + + Priority order: + 1. Units in RETRY_HOLD state are immediately granted the lock. + 2. Units in REQUEST state are considered next (oldest request first). + 3. Units in RETRY_RELEASE state are considered last (oldest completed first). + + If no eligible lock is found, no action is taken. + + Once a lock is selected, it is granted via `_grant_lock`. + """ + logger.info('Starting scheduling.') + + pending_requests: list[PeerUnitOperations] = [] + pending_retries: list[PeerUnitOperations] = [] + + for unit in iter_peer_units(self.model, self.relation_name): + operations = self._operations(unit) + + if not operations.is_peer_managed(): + continue + + if operations.is_retry_hold(): + self._grant_lock(lock, operations.unit.name) + return + + if operations.is_waiting(): + pending_requests.append(operations) + elif operations.is_waiting_retry(): + pending_retries.append(operations) + + selected = None + if pending_requests: + selected = pick_oldest_request(pending_requests) + elif pending_retries: + selected = pick_oldest_completed(pending_retries) + + if selected is None: + logger.info('No pending lock requests. Lock was not granted to any unit.') + return + + self._grant_lock(lock, selected) + + def _grant_lock(self, lock: PeerAppLock, unit_name: str) -> None: + """Grant the lock to the selected unit. + + Once the lock is granted, the selected unit becomes eligible to + execute its next queued operation. If the selected unit is the local + unit (leader), its worker process is started to trigger execution. + + Args: + lock: The peer lock instance to grant. + unit_name: Name of the unit receiving the lock grant. + """ + lock.grant(unit_name) + logger.info('Lock granted to unit=%s.', unit_name) + + if unit_name == self.model.unit.name: + self.worker.start() + + def request_async_lock( + self, + callback_id: str, + kwargs: dict[str, Any] | None = None, + max_retry: int | None = None, + ) -> None: + """Enqueue a rolling operation and request the distributed lock. + + This method appends an operation (identified by callback_id and kwargs) to the + calling unit's FIFO queue stored in the peer relation databag and marks the unit as + requesting the lock. It does not execute the operation directly. + + Args: + callback_id: Identifier for the callback to execute when this unit is granted + the lock. Must be a non-empty string and must exist in the manager's + callback registry. + kwargs: Keyword arguments to pass to the callback when executed. If omitted, + an empty dict is used. Must be JSON-serializable because it is stored + in Juju relation databags. + max_retry: Retry limit for this operation. None means unlimited retries. + 0 means no retries (drop immediately on first failure). Must be >= 0 + when provided. + + Raises: + RollingOpsInvalidLockRequestError: If any input is invalid (e.g. unknown callback_id, + non-dict kwargs, non-serializable kwargs, negative max_retry). + RollingOpsNoRelationError: If the peer relation does not exist. + """ + if callback_id not in self.callback_targets: + raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') + + try: + if kwargs is None: + kwargs = {} + operation = Operation.create(callback_id, kwargs, max_retry) + operations = self._operations(self.model.unit) + operations.request(operation) + + except (RollingOpsDecodingError, ValueError) as e: + logger.error('Failed to create operation: %s', e) + raise RollingOpsInvalidLockRequestError('Failed to create the lock request') from e + except RollingOpsNoRelationError as e: + logger.debug('No %s peer relation yet.', self.relation_name) + raise e + + if self.model.unit.is_leader(): + self._process_locks() + + def _on_run_with_lock(self) -> None: + """Execute the current head operation if this unit holds the distributed lock. + + - If this unit does not currently hold the lock grant, no operation is run. + - If this unit holds the grant but has no queued operation, lock is released. + - Otherwise, the operation's callback is looked up by `callback_id` and + invoked with the operation kwargs. + """ + lock = self._lock() + operations = self._operations(self.model.unit) + + if not lock.is_granted(self.model.unit.name): + logger.debug('Lock is not granted. Operation will not run.') + return + + if not (operation := operations.get_current()): + logger.debug('There is no operation to run.') + operations.finish(OperationResult.RELEASE) + return + + if not (callback := self.callback_targets.get(operation.callback_id)): + logger.error( + 'Operation %s target was not found. Releasing operation without retry.', + operation.callback_id, + ) + operations.finish(OperationResult.RELEASE) + return + logger.info( + 'Executing callback_id=%s, attempt=%s', operation.callback_id, operation.attempt + ) + try: + result = callback(**operation.kwargs) + except Exception as e: + logger.exception('Operation failed: %s: %s', operation.callback_id, e) + result = OperationResult.RETRY_RELEASE + + logger.info('Operation %s executed with result %s.', operation.callback_id, result) + operations.finish(result) + + def mirror_outcome(self, outcome: RunWithLockOutcome) -> None: + """Apply the execution result to the mirrored peer queue. + + This keeps the peer standby queue aligned with the backend that + actually executed the operation. + + Args: + outcome: The etcd execution outcome to mirror. + + Raises: + RollingOpsDecodingError: If theres is an inconsistency found. + """ + match outcome.status: + case RunWithLockStatus.NOT_GRANTED: + logger.info('Skipping mirror: etcd lock was not granted.') + return + + case RunWithLockStatus.NO_OPERATION: + if not self._operations(self.model.unit).has_pending_work(): + logger.info('Skipping mirror: no operation.') + return + raise RollingOpsDecodingError( + 'Mismatch between the etcd and peer operation queue.' + ) + + case RunWithLockStatus.MISSING_CALLBACK | RunWithLockStatus.EXECUTED: + self._operations(self.model.unit).mirror_result(outcome.op_id, outcome.result) # type: ignore[reportArgumentType] + case _: + raise RollingOpsDecodingError( + f'Unsupported run-with-lock outcome: {outcome.status}' + ) + + def get_status(self) -> RollingOpsStatus: + """Return the current rolling-ops status for this unit in peer mode. + + Status is derived from the local unit's peer-backed operation queue + and from the shared peer lock state. + + Returned values: + - INVALID: the peer relation does not exist + - GRANTED: the current unit holds the peer lock + - WAITING: the current unit has queued work but does not hold the lock + - IDLE: the current unit has no pending work + + Returns: + The current rolling-ops status for this unit. + """ + if self._relation is None: + return RollingOpsStatus.INVALID + + lock = self._lock() + operations = self._operations(self.model.unit) + + if lock.is_granted(self.model.unit.name): + return RollingOpsStatus.GRANTED + + if operations.has_pending_work(): + return RollingOpsStatus.WAITING + + return RollingOpsStatus.IDLE diff --git a/rollingops/src/charmlibs/rollingops/peer/_models.py b/rollingops/src/charmlibs/rollingops/peer/_models.py new file mode 100644 index 000000000..2ccce1271 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/_models.py @@ -0,0 +1,392 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Models for peer-relation rollingops.""" + +import logging +from collections.abc import Iterator +from dataclasses import dataclass +from datetime import datetime +from enum import StrEnum + +from ops import Model, Unit + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsDecodingError, + RollingOpsNoRelationError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationQueue, + OperationResult, + UnitBackendState, +) +from charmlibs.rollingops.common._utils import datetime_to_str, now_timestamp, parse_timestamp + +logger = logging.getLogger(__name__) + + +class LockIntent(StrEnum): + """Unit-level lock intents stored in unit databags.""" + + REQUEST = 'request' + RETRY_RELEASE = 'retry-release' + RETRY_HOLD = 'retry-hold' + IDLE = 'idle' + + +@dataclass +class PeerAppData: + """Application-scoped peer relation data.""" + + granted_unit: str = '' + granted_at: str = '' + + @property + def granted_at_dt(self) -> datetime | None: + """Return the grant timestamp as a datetime, if present.""" + return parse_timestamp(self.granted_at) + + @granted_at_dt.setter + def granted_at_dt(self, value: datetime | None) -> None: + """Store the grant timestamp from a datetime.""" + self.granted_at = datetime_to_str(value) if value is not None else '' + + +@dataclass +class PeerUnitData: + """Unit-scoped peer relation data.""" + + state: str = '' + operations: str = '' + executed_at: str = '' + + @property + def intent(self) -> LockIntent: + """Return the unit state as a LockIntent.""" + return LockIntent(self.state) if self.state else LockIntent.IDLE + + @intent.setter + def intent(self, value: LockIntent) -> None: + """Store the unit state from a LockIntent.""" + self.state = value + + @property + def queue(self) -> OperationQueue: + """Return the stored operation queue.""" + return OperationQueue.from_string(self.operations) + + @queue.setter + def queue(self, value: OperationQueue) -> None: + """Store the operation queue.""" + self.operations = value.to_string() + + @property + def executed_at_dt(self) -> datetime | None: + """Return the execution timestamp as a datetime, if present.""" + return parse_timestamp(self.executed_at) + + @executed_at_dt.setter + def executed_at_dt(self, value: datetime | None) -> None: + """Store the execution timestamp from a datetime.""" + self.executed_at = datetime_to_str(value) if value is not None else '' + + +class PeerAppLock: + """Application-scoped distributed lock state.""" + + def __init__(self, model: Model, relation_name: str): + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + self._relation = relation + self._app = model.app + + def _load(self) -> PeerAppData: + return self._relation.load(PeerAppData, self._app, decoder=lambda s: s) + + def _save(self, data: PeerAppData) -> None: + self._relation.save(data, self._app, encoder=str) + + @property + def granted_unit(self) -> str: + """Return the unit name currently holding the grant, if any.""" + return self._load().granted_unit + + @property + def granted_at(self) -> datetime | None: + """Return the timestamp when the grant was issued, if any.""" + return self._load().granted_at_dt + + def grant(self, unit_name: str) -> None: + """Grant the lock to the provided unit.""" + data = self._load() + data.granted_unit = unit_name + data.granted_at_dt = now_timestamp() + self._save(data) + + def release(self) -> None: + """Clear the current grant.""" + data = self._load() + data.granted_unit = '' + data.granted_at_dt = None + self._save(data) + + def is_granted(self, unit_name: str) -> bool: + """Return whether the provided unit currently holds the grant.""" + return self.granted_unit == unit_name + + +class PeerUnitOperations: + """Unit-scoped queued operations and execution state.""" + + def __init__(self, model: Model, relation_name: str, unit: Unit): + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + self._relation = relation + self.unit = unit + self._backend_state = UnitBackendState(model, relation_name, unit) + + def _load(self) -> PeerUnitData: + return self._relation.load(PeerUnitData, self.unit, decoder=lambda s: s) + + def _save(self, data: PeerUnitData) -> None: + self._relation.save(data, self.unit, encoder=str) + + def is_peer_managed(self) -> bool: + """Return whether the peer backend should process this unit's queue.""" + return self._backend_state.is_peer_managed() + + @property + def intent(self) -> LockIntent: + """Return the current unit intent.""" + return self._load().intent + + @property + def executed_at(self) -> datetime | None: + """Return the last execution timestamp for this unit.""" + return self._load().executed_at_dt + + @property + def queue(self) -> OperationQueue: + return self._load().queue + + def get_current(self) -> Operation | None: + """Return the head operation, if any.""" + return self._load().queue.peek() + + def has_pending_work(self) -> bool: + """Return whether this unit still has queued work.""" + return self.get_current() is not None + + def request(self, operation: Operation) -> None: + """Enqueue an operation and mark this unit as requesting the lock.""" + data = self._load() + queue = data.queue + + previous_length = len(queue) + queue.enqueue(operation) + added = len(queue) != previous_length + if not added: + logger.info( + 'Operation %s not added to the peer queue. ' + 'It already exists in the back of the queue.', + operation.callback_id, + ) + return + + data.queue = queue + if len(queue) == 1: + data.intent = LockIntent.REQUEST + self._save(data) + logger.info('Operation %s added to the peer queue.', operation.callback_id) + + def finish(self, result: OperationResult) -> None: + """Persist the result of executing the current operation.""" + data = self._load() + self._apply_result_to_data(data, result) + self._save(data) + + def _apply_result_to_data( + self, + data: PeerUnitData, + result: OperationResult, + ) -> None: + queue = data.queue + operation = queue.peek() + + if operation is None: + data.intent = LockIntent.IDLE + data.executed_at_dt = now_timestamp() + return + + match result: + case OperationResult.RETRY_HOLD: + queue.increase_attempt() + operation = queue.peek() + if operation is None or operation.is_max_retry_reached(): + logger.warning('Operation max retry reached. Dropping.') + queue.dequeue() + data.intent = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE + else: + data.intent = LockIntent.RETRY_HOLD + + case OperationResult.RETRY_RELEASE: + queue.increase_attempt() + operation = queue.peek() + if operation is None or operation.is_max_retry_reached(): + logger.warning('Operation max retry reached. Dropping.') + queue.dequeue() + data.intent = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE + else: + data.intent = LockIntent.RETRY_RELEASE + case _: + queue.dequeue() + data.intent = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE + + data.queue = queue + data.executed_at_dt = now_timestamp() + + def should_run(self, lock: PeerAppLock) -> bool: + """Return whether this unit should execute now.""" + return ( + self.is_peer_managed() + and lock.is_granted(self.unit.name) + and not self._executed_after_grant(lock) + ) + + def should_release(self, lock: PeerAppLock) -> bool: + """Return whether this unit should release the lock.""" + return (self.is_peer_managed() and self.is_completed(lock)) or self._executed_after_grant( + lock + ) + + def is_waiting(self) -> bool: + """Return whether this unit is waiting for a fresh grant.""" + return self.is_peer_managed() and self.intent == LockIntent.REQUEST + + def is_waiting_retry(self) -> bool: + """Return whether this unit is waiting for a retry after releasing.""" + return self.is_peer_managed() and self.intent == LockIntent.RETRY_RELEASE + + def is_retry_hold(self) -> bool: + """Return whether this unit wants to retry while keeping priority.""" + return self.is_peer_managed() and self.intent == LockIntent.RETRY_HOLD + + def is_retry(self, lock: PeerAppLock) -> bool: + """Return whether this unit is in a retry state and currently granted.""" + return ( + self.is_peer_managed() + and self.intent + in { + LockIntent.RETRY_RELEASE, + LockIntent.RETRY_HOLD, + } + and lock.is_granted(self.unit.name) + ) + + def is_completed(self, lock: PeerAppLock) -> bool: + """Return whether this unit completed and still holds the grant.""" + return ( + self.is_peer_managed() + and self.intent == LockIntent.IDLE + and lock.is_granted(self.unit.name) + ) + + def requested_at(self) -> datetime | None: + """Return the timestamp of the current operation request, if any.""" + operation = self.get_current() + return operation.requested_at if operation is not None else None + + def _executed_after_grant(self, lock: PeerAppLock) -> bool: + """Return whether execution happened after the current grant.""" + granted_at = lock.granted_at + executed_at = self.executed_at + if granted_at is None or executed_at is None: + return False + return executed_at > granted_at + + def mirror_result(self, op_id: str, result: OperationResult) -> None: + """Apply an execution result to the mirrored peer queue. + + This keeps the peer copy aligned with the backend that actually executed + the operation. + + Raises: + RollingOpsDecodingError: if there is an inconsistency found. + """ + data = self._load() + current = data.queue.peek() + + if current is None: + logger.warning('Cannot mirror finalized operation: peer queue is empty.') + raise RollingOpsDecodingError('Inconsistent operation found.') + + if current.op_id != op_id: + logger.warning( + 'Cannot mirror finalized operation: peer head op_id=%s ' + 'does not match finalized op_id=%s.', + current.op_id, + op_id, + ) + raise RollingOpsDecodingError('Inconsistent operation found.') + + self._apply_result_to_data(data, result) + self._save(data) + + +def iter_peer_units(model: Model, relation_name: str) -> Iterator[Unit]: + """Yield all units currently participating in the peer relation, including self.""" + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + units = set(relation.units) + units.add(model.unit) + + yield from units + + +def pick_oldest_completed(operations_list: list[PeerUnitOperations]) -> str | None: + """Return the name of the unit with the oldest executed_at timestamp.""" + selected = None + oldest = None + + for operations in operations_list: + timestamp = operations.executed_at + if timestamp is None: + continue + if oldest is None or timestamp < oldest: + oldest = timestamp + selected = operations + + return selected.unit.name if selected is not None else None + + +def pick_oldest_request(operations_list: list[PeerUnitOperations]) -> str | None: + """Return the name of the unit with the oldest head operation.""" + selected = None + oldest = None + + for operations in operations_list: + timestamp = operations.requested_at() + if timestamp is None: + continue + if oldest is None or timestamp < oldest: + oldest = timestamp + selected = operations + + return selected.unit.name if selected is not None else None diff --git a/rollingops/src/charmlibs/rollingops/peer/_rollingops.py b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py new file mode 100644 index 000000000..05bbb9081 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py @@ -0,0 +1,38 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Background process.""" + +import argparse +import time + +from charmlibs.rollingops.common._utils import dispatch_lock_granted, setup_logging + + +def main(): + """Juju hook event dispatcher.""" + parser = argparse.ArgumentParser() + parser.add_argument('--unit-name', required=True) + parser.add_argument('--charm-dir', required=True) + args = parser.parse_args() + setup_logging('/var/log/peer_rollingops_worker.log') + + # Sleep so that the leader unit can properly leave the hook and start a new one + time.sleep(10) + + dispatch_lock_granted(args.unit_name, args.charm_dir) + + +if __name__ == '__main__': + main() diff --git a/rollingops/src/charmlibs/rollingops/peer/_worker.py b/rollingops/src/charmlibs/rollingops/peer/_worker.py new file mode 100644 index 000000000..349154b8c --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/_worker.py @@ -0,0 +1,101 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" + +import logging + +from ops import RelationDataContent +from ops.charm import ( + CharmBase, +) + +from charmlibs import pathops +from charmlibs.rollingops.common._base_worker import BaseRollingOpsAsyncWorker + +logger = logging.getLogger(__name__) + + +class PeerRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): + """Manage the peer-backed rolling-ops worker process. + + The worker state is coordinated through the peer relation application + databag to ensure that it remains accessible across leadership + changes. This guarantees that a newly elected leader can detect, + stop, or restart an existing worker process as needed. + """ + + _pid_field = 'peer-rollingops-worker-pid' + _log_filename = 'peer_rollingops_worker' + + def __init__(self, charm: CharmBase, relation_name: str): + super().__init__(charm, 'peer-rollingops-async-worker', relation_name) + + @property + def _app_data(self) -> RelationDataContent: + """Return the application databag in the peer relation.""" + return self._relation.data[self.model.app] # type: ignore[reportOptionalMemberAccess] + + def _worker_script_path(self) -> pathops.LocalPath: + """Return the path to the peer rolling-ops worker script. + + This script is executed in a background process to handle operation + processing for the peer backend. + """ + return pathops.LocalPath( + self._venv_site_packages() / 'charmlibs' / 'rollingops' / 'peer' / '_rollingops.py' + ) + + def _get_pid_str(self) -> str: + """Return the stored worker process PID as a string. + + The PID is persisted in the application databag of the peer relation. + If no relation is available or no PID is stored, an empty string is returned. + + Returns: + The worker process PID as a string, or an empty string if not set. + """ + if self._relation is None: + return '' + return self._app_data.get(self._pid_field, '') + + def _set_pid_str(self, pid: str) -> None: + """Persist the worker process PID in the peer relation databag. + + The PID is stored in the application databag because it is used + to trigger rolling operations on the leader and the leader may change. + + Args: + pid: The process identifier to store. + """ + if self._relation is None: + return + self._app_data.update({self._pid_field: pid}) + + def _on_existing_worker(self, pid: int) -> bool: + """Handle the presence of an already running worker process. + + When an existing worker is detected, it is stopped before starting a + new one to ensure a single active worker per application. + + Args: + pid: The PID of the currently running worker. + + Returns: + True to indicate that the existing worker was handled and a new + worker can be started. + """ + logger.info('Stopping existing RollingOps worker PID %s before restart.', pid) + self.stop() + return True diff --git a/rollingops/src/charmlibs/rollingops/py.typed b/rollingops/src/charmlibs/rollingops/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/rollingops/tests/__init__.py b/rollingops/tests/__init__.py new file mode 100644 index 000000000..7ac25a9b2 --- /dev/null +++ b/rollingops/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/rollingops/tests/functional/conftest.py b/rollingops/tests/functional/conftest.py new file mode 100644 index 000000000..048308943 --- /dev/null +++ b/rollingops/tests/functional/conftest.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Fixtures for functional tests, which interact with the real system, but not with Juju.""" diff --git a/rollingops/tests/functional/test_version.py b/rollingops/tests/functional/test_version.py new file mode 100644 index 000000000..3673995ae --- /dev/null +++ b/rollingops/tests/functional/test_version.py @@ -0,0 +1,21 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional tests interacting with the real system, but not with Juju.""" + +from charmlibs import rollingops + + +def test_version(): + assert isinstance(rollingops.__version__, str) diff --git a/rollingops/tests/integration/__init__.py b/rollingops/tests/integration/__init__.py new file mode 100644 index 000000000..7ac25a9b2 --- /dev/null +++ b/rollingops/tests/integration/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/rollingops/tests/integration/charms/actions.yaml b/rollingops/tests/integration/charms/actions.yaml new file mode 100644 index 000000000..7128ad9f2 --- /dev/null +++ b/rollingops/tests/integration/charms/actions.yaml @@ -0,0 +1,44 @@ +# common actions.yaml file symlinked by these charms +# consider adding an action for each thing you want to test + +restart: + description: Restarts the example service + params: + delay: + description: "Introduce an artificial delay (for testing)." + type: integer + default: 0 + +failed-restart: + description: Example restart with a custom callback function. Used in testing + params: + delay: + description: "Introduce an artificial delay (for testing)." + type: integer + default: 0 + max-retry: + description: "Number of times the operation should be retried." + type: integer + +deferred-restart: + description: Example restart with a custom callback function. Used in testing + params: + delay: + description: "Introduce an artificial delay (for testing)." + type: integer + default: 0 + max-retry: + description: "Number of times the operation should be retried." + type: integer + +sync-restart: + description: Example restart with a custom callback function. Used in testing + params: + delay: + description: "Introduce an artificial delay (for testing)." + type: integer + default: 0 + timeout: + description: "Time (seconds) to wait before given up." + type: integer + default: 60 diff --git a/rollingops/tests/integration/charms/common.py b/rollingops/tests/integration/charms/common.py new file mode 100644 index 000000000..a098677c7 --- /dev/null +++ b/rollingops/tests/integration/charms/common.py @@ -0,0 +1,169 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common charm code for integration test charms. + +This file is symlinked alongside src/charm.py by these charms. +""" + +import json +import logging +import time +from datetime import UTC, datetime +from typing import Any + +from ops import ActionEvent, CharmBase, Framework +from ops.model import ActiveStatus, MaintenanceStatus, WaitingStatus + +from charmlibs import pathops +from charmlibs.rollingops import ( + OperationResult, + RollingOpsManager, + SyncLockBackend, +) + +logger = logging.getLogger(__name__) + +TRACE_FILE = pathops.LocalPath('/var/lib/charm-rolling-ops/transitions.log') + + +def _now_timestamp_str() -> str: + """UTC timestamp as a epoch.""" + return str(datetime.now(UTC).timestamp()) + + +class MySyncBackend(SyncLockBackend): + def acquire(self, timeout: int | None) -> None: + logger.info('acquiring sync lock') + + def release(self) -> None: + logger.info('releasing sync lock') + + +class Charm(CharmBase): + """Charm the service.""" + + def __init__(self, framework: Framework): + super().__init__(framework) + callback_targets = { + '_restart': self._restart, + '_failed_restart': self._failed_restart, + '_deferred_restart': self._deferred_restart, + } + + self.restart_manager = RollingOpsManager( + charm=self, + peer_relation_name='restart', + etcd_relation_name='etcd', + cluster_id='cluster-12345', + callback_targets=callback_targets, + sync_lock_targets={ + 'stop': MySyncBackend, + }, + ) + + self.framework.observe(self.on.restart_action, self._on_restart_action) + self.framework.observe(self.on.failed_restart_action, self._on_failed_restart_action) + self.framework.observe(self.on.deferred_restart_action, self._on_deferred_restart_action) + self.framework.observe(self.on.sync_restart_action, self._on_sync_restart_action) + + def _restart(self, delay: int = 0) -> None: + self._record_transition('_restart:start', delay=delay) + logger.info('Starting restart operation') + self.model.unit.status = MaintenanceStatus('Executing _restart operation') + time.sleep(int(delay)) + self.model.unit.status = ActiveStatus() + self._record_transition('_restart:done') + + def _failed_restart(self, delay: int = 0) -> OperationResult: + self._record_transition('_failed_restart:start', delay=delay) + logger.info('Starting failed restart operation') + self.model.unit.status = MaintenanceStatus('Executing _failed_restart operation') + time.sleep(int(delay)) + self.model.unit.status = MaintenanceStatus('Rolling _failed_restart operation failed') + self._record_transition('_failed_restart:retry_release') + return OperationResult.RETRY_RELEASE + + def _deferred_restart(self, delay: int = 0) -> OperationResult: + self._record_transition('_deferred_restart:start', delay=delay) + logger.info('Starting deferred restart operation') + self.model.unit.status = MaintenanceStatus('Executing _deferred_restart operation') + time.sleep(int(delay)) + self.model.unit.status = MaintenanceStatus('Rolling _deferred_restart operation failed') + self._record_transition('_deferred_restart:retry_hold', delay=delay) + return OperationResult.RETRY_HOLD + + def _on_restart_action(self, event: ActionEvent) -> None: + delay = event.params.get('delay') + self._record_transition('action:restart', delay=delay) + self.model.unit.status = WaitingStatus('Awaiting _restart operation') + self.restart_manager.request_async_lock(callback_id='_restart', kwargs={'delay': delay}) + + def _on_failed_restart_action(self, event: ActionEvent) -> None: + delay = event.params.get('delay') + max_retry = event.params.get('max-retry', None) + self._record_transition('action:failed-restart', delay=delay, max_retry=max_retry) + self.model.unit.status = WaitingStatus('Awaiting _failed_restart operation') + self.restart_manager.request_async_lock( + callback_id='_failed_restart', + kwargs={'delay': delay}, + max_retry=max_retry, + ) + + def _on_deferred_restart_action(self, event: ActionEvent) -> None: + delay = event.params.get('delay') + max_retry = event.params.get('max-retry', None) + self._record_transition('action:deferred-restart', delay=delay, max_retry=max_retry) + self.model.unit.status = WaitingStatus('Awaiting _deferred_restart operation') + self.restart_manager.request_async_lock( + callback_id='_deferred_restart', + kwargs={'delay': delay}, + max_retry=max_retry, + ) + + def _on_sync_restart_action(self, event: ActionEvent): + self.model.unit.status = WaitingStatus('Awaiting _sync_restart operation') + timeout = event.params.get('timeout', 60) + delay = event.params.get('delay') + self._record_transition('action:sync-restart', delay=delay, timeout=timeout) + + try: + with self.restart_manager.acquire_sync_lock(backend_id='stop', timeout=timeout): + self._record_transition('_sync_restart:start', delay=delay, timeout=timeout) + logger.info('Executing _sync_restart.') + self.model.unit.status = MaintenanceStatus('Executing _sync_restart operation') + time.sleep(int(event.params.get('delay', 0))) + self.model.unit.status = ActiveStatus('') + logger.info('Finished _sync_restart.') + self._record_transition('_sync_restart:done', delay=delay, timeout=timeout) + return + except TimeoutError: + self._record_transition('_sync_restart:timeout', delay=delay, timeout=timeout) + event.fail('Timed out acquiring sync lock') + + def _record_transition(self, name: str, **data: Any) -> None: + TRACE_FILE.parent.mkdir(parents=True, exist_ok=True) + state = self.restart_manager.state + payload = { + 'ts': _now_timestamp_str(), + 'unit': self.model.unit.name, + 'event': name, + 'rollingops_status': state.status.value if state.status else None, + 'processing_backend': state.processing_backend.value + if state.processing_backend + else None, + **data, + } + with TRACE_FILE.open('a', encoding='utf-8') as f: + f.write(json.dumps(payload) + '\n') diff --git a/rollingops/tests/integration/charms/k8s/actions.yaml b/rollingops/tests/integration/charms/k8s/actions.yaml new file mode 120000 index 000000000..9adaf92ea --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/actions.yaml @@ -0,0 +1 @@ +../actions.yaml \ No newline at end of file diff --git a/rollingops/tests/integration/charms/k8s/charmcraft.yaml b/rollingops/tests/integration/charms/k8s/charmcraft.yaml new file mode 100644 index 000000000..60b495614 --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/charmcraft.yaml @@ -0,0 +1,35 @@ +# common charmcraft.yaml file symlinked by these charms +# k8s charms can define containers + resources in metadata.yaml + +name: test +type: charm +summary: A small charm for use in integration tests. +description: A small charm for use in integration tests. + +base: ubuntu@24.04 +platforms: + amd64: + +parts: + charm: + source: . + plugin: uv + build-snaps: [astral-uv] + +containers: + workload: + resource: workload + +peers: + restart: + interface: rolling_op + +requires: + etcd: + interface: etcd_client + +resources: + workload: + type: oci-image + description: OCI image for the 'workload' container. + upstream-source: some-repo/some-image:some-tag diff --git a/rollingops/tests/integration/charms/k8s/library/README.md b/rollingops/tests/integration/charms/k8s/library/README.md new file mode 120000 index 000000000..1dfab2425 --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/library/README.md @@ -0,0 +1 @@ +../../../../../README.md \ No newline at end of file diff --git a/rollingops/tests/integration/charms/k8s/library/pyproject.toml b/rollingops/tests/integration/charms/k8s/library/pyproject.toml new file mode 120000 index 000000000..be00ff53f --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/library/pyproject.toml @@ -0,0 +1 @@ +../../../../../pyproject.toml \ No newline at end of file diff --git a/rollingops/tests/integration/charms/k8s/library/src b/rollingops/tests/integration/charms/k8s/library/src new file mode 120000 index 000000000..d753b57a1 --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/library/src @@ -0,0 +1 @@ +../../../../../src \ No newline at end of file diff --git a/rollingops/tests/integration/charms/k8s/pyproject.toml b/rollingops/tests/integration/charms/k8s/pyproject.toml new file mode 120000 index 000000000..1e11d7825 --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/pyproject.toml @@ -0,0 +1 @@ +../pyproject.toml \ No newline at end of file diff --git a/rollingops/tests/integration/charms/k8s/src/charm.py b/rollingops/tests/integration/charms/k8s/src/charm.py new file mode 100644 index 000000000..abb93f656 --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/src/charm.py @@ -0,0 +1,40 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""K8s charm for testing.""" + +import logging + +import common +import ops + +logger = logging.getLogger(__name__) + +CONTAINER = 'workload' + + +class RollingOpsCharm(common.Charm): + """Charm the application.""" + + def __init__(self, framework: ops.Framework): + super().__init__(framework) + framework.observe(self.on[CONTAINER].pebble_ready, self._on_pebble_ready) + + def _on_pebble_ready(self, event: ops.PebbleReadyEvent): + """Handle pebble-ready event.""" + self.unit.status = ops.ActiveStatus() + + +if __name__ == '__main__': # pragma: nocover + ops.main(RollingOpsCharm) diff --git a/rollingops/tests/integration/charms/k8s/src/common.py b/rollingops/tests/integration/charms/k8s/src/common.py new file mode 120000 index 000000000..349c5f056 --- /dev/null +++ b/rollingops/tests/integration/charms/k8s/src/common.py @@ -0,0 +1 @@ +../../common.py \ No newline at end of file diff --git a/rollingops/tests/integration/charms/machine/actions.yaml b/rollingops/tests/integration/charms/machine/actions.yaml new file mode 120000 index 000000000..9adaf92ea --- /dev/null +++ b/rollingops/tests/integration/charms/machine/actions.yaml @@ -0,0 +1 @@ +../actions.yaml \ No newline at end of file diff --git a/rollingops/tests/integration/charms/machine/charmcraft.yaml b/rollingops/tests/integration/charms/machine/charmcraft.yaml new file mode 100644 index 000000000..25d6dd0bb --- /dev/null +++ b/rollingops/tests/integration/charms/machine/charmcraft.yaml @@ -0,0 +1,25 @@ +# common charmcraft.yaml file symlinked by these charms +# k8s charms can define containers + resources in metadata.yaml + +name: test +type: charm +summary: A small charm for use in integration tests. +description: A small charm for use in integration tests. + +base: ubuntu@24.04 +platforms: + amd64: + +parts: + charm: + source: . + plugin: uv + build-snaps: [astral-uv] + +peers: + restart: + interface: rolling_op + +requires: + etcd: + interface: etcd_client diff --git a/rollingops/tests/integration/charms/machine/library/README.md b/rollingops/tests/integration/charms/machine/library/README.md new file mode 120000 index 000000000..1dfab2425 --- /dev/null +++ b/rollingops/tests/integration/charms/machine/library/README.md @@ -0,0 +1 @@ +../../../../../README.md \ No newline at end of file diff --git a/rollingops/tests/integration/charms/machine/library/pyproject.toml b/rollingops/tests/integration/charms/machine/library/pyproject.toml new file mode 120000 index 000000000..be00ff53f --- /dev/null +++ b/rollingops/tests/integration/charms/machine/library/pyproject.toml @@ -0,0 +1 @@ +../../../../../pyproject.toml \ No newline at end of file diff --git a/rollingops/tests/integration/charms/machine/library/src b/rollingops/tests/integration/charms/machine/library/src new file mode 120000 index 000000000..d753b57a1 --- /dev/null +++ b/rollingops/tests/integration/charms/machine/library/src @@ -0,0 +1 @@ +../../../../../src \ No newline at end of file diff --git a/rollingops/tests/integration/charms/machine/pyproject.toml b/rollingops/tests/integration/charms/machine/pyproject.toml new file mode 120000 index 000000000..1e11d7825 --- /dev/null +++ b/rollingops/tests/integration/charms/machine/pyproject.toml @@ -0,0 +1 @@ +../pyproject.toml \ No newline at end of file diff --git a/rollingops/tests/integration/charms/machine/src/charm.py b/rollingops/tests/integration/charms/machine/src/charm.py new file mode 100644 index 000000000..350b11cc3 --- /dev/null +++ b/rollingops/tests/integration/charms/machine/src/charm.py @@ -0,0 +1,49 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Machine charm for testing.""" + +import logging + +import common +import ops + +from charmlibs import apt + +logger = logging.getLogger(__name__) + + +class RollingOpsCharm(common.Charm): + """Charm the application.""" + + def __init__(self, framework: ops.Framework): + super().__init__(framework) + framework.observe(self.on.start, self._on_start) + framework.observe(self.on.install, self._on_install) + + def _on_start(self, event: ops.StartEvent): + """Handle start event.""" + self.unit.status = ops.ActiveStatus() + + def _on_install(self, event: ops.InstallEvent) -> None: + """Handle the install. Install the etcd-client.""" + try: + apt.update() + apt.add_package('etcd-client') + except apt.PackageError as e: + logger.error('could not install package. Reason: %s', e.message) + + +if __name__ == '__main__': # pragma: nocover + ops.main(RollingOpsCharm) diff --git a/rollingops/tests/integration/charms/machine/src/common.py b/rollingops/tests/integration/charms/machine/src/common.py new file mode 120000 index 000000000..349c5f056 --- /dev/null +++ b/rollingops/tests/integration/charms/machine/src/common.py @@ -0,0 +1 @@ +../../common.py \ No newline at end of file diff --git a/rollingops/tests/integration/charms/pyproject.toml b/rollingops/tests/integration/charms/pyproject.toml new file mode 100644 index 000000000..c6ccebd56 --- /dev/null +++ b/rollingops/tests/integration/charms/pyproject.toml @@ -0,0 +1,19 @@ +# common pyproject.toml file symlinked by these charms + +[project] +name = "integration-test-charm" +version = "0.0.0.dev0" +description = "Charm for integration tests." +requires-python = ">=3.12" +dependencies = [ + "ops==3.*", + "charmlibs-rollingops", + "charmlibs-apt", + "charmlibs-interfaces-tls-certificates>=1.8.1", + "charmlibs-pathops>=1.2.1", + "dpcharmlibs-interfaces==1.0.0", + "tenacity" +] + +[tool.uv.sources] +"charmlibs-rollingops" = { path = "library", editable = true } diff --git a/rollingops/tests/integration/conftest.py b/rollingops/tests/integration/conftest.py new file mode 100644 index 000000000..1e75dd9ef --- /dev/null +++ b/rollingops/tests/integration/conftest.py @@ -0,0 +1,82 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Fixtures for Juju integration tests.""" + +import logging +import os +import pathlib +import sys +import time +import typing +from collections.abc import Iterator + +import jubilant +import pytest + +logger = logging.getLogger(__name__) + + +def pytest_addoption(parser: pytest.OptionGroup): + parser.addoption( + '--keep-models', + action='store_true', + default=False, + help='keep temporarily-created models', + ) + + +@pytest.fixture(scope='session') +def app_name() -> str: + """Return the default application name.""" + return 'test' # determined by test charms' charmcraft.yaml + + +@pytest.fixture(scope='session') +def charm() -> pathlib.Path: + """Return the packed charm path.""" + substrate = os.environ['CHARMLIBS_SUBSTRATE'] + # tag = os.environ.get('CHARMLIBS_TAG', '') # get the tag if needed + return pathlib.Path(__file__).parent / '.packed' / f'{substrate}.charm' # set by pack.sh + + +@pytest.fixture(scope='module') +def juju( + request: pytest.FixtureRequest, charm: pathlib.Path, app_name: str +) -> Iterator[jubilant.Juju]: + """Pytest fixture that wraps :meth:`jubilant.with_model`. + + This adds command line parameter ``--keep-models`` (see help for details). + """ + keep_models = typing.cast('bool', request.config.getoption('--keep-models')) + with jubilant.temp_model(keep=keep_models) as juju: + juju.model_config({'logging-config': '=INFO;unit=DEBUG'}) + _deploy(juju, charm=charm, app_name=app_name) + juju.wait(jubilant.all_active, timeout=15 * 60.0) + yield juju + if request.session.testsfailed: + logger.info('Collecting Juju logs ...') + time.sleep(0.5) # Wait for Juju to process logs. + log = juju.debug_log(limit=0) + print(log, end='', file=sys.stderr) + + +def _deploy(juju: jubilant.Juju, charm: pathlib.Path, app_name: str, num_units: int = 1) -> None: + substrate = os.environ['CHARMLIBS_SUBSTRATE'] + if substrate == 'k8s': + juju.deploy( + charm, app=app_name, num_units=num_units, resources={'workload': 'ubuntu:latest'} + ) + else: + juju.deploy(charm, app=app_name, num_units=num_units) diff --git a/rollingops/tests/integration/pack.sh b/rollingops/tests/integration/pack.sh new file mode 100755 index 000000000..b5c7113ff --- /dev/null +++ b/rollingops/tests/integration/pack.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# This script is executed in this directory via `just pack-k8s` or `just pack-machine`. +# Extra args are passed to this script, e.g. `just pack-k8s foo` -> $1 is 'foo'. +# In CI, the `just pack-` commands are invoked: +# - If this file exists and `just integration-` would execute any tests +# - Before running integration tests +# - With no additional arguments +# +# Environment variables: +# $CHARMLIBS_SUBSTRATE will have the value 'k8s' or 'machine' (set by pack-k8s or pack-machine) +# In CI, $CHARMLIBS_TAG is set based on pyproject.toml:tool.charmlibs.integration.tags +# For local testing, set $CHARMLIBS_TAG directly or use the tag variable. For example: +# just tag=24.04 pack-k8s some extra args +set -xueo pipefail + +TMP_DIR=".tmp" # clean temporary directory where charms will be packed +PACKED_DIR=".packed" # where packed charms will be placed with name expected in conftest.py + +: copy charm files to temporary directory for packing, dereferencing symlinks +rm -rf "$TMP_DIR" +cp --recursive --dereference "charms/$CHARMLIBS_SUBSTRATE/" "$TMP_DIR" + +: pack charm +cd "$TMP_DIR" +uv lock # required by uv charm plugin +charmcraft pack +cd - + +: place packed charm in expected location +mkdir -p "$PACKED_DIR" # -p means create parents and don't complain if dir already exists +mv "$TMP_DIR"/*.charm "$PACKED_DIR/$CHARMLIBS_SUBSTRATE.charm" # read by conftest.py diff --git a/rollingops/tests/integration/test_etcd_rolling_ops.py b/rollingops/tests/integration/test_etcd_rolling_ops.py new file mode 100644 index 000000000..21c84fe35 --- /dev/null +++ b/rollingops/tests/integration/test_etcd_rolling_ops.py @@ -0,0 +1,502 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration tests using real Juju and pre-packed charm(s).""" + +import logging +import time +from pathlib import Path + +import jubilant +import pytest +from tenacity import retry, stop_after_delay, wait_fixed + +from tests.integration.utils import ( + get_unit_events, + is_empty_file, + parse_ts, + remove_transition_file, +) + +logger = logging.getLogger(__name__) +TIMEOUT = 15 * 60.0 +ETCD_PROCESS_LOGS = '/var/log/etcd_rollingops_worker.log' +PEER_PROCCES_LOGS = '/var/log/peer_rollingops_worker.log' +ETCD_CONFIG_FILE = '/var/lib/rollingops/etcd/etcdctl.json' + + +def etcdctl_file_exits(juju: jubilant.Juju, unit: str) -> bool: + task = juju.exec(f'test -f {ETCD_CONFIG_FILE}', unit=unit) + if task.status != 'completed' or task.return_code != 0: + return False + return True + + +@retry(wait=wait_fixed(10), stop=stop_after_delay(60), reraise=True) +def wait_for_etcdctl_config_file(juju: jubilant.Juju, unit: str) -> None: + if not etcdctl_file_exits(juju, unit): + raise RuntimeError('etcdctl config file not ready') + + +def test_deploy(juju: jubilant.Juju, app_name: str): + """The deployment takes place in the module scoped `juju` fixture.""" + assert app_name in juju.status().apps + + +@pytest.mark.machine_only +def test_charm_is_integrated_with_etcd(juju: jubilant.Juju, app_name: str): + juju.deploy( + 'self-signed-certificates', + app='self-signed-certificates', + channel='1/stable', + ) + juju.deploy( + 'charmed-etcd', + app='etcd', + channel='3.6/stable', + ) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + juju.integrate( + 'etcd:client-certificates', + 'self-signed-certificates:certificates', + ) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + juju.integrate(f'{app_name}:etcd', 'etcd:etcd-client') + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + wait_for_etcdctl_config_file(juju, f'{app_name}/0') + + +@pytest.mark.machine_only +def test_restart_action_one_unit_single_app(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' + + juju.run(unit, 'restart', {'delay': 1}, wait=TIMEOUT) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [ + (e['event'], e['processing_backend']) + for e in events + if not e['event'].startswith('action') + ] + expected = [ + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ] + + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert not is_empty_file(juju, unit, ETCD_PROCESS_LOGS) + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_failed_restart_retries_one_unit_single_app(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' + remove_transition_file(juju, unit) + + juju.run(unit, 'failed-restart', {'delay': 1, 'max-retry': 1}) + juju.run(unit, 'restart', {'delay': 1}) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [ + (e['event'], e['processing_backend']) + for e in events + if not e['event'].startswith('action') + ] + + expected = [ + ('_failed_restart:start', 'etcd'), # attempt 0 + ('_failed_restart:retry_release', 'etcd'), + ('_failed_restart:start', 'etcd'), # retry 1 + ('_failed_restart:retry_release', 'etcd'), + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ] + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_assert_deferred_restart_retries_one_unit_single_app(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' + remove_transition_file(juju, unit) + + juju.run(unit, 'deferred-restart', {'delay': 1, 'max-retry': 1}, wait=TIMEOUT) + juju.run(unit, 'restart', {'delay': 1}) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [ + (e['event'], e['processing_backend']) + for e in events + if not e['event'].startswith('action') + ] + + expected = [ + ('_deferred_restart:start', 'etcd'), # attempt 0 + ('_deferred_restart:retry_hold', 'etcd'), + ('_deferred_restart:start', 'etcd'), # retry 1 + ('_deferred_restart:retry_hold', 'etcd'), + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ] + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_assert_restart_rolls_one_unit_at_a_time_single_app(juju: jubilant.Juju, app_name: str): + juju.add_unit(app=app_name, num_units=4) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + for unit in units: + juju.run(unit, 'restart', {'delay': 15}) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + all_events: list[dict[str, str]] = [] + for unit in units: + events = get_unit_events(juju, unit) + assert len(events) == 3 + all_events.extend(events) + + restart_events = [e for e in all_events if not e['event'].startswith('action')] + restart_events.sort(key=parse_ts) + + logger.info(restart_events) + + assert len(restart_events) == len(units) * 2 + for i in range(0, len(restart_events), 2): + start_event = restart_events[i] + done_event = restart_events[i + 1] + + assert start_event['event'] == '_restart:start' + assert done_event['event'] == '_restart:done' + assert start_event['unit'] == done_event['unit'] + assert start_event['processing_backend'] == 'etcd' + assert done_event['processing_backend'] == 'etcd' + for unit in units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_retry_hold_operation_two_units_single_app(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[1] + unit_b = units[3] + + juju.run(unit_a, 'deferred-restart', {'delay': 15, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit_b, 'restart', {'delay': 2}, wait=TIMEOUT) + + juju.wait( + lambda status: status.apps[app_name].units[unit_b].is_active, + error=jubilant.any_error, + timeout=TIMEOUT, + ) + + all_events: list[dict[str, str]] = [] + all_events.extend(get_unit_events(juju, unit_a)) + all_events.extend(get_unit_events(juju, unit_b)) + all_events.sort(key=parse_ts) + + logger.info(all_events) + + relevant_events = [e for e in all_events if not e['event'].startswith('action')] + sequence = [(e['unit'], e['event'], e['processing_backend']) for e in relevant_events] + + logger.info(sequence) + + assert sequence == [ + (unit_a, '_deferred_restart:start', 'etcd'), # attempt 0 + (unit_a, '_deferred_restart:retry_hold', 'etcd'), + (unit_a, '_deferred_restart:start', 'etcd'), # retry 1 + (unit_a, '_deferred_restart:retry_hold', 'etcd'), + (unit_a, '_deferred_restart:start', 'etcd'), # retry 2 + (unit_a, '_deferred_restart:retry_hold', 'etcd'), + (unit_b, '_restart:start', 'etcd'), + (unit_b, '_restart:done', 'etcd'), + ], f'unexpected event sequence: {sequence}' + + for unit in units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_retry_release_two_units_single_app(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[2] + unit_b = units[4] + + juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit_b, 'failed-restart', {'delay': 15, 'max-retry': 2}, wait=TIMEOUT) + + time.sleep( + 60 * 3 + ) # wait for operation execution. TODO: in charm use lock state to clear status. + + all_events: list[dict[str, str]] = [] + all_events.extend(get_unit_events(juju, unit_a)) + all_events.extend(get_unit_events(juju, unit_b)) + all_events.sort(key=parse_ts) + + restart_events = [e for e in all_events if not e['event'].startswith('action')] + restart_events.sort(key=parse_ts) + + logger.info(restart_events) + + assert len(restart_events) == 2 * 2 * 3 # 2 units * 2 events * 3 executions + for i in range(0, len(restart_events), 2): + start_event = restart_events[i] + done_event = restart_events[i + 1] + + assert start_event['event'] == '_failed_restart:start' + assert done_event['event'] == '_failed_restart:retry_release' + assert start_event['unit'] == done_event['unit'] + assert start_event['processing_backend'] == 'etcd' + assert done_event['processing_backend'] == 'etcd' + + for unit in units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_subsequent_lock_request_ops_single_app(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[3] + + juju.run(unit_a, 'deferred-restart', {'delay': 1, 'max-retry': 1}) + for _ in range(3): + juju.run(unit_a, 'failed-restart', {'delay': 1, 'max-retry': 0}) + juju.run(unit_a, 'restart', {'delay': 1}) + + juju.wait( + lambda status: status.apps[app_name].units[unit_a].is_active, + error=jubilant.any_error, + timeout=TIMEOUT, + ) + + unit_a_events = get_unit_events(juju, unit_a) + relevant_events = [ + (e['event'], e['processing_backend']) + for e in unit_a_events + if not e['event'].startswith('action') + ] + logger.info('unit_a_events %s', unit_a_events) + + assert relevant_events == [ + ('_deferred_restart:start', 'etcd'), # attempt 0 + ('_deferred_restart:retry_hold', 'etcd'), + ('_deferred_restart:start', 'etcd'), # retry 1 + ('_deferred_restart:retry_hold', 'etcd'), + ('_failed_restart:start', 'etcd'), # attempt 0 + ('_failed_restart:retry_release', 'etcd'), + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ], f'unexpected event sequence: {relevant_events}' + + for unit in units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_rolling_ops_multi_app(juju: jubilant.Juju, charm: Path, app_name: str): + second_app = f'{app_name}-secondary' + juju.deploy(charm, app=second_app, num_units=3) + juju.wait( + lambda status: jubilant.all_active(status, second_app), + error=jubilant.any_error, + timeout=TIMEOUT, + ) + juju.integrate(f'{second_app}:etcd', 'etcd:etcd-client') + + juju.wait( + lambda status: jubilant.all_active(status, second_app, 'etcd'), + error=jubilant.any_error, + timeout=TIMEOUT, + ) + + primary_units = sorted(juju.status().apps[app_name].units.keys()) + secondary_units = sorted(juju.status().apps[second_app].units.keys()) + all_units: list[str] = primary_units + secondary_units + + for unit in all_units: + remove_transition_file(juju, unit) + wait_for_etcdctl_config_file(juju, unit) + + for unit in all_units: + juju.run(unit, 'restart', {'delay': 10}, wait=TIMEOUT) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + all_events: list[dict[str, str]] = [] + + for unit in all_units: + events = get_unit_events(juju, unit) + assert len(events) == 3 + all_events.extend(events) + + restart_events = [e for e in all_events if not e['event'].startswith('action')] + restart_events.sort(key=parse_ts) + + logger.info(restart_events) + + assert len(restart_events) == len(all_units) * 2 + for i in range(0, len(restart_events), 2): + start_event = restart_events[i] + done_event = restart_events[i + 1] + + assert start_event['event'] == '_restart:start' + assert done_event['event'] == '_restart:done' + assert start_event['unit'] == done_event['unit'] + assert start_event['processing_backend'] == 'etcd' + assert done_event['processing_backend'] == 'etcd' + + for unit in all_units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_rolling_ops_sync_lock_multi_app(juju: jubilant.Juju, app_name: str): + second_app = f'{app_name}-secondary' + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + primary_units = sorted(juju.status().apps[app_name].units.keys()) + secondary_units = sorted(juju.status().apps[second_app].units.keys()) + all_units: list[str] = primary_units + secondary_units + + for unit in all_units: + remove_transition_file(juju, unit) + wait_for_etcdctl_config_file(juju, unit) + + unit_a = primary_units[1] + unit_b = secondary_units[1] + + juju.cli('run', unit_a, 'sync-restart', 'delay=15', '--background') + time.sleep(2) + juju.cli('run', unit_b, 'sync-restart', 'delay=15', '--background') + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + all_events: list[dict[str, str]] = [] + + for unit in {unit_a, unit_b}: + events = get_unit_events(juju, unit) + assert len(events) == 3 + all_events.extend(events) + + all_events.sort(key=parse_ts) + restart_events = [ + (e['unit'], e['event'], e['processing_backend']) + for e in all_events + if not e['event'].startswith('action') + ] + + logger.info(restart_events) + + assert restart_events == [ + (unit_a, '_sync_restart:start', 'etcd'), + (unit_a, '_sync_restart:done', 'etcd'), + (unit_b, '_sync_restart:start', 'etcd'), + (unit_b, '_sync_restart:done', 'etcd'), + ], f'unexpected event sequence: {restart_events}' + + for unit in all_units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_lock_released_when_unit_removed(juju: jubilant.Juju, app_name: str) -> None: + units = sorted(juju.status().apps[app_name].units.keys()) + for unit in units: + remove_transition_file(juju, unit) + unit_a = units[1] + unit_b = units[2] + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + juju.run(unit_a, 'deferred-restart', {'delay': 15}) + time.sleep(5) + juju.run(unit_b, 'restart', {'delay': 2}) + + juju.remove_unit(unit_a) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + unit_b_events = get_unit_events(juju, unit_b) + relevant_events = [ + (e['event'], e['processing_backend']) + for e in unit_b_events + if not e['event'].startswith('action') + ] + + logger.info('unit_b_events %s', unit_b_events) + + assert relevant_events == [ + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ], f'unexpected event sequence: {relevant_events}' + + +@pytest.mark.machine_only +def test_actions_still_work_after_etcd_relation_removed( + juju: jubilant.Juju, app_name: str +) -> None: + units = sorted(juju.status().apps[app_name].units.keys()) + for unit in units: + remove_transition_file(juju, unit) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + unit_a = units[3] + + juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 2}) + for i in range(3): + juju.run(unit_a, 'restart', {'delay': i}) + + juju.remove_relation(f'{app_name}:etcd', 'etcd:etcd-client') + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + unit_a_events = get_unit_events(juju, unit_a) + relevant_events = [e['event'] for e in unit_a_events if not e['event'].startswith('action')] + + logger.info('unit_a_events %s', unit_a_events) + + # During fallback if the execution is not fully committed to etcd, it may + # be re-executed on the peer context. + assert relevant_events.count('_failed_restart:start') >= 3, relevant_events + assert relevant_events.count('_failed_restart:retry_release') >= 3, relevant_events + assert relevant_events.count('_restart:start') >= 3, relevant_events + assert relevant_events.count('_restart:done') >= 3, relevant_events diff --git a/rollingops/tests/integration/test_peer_rolling_ops.py b/rollingops/tests/integration/test_peer_rolling_ops.py new file mode 100644 index 000000000..dcd527b61 --- /dev/null +++ b/rollingops/tests/integration/test_peer_rolling_ops.py @@ -0,0 +1,390 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration tests using real Juju and pre-packed charm(s).""" + +import logging +import time + +import jubilant + +from tests.integration.utils import ( + get_leader_unit_name, + get_unit_events, + parse_ts, + remove_transition_file, +) + +logger = logging.getLogger(__name__) +TIMEOUT = 15 * 60.0 + + +def test_deploy(juju: jubilant.Juju, app_name: str): + """The deployment takes place in the module scoped `juju` fixture.""" + assert app_name in juju.status().apps + + +def test_restart_action_one_unit(juju: jubilant.Juju, app_name: str): + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + unit = f'{app_name}/0' + + juju.run(unit, 'restart', {'delay': 1}, wait=TIMEOUT) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [e['event'] for e in events] + + expected = [ + 'action:restart', + '_restart:start', + '_restart:done', + ] + + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert all(e['processing_backend'] == 'peer' for e in events) + + +def test_failed_restart_retries_one_unit(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' + + remove_transition_file(juju, unit) + juju.run(unit, 'failed-restart', {'delay': 1, 'max-retry': 2}) + juju.run(unit, 'restart', {'delay': 1}) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [e['event'] for e in events if not e['event'].startswith('action')] + + expected = [ + '_failed_restart:start', # attempt 0 + '_failed_restart:retry_release', + '_failed_restart:start', # retry 1 + '_failed_restart:retry_release', + '_failed_restart:start', # retry 2 + '_failed_restart:retry_release', + '_restart:start', + '_restart:done', + ] + + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert all(e['processing_backend'] == 'peer' for e in events) + + +def test_assert_deferred_restart_retries_one_unit(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' + + remove_transition_file(juju, unit) + juju.run(unit, 'deferred-restart', {'delay': 1, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit, 'restart', {'delay': 1}) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [e['event'] for e in events if not e['event'].startswith('action')] + + expected = [ + '_deferred_restart:start', # attempt 0 + '_deferred_restart:retry_hold', + '_deferred_restart:start', # retry 1 + '_deferred_restart:retry_hold', + '_deferred_restart:start', # retry 2 + '_deferred_restart:retry_hold', + '_restart:start', + '_restart:done', + ] + + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert all(e['processing_backend'] == 'peer' for e in events) + + +def test_assert_restart_rolls_one_unit_at_a_time(juju: jubilant.Juju, app_name: str): + juju.add_unit(app=app_name, num_units=4) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + status = juju.status() + units = sorted(status.apps[app_name].units) + + for unit in units: + remove_transition_file(juju, unit) + + for unit in units: + juju.run(unit, 'restart', {'delay': 2}) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + all_events: list[dict[str, str]] = [] + for unit in units: + events = get_unit_events(juju, unit) + assert len(events) == 3 + all_events.extend(events) + + restart_events = [e for e in all_events if e['event'] in {'_restart:start', '_restart:done'}] + restart_events.sort(key=parse_ts) + + logger.info(restart_events) + + for i in range(0, len(restart_events), 2): + start_event = restart_events[i] + done_event = restart_events[i + 1] + + assert start_event['event'] == '_restart:start' + assert done_event['event'] == '_restart:done' + assert start_event['unit'] == done_event['unit'], ( + f'start/done pair mismatch: {start_event} vs {done_event}' + ) + assert all(e['processing_backend'] == 'peer' for e in all_events) + + +def test_retry_hold_keeps_lock_on_same_unit(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[1] + unit_b = units[3] + + juju.run(unit_a, 'deferred-restart', {'delay': 15, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit_b, 'restart', {'delay': 2}, wait=TIMEOUT) + + juju.wait( + lambda status: status.apps[app_name].units[unit_b].is_active, + error=jubilant.any_error, + timeout=TIMEOUT, + ) + + all_events: list[dict[str, str]] = [] + all_events.extend(get_unit_events(juju, unit_a)) + all_events.extend(get_unit_events(juju, unit_b)) + all_events.sort(key=parse_ts) + + logger.info(all_events) + + relevant_events = [ + e + for e in all_events + if e['event'] + in { + '_deferred_restart:start', + '_deferred_restart:retry_hold', + '_restart:start', + '_restart:done', + } + ] + + sequence = [(e['unit'], e['event']) for e in relevant_events] + + logger.info(sequence) + + assert sequence == [ + (unit_a, '_deferred_restart:start'), # attempt 0 + (unit_a, '_deferred_restart:retry_hold'), + (unit_a, '_deferred_restart:start'), # retry 1 + (unit_a, '_deferred_restart:retry_hold'), + (unit_a, '_deferred_restart:start'), # retry 2 + (unit_a, '_deferred_restart:retry_hold'), + (unit_b, '_restart:start'), + (unit_b, '_restart:done'), + ], f'unexpected event sequence: {sequence}' + assert all(e['processing_backend'] == 'peer' for e in all_events) + + +def test_retry_release_alternates_execution(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[2] + unit_b = units[4] + + juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit_b, 'failed-restart', {'delay': 1, 'max-retry': 2}, wait=TIMEOUT) + + time.sleep(60) # wait for operation execution. TODO: in charm use lock state to clear status. + + all_events: list[dict[str, str]] = [] + all_events.extend(get_unit_events(juju, unit_a)) + all_events.extend(get_unit_events(juju, unit_b)) + all_events.sort(key=parse_ts) + + logger.info(all_events) + + relevant_events = [ + e + for e in all_events + if e['event'] in {'_failed_restart:start', '_failed_restart:retry_release'} + ] + + sequence = [(e['unit'], e['event']) for e in relevant_events] + + logger.info(sequence) + + assert sequence == [ + (unit_a, '_failed_restart:start'), # attempt 0 + (unit_a, '_failed_restart:retry_release'), + (unit_b, '_failed_restart:start'), # attempt 0 + (unit_b, '_failed_restart:retry_release'), + (unit_a, '_failed_restart:start'), # retry 1 + (unit_a, '_failed_restart:retry_release'), + (unit_b, '_failed_restart:start'), # retry 1 + (unit_b, '_failed_restart:retry_release'), + (unit_a, '_failed_restart:start'), # retry 2 + (unit_a, '_failed_restart:retry_release'), + (unit_b, '_failed_restart:start'), # retry 2 + (unit_b, '_failed_restart:retry_release'), + ], f'unexpected event sequence: {sequence}' + assert all(e['processing_backend'] == 'peer' for e in all_events) + + +def test_subsequent_lock_request_of_different_ops(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[3] + unit_b = units[4] + + juju.run(unit_b, 'deferred-restart', {'delay': 10, 'max-retry': 2}) + juju.run(unit_a, 'failed-restart', {'delay': 1, 'max-retry': 2}) + juju.run(unit_a, 'deferred-restart', {'delay': 1, 'max-retry': 0}) + juju.run(unit_a, 'restart', {'delay': 1}) + + juju.wait( + lambda status: status.apps[app_name].units[unit_a].is_active, + error=jubilant.any_error, + timeout=TIMEOUT, + ) + + unit_a_events = get_unit_events(juju, unit_a) + relevant_events = [e['event'] for e in unit_a_events] + + logger.info('unit_a_events %s', unit_a_events) + + assert relevant_events == [ + 'action:failed-restart', + 'action:deferred-restart', + 'action:restart', + '_failed_restart:start', # attempt 0 + '_failed_restart:retry_release', + '_failed_restart:start', # retry 1 + '_failed_restart:retry_release', + '_failed_restart:start', # retry 2 + '_failed_restart:retry_release', + '_deferred_restart:start', # attempt 0 + '_deferred_restart:retry_hold', + '_restart:start', + '_restart:done', + ], f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in unit_a_events) + + +def test_subsequent_lock_request_of_same_op(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[3] + unit_b = units[4] + + juju.run(unit_b, 'deferred-restart', {'delay': 10, 'max-retry': 1}) + juju.run(unit_a, 'failed-restart', {'delay': 1, 'max-retry': 2}) + for _ in range(3): + juju.run(unit_a, 'deferred-restart', {'delay': 1, 'max-retry': 0}) + juju.run(unit_a, 'restart', {'delay': 1}) + + juju.wait( + lambda status: status.apps[app_name].units[unit_a].is_active, + error=jubilant.any_error, + timeout=TIMEOUT, + ) + + unit_a_events = get_unit_events(juju, unit_a) + relevant_events = [e['event'] for e in unit_a_events if not e['event'].startswith('action')] + + logger.info('unit_a_events %s', unit_a_events) + + assert relevant_events == [ + '_failed_restart:start', # attempt 0 + '_failed_restart:retry_release', + '_failed_restart:start', # retry 1 + '_failed_restart:retry_release', + '_failed_restart:start', # retry 2 + '_failed_restart:retry_release', + '_deferred_restart:start', # attemp 0 + '_deferred_restart:retry_hold', + '_restart:start', + '_restart:done', + ], f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in unit_a_events) + + +def test_sync_lock_is_executed(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + for unit in units: + juju.run(unit, 'sync-restart', {'delay': 1}) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + expected_events = [ + 'action:sync-restart', + '_sync_restart:start', + '_sync_restart:done', + ] + + # mutually exclusive execution is not guarantee + for unit in units: + events = get_unit_events(juju, unit) + relevant_events = [e['event'] for e in events] + + assert expected_events == relevant_events, f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in events) + + +def test_retry_on_leader_unit_leaves_the_hook(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + leader = get_leader_unit_name(juju, app_name) + non_leader = next(unit for unit in units if unit != leader) + + juju.run(leader, 'failed-restart', {'delay': 5}) + juju.run(non_leader, 'restart', {'delay': 3}) + + juju.wait( + lambda status: status.apps[app_name].units[non_leader].is_active, + error=jubilant.any_error, + timeout=TIMEOUT, + ) + + non_leader_events = get_unit_events(juju, non_leader) + relevant_events = [e['event'] for e in non_leader_events] + + assert relevant_events == [ + 'action:restart', + '_restart:start', + '_restart:done', + ], f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in non_leader_events) diff --git a/rollingops/tests/integration/utils.py b/rollingops/tests/integration/utils.py new file mode 100644 index 000000000..bed37564a --- /dev/null +++ b/rollingops/tests/integration/utils.py @@ -0,0 +1,64 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utils for integration tests.""" + +import json +from datetime import UTC, datetime + +import jubilant + +from charmlibs import pathops + +TRACE_FILE = '/var/lib/charm-rolling-ops/transitions.log' + + +def get_unit_events(juju: jubilant.Juju, unit: str) -> list[dict[str, str]]: + task = juju.exec(f'cat {TRACE_FILE}', unit=unit) + + if not task.stdout.strip(): + return [] + + return [json.loads(line) for line in task.stdout.strip().splitlines()] + + +def parse_ts(event: dict[str, str]) -> datetime: + return datetime.fromtimestamp(float(event['ts']), tz=UTC) + + +def get_leader_unit_name(juju: jubilant.Juju, app: str) -> str: + """Retrieve the leader unit's name. + + Raises: + RuntimeError: if no leader unit is found. + """ + for name, unit in juju.status().get_units(app).items(): + if unit.leader: + return name + + raise RuntimeError(f'No leader unit found for app {app}') + + +def remove_transition_file(juju: jubilant.Juju, unit: str): + juju.exec(f'rm -f {TRACE_FILE}', unit=unit) + + +def is_empty_file(juju: jubilant.Juju, unit: str, path: str) -> bool: + pathops_path = pathops.LocalPath(path) + try: + task = juju.exec(f'test ! -s {pathops_path}', unit=unit) + except Exception: + return False + + return task.status == 'completed' and task.return_code == 0 diff --git a/rollingops/tests/unit/conftest.py b/rollingops/tests/unit/conftest.py new file mode 100644 index 000000000..bf96787fe --- /dev/null +++ b/rollingops/tests/unit/conftest.py @@ -0,0 +1,314 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Fixtures for unit tests, typically mocking out parts of the external system.""" + +import types +from collections.abc import Generator +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock, patch + +import ops +import pytest +from ops import ActionEvent +from ops.testing import Context + +import charmlibs.rollingops.etcd._certificates as certificates +import charmlibs.rollingops.etcd._etcdctl as etcdctl +from charmlibs.interfaces.tls_certificates import ( + Certificate, + PrivateKey, +) +from charmlibs.pathops import LocalPath +from charmlibs.rollingops import RollingOpsManager +from charmlibs.rollingops.common._models import OperationResult +from charmlibs.rollingops.etcd._models import SharedCertificate +from charmlibs.rollingops.peer._backend import PeerRollingOpsBackend + +VALID_CA_CERT_PEM = """-----BEGIN CERTIFICATE----- + MIIC6DCCAdCgAwIBAgIUW42TU9LSjEZLMCclWrvSwAsgRtcwDQYJKoZIhvcNAQEL + BQAwIDELMAkGA1UEBhMCVVMxETAPBgNVBAMMCHdoYXRldmVyMB4XDTIzMDMyNDE4 + NDMxOVoXDTI0MDMyMzE4NDMxOVowPDELMAkGA1UEAwwCb2sxLTArBgNVBC0MJGUw + NjVmMWI3LTE2OWEtNDE5YS1iNmQyLTc3OWJkOGM4NzIwNjCCASIwDQYJKoZIhvcN + AQEBBQADggEPADCCAQoCggEBAK42ixoklDH5K5i1NxXo/AFACDa956pE5RA57wlC + BfgUYaIDRmv7TUVJh6zoMZSD6wjSZl3QgP7UTTZeHbvs3QE9HUwEkH1Lo3a8vD3z + eqsE2vSnOkpWWnPbfxiQyrTm77/LAWBt7lRLRLdfL6WcucD3wsGqm58sWXM3HG0f + SN7PHCZUFqU6MpkHw8DiKmht5hBgWG+Vq3Zw8MNaqpwb/NgST3yYdcZwb58G2FTS + ZvDSdUfRmD/mY7TpciYV8EFylXNNFkth8oGNLunR9adgZ+9IunfRKj1a7S5GSwXU + AZDaojw+8k5i3ikztsWH11wAVCiLj/3euIqq95z8xGycnKcCAwEAATANBgkqhkiG + 9w0BAQsFAAOCAQEAWMvcaozgBrZ/MAxzTJmp5gZyLxmMNV6iT9dcqbwzDtDtBvA/ + 46ux6ytAQ+A7Bd3AubvozwCr1Id6g66ae0blWYRRZmF8fDdX/SBjIUkv7u9A3NVQ + XN9gsEvK9pdpfN4ZiflfGSLdhM1STHycLmhG6H5s7HklbukMRhQi+ejbSzm/wiw1 + ipcxuKhSUIVNkTLusN5b+HE2gwF1fn0K0z5jWABy08huLgbaEKXJEx5/FKLZGJga + fpIzAdf25kMTu3gggseaAmzyX3AtT1i8A8nqYfe8fnnVMkvud89kq5jErv/hlMC9 + 49g5yWQR2jilYYM3j9BHDuB+Rs+YS5BCep1JnQ== + -----END CERTIFICATE-----""" + +VALID_CLIENT_CERT_PEM = """-----BEGIN CERTIFICATE----- + MIIC6DCCAdCgAwIBAgIUdiBwE/CtaBXJl3MArjZen6Y8kigwDQYJKoZIhvcNAQEL + BQAwIDELMAkGA1UEBhMCVVMxETAPBgNVBAMMCHdoYXRldmVyMB4XDTIzMDMyNDE4 + NDg1OVoXDTI0MDMyMzE4NDg1OVowPDELMAkGA1UEAwwCb2sxLTArBgNVBC0MJDEw + MDdjNDBhLWUwYzMtNDVlOS05YTAxLTVlYjY0NWQ0ZmEyZDCCASIwDQYJKoZIhvcN + AQEBBQADggEPADCCAQoCggEBANOnUl6JDlXpLMRr/PxgtfE/E5Yk6E/TkPkPL/Kk + tUGjEi42XZDg9zn3U6cjTDYu+rfKY2jiitfsduW6DQIkEpz3AvbuCMbbgnFpcjsB + YysLSMTmuz/AVPrfnea/tQTALcONCSy1VhAjGSr81ZRSMB4khl9StSauZrbkpJ1P + shqkFSUyAi31mKrnXz0Es/v0Yi0FzAlgWrZ4u1Ld+Bo2Xz7oK4mHf7/93Jc+tEaM + IqG6ocD0q8bjPp0tlSxftVADNUzWlZfM6fue5EXzOsKqyDrxYOSchfU9dNzKsaBX + kxbHEeSUPJeYYj7aVPEfAs/tlUGsoXQvwWfRie8grp2BoLECAwEAATANBgkqhkiG + 9w0BAQsFAAOCAQEACZARBpHYH6Gr2a1ka0mCWfBmOZqfDVan9rsI5TCThoylmaXW + quEiZ2LObI+5faPzxSBhr9TjJlQamsd4ywout7pHKN8ZGqrCMRJ1jJbUfobu1n2k + UOsY4+jzV1IRBXJzj64fLal4QhUNv341lAer6Vz3cAyRk7CK89b/DEY0x+jVpyZT + 1osx9JtsOmkDTgvdStGzq5kPKWOfjwHkmKQaZXliCgqbhzcCERppp1s/sX6K7nIh + 4lWiEmzUSD3Hngk51KGWlpZszO5KQ4cSZ3HUt/prg+tt0ROC3pY61k+m5dDUa9M8 + RtMI6iTjzSj/UV8DiAx0yeM+bKoy4jGeXmaL3g== + -----END CERTIFICATE-----""" + +VALID_CLIENT_KEY_PEM = """-----BEGIN RSA PRIVATE KEY----- + MIIEpAIBAAKCAQEAqk3eP5GA+m9xeAaP8TzcPVQPXdkDYWFENB2P3qPv+nSF/KGK + BxmADFR43tCT69rv44BQvYt38MB8cvyMSPBfQqJmE2ff3UnBISfhebS0A3WC7qWy + yPLjpHcznHxcxYLmqVjcCBO40TVvWTbcjmKNtQbDc5lnEeWyv1Vv5ceXGQD/dId7 + tfbGgeG1kqB02ysAYLxeoMuHGoL77+8DEuQY7PlFCCQMNTLwB4isft9OkhTpCQad + xJNzc5mGYc9nMofLl/tZIi7Kn3mw4LmwNoyuxeoP1eklK+g8FvPyWYYaLug08wCR + Sf/YKpmZgj6LfRFnXvxYiw1tGQLZ4uqiuQpBLwIDAQABAoIBACWfr1Zu4EYzgadp + F7rNXcCkxgJPM7p7QRScZVDj+dvki0dNLs+zuADBVrSu8sb75txlWDEP008aT0Qd + /CYPCJSRiSiHXcMnDKY1B9CZ9dz/xI3RiIZxdo46kWnkZaBy8199VJrqNH3vpqpY + fvBr4G+aT2rF/KnNC6jOiLqEViK9I1CDIEBM+Hc9VfNlSd1yKCMH5FCge95GqALP + rbjA2YxQNql20fZqs3QRbwUZ7LCvb7DIKr4puxOFyxfe5tgHtDnHc/mdzt0BhLXb + 2ZwioPtqfgolFoAwSQ1rpTjK9fiSCrvIb0CaVUnNyO1wJ/i90uztYVXswEeWMket + cwRj0BkCgYEA6ozA07DY4q6XmQPUZZS9H7qk2TwD7cuGoNpYCYtTrMFkajLT5F4E + C82Sfd94+hqNBix11h2FPY8ng/De6O37k6jHNnep/N80/90cyvwwdHInGwIAbWF7 + wpRZEk6/ftlYji/zbAK0Dz9AncQVBGVjqu/rlOUeEbrqBprMBFdMBFcCgYEAueEB + TQPiTIfkBlHBuGS505Q05sPGyR1KsmwQ6fHRtu7gaqsQUXnd4vLXGwNsoNJyPxDw + uj6GCrpEyY6nMPUEGALM4WTd5EoNej1FVDJWaJk9s4uv6fTu/pFdjbf4ezcmH54I + SpyyFRsjm6Y84a9V5pu9rv/wyRVdcLgS+Ne3YukCgYEAs5pkbbWV3r7ixwDvu3lR + +OHrKY2TVJvs029eyrAturO8OLYDG3QClSctbcWZ1apPItMYyISCaskb8SSZDLRv + WHp9UXAAcupYozSlv6mtUP24hC3cNeXX5v/B1QsICBJWhUqik6reRm6hBC4KCfu5 + fkOJmdJ4XAtM+RG/9/MA+rECgYEApo2Bn+OiC1ccL7lkLng6teWvvTKhVSWk/9ir + EyS1+Ad1GL8tAQSEmE1mBvN7i2LmMbJZMVjCvKwI5N2o28o/n9Aqiq/Zzyu3hdeO + 3pG4MUNWMSIyPx1UZNAWFt1IjgdtZpkw7sIXI6hMsLQ1CzgTbW4RedQlidhWAKE/ + hq+rx7kCgYAS7uXayl5+8+QAHlTCt7FJInFIHE3yy1g5pxK0zp0rNE7T4SmGzZv9 + QccUFun5Tk0AgoeKEQQpvDHv3ACODl3PUyiuoFDYOeEB57dIEmM9FW85MIBGK5RI + 5Zv3x7N0WSyCf6w51sT2UsaI5Ybqnfo7zCThvUkmVM1yfxyfjcKKnQ== + -----END RSA PRIVATE KEY-----""" + + +@pytest.fixture +def temp_certificates(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> types.ModuleType: + base_dir = LocalPath(str(tmp_path)) / 'tls' + ca_cert = base_dir / 'client-ca.pem' + client_key = base_dir / 'client.key' + client_cert = base_dir / 'client.pem' + + monkeypatch.setattr(certificates, 'BASE_DIR', base_dir) + monkeypatch.setattr(certificates, 'CA_CERT_PATH', ca_cert) + monkeypatch.setattr(certificates, 'CLIENT_KEY_PATH', client_key) + monkeypatch.setattr(certificates, 'CLIENT_CERT_PATH', client_cert) + + base_dir.mkdir(parents=True, exist_ok=True) + return certificates + + +@pytest.fixture +def temp_etcdctl(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> types.ModuleType: + base_dir = LocalPath(str(tmp_path)) / 'etcd' + server_ca = base_dir / 'server-ca.pem' + env_file = base_dir / 'etcdctl.json' + + monkeypatch.setattr(etcdctl, 'BASE_DIR', base_dir) + monkeypatch.setattr(etcdctl, 'SERVER_CA_PATH', server_ca) + monkeypatch.setattr(etcdctl, 'CONFIG_FILE_PATH', env_file) + + base_dir.mkdir(parents=True, exist_ok=True) + return etcdctl + + +@pytest.fixture +def etcdctl_patch() -> Generator[MagicMock, None, None]: + with patch('charmlibs.rollingops.etcd._certificates') as mock_etcdctl: + yield mock_etcdctl + + +@pytest.fixture +def certificates_manager_patches() -> Generator[dict[str, MagicMock], None, None]: + with ( + patch( + 'charmlibs.rollingops.etcd._certificates._exists', + return_value=False, + ), + patch( + 'charmlibs.rollingops.etcd._certificates.generate', + return_value=SharedCertificate( + certificate=Certificate.from_string(VALID_CLIENT_CERT_PEM), + key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), + ca=Certificate.from_string(VALID_CA_CERT_PEM), + ), + ) as mock_generate, + patch( + 'charmlibs.rollingops.etcd._certificates.persist_client_cert_key_and_ca', + return_value=None, + ) as mock_persit, + ): + yield { + 'generate': mock_generate, + 'persist': mock_persit, + } + + +class BaseRollingOpsTestCharm(ops.CharmBase): + def __init__(self, framework: ops.Framework): + super().__init__(framework) + + callback_targets = { + '_restart': self._restart, + '_failed_restart': self._failed_restart, + '_deferred_restart': self._deferred_restart, + } + + self.restart_manager = self._make_restart_manager(callback_targets) + self.framework.observe(self.on.restart_action, self._on_restart_action) + self.framework.observe(self.on.failed_restart_action, self._on_failed_restart_action) + self.framework.observe(self.on.deferred_restart_action, self._on_deferred_restart_action) + + def _make_restart_manager( + self, callback_targets: dict[str, Any] + ) -> PeerRollingOpsBackend | RollingOpsManager: + raise NotImplementedError + + def _on_restart_action(self, event: ActionEvent) -> None: + delay = event.params.get('delay') + self.restart_manager.request_async_lock(callback_id='_restart', kwargs={'delay': delay}) + + def _on_failed_restart_action(self, event: ActionEvent) -> None: + delay = event.params.get('delay') + max_retry = event.params.get('max-retry', None) + self.restart_manager.request_async_lock( + callback_id='_failed_restart', + kwargs={'delay': delay}, + max_retry=max_retry, + ) + + def _on_deferred_restart_action(self, event: ActionEvent) -> None: + delay = event.params.get('delay') + max_retry = event.params.get('max-retry', None) + self.restart_manager.request_async_lock( + callback_id='_deferred_restart', + kwargs={'delay': delay}, + max_retry=max_retry, + ) + + def _restart(self) -> None: + pass + + def _failed_restart(self, delay: int = 0) -> OperationResult: + return OperationResult.RETRY_RELEASE + + def _deferred_restart(self, delay: int = 0) -> OperationResult: + return OperationResult.RETRY_HOLD + + +class PeerRollingOpsCharm(BaseRollingOpsTestCharm): + def _make_restart_manager(self, callback_targets: dict[str, Any]) -> PeerRollingOpsBackend: + return PeerRollingOpsBackend( + charm=self, + relation_name='restart', + callback_targets=callback_targets, + ) + + +class RollingOpsCharm(BaseRollingOpsTestCharm): + def _make_restart_manager(self, callback_targets: dict[str, Any]) -> RollingOpsManager: + return RollingOpsManager( + charm=self, + peer_relation_name='restart', + etcd_relation_name='etcd', + cluster_id='cluster-12345', + callback_targets=callback_targets, + ) + + +@pytest.fixture +def peer_charm_test() -> type[PeerRollingOpsCharm]: + return PeerRollingOpsCharm + + +@pytest.fixture +def charm_test() -> type[RollingOpsCharm]: + return RollingOpsCharm + + +meta: dict[str, Any] = { + 'name': 'charm', + 'peers': { + 'restart': { + 'interface': 'rolling_op', + }, + }, + 'requires': { + 'etcd': { + 'interface': 'etcd_client', + }, + }, +} + +actions: dict[str, Any] = { + 'restart': { + 'description': 'Restarts the example service', + 'params': { + 'delay': { + 'description': 'Introduce an artificial delay (for testing).', + 'type': 'integer', + 'default': 0, + }, + }, + }, + 'failed-restart': { + 'description': 'Example restart with a custom callback function. Used in testing', + 'params': { + 'delay': { + 'description': 'Introduce an artificial delay (for testing).', + 'type': 'integer', + 'default': 0, + }, + 'max-retry': { + 'description': 'Number of times the operation should be retried.', + 'type': 'integer', + }, + }, + }, + 'deferred-restart': { + 'description': 'Example restart with a custom callback function. Used in testing', + 'params': { + 'delay': { + 'description': 'Introduce an artificial delay (for testing).', + 'type': 'integer', + 'default': 0, + }, + 'max-retry': { + 'description': 'Number of times the operation should be retried.', + 'type': 'integer', + }, + }, + }, +} + + +@pytest.fixture +def ctx(charm_test: type[RollingOpsCharm]) -> Context[RollingOpsCharm]: + return Context(charm_test, meta=meta, actions=actions) + + +@pytest.fixture +def peer_ctx(peer_charm_test: type[PeerRollingOpsCharm]) -> Context[PeerRollingOpsCharm]: + return Context(peer_charm_test, meta=meta, actions=actions) diff --git a/rollingops/tests/unit/test_common_models.py b/rollingops/tests/unit/test_common_models.py new file mode 100644 index 000000000..8ae9a1644 --- /dev/null +++ b/rollingops/tests/unit/test_common_models.py @@ -0,0 +1,379 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + +import json +from datetime import UTC, datetime +from typing import Any + +import pytest + +from charmlibs.rollingops.common._exceptions import RollingOpsDecodingError +from charmlibs.rollingops.common._models import ( + Operation, + OperationResult, +) + + +def test_operation_create_sets_fields(): + op = Operation.create('restart', {'b': 2, 'a': 1}, max_retry=3) + + assert op.kwargs == {'b': 2, 'a': 1} + assert op.callback_id == 'restart' + assert op.max_retry == 3 + assert isinstance(op.requested_at, datetime) + + +def test_operation_to_string_contains_string_values_only(): + ts = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) + op = Operation( + callback_id='cb', + kwargs={'b': 2, 'a': 1}, + requested_at=ts, + max_retry=None, + attempt=0, + result=None, + ) + + s = op.to_string() + obj = json.loads(s) + + assert obj['callback_id'] == 'cb' + assert obj['kwargs'] == '{"a":1,"b":2}' + assert obj['requested_at'] == str(ts.timestamp()) + assert obj.get('max_retry', '') == '' + + +def test_operation_to_string_contains_string_values_only_zero_max_retry(): + ts = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) + op = Operation( + callback_id='cb', + kwargs={'b': 2, 'a': 1}, + requested_at=ts, + max_retry=0, + attempt=0, + result=None, + ) + + s = op.to_string() + obj = json.loads(s) + + assert obj['callback_id'] == 'cb' + assert obj['kwargs'] == '{"a":1,"b":2}' + assert obj['requested_at'] == str(ts.timestamp()) + assert obj.get('max_retry', '') == '0' + + +def test_operation_is_max_retry_reached_on_zero_max_retry(): + op = Operation.create('restart', {'a': 1, 'b': 2}, max_retry=0) + assert not op.is_max_retry_reached() + op.increase_attempt() + assert op.is_max_retry_reached() + + +def test_operation_equality_and_hash_ignore_timestamp_and_max_retry(): + # Equality only depends on (callback_id, kwargs) + op1 = Operation.create('restart', {'a': 1, 'b': 2}, max_retry=0) + op2 = Operation.create('restart', {'b': 2, 'a': 1}, max_retry=999) + + assert op1 == op2 + assert hash(op1) == hash(op2) + + op3 = Operation.create('restart', {'a': 2}, max_retry=0) + assert op1 != op3 + + +def test_operation_equality_and_hash_empty_arguments(): + # Equality only depends on (callback_id, kwargs) + op1 = Operation.create('restart', {}, max_retry=0) + op2 = Operation.create('restart', {}, max_retry=999) + + assert op1 == op2 + assert hash(op1) == hash(op2) + + op3 = Operation.create('restart', {'a': 2}, max_retry=0) + assert op1 != op3 + + +def test_operation_to_string_and_from_string(): + ts = datetime(2026, 2, 23, 12, 0, 0, 0, tzinfo=UTC) + op1 = Operation( + callback_id='cb', + kwargs={'x': 1, 'y': 'z'}, + requested_at=ts, + max_retry=5, + attempt=0, + result=None, + ) + + s = op1.to_string() + op2 = Operation.from_string(s) + + assert op2.callback_id == op1.callback_id + assert op2.kwargs == op1.kwargs + assert op2.requested_at == op1.requested_at + assert op2.max_retry == op1.max_retry + assert op2.attempt == op1.attempt + + +def test_operation_from_string_valid_payload(): + requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) + payload = json.dumps({ + 'callback_id': 'cb-123', + 'kwargs': json.dumps({'b': 2, 'a': 'x'}), + 'requested_at': str(requested_at.timestamp()), + 'max_retry': '5', + 'attempt': '2', + }) + + op = Operation.from_string(payload) + + assert op is not None + assert op.callback_id == 'cb-123' + assert op.kwargs == {'b': 2, 'a': 'x'} + assert op.requested_at == requested_at + assert op.max_retry == 5 + assert op.attempt == 2 + + +def test_from_string_valid_payload_with_empty_kwargs_and_no_max_retry(): + requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) + payload = json.dumps({ + 'callback_id': 'cb-123', + 'kwargs': '', + 'requested_at': str(requested_at.timestamp()), + 'max_retry': '', + 'attempt': '0', + }) + + op = Operation.from_string(payload) + + assert op is not None + assert op.callback_id == 'cb-123' + assert op.kwargs == {} + assert op.requested_at == requested_at + assert op.max_retry is None + assert op.attempt == 0 + + +def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): + requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) + payload = json.dumps({ + 'callback_id': 'cb-123', + 'kwargs': '{}', + 'requested_at': str(requested_at.timestamp()), + 'max_retry': '0', + 'attempt': '0', + }) + + op = Operation.from_string(payload) + + assert op is not None + assert op.callback_id == 'cb-123' + assert op.kwargs == {} + assert op.requested_at == requested_at + assert op.max_retry == 0 + assert op.attempt == 0 + + +@pytest.mark.parametrize( + 'payload', + [ + '{not valid json', + json.dumps( # invalid requested_at + { + 'callback_id': 'cb-123', + 'kwargs': json.dumps({'x': 1}), + 'requested_at': 'bad-ts', + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # invalid kwargs + { + 'callback_id': 'cb-123', + 'kwargs': '{bad kwargs json', + 'requested_at': str(datetime.now(UTC).timestamp()), + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # missing callback_id + { + 'kwargs': json.dumps({'x': 1}), + 'requested_at': str(datetime.now(UTC).timestamp()), + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # invalid kwargs + { + 'callback_id': 'cb-123', + 'kwargs': '[]', + 'requested_at': str(datetime.now(UTC).timestamp()), + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # missing requested_at + { + 'callback_id': 'cb-123', + 'kwargs': '{}', + 'requested_at': '', + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # result + { + 'callback_id': 'cb-123', + 'kwargs': '{}', + 'requested_at': 'bad-ts', + 'max_retry': '3', + 'attempt': '1', + 'result': 'something', + } + ), + ], +) +def test_operation_from_string_invalid_inputs_return_none(payload: Any): + with pytest.raises(RollingOpsDecodingError, match='Failed to deserialize'): + Operation.from_string(payload) + + +def test_op_id_returns_timestamp_and_callback_id() -> None: + requested_at = datetime(2025, 1, 2, 3, 4, 5) + operation = Operation( + callback_id='restart', + kwargs={'delay': 2}, + requested_at=requested_at, + max_retry=3, + attempt=0, + result=None, + ) + + assert operation.op_id == f'{requested_at.timestamp()}-restart' + + +def test_complete_increments_attempt_and_sets_release() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=3, + attempt=0, + result=None, + ) + + operation.complete() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RELEASE + + +def test_retry_hold_sets_retry_hold_when_max_retry_not_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=3, + attempt=0, + result=None, + ) + + operation.retry_hold() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RETRY_HOLD + + +def test_retry_hold_sets_release_when_max_retry_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=0, + attempt=0, + result=None, + ) + + operation.retry_hold() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RELEASE + + +def test_retry_release_sets_retry_release_when_max_retry_not_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=3, + attempt=0, + result=None, + ) + + operation.retry_release() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RETRY_RELEASE + + +def test_retry_release_sets_release_when_max_retry_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=0, + attempt=0, + result=None, + ) + + operation.retry_release() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RELEASE + + +def test_retry_hold_with_no_max_retry_sets_retry_hold() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=None, + attempt=5, + result=None, + ) + + operation.retry_hold() + + assert operation.attempt == 6 + assert operation.result == OperationResult.RETRY_HOLD + + +def test_retry_release_with_no_max_retry_sets_retry_release() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=None, + attempt=5, + result=None, + ) + + operation.retry_release() + + assert operation.attempt == 6 + assert operation.result == OperationResult.RETRY_RELEASE diff --git a/rollingops/tests/unit/test_etcd_certificates.py b/rollingops/tests/unit/test_etcd_certificates.py new file mode 100644 index 000000000..00b21e4cb --- /dev/null +++ b/rollingops/tests/unit/test_etcd_certificates.py @@ -0,0 +1,152 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + +from typing import Any + +from tests.unit.conftest import VALID_CA_CERT_PEM, VALID_CLIENT_CERT_PEM, VALID_CLIENT_KEY_PEM + +from charmlibs.interfaces.tls_certificates import ( + Certificate, + PrivateKey, +) +from charmlibs.rollingops.etcd._models import SharedCertificate + + +def make_shared_certificate() -> SharedCertificate: + return SharedCertificate( + certificate=Certificate.from_string(VALID_CLIENT_CERT_PEM), + key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), + ca=Certificate.from_string(VALID_CA_CERT_PEM), + ) + + +def test_make_shared_certificate_is_valid(): + Certificate.from_string(VALID_CA_CERT_PEM) + PrivateKey.from_string(VALID_CLIENT_KEY_PEM) + Certificate.from_string(VALID_CLIENT_CERT_PEM) + + +def test_certificates_manager_exists_returns_false_when_no_files( + temp_certificates: Any, +) -> None: + assert temp_certificates._exists() is False + + +def test_certificates_manager_exists_returns_false_when_cert_does_not_exist( + temp_certificates: Any, +) -> None: + temp_certificates.CLIENT_KEY_PATH.write_text('client-key') + + assert temp_certificates._exists() is False + + +def test_certificates_manager_exists_returns_false_when_key_does_not_exist( + temp_certificates: Any, +) -> None: + temp_certificates.CLIENT_CERT_PATH.write_text('client-cert') + + assert temp_certificates._exists() is False + + +def test_certificates_manager_exists_returns_true_when_all_files_exist( + temp_certificates: Any, +) -> None: + temp_certificates.CLIENT_KEY_PATH.write_text('client-key') + temp_certificates.CLIENT_CERT_PATH.write_text('client-cert') + temp_certificates.CA_CERT_PATH.write_text('ca-cert') + + assert temp_certificates._exists() is True + + +def test_certificates_manager_persist_client_cert_and_key_writes_files( + temp_certificates: Any, +) -> None: + shared_certificate = make_shared_certificate() + temp_certificates.persist_client_cert_key_and_ca(shared_certificate) + + assert temp_certificates.CLIENT_CERT_PATH.read_text() == shared_certificate.certificate.raw + assert temp_certificates.CLIENT_KEY_PATH.read_text() == shared_certificate.key.raw + assert temp_certificates.CA_CERT_PATH.read_text() == shared_certificate.ca.raw + + +def test_certificates_manager_has_client_cert_and_key_returns_false_when_files_missing( + temp_certificates: Any, +) -> None: + shared_certificate = make_shared_certificate() + assert temp_certificates._has_client_cert_key_and_ca(shared_certificate) is False + + +def test_certificates_manager_has_client_cert_and_key_returns_true_when_material_matches( + temp_certificates: Any, +) -> None: + temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) + temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) + temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) + + shared_certificate = make_shared_certificate() + assert temp_certificates._has_client_cert_key_and_ca(shared_certificate) is True + + +def test_certificates_manager_has_client_cert_and_key_returns_false_when_material_differs( + temp_certificates: Any, +) -> None: + temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) + temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) + temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) + + other_shared_certificate = SharedCertificate( + certificate=Certificate.from_string(VALID_CA_CERT_PEM), + key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), + ca=Certificate.from_string(VALID_CLIENT_CERT_PEM), + ) + assert temp_certificates._has_client_cert_key_and_ca(other_shared_certificate) is False + + +def test_certificates_manager_generate_does_nothing_when_files_already_exist( + temp_certificates: Any, +) -> None: + temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) + temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) + temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) + old_certificates = make_shared_certificate() + + new_certificates = temp_certificates.generate(common_name='unit-1') + + written = SharedCertificate.from_strings( + certificate=temp_certificates.CLIENT_CERT_PATH.read_text(), + key=temp_certificates.CLIENT_KEY_PATH.read_text(), + ca=temp_certificates.CA_CERT_PATH.read_text(), + ) + assert written == old_certificates + + assert new_certificates == old_certificates + + +def test_certificates_manager_generate_creates_all_files( + temp_certificates: Any, +) -> None: + shared = temp_certificates.generate(common_name='unit-1') + assert temp_certificates._exists() is True + + assert temp_certificates.CA_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') + assert temp_certificates.CLIENT_KEY_PATH.read_text().startswith( + '-----BEGIN RSA PRIVATE KEY-----' + ) + assert temp_certificates.CLIENT_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') + + assert temp_certificates.CA_CERT_PATH.read_text() == shared.ca.raw + assert temp_certificates.CLIENT_KEY_PATH.read_text() == shared.key.raw + assert temp_certificates.CLIENT_CERT_PATH.read_text() == shared.certificate.raw diff --git a/rollingops/tests/unit/test_etcd_etcdctl.py b/rollingops/tests/unit/test_etcd_etcdctl.py new file mode 100644 index 000000000..26497fa1e --- /dev/null +++ b/rollingops/tests/unit/test_etcd_etcdctl.py @@ -0,0 +1,94 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + +import json +from typing import Any +from unittest.mock import patch + +import pytest + +from charmlibs.pathops import LocalPath +from charmlibs.rollingops.common._exceptions import RollingOpsEtcdNotConfiguredError + + +def test_etcdctl_write_env(temp_etcdctl: Any) -> None: + temp_etcdctl.write_config_file( + endpoints='https://10.0.0.1:2379,https://10.0.0.2:2379', + client_cert_path=LocalPath('PATH1'), + client_key_path=LocalPath('PATH2'), + ) + + assert temp_etcdctl.BASE_DIR.exists() + + config = json.loads(temp_etcdctl.CONFIG_FILE_PATH.read_text()) + assert config == { + 'endpoints': 'https://10.0.0.1:2379,https://10.0.0.2:2379', + 'cacert_path': str(temp_etcdctl.SERVER_CA_PATH), + 'cert_path': 'PATH1', + 'key_path': 'PATH2', + } + + +def test_etcdctl_ensure_initialized_raises_when_env_missing(temp_etcdctl: Any) -> None: + with pytest.raises(RollingOpsEtcdNotConfiguredError): + temp_etcdctl.ensure_initialized() + + +def test_etcdctl_cleanup_removes_env_file_and_server_ca(temp_etcdctl: Any) -> None: + temp_etcdctl.BASE_DIR.mkdir(parents=True, exist_ok=True) + temp_etcdctl.CONFIG_FILE_PATH.write_text('env') + temp_etcdctl.SERVER_CA_PATH.write_text('ca') + + assert temp_etcdctl.CONFIG_FILE_PATH.exists() + assert temp_etcdctl.SERVER_CA_PATH.exists() + + temp_etcdctl.cleanup() + + assert not temp_etcdctl.CONFIG_FILE_PATH.exists() + assert not temp_etcdctl.SERVER_CA_PATH.exists() + + +def test_etcdctl_cleanup_is_noop_when_files_do_not_exist(temp_etcdctl: Any) -> None: + assert not temp_etcdctl.CONFIG_FILE_PATH.exists() + assert not temp_etcdctl.SERVER_CA_PATH.exists() + + temp_etcdctl.cleanup() + + assert not temp_etcdctl.CONFIG_FILE_PATH.exists() + assert not temp_etcdctl.SERVER_CA_PATH.exists() + + +def test_etcdctl_load_env_parses_exported_vars(temp_etcdctl: Any) -> None: + temp_etcdctl.BASE_DIR.mkdir(parents=True, exist_ok=True) + temp_etcdctl.SERVER_CA_PATH.write_text('SERVER CA') + temp_etcdctl.CONFIG_FILE_PATH.write_text( + json.dumps({ + 'endpoints': 'https://10.0.0.1:2379', + 'cacert_path': '/a-path/server-ca.pem', + 'cert_path': '/a-path/client.pem', + 'key_path': '/a-path/client.key', + }) + ) + + with patch.dict('os.environ', {'EXISTING_VAR': 'present'}, clear=True): + env = temp_etcdctl.load_env() + + assert env['EXISTING_VAR'] == 'present' + assert env['ETCDCTL_API'] == '3' + assert env['ETCDCTL_ENDPOINTS'] == 'https://10.0.0.1:2379' + assert env['ETCDCTL_CERT'] == '/a-path/client.pem' + assert env['ETCDCTL_KEY'] == '/a-path/client.key' + assert env['ETCDCTL_CACERT'] == '/a-path/server-ca.pem' diff --git a/rollingops/tests/unit/test_etcd_models.py b/rollingops/tests/unit/test_etcd_models.py new file mode 100644 index 000000000..e20ce39b5 --- /dev/null +++ b/rollingops/tests/unit/test_etcd_models.py @@ -0,0 +1,39 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + + +from charmlibs.rollingops.etcd._models import RollingOpsKeys + + +def test_rollingopskeys_paths() -> None: + keys = RollingOpsKeys.for_owner('cluster-a', 'unit-1') + + assert keys.cluster_prefix == '/rollingops/default/cluster-a/' + assert keys._owner_prefix == '/rollingops/default/cluster-a/unit-1/' + assert keys.lock_key == '/rollingops/default/cluster-a/granted-unit/' + assert keys.pending == '/rollingops/default/cluster-a/unit-1/pending/' + assert keys.inprogress == '/rollingops/default/cluster-a/unit-1/inprogress/' + assert keys.completed == '/rollingops/default/cluster-a/unit-1/completed/' + + +def test_rollingopskeys_lock_key_is_shared_within_cluster() -> None: + k1 = RollingOpsKeys.for_owner('cluster-a', 'unit-1') + k2 = RollingOpsKeys.for_owner('cluster-a', 'unit-2') + + assert k1.lock_key == k2.lock_key + assert k1.pending != k2.pending + assert k1.inprogress != k2.inprogress + assert k1.completed != k2.completed diff --git a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py new file mode 100644 index 000000000..828b6fff6 --- /dev/null +++ b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py @@ -0,0 +1,352 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + +from unittest.mock import MagicMock, patch + +import pytest +from ops.testing import Context, PeerRelation, Secret, State +from scenario import RawDataBagContents +from scenario.errors import UncaughtCharmError +from tests.unit.conftest import ( + VALID_CA_CERT_PEM, + VALID_CLIENT_CERT_PEM, + VALID_CLIENT_KEY_PEM, + RollingOpsCharm, +) + +from charmlibs.interfaces.tls_certificates import ( + Certificate, + PrivateKey, +) +from charmlibs.rollingops.common._exceptions import ( + RollingOpsEtcdNotConfiguredError, + RollingOpsInvalidSecretContentError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationQueue, + ProcessingBackend, + RollingOpsState, + RollingOpsStatus, +) +from charmlibs.rollingops.etcd._models import SharedCertificate +from charmlibs.rollingops.etcd._relations import CERT_SECRET_FIELD +from charmlibs.rollingops.peer._models import LockIntent + + +def _unit_databag(state: State, peer: PeerRelation) -> RawDataBagContents: + return state.get_relation(peer.id).local_unit_data + + +def test_leader_elected_creates_shared_secret_and_stores_id( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): + peer_relation = PeerRelation(endpoint='restart') + + state_in = State(leader=True, relations={peer_relation}) + state_out = ctx.run(ctx.on.leader_elected(), state_in) + + peer_out = next(r for r in state_out.relations if r.endpoint == 'restart') + assert CERT_SECRET_FIELD in peer_out.local_app_data + assert peer_out.local_app_data[CERT_SECRET_FIELD].startswith('secret:') + + certificates_manager_patches['generate'].assert_called_once() + + +def test_leader_elected_does_not_regenerate_when_secret_already_exists( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): + peer_relation = PeerRelation( + endpoint='restart', local_app_data={CERT_SECRET_FIELD: 'secret:existing'} + ) + secret = Secret( + id='secret:existing', + owner='app', + tracked_content={ + 'client-cert': VALID_CLIENT_CERT_PEM, + 'client-key': VALID_CLIENT_KEY_PEM, + 'client-ca': VALID_CA_CERT_PEM, + }, + ) + + state_in = State(leader=True, relations={peer_relation}, secrets=[secret]) + + state_out = ctx.run(ctx.on.leader_elected(), state_in) + + peer_out = next(r for r in state_out.relations if r.endpoint == 'restart') + assert peer_out.local_app_data[CERT_SECRET_FIELD] == 'secret:existing' + certificates_manager_patches['generate'].assert_not_called() + + +def test_non_leader_does_not_create_shared_secret( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): + peer_relation = PeerRelation(endpoint='restart') + state_in = State(leader=False, relations={peer_relation}) + + state_out = ctx.run(ctx.on.relation_changed(peer_relation, remote_unit=1), state_in) + + peer_out = next(r for r in state_out.relations if r.endpoint == 'restart') + assert CERT_SECRET_FIELD not in peer_out.local_app_data + certificates_manager_patches['generate'].assert_not_called() + + +def test_relation_changed_syncs_local_certificate_from_secret( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): + peer_relation = PeerRelation( + endpoint='restart', local_app_data={CERT_SECRET_FIELD: 'secret:rollingops-cert'} + ) + + secret = Secret( + id='secret:rollingops-cert', + tracked_content={ + 'client-cert': VALID_CLIENT_CERT_PEM, + 'client-key': VALID_CLIENT_KEY_PEM, + 'client-ca': VALID_CA_CERT_PEM, + }, + ) + + state_in = State(leader=False, relations={peer_relation}, secrets=[secret]) + expected_shared = SharedCertificate( + certificate=Certificate.from_string(VALID_CLIENT_CERT_PEM), + key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), + ca=Certificate.from_string(VALID_CA_CERT_PEM), + ) + ctx.run(ctx.on.relation_changed(peer_relation, remote_unit=1), state_in) + certificates_manager_patches['persist'].assert_called_once_with(expected_shared) + + +def test_invalid_certificate_secret_content_raises( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): + peer_relation = PeerRelation( + endpoint='restart', local_app_data={CERT_SECRET_FIELD: 'secret:rollingops-cert'} + ) + + secret = Secret( + id='secret:rollingops-cert', + tracked_content={ + 'client-cert': '', + 'client-key': 'KEY_PEM', + 'client-ca': 'CA_PEM', + }, + ) + + state_in = State(leader=False, relations={peer_relation}, secrets=[secret]) + with pytest.raises(UncaughtCharmError) as exc_info: + ctx.run(ctx.on.relation_changed(peer_relation, remote_unit=1), state_in) + assert isinstance(exc_info.value.__cause__, RollingOpsInvalidSecretContentError) + + +def test_on_restart_action_lock_fallbacks_to_peer( + ctx: Context[RollingOpsCharm], +): + peer = PeerRelation(endpoint='restart') + state_in = State(leader=False, relations={peer}) + + state_out = ctx.run( + ctx.on.action('restart', params={'delay': 10}), + state_in, + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.REQUEST + assert databag['operations'] + assert databag['processing_backend'] == ProcessingBackend.PEER + assert databag['etcd_cleanup_needed'] == 'true' + + q = OperationQueue.from_string(databag['operations']) + assert len(q) == 1 + operation = q.peek() + assert operation is not None + assert operation.callback_id == '_restart' + assert operation.kwargs == {'delay': 10} + assert operation.max_retry is None + assert operation.requested_at is not None + + +def test_state_not_initialized(ctx: Context[RollingOpsCharm]): + state = State(leader=True) + + with ctx(ctx.on.start(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] + assert isinstance(rolling_state, RollingOpsState) + assert rolling_state.status == RollingOpsStatus.INVALID + assert rolling_state.processing_backend is None + assert len(rolling_state.operations) == 0 + + +def test_state_peer_idle(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_unit_data={ + 'state': '', + 'operations': '', + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] + assert isinstance(rolling_state, RollingOpsState) + assert rolling_state.status == RollingOpsStatus.IDLE + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 0 + + +def test_state_peer_waiting(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] + assert isinstance(rolling_state, RollingOpsState) + assert rolling_state.status == RollingOpsStatus.WAITING + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 + + +def test_state_peer_is_granted(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_app_data={ + 'granted_unit': f'{ctx.app_name}/0', + }, + local_unit_data={ + 'state': 'retry-release', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '2026-04-09T10:01:00+00:00', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] + assert isinstance(rolling_state, RollingOpsState) + assert rolling_state.status == RollingOpsStatus.GRANTED + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 + + +def test_state_peer_waiting_retry(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_app_data={ + 'granted_unit': 'myapp/0', + }, + local_unit_data={ + 'state': 'retry-release', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '2026-04-09T10:01:00+00:00', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] + assert isinstance(rolling_state, RollingOpsState) + assert rolling_state.status == RollingOpsStatus.WAITING + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 + + +def test_state_etcd_status(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': '', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'etcd', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with patch( + 'charmlibs.rollingops.etcd._backend.EtcdRollingOpsBackend.get_status', + return_value=RollingOpsStatus.GRANTED, + ): + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] + assert isinstance(rolling_state, RollingOpsState) + assert rolling_state.status == RollingOpsStatus.GRANTED + assert rolling_state.processing_backend == ProcessingBackend.ETCD + assert len(rolling_state.operations) == 1 + + +def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([Operation.create('restart', {'delay': 1})]).to_string(), + 'executed_at': '', + 'processing_backend': 'etcd', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with patch( + 'charmlibs.rollingops._rollingops_manager.EtcdRollingOpsBackend.get_status', + side_effect=RollingOpsEtcdNotConfiguredError('boom'), + ): + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] + assert isinstance(rolling_state, RollingOpsState) + assert rolling_state.status == RollingOpsStatus.WAITING + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 diff --git a/rollingops/tests/unit/test_peer_models.py b/rollingops/tests/unit/test_peer_models.py new file mode 100644 index 000000000..709a38867 --- /dev/null +++ b/rollingops/tests/unit/test_peer_models.py @@ -0,0 +1,144 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + +import json + +import pytest + +from charmlibs.rollingops.common._exceptions import RollingOpsDecodingError +from charmlibs.rollingops.common._models import Operation, OperationQueue + + +def _decode_queue_string(queue_str: str) -> list[dict[str, str]]: + """Helper: decode OperationQueue.to_string() -> list of dicts.""" + items = json.loads(queue_str) + assert isinstance(items, list) + return [json.loads(s) for s in items] # type: ignore[reportUnknownArgumentType] + + +def test_queue_empty_behaviour(): + q = OperationQueue() + + assert len(q) == 0 + assert q.empty is True + assert q.peek() is None + assert q.dequeue() is None + + assert json.loads(q.to_string()) == [] + + +def test_queue_enqueue_and_fifo_order(): + q = OperationQueue() + op1 = Operation.create('a', {'x': 2}) + op2 = Operation.create('b', {'i': 2}) + q.enqueue(op1) + q.enqueue(op2) + + assert len(q) == 2 + op = q.peek() + assert op is not None + assert op == op1 + + first = q.dequeue() + assert first is not None + assert first == op1 + assert len(q) == 1 + op = q.peek() + assert op is not None + assert op == op2 + + second = q.dequeue() + assert second is not None + assert second == op2 + assert q.empty is True + + +def test_queue_deduplicates_only_against_last_item(): + q = OperationQueue() + op1 = Operation.create('a', {'x': 2}) + op2 = Operation.create('a', {'x': 2}) + op3 = Operation.create('a', {'x': 4}) + + q.enqueue(op1) + assert len(q) == 1 + + q.enqueue(op2) + assert len(q) == 1 + + q.enqueue(op3) + assert len(q) == 2 + + q.enqueue(op2) + assert len(q) == 3 + + +def test_queue_to_string_and_from_string(): + q1 = OperationQueue() + op1 = Operation.create('a', {'x': 1}, max_retry=5) + op2 = Operation.create('b', {'y': 'z'}, max_retry=None) + q1.enqueue(op1) + q1.enqueue(op2) + + encoded = q1.to_string() + q2 = OperationQueue.from_string(encoded) + + assert len(q2) == 2 + op = q2.peek() + assert op is not None + assert op == op1 + + op = q2.dequeue() + assert op is not None + assert op == op1 + + op = q2.dequeue() + assert op is not None + assert op == op2 + assert q2.empty + + +def test_queue_from_string_empty_string_is_empty_queue(): + q = OperationQueue.from_string('') + assert q.empty + assert q.peek() is None + + +def test_queue_from_string_rejects_non_list_json(): + with pytest.raises(RollingOpsDecodingError, match='OperationQueue string'): + OperationQueue.from_string(json.dumps({'not': 'a list'})) + + +def test_queue_from_string_rejects_invalid_jason(): + with pytest.raises(RollingOpsDecodingError, match='Failed to deserialize data'): + OperationQueue.from_string('{invalid') + + +def test_queue_encoding_is_list_of_operation_strings(): + q = OperationQueue() + op1 = Operation.create('a', {'x': 1}) + q.enqueue(op1) + s = q.to_string() + + decoded = json.loads(s) + assert isinstance(decoded, list) + assert len(decoded) == 1 # type: ignore[reportUnknownArgumentType] + assert isinstance(decoded[0], str) + + op_dicts = _decode_queue_string(s) + assert op_dicts[0]['callback_id'] == 'a' + assert op_dicts[0]['kwargs'] == '{"x":1}' + assert op_dicts[0].get('max_retry', '') == '' + assert 'requested_at' in op_dicts[0] diff --git a/rollingops/tests/unit/test_peer_rollingops_in_charm.py b/rollingops/tests/unit/test_peer_rollingops_in_charm.py new file mode 100644 index 000000000..9c18745c0 --- /dev/null +++ b/rollingops/tests/unit/test_peer_rollingops_in_charm.py @@ -0,0 +1,529 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + + +from typing import Any + +import pytest +from ops.testing import Context, PeerRelation, State +from scenario import RawDataBagContents +from tests.unit.conftest import PeerRollingOpsCharm + +from charmlibs.rollingops.common._exceptions import RollingOpsInvalidLockRequestError +from charmlibs.rollingops.common._models import Operation, OperationQueue +from charmlibs.rollingops.common._utils import now_timestamp +from charmlibs.rollingops.peer._models import LockIntent + + +def _unit_databag(state: State, peer: PeerRelation) -> RawDataBagContents: + return state.get_relation(peer.id).local_unit_data + + +def _app_databag(state: State, peer: PeerRelation) -> RawDataBagContents: + return state.get_relation(peer.id).local_app_data + + +def _make_operation_queue( + callback_id: str, kwargs: dict[str, Any], max_retry: int | None +) -> OperationQueue: + q = OperationQueue() + op1 = Operation.create(callback_id=callback_id, kwargs=kwargs, max_retry=max_retry) + q.enqueue(op1) + return q + + +def test_lock_request_enqueues_and_sets_request( + peer_ctx: Context[PeerRollingOpsCharm], +): + peer = PeerRelation(endpoint='restart') + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run( + peer_ctx.on.action('restart', params={'delay': 10}), + state_in, + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.REQUEST + assert databag['operations'] + + q = OperationQueue.from_string(databag['operations']) + assert len(q) == 1 + operation = q.peek() + assert operation is not None + assert operation.callback_id == '_restart' + assert operation.kwargs == {'delay': 10} + assert operation.max_retry is None + assert operation.requested_at is not None + + +@pytest.mark.parametrize( + 'max_retry', + [ + (-5), + (-1), + ('3'), + ], +) +def test_lock_request_invalid_inputs(peer_ctx: Context[PeerRollingOpsCharm], max_retry: Any): + peer = PeerRelation(endpoint='restart') + state_in = State(leader=False, relations={peer}) + + with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with pytest.raises(RollingOpsInvalidLockRequestError): + mgr.charm.restart_manager.request_async_lock( + callback_id='_restart', + kwargs={}, + max_retry=max_retry, + ) + + +@pytest.mark.parametrize( + 'callback_id', + [ + ('',), + (' ',), + ('unknown',), + ], +) +def test_lock_request_invalid_callback_id( + peer_ctx: Context[PeerRollingOpsCharm], callback_id: str +): + peer = PeerRelation(endpoint='restart') + state_in = State(leader=False, relations={peer}) + + with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with pytest.raises(RollingOpsInvalidLockRequestError, match='Unknown callback_id'): + mgr.charm.restart_manager.request_async_lock( + callback_id=callback_id, + kwargs={}, + max_retry=0, + ) + + +@pytest.mark.parametrize( + 'kwargs', + [ + ('nope'), + ([]), + ({'x': OperationQueue()}), + ], +) +def test_lock_request_invalid_kwargs(peer_ctx: Context[PeerRollingOpsCharm], kwargs: Any): + peer = PeerRelation(endpoint='restart') + state_in = State(leader=False, relations={peer}) + + with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with pytest.raises( + RollingOpsInvalidLockRequestError, match='Failed to create the lock request' + ): + mgr.charm.restart_manager.request_async_lock( + callback_id='_restart', + kwargs=kwargs, + max_retry=0, + ) + + +def test_existing_operation_then_new_request(peer_ctx: Context[PeerRollingOpsCharm]): + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + peer = PeerRelation( + endpoint='restart', + local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + ) + + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run(peer_ctx.on.action('restart', params={'delay': 10}), state_in) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.REQUEST + result = OperationQueue.from_string(databag['operations']) + + assert len(result) == 2 + assert result.operations[0].callback_id == '_failed_restart' + assert result.operations[1].callback_id == '_restart' + + +def test_new_request_does_not_overwrite_state_if_queue_not_empty( + peer_ctx: Context[PeerRollingOpsCharm], +): + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + executed_at = str(now_timestamp().timestamp()) + peer = PeerRelation( + endpoint='restart', + local_unit_data={ + 'state': LockIntent.RETRY_RELEASE, + 'executed_at': executed_at, + 'operations': queue.to_string(), + }, + ) + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run(peer_ctx.on.action('restart', params={'delay': 10}), state_in) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.RETRY_RELEASE + assert databag['executed_at'] == executed_at + result = OperationQueue.from_string(databag['operations']) + assert len(result) == 2 + assert result.operations[0].callback_id == '_failed_restart' + assert result.operations[1].callback_id == '_restart' + + +def test_relation_changed_without_grant_does_not_run_operation( + peer_ctx: Context[PeerRollingOpsCharm], +): + remote_unit_name = f'{peer_ctx.app_name}/1' + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + peer = PeerRelation( + endpoint='restart', + local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + local_app_data={ + 'granted_unit': remote_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, + ) + + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run( + peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.REQUEST + result = OperationQueue.from_string(databag['operations']) + assert len(result) == 1 + assert databag.get('executed_at', '') == '' + + +def test_lock_complete_pops_head(peer_ctx: Context[PeerRollingOpsCharm]): + remote_unit_name = f'{peer_ctx.app_name}/1' + local_unit_name = f'{peer_ctx.app_name}/0' + queue = _make_operation_queue(callback_id='_restart', kwargs={}, max_retry=0) + peer = PeerRelation( + endpoint='restart', + local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, + ) + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run( + peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.IDLE + assert databag['executed_at'] is not None + assert databag.get('operations', None) == '[]' + + q = OperationQueue.from_string(databag['operations']) + assert len(q) == 0 + + +def test_successful_operation_leaves_state_request_when_more_ops_remain( + peer_ctx: Context[PeerRollingOpsCharm], +): + local_unit_name = f'{peer_ctx.app_name}/0' + remote_unit_name = f'{peer_ctx.app_name}/1' + queue = OperationQueue() + op1 = Operation.create(callback_id='_restart', kwargs={}, max_retry=None) + op2 = Operation.create(callback_id='_failed_restart', kwargs={}, max_retry=None) + + queue.enqueue(op1) + queue.enqueue(op2) + + peer = PeerRelation( + endpoint='restart', + local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, + ) + + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run( + peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.REQUEST + q = OperationQueue.from_string(databag['operations']) + assert len(q) == 1 + current_operation = q.peek() + assert current_operation is not None + assert current_operation.callback_id == '_failed_restart' + + +@pytest.mark.parametrize( + 'callback_id, lock_intent', + [ + ('_failed_restart', LockIntent.RETRY_RELEASE), + ('_deferred_restart', LockIntent.RETRY_HOLD), + ], +) +def test_lock_retry_marks_retry( + peer_ctx: Context[PeerRollingOpsCharm], + callback_id: str, + lock_intent: LockIntent, +): + remote_unit_name = f'{peer_ctx.app_name}/1' + local_unit_name = f'{peer_ctx.app_name}/0' + queue = _make_operation_queue(callback_id=callback_id, kwargs={}, max_retry=3) + peer = PeerRelation( + endpoint='restart', + local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, + ) + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run( + peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == lock_intent + assert databag['executed_at'] is not None + + q = OperationQueue.from_string(databag['operations']) + assert len(q) == 1 + current_operation = q.peek() + initial_operation = queue.peek() + assert current_operation is not None + assert initial_operation is not None + assert current_operation.callback_id == initial_operation.callback_id + assert current_operation.kwargs == initial_operation.kwargs + assert current_operation.max_retry == initial_operation.max_retry + assert current_operation.requested_at == initial_operation.requested_at + assert current_operation.attempt == 1 + + +@pytest.mark.parametrize( + 'callback_id', + [ + ('_failed_restart'), + ('_deferred_restart'), + ], +) +def test_lock_retry_drops_when_max_retry_reached( + peer_ctx: Context[PeerRollingOpsCharm], + callback_id: str, +): + remote_unit_name = f'{peer_ctx.app_name}/1' + local_unit_name = f'{peer_ctx.app_name}/0' + + queue = OperationQueue() + op1 = Operation.create(callback_id=callback_id, kwargs={}, max_retry=3) + queue.enqueue(op1) + op = queue.peek() + assert op is not None + op.increase_attempt() + op.increase_attempt() + op.increase_attempt() + + peer = PeerRelation( + endpoint='restart', + local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, + ) + state_in = State(leader=False, relations={peer}) + + state_out = peer_ctx.run( + peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.IDLE + assert databag['executed_at'] is not None + + q = OperationQueue.from_string(databag['operations']) + assert len(q) == 0 + + +def test_lock_grant_and_release(peer_ctx: Context[PeerRollingOpsCharm]): + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + peer = PeerRelation( + endpoint='restart', + peers_data={1: {'state': LockIntent.REQUEST, 'operations': queue.to_string()}}, + ) + state_in = State(leader=True, relations={peer}) + + state = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + databag = _app_databag(state, peer) + + unit_name = f'{peer_ctx.app_name}/1' + assert unit_name in databag['granted_unit'] + assert databag['granted_at'] is not None + + +def test_scheduling_does_nothing_if_lock_already_granted(peer_ctx: Context[PeerRollingOpsCharm]): + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + remote_unit_name = f'{peer_ctx.app_name}/1' + now_timestamp_str = str(now_timestamp().timestamp()) + peer = PeerRelation( + endpoint='restart', + peers_data={ + 1: {'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + 2: {'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + }, + local_app_data={'granted_unit': remote_unit_name, 'granted_at': now_timestamp_str}, + ) + state_in = State(leader=True, relations={peer}) + + state_out = peer_ctx.run( + peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in + ) + + databag = _app_databag(state_out, peer) + assert databag['granted_unit'] == remote_unit_name + assert databag['granted_at'] == now_timestamp_str + + +def test_schedule_picks_retry_hold(peer_ctx: Context[PeerRollingOpsCharm]): + old_operation = str(now_timestamp().timestamp()) + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + new_operation = str(now_timestamp().timestamp()) + + peer = PeerRelation( + endpoint='restart', + peers_data={ + 1: { + 'state': LockIntent.RETRY_RELEASE, + 'operations': queue.to_string(), + 'executed_at': new_operation, + }, + 2: { + 'state': LockIntent.REQUEST, + 'operations': queue.to_string(), + 'executed_at': old_operation, + }, + 3: { + 'state': LockIntent.RETRY_HOLD, + 'operations': queue.to_string(), + 'executed_at': new_operation, + }, + }, + ) + state_in = State(leader=True, relations={peer}) + + state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + + databag = _app_databag(state_out, peer) + remote_unit_name = f'{peer_ctx.app_name}/3' + assert databag['granted_unit'] == remote_unit_name + + +def test_schedule_picks_oldest_requested_at_among_requests(peer_ctx: Context[PeerRollingOpsCharm]): + old_queue = OperationQueue() + old_op = Operation.create(callback_id='restart', kwargs={}, max_retry=2) + old_queue.enqueue(old_op) + + new_queue = OperationQueue() + new_op = Operation.create(callback_id='restart', kwargs={}, max_retry=2) + new_queue.enqueue(new_op) + + peer = PeerRelation( + endpoint='restart', + peers_data={ + 1: {'state': LockIntent.REQUEST, 'operations': new_queue.to_string()}, + 2: {'state': LockIntent.REQUEST, 'operations': old_queue.to_string()}, + }, + ) + state_in = State(leader=True, relations={peer}) + + state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + databag = _app_databag(state_out, peer) + remote_unit_name = f'{peer_ctx.app_name}/2' + assert databag['granted_unit'] == remote_unit_name + + +def test_schedule_picks_oldest_executed_at_among_retries_when_no_requests( + peer_ctx: Context[PeerRollingOpsCharm], +): + old_operation = str(now_timestamp().timestamp()) + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + new_operation = str(now_timestamp().timestamp()) + + peer = PeerRelation( + endpoint='restart', + peers_data={ + 1: { + 'state': LockIntent.RETRY_RELEASE, + 'operations': queue.to_string(), + 'executed_at': new_operation, + }, + 2: { + 'state': LockIntent.RETRY_RELEASE, + 'operations': queue.to_string(), + 'executed_at': old_operation, + }, + }, + ) + state_in = State(leader=True, relations={peer}) + + state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + + databag = _app_databag(state_out, peer) + remote_unit_name = f'{peer_ctx.app_name}/2' + assert databag['granted_unit'] == remote_unit_name + + +def test_schedule_prioritizes_requests_over_retries(peer_ctx: Context[PeerRollingOpsCharm]): + queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) + + peer = PeerRelation( + endpoint='restart', + peers_data={ + 1: { + 'state': LockIntent.RETRY_RELEASE, + 'operations': queue.to_string(), + 'executed_at': str(now_timestamp().timestamp()), + }, + 2: {'state': LockIntent.REQUEST, 'operations': queue.to_string()}, + }, + ) + state_in = State(leader=True, relations={peer}) + + state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + + databag = _app_databag(state_out, peer) + remote_unit_name = f'{peer_ctx.app_name}/2' + assert databag['granted_unit'] == remote_unit_name + + +def test_no_unit_is_granted_if_there_are_no_requests(peer_ctx: Context[PeerRollingOpsCharm]): + peer = PeerRelation( + endpoint='restart', + peers_data={1: {'state': LockIntent.IDLE}, 2: {'state': LockIntent.IDLE}}, + ) + state_in = State(leader=True, relations={peer}) + + state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + + databag = _app_databag(state_out, peer) + assert databag.get('granted_unit', '') == '' + assert databag.get('granted_at', '') == '' diff --git a/rollingops/tests/unit/test_version.py b/rollingops/tests/unit/test_version.py new file mode 100644 index 000000000..23f6c4f6b --- /dev/null +++ b/rollingops/tests/unit/test_version.py @@ -0,0 +1,21 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for library code, not involving charm code.""" + +from charmlibs import rollingops + + +def test_version(): + assert isinstance(rollingops.__version__, str) diff --git a/rollingops/tests/unit/test_version_in_charm.py b/rollingops/tests/unit/test_version_in_charm.py new file mode 100644 index 000000000..3ef98de07 --- /dev/null +++ b/rollingops/tests/unit/test_version_in_charm.py @@ -0,0 +1,38 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Light weight state-transition tests of the library in a charming context.""" + +import ops +import ops.testing + +from charmlibs import rollingops + + +class Charm(ops.CharmBase): + package_version: str + + def __init__(self, framework: ops.Framework): + super().__init__(framework) + framework.observe(self.on.start, self._on_start) + + def _on_start(self, event: ops.StartEvent): + self.package_version = rollingops.__version__ + + +def test_version(): + ctx = ops.testing.Context(Charm, meta={'name': 'charm'}) + with ctx(ctx.on.start(), ops.testing.State()) as manager: + manager.run() + assert isinstance(manager.charm.package_version, str) diff --git a/rollingops/uv.lock b/rollingops/uv.lock new file mode 100644 index 000000000..e6833c191 --- /dev/null +++ b/rollingops/uv.lock @@ -0,0 +1,472 @@ +version = 1 +revision = 3 +requires-python = ">=3.12" + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "implementation_name != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" }, + { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" }, + { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" }, + { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" }, + { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" }, + { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" }, + { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" }, + { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" }, + { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" }, + { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" }, + { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" }, + { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, + { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, + { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, + { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, + { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" }, + { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" }, + { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, + { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" }, + { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" }, + { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, + { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, + { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, + { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, + { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, + { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" }, + { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" }, + { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" }, + { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" }, + { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, + { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, + { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, + { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, + { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" }, + { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, +] + +[[package]] +name = "charmlibs-apt" +version = "1.0.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fd/58/33e87779fdbcf62a3b34e3444d7175c1168b4b2726cc29c98849c09ac086/charmlibs_apt-1.0.0.post0.tar.gz", hash = "sha256:9c2e0b3c1f553ebcaae99c9aad72e15383aec56677a8dd3f6479dc6f084189a6", size = 31942, upload-time = "2025-10-15T02:40:29.521Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/92/4db19cd8bc94db51a115f7a2e3c46d96b991ca7ebe27207beac9a6570bc6/charmlibs_apt-1.0.0.post0-py3-none-any.whl", hash = "sha256:958e84719eb1feff539f058dc6c7af648c53c88b9ebe7c6157ec8d2bdf5fbfc6", size = 19287, upload-time = "2025-10-15T02:40:27.756Z" }, +] + +[[package]] +name = "charmlibs-interfaces-tls-certificates" +version = "1.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, + { name = "ops" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/7e/166af1e71f2bf96482845a1806dc345cbc5507134a99ccbbae297f174e4b/charmlibs_interfaces_tls_certificates-1.8.1.tar.gz", hash = "sha256:f2bfabf3a3b4c18034941771733177b30e4742c06d7742d4bb30da6ead953f43", size = 148059, upload-time = "2026-02-27T13:46:50.086Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/17/1d1b0083800f4cc20f42e5d2763521d93975376499565c62da5276a80629/charmlibs_interfaces_tls_certificates-1.8.1-py3-none-any.whl", hash = "sha256:8e8fe047e02515d76f57a1d019056d72ce8c859c2ffb39a1e379cfc11fc048e6", size = 28208, upload-time = "2026-02-27T13:46:48.959Z" }, +] + +[[package]] +name = "charmlibs-pathops" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ops" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/05/18/120f0635f449af7ef9e1a431b7f1c21eb4e09a39870f3862350853ba92a1/charmlibs_pathops-1.2.1.tar.gz", hash = "sha256:00fa50f95bb7fbfbe3d5507de94e583f7333f63ec6ef42a49600b641aabbcfd3", size = 21892, upload-time = "2026-02-06T00:34:03.882Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/57/b54e4fcb9c53085fde0bc6016232e0bb5f88cbf7a03ef78176848793b648/charmlibs_pathops-1.2.1-py3-none-any.whl", hash = "sha256:36dc4e5f76ae3eb89020df916c14e169f71ba856f71a430dab94b0b7948e9b10", size = 27005, upload-time = "2026-02-06T00:34:02.669Z" }, +] + +[[package]] +name = "charmlibs-rollingops" +source = { editable = "." } +dependencies = [ + { name = "charmlibs-interfaces-tls-certificates" }, + { name = "charmlibs-pathops" }, + { name = "dpcharmlibs-interfaces" }, + { name = "ops" }, + { name = "tenacity" }, +] + +[package.dev-dependencies] +integration = [ + { name = "charmlibs-apt" }, + { name = "jubilant" }, + { name = "tenacity" }, +] +unit = [ + { name = "ops", extra = ["testing"] }, +] + +[package.metadata] +requires-dist = [ + { name = "charmlibs-interfaces-tls-certificates", specifier = ">=1.8.1" }, + { name = "charmlibs-pathops", specifier = ">=1.2.1" }, + { name = "dpcharmlibs-interfaces", specifier = "==1.0.0" }, + { name = "ops" }, + { name = "tenacity" }, +] + +[package.metadata.requires-dev] +functional = [] +integration = [ + { name = "charmlibs-apt" }, + { name = "jubilant" }, + { name = "tenacity" }, +] +lint = [] +unit = [{ name = "ops", extras = ["testing"] }] + +[[package]] +name = "cryptography" +version = "46.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/60/04/ee2a9e8542e4fa2773b81771ff8349ff19cdd56b7258a0cc442639052edb/cryptography-46.0.5.tar.gz", hash = "sha256:abace499247268e3757271b2f1e244b36b06f8515cf27c4d49468fc9eb16e93d", size = 750064, upload-time = "2026-02-10T19:18:38.255Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/81/b0bb27f2ba931a65409c6b8a8b358a7f03c0e46eceacddff55f7c84b1f3b/cryptography-46.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:351695ada9ea9618b3500b490ad54c739860883df6c1f555e088eaf25b1bbaad", size = 7176289, upload-time = "2026-02-10T19:17:08.274Z" }, + { url = "https://files.pythonhosted.org/packages/ff/9e/6b4397a3e3d15123de3b1806ef342522393d50736c13b20ec4c9ea6693a6/cryptography-46.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c18ff11e86df2e28854939acde2d003f7984f721eba450b56a200ad90eeb0e6b", size = 4275637, upload-time = "2026-02-10T19:17:10.53Z" }, + { url = "https://files.pythonhosted.org/packages/63/e7/471ab61099a3920b0c77852ea3f0ea611c9702f651600397ac567848b897/cryptography-46.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d7e3d356b8cd4ea5aff04f129d5f66ebdc7b6f8eae802b93739ed520c47c79b", size = 4424742, upload-time = "2026-02-10T19:17:12.388Z" }, + { url = "https://files.pythonhosted.org/packages/37/53/a18500f270342d66bf7e4d9f091114e31e5ee9e7375a5aba2e85a91e0044/cryptography-46.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:50bfb6925eff619c9c023b967d5b77a54e04256c4281b0e21336a130cd7fc263", size = 4277528, upload-time = "2026-02-10T19:17:13.853Z" }, + { url = "https://files.pythonhosted.org/packages/22/29/c2e812ebc38c57b40e7c583895e73c8c5adb4d1e4a0cc4c5a4fdab2b1acc/cryptography-46.0.5-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:803812e111e75d1aa73690d2facc295eaefd4439be1023fefc4995eaea2af90d", size = 4947993, upload-time = "2026-02-10T19:17:15.618Z" }, + { url = "https://files.pythonhosted.org/packages/6b/e7/237155ae19a9023de7e30ec64e5d99a9431a567407ac21170a046d22a5a3/cryptography-46.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ee190460e2fbe447175cda91b88b84ae8322a104fc27766ad09428754a618ed", size = 4456855, upload-time = "2026-02-10T19:17:17.221Z" }, + { url = "https://files.pythonhosted.org/packages/2d/87/fc628a7ad85b81206738abbd213b07702bcbdada1dd43f72236ef3cffbb5/cryptography-46.0.5-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:f145bba11b878005c496e93e257c1e88f154d278d2638e6450d17e0f31e558d2", size = 3984635, upload-time = "2026-02-10T19:17:18.792Z" }, + { url = "https://files.pythonhosted.org/packages/84/29/65b55622bde135aedf4565dc509d99b560ee4095e56989e815f8fd2aa910/cryptography-46.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:e9251e3be159d1020c4030bd2e5f84d6a43fe54b6c19c12f51cde9542a2817b2", size = 4277038, upload-time = "2026-02-10T19:17:20.256Z" }, + { url = "https://files.pythonhosted.org/packages/bc/36/45e76c68d7311432741faf1fbf7fac8a196a0a735ca21f504c75d37e2558/cryptography-46.0.5-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:47fb8a66058b80e509c47118ef8a75d14c455e81ac369050f20ba0d23e77fee0", size = 4912181, upload-time = "2026-02-10T19:17:21.825Z" }, + { url = "https://files.pythonhosted.org/packages/6d/1a/c1ba8fead184d6e3d5afcf03d569acac5ad063f3ac9fb7258af158f7e378/cryptography-46.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:4c3341037c136030cb46e4b1e17b7418ea4cbd9dd207e4a6f3b2b24e0d4ac731", size = 4456482, upload-time = "2026-02-10T19:17:25.133Z" }, + { url = "https://files.pythonhosted.org/packages/f9/e5/3fb22e37f66827ced3b902cf895e6a6bc1d095b5b26be26bd13c441fdf19/cryptography-46.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:890bcb4abd5a2d3f852196437129eb3667d62630333aacc13dfd470fad3aaa82", size = 4405497, upload-time = "2026-02-10T19:17:26.66Z" }, + { url = "https://files.pythonhosted.org/packages/1a/df/9d58bb32b1121a8a2f27383fabae4d63080c7ca60b9b5c88be742be04ee7/cryptography-46.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:80a8d7bfdf38f87ca30a5391c0c9ce4ed2926918e017c29ddf643d0ed2778ea1", size = 4667819, upload-time = "2026-02-10T19:17:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/ea/ed/325d2a490c5e94038cdb0117da9397ece1f11201f425c4e9c57fe5b9f08b/cryptography-46.0.5-cp311-abi3-win32.whl", hash = "sha256:60ee7e19e95104d4c03871d7d7dfb3d22ef8a9b9c6778c94e1c8fcc8365afd48", size = 3028230, upload-time = "2026-02-10T19:17:30.518Z" }, + { url = "https://files.pythonhosted.org/packages/e9/5a/ac0f49e48063ab4255d9e3b79f5def51697fce1a95ea1370f03dc9db76f6/cryptography-46.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:38946c54b16c885c72c4f59846be9743d699eee2b69b6988e0a00a01f46a61a4", size = 3480909, upload-time = "2026-02-10T19:17:32.083Z" }, + { url = "https://files.pythonhosted.org/packages/00/13/3d278bfa7a15a96b9dc22db5a12ad1e48a9eb3d40e1827ef66a5df75d0d0/cryptography-46.0.5-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:94a76daa32eb78d61339aff7952ea819b1734b46f73646a07decb40e5b3448e2", size = 7119287, upload-time = "2026-02-10T19:17:33.801Z" }, + { url = "https://files.pythonhosted.org/packages/67/c8/581a6702e14f0898a0848105cbefd20c058099e2c2d22ef4e476dfec75d7/cryptography-46.0.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5be7bf2fb40769e05739dd0046e7b26f9d4670badc7b032d6ce4db64dddc0678", size = 4265728, upload-time = "2026-02-10T19:17:35.569Z" }, + { url = "https://files.pythonhosted.org/packages/dd/4a/ba1a65ce8fc65435e5a849558379896c957870dd64fecea97b1ad5f46a37/cryptography-46.0.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe346b143ff9685e40192a4960938545c699054ba11d4f9029f94751e3f71d87", size = 4408287, upload-time = "2026-02-10T19:17:36.938Z" }, + { url = "https://files.pythonhosted.org/packages/f8/67/8ffdbf7b65ed1ac224d1c2df3943553766914a8ca718747ee3871da6107e/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c69fd885df7d089548a42d5ec05be26050ebcd2283d89b3d30676eb32ff87dee", size = 4270291, upload-time = "2026-02-10T19:17:38.748Z" }, + { url = "https://files.pythonhosted.org/packages/f8/e5/f52377ee93bc2f2bba55a41a886fd208c15276ffbd2569f2ddc89d50e2c5/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:8293f3dea7fc929ef7240796ba231413afa7b68ce38fd21da2995549f5961981", size = 4927539, upload-time = "2026-02-10T19:17:40.241Z" }, + { url = "https://files.pythonhosted.org/packages/3b/02/cfe39181b02419bbbbcf3abdd16c1c5c8541f03ca8bda240debc467d5a12/cryptography-46.0.5-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:1abfdb89b41c3be0365328a410baa9df3ff8a9110fb75e7b52e66803ddabc9a9", size = 4442199, upload-time = "2026-02-10T19:17:41.789Z" }, + { url = "https://files.pythonhosted.org/packages/c0/96/2fcaeb4873e536cf71421a388a6c11b5bc846e986b2b069c79363dc1648e/cryptography-46.0.5-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:d66e421495fdb797610a08f43b05269e0a5ea7f5e652a89bfd5a7d3c1dee3648", size = 3960131, upload-time = "2026-02-10T19:17:43.379Z" }, + { url = "https://files.pythonhosted.org/packages/d8/d2/b27631f401ddd644e94c5cf33c9a4069f72011821cf3dc7309546b0642a0/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:4e817a8920bfbcff8940ecfd60f23d01836408242b30f1a708d93198393a80b4", size = 4270072, upload-time = "2026-02-10T19:17:45.481Z" }, + { url = "https://files.pythonhosted.org/packages/f4/a7/60d32b0370dae0b4ebe55ffa10e8599a2a59935b5ece1b9f06edb73abdeb/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:68f68d13f2e1cb95163fa3b4db4bf9a159a418f5f6e7242564fc75fcae667fd0", size = 4892170, upload-time = "2026-02-10T19:17:46.997Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b9/cf73ddf8ef1164330eb0b199a589103c363afa0cf794218c24d524a58eab/cryptography-46.0.5-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:a3d1fae9863299076f05cb8a778c467578262fae09f9dc0ee9b12eb4268ce663", size = 4441741, upload-time = "2026-02-10T19:17:48.661Z" }, + { url = "https://files.pythonhosted.org/packages/5f/eb/eee00b28c84c726fe8fa0158c65afe312d9c3b78d9d01daf700f1f6e37ff/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4143987a42a2397f2fc3b4d7e3a7d313fbe684f67ff443999e803dd75a76826", size = 4396728, upload-time = "2026-02-10T19:17:50.058Z" }, + { url = "https://files.pythonhosted.org/packages/65/f4/6bc1a9ed5aef7145045114b75b77c2a8261b4d38717bd8dea111a63c3442/cryptography-46.0.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7d731d4b107030987fd61a7f8ab512b25b53cef8f233a97379ede116f30eb67d", size = 4652001, upload-time = "2026-02-10T19:17:51.54Z" }, + { url = "https://files.pythonhosted.org/packages/86/ef/5d00ef966ddd71ac2e6951d278884a84a40ffbd88948ef0e294b214ae9e4/cryptography-46.0.5-cp314-cp314t-win32.whl", hash = "sha256:c3bcce8521d785d510b2aad26ae2c966092b7daa8f45dd8f44734a104dc0bc1a", size = 3003637, upload-time = "2026-02-10T19:17:52.997Z" }, + { url = "https://files.pythonhosted.org/packages/b7/57/f3f4160123da6d098db78350fdfd9705057aad21de7388eacb2401dceab9/cryptography-46.0.5-cp314-cp314t-win_amd64.whl", hash = "sha256:4d8ae8659ab18c65ced284993c2265910f6c9e650189d4e3f68445ef82a810e4", size = 3469487, upload-time = "2026-02-10T19:17:54.549Z" }, + { url = "https://files.pythonhosted.org/packages/e2/fa/a66aa722105ad6a458bebd64086ca2b72cdd361fed31763d20390f6f1389/cryptography-46.0.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:4108d4c09fbbf2789d0c926eb4152ae1760d5a2d97612b92d508d96c861e4d31", size = 7170514, upload-time = "2026-02-10T19:17:56.267Z" }, + { url = "https://files.pythonhosted.org/packages/0f/04/c85bdeab78c8bc77b701bf0d9bdcf514c044e18a46dcff330df5448631b0/cryptography-46.0.5-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1f30a86d2757199cb2d56e48cce14deddf1f9c95f1ef1b64ee91ea43fe2e18", size = 4275349, upload-time = "2026-02-10T19:17:58.419Z" }, + { url = "https://files.pythonhosted.org/packages/5c/32/9b87132a2f91ee7f5223b091dc963055503e9b442c98fc0b8a5ca765fab0/cryptography-46.0.5-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:039917b0dc418bb9f6edce8a906572d69e74bd330b0b3fea4f79dab7f8ddd235", size = 4420667, upload-time = "2026-02-10T19:18:00.619Z" }, + { url = "https://files.pythonhosted.org/packages/a1/a6/a7cb7010bec4b7c5692ca6f024150371b295ee1c108bdc1c400e4c44562b/cryptography-46.0.5-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ba2a27ff02f48193fc4daeadf8ad2590516fa3d0adeeb34336b96f7fa64c1e3a", size = 4276980, upload-time = "2026-02-10T19:18:02.379Z" }, + { url = "https://files.pythonhosted.org/packages/8e/7c/c4f45e0eeff9b91e3f12dbd0e165fcf2a38847288fcfd889deea99fb7b6d/cryptography-46.0.5-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:61aa400dce22cb001a98014f647dc21cda08f7915ceb95df0c9eaf84b4b6af76", size = 4939143, upload-time = "2026-02-10T19:18:03.964Z" }, + { url = "https://files.pythonhosted.org/packages/37/19/e1b8f964a834eddb44fa1b9a9976f4e414cbb7aa62809b6760c8803d22d1/cryptography-46.0.5-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3ce58ba46e1bc2aac4f7d9290223cead56743fa6ab94a5d53292ffaac6a91614", size = 4453674, upload-time = "2026-02-10T19:18:05.588Z" }, + { url = "https://files.pythonhosted.org/packages/db/ed/db15d3956f65264ca204625597c410d420e26530c4e2943e05a0d2f24d51/cryptography-46.0.5-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:420d0e909050490d04359e7fdb5ed7e667ca5c3c402b809ae2563d7e66a92229", size = 3978801, upload-time = "2026-02-10T19:18:07.167Z" }, + { url = "https://files.pythonhosted.org/packages/41/e2/df40a31d82df0a70a0daf69791f91dbb70e47644c58581d654879b382d11/cryptography-46.0.5-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:582f5fcd2afa31622f317f80426a027f30dc792e9c80ffee87b993200ea115f1", size = 4276755, upload-time = "2026-02-10T19:18:09.813Z" }, + { url = "https://files.pythonhosted.org/packages/33/45/726809d1176959f4a896b86907b98ff4391a8aa29c0aaaf9450a8a10630e/cryptography-46.0.5-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:bfd56bb4b37ed4f330b82402f6f435845a5f5648edf1ad497da51a8452d5d62d", size = 4901539, upload-time = "2026-02-10T19:18:11.263Z" }, + { url = "https://files.pythonhosted.org/packages/99/0f/a3076874e9c88ecb2ecc31382f6e7c21b428ede6f55aafa1aa272613e3cd/cryptography-46.0.5-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a3d507bb6a513ca96ba84443226af944b0f7f47dcc9a399d110cd6146481d24c", size = 4452794, upload-time = "2026-02-10T19:18:12.914Z" }, + { url = "https://files.pythonhosted.org/packages/02/ef/ffeb542d3683d24194a38f66ca17c0a4b8bf10631feef44a7ef64e631b1a/cryptography-46.0.5-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9f16fbdf4da055efb21c22d81b89f155f02ba420558db21288b3d0035bafd5f4", size = 4404160, upload-time = "2026-02-10T19:18:14.375Z" }, + { url = "https://files.pythonhosted.org/packages/96/93/682d2b43c1d5f1406ed048f377c0fc9fc8f7b0447a478d5c65ab3d3a66eb/cryptography-46.0.5-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ced80795227d70549a411a4ab66e8ce307899fad2220ce5ab2f296e687eacde9", size = 4667123, upload-time = "2026-02-10T19:18:15.886Z" }, + { url = "https://files.pythonhosted.org/packages/45/2d/9c5f2926cb5300a8eefc3f4f0b3f3df39db7f7ce40c8365444c49363cbda/cryptography-46.0.5-cp38-abi3-win32.whl", hash = "sha256:02f547fce831f5096c9a567fd41bc12ca8f11df260959ecc7c3202555cc47a72", size = 3010220, upload-time = "2026-02-10T19:18:17.361Z" }, + { url = "https://files.pythonhosted.org/packages/48/ef/0c2f4a8e31018a986949d34a01115dd057bf536905dca38897bacd21fac3/cryptography-46.0.5-cp38-abi3-win_amd64.whl", hash = "sha256:556e106ee01aa13484ce9b0239bca667be5004efb0aabbed28d353df86445595", size = 3467050, upload-time = "2026-02-10T19:18:18.899Z" }, +] + +[[package]] +name = "dpcharmlibs-interfaces" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ops" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8c/41/4cfa520d62c0aa53804599bf74506184eadc7c1e4803616383ec6bd3ab39/dpcharmlibs_interfaces-1.0.0.tar.gz", hash = "sha256:a177653019781a7a165be52cf4293134ee58912cd93686082cdd82da347a3bed", size = 203809, upload-time = "2026-03-24T17:38:47.136Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/88/9dbe3ceeb571dfd5e203fa4382376e686e0781a047bf2fb91f755962e4d5/dpcharmlibs_interfaces-1.0.0-py3-none-any.whl", hash = "sha256:afff80be30e3ff8c31f68557946e9d5959247f1973cbc00e96da55b095f95a21", size = 36982, upload-time = "2026-03-24T17:38:45.81Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + +[[package]] +name = "jubilant" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/0b/275edac8b57b0aac34f84073997660ebf536f97d2fa0d85a2cc3321047b6/jubilant-1.7.0.tar.gz", hash = "sha256:46b7c29a4f3336ab16d77d88418dbf8c9d0746e3f80ef42ee4c2d103eff79650", size = 32455, upload-time = "2026-01-29T02:40:10.335Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/d5/5b95ae9ab5abf283e33c802d286045abda7d826396ba417d5d3a20201b24/jubilant-1.7.0-py3-none-any.whl", hash = "sha256:1dcd70eb10299a95ae9fab405a3ce5f01a15513776b7f8eb4cf7b02808c93cdf", size = 33396, upload-time = "2026-01-29T02:40:09.222Z" }, +] + +[[package]] +name = "opentelemetry-api" +version = "1.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" }, +] + +[[package]] +name = "ops" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "pyyaml" }, + { name = "websocket-client" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d3/bb/79b7efdb1243cbad11b6568c51ba4fb7358cd2c4d13bfd971a77c0aa7440/ops-3.6.0.tar.gz", hash = "sha256:a1c3361049c66759840a436143b07c74c2a46dcc44cbfd1177a9051f849c7971", size = 579236, upload-time = "2026-02-26T04:19:12.689Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/b6/d7daab4f841566d3cb0402d3463f7c1a00626724d6d7c02d7bf934ae6c86/ops-3.6.0-py3-none-any.whl", hash = "sha256:341c6688684446cc4b42860738898683feb271175bb9c4775ae68c81e4e0976a", size = 211856, upload-time = "2026-02-26T04:19:08.012Z" }, +] + +[package.optional-dependencies] +testing = [ + { name = "ops-scenario" }, +] + +[[package]] +name = "ops-scenario" +version = "8.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ops" }, + { name = "pyyaml" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/63/c8/15d9f91eafa46d1dfa7f580be3274c22399f941724b74e274334de9468bb/ops_scenario-8.6.0.tar.gz", hash = "sha256:5a40a91fd5e9b6c8249933944dfc6e807ad2ddbd36a68c800746b9bb8a0eabfb", size = 71728, upload-time = "2026-02-26T04:19:15.34Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/d2/fb3176805339d3aa95b9d6e43478d0e34355c6c46f27723249f46bb8d19d/ops_scenario-8.6.0-py3-none-any.whl", hash = "sha256:469490a042dc45eca24eef7aa1b9214704d97d67503ad8465414ab68dc989d30", size = 64241, upload-time = "2026-02-26T04:19:09.579Z" }, +] + +[[package]] +name = "pycparser" +version = "3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, +] + +[[package]] +name = "pydantic" +version = "2.12.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, + { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, + { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, + { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, + { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, + { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, + { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, + { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, + { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, + { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, + { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, + { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, + { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, + { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, + { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, + { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, + { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, + { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, + { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, + { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, + { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, + { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, + { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, + { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, + { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, + { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" }, + { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, + { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, + { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, + { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, + { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, + { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, + { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, + { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, + { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, + { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, + { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, + { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, + { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, + { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, + { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, + { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, + { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, + { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, + { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, + { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, + { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, + { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, + { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] + +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + +[[package]] +name = "websocket-client" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/41/aa4bf9664e4cda14c3b39865b12251e8e7d239f4cd0e3cc1b6c2ccde25c1/websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98", size = 70576, upload-time = "2025-10-07T21:16:36.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +] From c877e3d56de5965e6a9823167eb2def95ac3f33d Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Mon, 13 Apr 2026 19:21:36 +0200 Subject: [PATCH 02/15] fix merge --- .../src/charmlibs/rollingops/_base_manager.py | 93 ---- .../src/charmlibs/rollingops/_certificates.py | 170 ------ .../charmlibs/rollingops/_etcd_rollingops.py | 40 -- .../src/charmlibs/rollingops/_etcdctl.py | 215 -------- .../src/charmlibs/rollingops/_manager.py | 201 ------- .../src/charmlibs/rollingops/_models.py | 262 --------- .../src/charmlibs/rollingops/_peer_manager.py | 449 --------------- .../src/charmlibs/rollingops/_peer_models.py | 521 ------------------ .../charmlibs/rollingops/_peer_rollingops.py | 40 -- .../src/charmlibs/rollingops/_peer_worker.py | 128 ----- .../src/charmlibs/rollingops/_relations.py | 283 ---------- .../src/charmlibs/rollingops/_worker.py | 173 ------ rollingops/tests/unit/test_certificates.py | 152 ----- rollingops/tests/unit/test_etcdctl.py | 94 ---- rollingops/tests/unit/test_models.py | 39 -- 15 files changed, 2860 deletions(-) delete mode 100644 rollingops/src/charmlibs/rollingops/_base_manager.py delete mode 100644 rollingops/src/charmlibs/rollingops/_certificates.py delete mode 100644 rollingops/src/charmlibs/rollingops/_etcd_rollingops.py delete mode 100644 rollingops/src/charmlibs/rollingops/_etcdctl.py delete mode 100644 rollingops/src/charmlibs/rollingops/_manager.py delete mode 100644 rollingops/src/charmlibs/rollingops/_models.py delete mode 100644 rollingops/src/charmlibs/rollingops/_peer_manager.py delete mode 100644 rollingops/src/charmlibs/rollingops/_peer_models.py delete mode 100644 rollingops/src/charmlibs/rollingops/_peer_rollingops.py delete mode 100644 rollingops/src/charmlibs/rollingops/_peer_worker.py delete mode 100644 rollingops/src/charmlibs/rollingops/_relations.py delete mode 100644 rollingops/src/charmlibs/rollingops/_worker.py delete mode 100644 rollingops/tests/unit/test_certificates.py delete mode 100644 rollingops/tests/unit/test_etcdctl.py delete mode 100644 rollingops/tests/unit/test_models.py diff --git a/rollingops/src/charmlibs/rollingops/_base_manager.py b/rollingops/src/charmlibs/rollingops/_base_manager.py deleted file mode 100644 index 3ac9d50e1..000000000 --- a/rollingops/src/charmlibs/rollingops/_base_manager.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" - -import logging -from typing import Any - -from ops import CharmBase, Object -from ops.framework import EventBase - -from charmlibs.rollingops._manager import EtcdRollingOpsManager -from charmlibs.rollingops._peer_manager import PeerRollingOpsManager - -logger = logging.getLogger(__name__) - - -class RollingOpsLockGrantedEvent(EventBase): - """Custom event emitted when the background worker grants the lock.""" - - -class RollingOpsManager(Object): - def __init__( - self, - charm: CharmBase, - peer_relation_name: str, - etcd_relation_name: str, - cluster_id: str, - callback_targets: dict[str, Any], - ): - super().__init__(charm, 'rolling-ops-manager') - - self.charm = charm - self.peer_relation_name = peer_relation_name - self.etcd_relation_name = etcd_relation_name - charm.on.define_event('rollingops_lock_granted', RollingOpsLockGrantedEvent) - - self.peer_manager = PeerRollingOpsManager( - charm=charm, - relation_name=peer_relation_name, - callback_targets=callback_targets, - ) - self.etcd_manager = EtcdRollingOpsManager( - charm=charm, - peer_relation_name=peer_relation_name, - etcd_relation_name=etcd_relation_name, - cluster_id=cluster_id, - callback_targets=callback_targets, - ) - - self.framework.observe(charm.on.rollingops_lock_granted, self._on_rollingops_lock_granted) - - def _has_relation(self, relation_name: str) -> bool: - return self.model.get_relation(relation_name) is not None - - def _get_active_manager(self) -> Any: - has_etcd = self._has_relation(self.etcd_relation_name) - has_peer = self._has_relation(self.peer_relation_name) - - if has_etcd: - return self.etcd_manager - - if has_peer: - return self.peer_manager - - raise RuntimeError('No active rollingops relation found.') - - def request_async_lock( - self, callback_id: str, kwargs: dict[str, Any] | None = None, max_retry: int | None = None - ) -> None: - manager = self._get_active_manager() - return manager.request_async_lock( - callback_id=callback_id, kwargs=kwargs, max_retry=max_retry - ) - - def _on_rollingops_lock_granted(self, event: RollingOpsLockGrantedEvent) -> None: - """Handler of the custom hook rollingops_lock_granted. - - The custom hook is triggered by a background process. - """ - manager = self._get_active_manager() - manager._on_rollingops_lock_granted(event) diff --git a/rollingops/src/charmlibs/rollingops/_certificates.py b/rollingops/src/charmlibs/rollingops/_certificates.py deleted file mode 100644 index 34f4b9e71..000000000 --- a/rollingops/src/charmlibs/rollingops/_certificates.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Manage generation and persistence of TLS certificates for etcd client access. - -This file contains functions responsible for creating and storing a client Certificate -Authority (CA) and a client certificate/key pair used to authenticate -with etcd via TLS. Certificates are generated only once and persisted -under a local directory so they can be reused across charm executions. - -Certificates are valid for 20 years. They are not renewed or rotated. -""" - -from datetime import timedelta - -from charmlibs import pathops -from charmlibs.interfaces.tls_certificates import ( - Certificate, - CertificateRequestAttributes, - CertificateSigningRequest, - PrivateKey, - TLSCertificatesError, -) -from charmlibs.rollingops._models import ( - RollingOpsFileSystemError, - SharedCertificate, - with_pebble_retry, -) - -BASE_DIR = pathops.LocalPath('/var/lib/rollingops/tls') -CA_CERT_PATH = BASE_DIR / 'client-ca.pem' -CLIENT_KEY_PATH = BASE_DIR / 'client.key' -CLIENT_CERT_PATH = BASE_DIR / 'client.pem' -VALIDITY_DAYS = 365 * 50 -KEY_SIZE = 4096 - - -def persist_client_cert_key_and_ca(shared: SharedCertificate) -> None: - """Persist the provided client certificate, key, and CA to disk. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - if _has_client_cert_key_and_ca(shared): - return - try: - with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) - shared.write_to_paths(CLIENT_CERT_PATH, CLIENT_KEY_PATH, CA_CERT_PATH) - - except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to persist client certificates and key.') from e - - -def _has_client_cert_key_and_ca(shared: SharedCertificate) -> bool: - """Return whether the provided certificate material matches local files. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - if not _exists(): - return False - try: - stored = SharedCertificate.from_paths( - CLIENT_CERT_PATH, - CLIENT_KEY_PATH, - CA_CERT_PATH, - ) - return stored == shared - - except ( - FileNotFoundError, - IsADirectoryError, - PermissionError, - TLSCertificatesError, - ValueError, - ) as e: - raise RollingOpsFileSystemError('Failed to read certificates and key.') from e - - -def generate(common_name: str) -> SharedCertificate: - """Generate a client CA and client certificate if they do not exist. - - This method creates: - 1. A CA private key and self-signed CA certificate. - 2. A client private key. - 3. A certificate signing request (CSR) using the provided common name. - 4. A client certificate signed by the generated CA. - - The generated files are written to disk and reused in future runs. - If the certificates already exist, this method does nothing. - - Args: - common_name: Common Name (CN) used in the client certificate - subject. This value should not contain slashes. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - if _exists(): - return SharedCertificate.from_paths( - CLIENT_CERT_PATH, - CLIENT_KEY_PATH, - CA_CERT_PATH, - ) - - ca_key = PrivateKey.generate(key_size=KEY_SIZE) - ca_attributes = CertificateRequestAttributes( - common_name=common_name, - is_ca=True, - add_unique_id_to_subject_name=False, - ) - ca_crt = Certificate.generate_self_signed_ca( - attributes=ca_attributes, - private_key=ca_key, - validity=timedelta(days=VALIDITY_DAYS), - ) - - client_key = PrivateKey.generate(key_size=KEY_SIZE) - - csr_attributes = CertificateRequestAttributes( - common_name=common_name, add_unique_id_to_subject_name=False - ) - csr = CertificateSigningRequest.generate( - attributes=csr_attributes, - private_key=client_key, - ) - - client_crt = Certificate.generate( - csr=csr, - ca=ca_crt, - ca_private_key=ca_key, - validity=timedelta(days=VALIDITY_DAYS), - is_ca=False, - ) - - shared = SharedCertificate( - certificate=client_crt, - key=client_key, - ca=ca_crt, - ) - - persist_client_cert_key_and_ca(shared) - return shared - - -def _exists() -> bool: - """Check whether the client certificates and CA certificate already exist. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - """ - return ( - with_pebble_retry(lambda: CA_CERT_PATH.exists()) - and with_pebble_retry(lambda: CLIENT_KEY_PATH.exists()) - and with_pebble_retry(lambda: CLIENT_CERT_PATH.exists()) - ) diff --git a/rollingops/src/charmlibs/rollingops/_etcd_rollingops.py b/rollingops/src/charmlibs/rollingops/_etcd_rollingops.py deleted file mode 100644 index 816d7659d..000000000 --- a/rollingops/src/charmlibs/rollingops/_etcd_rollingops.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import subprocess -import time - - -def main(): - """Juju hook event dispatcher.""" - parser = argparse.ArgumentParser() - parser.add_argument('--run-cmd', required=True) - parser.add_argument('--unit-name', required=True) - parser.add_argument('--charm-dir', required=True) - parser.add_argument('--owner', required=True) - args = parser.parse_args() - - time.sleep(10) - - dispatch_sub_cmd = ( - f'JUJU_DISPATCH_PATH=hooks/rollingops_lock_granted {args.charm_dir}/dispatch' - ) - res = subprocess.run([args.run_cmd, '-u', args.unit_name, dispatch_sub_cmd]) - res.check_returncode() - - -if __name__ == '__main__': - main() diff --git a/rollingops/src/charmlibs/rollingops/_etcdctl.py b/rollingops/src/charmlibs/rollingops/_etcdctl.py deleted file mode 100644 index 4befb143a..000000000 --- a/rollingops/src/charmlibs/rollingops/_etcdctl.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Functions for interacting with etcd through the etcdctl CLI. - -The functions in this file manage the environment variables required for -connecting to an etcd cluster, including TLS configuration, and provide -convenience functions for executing commands and retrieving structured results. -""" - -import json -import logging -import os -import shutil -import subprocess -from dataclasses import asdict -from functools import lru_cache - -from charmlibs import pathops -from charmlibs.rollingops._models import ( - CERT_MODE, - EtcdConfig, - RollingOpsEtcdNotConfiguredError, - RollingOpsFileSystemError, - with_pebble_retry, -) - -logger = logging.getLogger(__name__) - -BASE_DIR = pathops.LocalPath('/var/lib/rollingops/etcd') -SERVER_CA_PATH = BASE_DIR / 'server-ca.pem' -CONFIG_FILE_PATH = BASE_DIR / 'etcdctl.json' -ETCDCTL_CMD = 'etcdctl' - - -@lru_cache(maxsize=1) -def is_etcdctl_installed() -> bool: - """Return whether the snap-provided etcdctl command is available.""" - return shutil.which(ETCDCTL_CMD) is not None - - -def write_trusted_server_ca(tls_ca_pem: str) -> None: - """Persist the etcd server CA certificate to disk. - - Args: - tls_ca_pem: PEM-encoded CA certificate. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - try: - with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) - with_pebble_retry(lambda: SERVER_CA_PATH.write_text(tls_ca_pem, mode=CERT_MODE)) - except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to persist etcd trusted CA certificate.') from e - - -def write_config_file( - endpoints: str, - client_cert_path: pathops.LocalPath, - client_key_path: pathops.LocalPath, -) -> None: - """Create or update the etcdctl configuration JSON file. - - This function writes a JSON file containing the required ETCDCTL_* - variables used by etcdctl to connect to the etcd cluster. - - Args: - endpoints: Comma-separated list of etcd endpoints. - client_cert_path: Path to the client certificate. - client_key_path: Path to the client private key. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - config = EtcdConfig( - endpoints=endpoints, - cacert_path=str(SERVER_CA_PATH), - cert_path=str(client_cert_path), - key_path=str(client_key_path), - ) - - try: - with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) - with_pebble_retry( - lambda: CONFIG_FILE_PATH.write_text(json.dumps(asdict(config), indent=2), mode=0o600) - ) - except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to persist etcd config file.') from e - - -def _load_config() -> EtcdConfig: - """Load etcd configuration from disk. - - Raises: - RollingOpsEtcdNotConfiguredError: If the config file does not exist. - RollingOpsFileSystemError: if we faile to read the etcd configuration file or - file cannot be deserialized. - PebbleConnectionError: if the remote container cannot be reached - """ - if not with_pebble_retry(lambda: CONFIG_FILE_PATH.exists()): - raise RollingOpsEtcdNotConfiguredError( - f'etcdctl config file does not exist: {CONFIG_FILE_PATH}' - ) - - try: - data = json.loads(CONFIG_FILE_PATH.read_text()) - return EtcdConfig(**data) - except FileNotFoundError as e: - raise RollingOpsEtcdNotConfiguredError('etcd configuration file not found.') from e - except (IsADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to read the etcd config file.') from e - except (json.JSONDecodeError, TypeError) as e: - raise RollingOpsFileSystemError('Invalid etcd configuration file format.') from e - - -def load_env() -> dict[str, str]: - """Return environment variables for etcdctl. - - Returns: A dictionary containing environment variables to pass to subprocess calls. - - Raises: - RollingOpsEtcdNotConfiguredError: If the environment file does not exist. - RollingOpsFileSystemError: if we fail to read the etcd configuration file or - the file cannot be deserialized. - PebbleConnectionError: if the remote container cannot be reached - """ - config = _load_config() - - env = os.environ.copy() - env.update({ - 'ETCDCTL_API': '3', - 'ETCDCTL_ENDPOINTS': config.endpoints, - 'ETCDCTL_CACERT': config.cacert_path, - 'ETCDCTL_CERT': config.cert_path, - 'ETCDCTL_KEY': config.key_path, - }) - return env - - -def ensure_initialized(): - """Checks whether the etcd config file for etcdctl is setup. - - Raises: - RollingOpsEtcdNotConfiguredError: if the etcd config file does not exist, etcd - server CA does not exist or etcdctl is not installed. - PebbleConnectionError: if the remote container cannot be reached. - """ - if not with_pebble_retry(lambda: CONFIG_FILE_PATH.exists()): - raise RollingOpsEtcdNotConfiguredError( - f'etcdctl config file does not exist: {CONFIG_FILE_PATH}' - ) - if not with_pebble_retry(lambda: SERVER_CA_PATH.exists()): - raise RollingOpsEtcdNotConfiguredError( - f'etcdctl server CA file does not exist: {SERVER_CA_PATH}' - ) - if not is_etcdctl_installed(): - raise RollingOpsEtcdNotConfiguredError(f'{ETCDCTL_CMD} is not installed.') - - -def cleanup() -> None: - """Removes the etcdctl env file and the trusted etcd server CA. - - Raises: - RollingOpsFileSystemError: if there is a problem when deleting the files. - PebbleConnectionError: if the remote container cannot be reached. - """ - try: - with_pebble_retry(lambda: SERVER_CA_PATH.unlink(missing_ok=True)) - with_pebble_retry(lambda: CONFIG_FILE_PATH.unlink(missing_ok=True)) - except (IsADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to remove etcd config file and CA.') from e - - -def run(*args: str) -> str | None: - """Execute an etcdctl command. - - Args: - args: List of arguments to pass to etcdctl. - - Returns: - The stdout of the command, stripped, or None if execution failed. - - Raises: - RollingOpsEtcdNotConfiguredError: if the etcd config file does not exist. - PebbleConnectionError: if the remote container cannot be reached. - """ - ensure_initialized() - cmd = [ETCDCTL_CMD, *args] - - try: - result = subprocess.run( - cmd, env=load_env(), check=True, text=True, capture_output=True - ).stdout.strip() - except subprocess.CalledProcessError as e: - logger.error('etcdctl command failed: returncode: %s, error: %s', e.returncode, e.stderr) - return None - except subprocess.TimeoutExpired as e: - logger.error('Timed out running etcdctl: %s', e.stderr) - return None - - return result diff --git a/rollingops/src/charmlibs/rollingops/_manager.py b/rollingops/src/charmlibs/rollingops/_manager.py deleted file mode 100644 index 6497db79f..000000000 --- a/rollingops/src/charmlibs/rollingops/_manager.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any - -from ops import Relation -from ops.charm import ( - CharmBase, - RelationBrokenEvent, - RelationCreatedEvent, - RelationDepartedEvent, -) -from ops.framework import EventBase, Object - -from charmlibs.rollingops import _etcdctl as etcdctl -from charmlibs.rollingops._models import ( - RollingOpsEtcdNotConfiguredError, - RollingOpsInvalidLockRequestError, - RollingOpsKeys, - RollingOpsNoEtcdRelationError, -) -from charmlibs.rollingops._relations import EtcdRequiresV1, SharedClientCertificateManager -from charmlibs.rollingops._worker import EtcdRollingOpsAsyncWorker - -logger = logging.getLogger(__name__) - - -class EtcdRollingOpsManager(Object): - """Rolling ops manager for clusters.""" - - def __init__( - self, - charm: CharmBase, - peer_relation_name: str, - etcd_relation_name: str, - cluster_id: str, - callback_targets: dict[str, Any], - ): - """Register our custom events. - - params: - charm: the charm we are attaching this to. - peer_relation_name: peer relation used for rolling ops. - etcd_relation_name: the relation to integrate with etcd. - cluster_id: unique identifier for the cluster - callback_targets: mapping from callback_id -> callable. - """ - super().__init__(charm, 'etcd-rolling-ops-manager') - self._charm = charm - self.peer_relation_name = peer_relation_name - self.etcd_relation_name = etcd_relation_name - self.callback_targets = callback_targets - self.charm_dir = charm.charm_dir - - owner = f'{self.model.uuid}-{self.model.unit.name}'.replace('/', '-') - self.worker = EtcdRollingOpsAsyncWorker( - charm, peer_relation_name=peer_relation_name, owner=owner - ) - self.keys = RollingOpsKeys.for_owner(cluster_id, owner) - - self.shared_certificates = SharedClientCertificateManager( - charm, - peer_relation_name=peer_relation_name, - ) - - self.etcd = EtcdRequiresV1( - charm, - relation_name=etcd_relation_name, - cluster_id=self.keys.cluster_prefix, - shared_certificates=self.shared_certificates, - ) - - self.framework.observe( - charm.on[self.peer_relation_name].relation_departed, self._on_peer_relation_departed - ) - self.framework.observe( - charm.on[self.etcd_relation_name].relation_broken, self._on_etcd_relation_broken - ) - self.framework.observe( - charm.on[self.etcd_relation_name].relation_created, self._on_etcd_relation_created - ) - - @property - def _peer_relation(self) -> Relation | None: - """Return the peer relation for this charm.""" - return self.model.get_relation(self.peer_relation_name) - - @property - def _etcd_relation(self) -> Relation | None: - """Return the etcd relation for this charm.""" - return self.model.get_relation(self.etcd_relation_name) - - def _on_etcd_relation_created(self, event: RelationCreatedEvent) -> None: - """Check whether the snap-provided etcdctl command is available.""" - if not etcdctl.is_etcdctl_installed(): - logger.error('%s is not installed', etcdctl.ETCDCTL_CMD) - # TODO: fallback to peer relation implementation. - - def _on_rollingops_lock_granted(self, event: EventBase) -> None: - """Handle the event when a rolling operation lock is granted. - - If etcd is not yet configured, the operation is skipped. - """ - if not self._peer_relation or not self._etcd_relation: - # TODO: handle this case. Fallback to peer relation. - return - try: - etcdctl.ensure_initialized() - except RollingOpsEtcdNotConfiguredError: - # TODO: handle this case. Fallback to peer relation. - return - logger.info('Received a rolling-op lock granted event.') - self._on_run_with_lock() - - def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: - """Handle a unit departing from the peer relation. - - If the current unit is the one departing, stop the etcd worker - process to ensure a clean shutdown. - """ - unit = event.departing_unit - if unit == self.model.unit: - self.worker.stop() - - def _on_etcd_relation_broken(self, event: RelationBrokenEvent) -> None: - """Handle the etcd relation being fully removed. - - This method stops the etcd worker process since the required - relation is no longer available. - """ - self.worker.stop() - - def request_async_lock( - self, - callback_id: str, - kwargs: dict[str, Any] | None = None, - max_retry: int | None = None, - ) -> None: - """This is a dummy function. - - Here we spawn a new process that will trigger a Juju hook. - This function will be completely remade in the next PR. - - Args: - callback_id: Identifier of the registered callback to execute when - the lock is granted. - kwargs: Optional keyword arguments passed to the callback when - executed. Must be JSON-serializable. - max_retry: Maximum number of retries for the operation. - - None: retry indefinitely - - 0: do not retry on failure - - Raises: - RollingOpsInvalidLockRequestError: If the callback_id is not registered or - invalid parameters were provided. - RollingOpsNoEtcdRelationError: if the etcd relation does not exist - RollingOpsEtcdNotConfiguredError: if etcd client has not been configured yet - PebbleConnectionError: if the remote container cannot be reached. - RollingOpsCharmLibMissingError: if the charm libs cannot be found. - """ - if callback_id not in self.callback_targets: - raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') - - if not self._etcd_relation: - raise RollingOpsNoEtcdRelationError - - etcdctl.ensure_initialized() - - # TODO: implement actual lock request - - self.worker.start() - - def _on_run_with_lock(self) -> None: - """This is a dummy function. - - Here we try to reach etcd from each unit. - This function will be completely remade in the next PR. - """ - # TODO: implement the actual execution under lock - etcdctl.run('put', self.keys.lock_key, self.keys.owner) - - result = etcdctl.run('get', self.keys.lock_key, '--print-value-only') - - if result is None: - logger.error('Unexpected response from etcd.') - return - - callback = self.callback_targets.get('_restart', '') - callback(delay=1) diff --git a/rollingops/src/charmlibs/rollingops/_models.py b/rollingops/src/charmlibs/rollingops/_models.py deleted file mode 100644 index 5e653bc2e..000000000 --- a/rollingops/src/charmlibs/rollingops/_models.py +++ /dev/null @@ -1,262 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""etcd rolling ops models.""" - -from collections.abc import Callable -from dataclasses import dataclass -from enum import StrEnum -from typing import ClassVar, TypeVar - -from ops import pebble -from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed - -from charmlibs.interfaces.tls_certificates import Certificate, PrivateKey -from charmlibs.pathops import LocalPath, PebbleConnectionError - -T = TypeVar('T') - - -class RollingOpsNoEtcdRelationError(Exception): - """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" - - -class RollingOpsEtcdUnreachableError(Exception): - """Raised if etcd server is unreachable.""" - - -class RollingOpsEtcdNotConfiguredError(Exception): - """Raised if etcd client has not been configured yet (env file does not exist).""" - - -class RollingOpsFileSystemError(Exception): - """Raised if there is a problem when interacting with the filesystem.""" - - -class RollingOpsInvalidLockRequestError(Exception): - """Raised if the lock request is invalid.""" - - -class RollingOpsDecodingError(Exception): - """Raised if json content cannot be processed.""" - - -class RollingOpsInvalidSecretContentError(Exception): - """Raised if the content of a secret is invalid.""" - - -class RollingOpsCharmLibMissingError(Exception): - """Raised if the path to the libraries cannot be resolved.""" - - -CERT_MODE = 0o644 -KEY_MODE = 0o600 - - -@retry( - retry=retry_if_exception_type((PebbleConnectionError, pebble.APIError, pebble.ChangeError)), - stop=stop_after_attempt(3), - wait=wait_fixed(10), - reraise=True, -) -def with_pebble_retry[T](func: Callable[[], T]) -> T: - return func() - - -class OperationResult(StrEnum): - """Callback return values.""" - - RELEASE = 'release' - RETRY_RELEASE = 'retry-release' - RETRY_HOLD = 'retry-hold' - - -@dataclass(frozen=True) -class SharedCertificate: - """Represent the certificates shared within units of an app to connect to etcd.""" - - certificate: Certificate - key: PrivateKey - ca: Certificate - - @classmethod - def from_paths( - cls, cert_path: LocalPath, key_path: LocalPath, ca_path: LocalPath - ) -> 'SharedCertificate': - """Create a SharedCertificate from certificate files on disk. - - This method reads the certificate, private key, and CA certificate - from the provided file paths and converts them into their respective - typed objects. - - Args: - cert_path: Path to the client certificate file (PEM format). - key_path: Path to the private key file (PEM format). - ca_path: Path to the CA certificate file (PEM format). - - Returns: - A SharedCertificate instance containing the loaded certificate material. - - Raises: - TLSCertificatesError: If any certificate cannot be parsed. - ValueError: If the key cannot be parsed - PebbleConnectionError: If the remote container cannot be reached - after retries. - FileNotFoundError: If the file does not exist. - PermissionError: If the file cannot be accessed. - """ - return cls( - certificate=Certificate.from_string(cls._read_text_with_retry(cert_path)), - key=PrivateKey.from_string(cls._read_text_with_retry(key_path)), - ca=Certificate.from_string(cls._read_text_with_retry(ca_path)), - ) - - @classmethod - def from_strings(cls, certificate: str, key: str, ca: str) -> 'SharedCertificate': - """Create a SharedCertificate from PEM-encoded strings. - - Raises: - TLSCertificatesError: If any certificate cannot be parsed. - ValueError: If the key cannot be parsed - """ - return cls( - certificate=Certificate.from_string(certificate), - key=PrivateKey.from_string(key), - ca=Certificate.from_string(ca), - ) - - def write_to_paths( - self, cert_path: LocalPath, key_path: LocalPath, ca_path: LocalPath - ) -> None: - """Write the certificate material to disk. - - This method writes the client certificate, private key, and CA certificate - to the specified file paths using appropriate file permissions. - - - Certificate and CA files are written with mode 0o644. - - Private key is written with mode 0o600. - - Args: - cert_path: Path where the client certificate will be written. - key_path: Path where the private key will be written. - ca_path: Path where the CA certificate will be written. - - Raises: - PebbleConnectionError: If the remote container cannot be reached - after retries. - PermissionError: If the file cannot be written. - NotADirectoryError: If the parent path is invalid. - """ - self._write_text_with_retry(path=cert_path, content=self.certificate.raw, mode=CERT_MODE) - self._write_text_with_retry(path=key_path, content=self.key.raw, mode=KEY_MODE) - self._write_text_with_retry(path=ca_path, content=self.ca.raw, mode=CERT_MODE) - - @classmethod - def _read_text_with_retry(cls, path: LocalPath) -> str: - """Read the content of a file, retrying on transient Pebble errors. - - Args: - path: The file path to read. - - Returns: - The file content as a string. - - Raises: - PebbleConnectionError: If the remote container cannot be reached - after retries. - FileNotFoundError: If the file does not exist. - PermissionError: If the file cannot be accessed. - """ - return with_pebble_retry(lambda: path.read_text()) - - def _write_text_with_retry(self, path: LocalPath, content: str, mode: int) -> None: - """Write text to a file, retrying on transient Pebble errors. - - Args: - path: The file path to write to. - content: The text content to write. - mode: File permission mode to apply (e.g. 0o600). - - Raises: - PebbleConnectionError: If the remote container cannot be reached - after retries. - PermissionError: If the file cannot be written. - NotADirectoryError: If the parent path is invalid. - """ - with_pebble_retry(lambda: path.write_text(content, mode=mode)) - - -@dataclass(frozen=True) -class EtcdConfig: - """Represent the etcd configuration.""" - - endpoints: str - cacert_path: str - cert_path: str - key_path: str - - -@dataclass(frozen=True) -class RollingOpsKeys: - """Collection of etcd key prefixes used for rolling operations. - - Layout: - /rollingops/{lock_name}/{cluster_id}/granted-unit/ - /rollingops/{lock_name}/{cluster_id}/{owner}/pending/ - /rollingops/{lock_name}/{cluster_id}/{owner}/inprogress/ - /rollingops/{lock_name}/{cluster_id}/{owner}/completed/ - - The distributed lock key is cluster-scoped - """ - - ROOT: ClassVar[str] = '/rollingops' - - cluster_id: str - owner: str - lock_name: str = 'default' - - @property - def cluster_prefix(self) -> str: - """Etcd prefix corresponding to the cluster namespace.""" - return f'{self.ROOT}/{self.lock_name}/{self.cluster_id}/' - - @property - def _owner_prefix(self) -> str: - """Etcd prefix for all the queues belonging to an owner.""" - return f'{self.cluster_prefix}{self.owner}/' - - @property - def lock_key(self) -> str: - """Etcd key of the lock.""" - return f'{self.cluster_prefix}granted-unit/' - - @property - def pending(self) -> str: - """Prefix for operations waiting to be executed.""" - return f'{self._owner_prefix}pending/' - - @property - def inprogress(self) -> str: - """Prefix for operations currently being executed.""" - return f'{self._owner_prefix}inprogress/' - - @property - def completed(self) -> str: - """Prefix for operations that have finished execution.""" - return f'{self._owner_prefix}completed/' - - @classmethod - def for_owner(cls, cluster_id: str, owner: str) -> 'RollingOpsKeys': - """Create a set of keys for a given owner on a cluster.""" - return cls(cluster_id=cluster_id, owner=owner) diff --git a/rollingops/src/charmlibs/rollingops/_peer_manager.py b/rollingops/src/charmlibs/rollingops/_peer_manager.py deleted file mode 100644 index e75bdcb78..000000000 --- a/rollingops/src/charmlibs/rollingops/_peer_manager.py +++ /dev/null @@ -1,449 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Rolling Ops v1 — coordinated rolling operations for Juju charms. - -This library provides a reusable mechanism for coordinating rolling operations -across units of a Juju application using a peer-relation distributed lock. - -The library guarantees that at most one unit executes a rolling operation at any -time, while allowing multiple units to enqueue operations and participate -in a coordinated rollout. - -## Data model (peer relation) - -### Unit databag - -Each unit maintains a FIFO queue of operations it wishes to execute. - -Keys: -- `operations`: JSON-encoded list of queued `Operation` objects -- `state`: `"idle"` | `"request"` | `"retry-release"` | `"retry-hold"` -- `executed_at`: UTC timestamp string indicating when the current operation last ran - -Each `Operation` contains: -- `callback_id`: identifier of the callback to execute -- `kwargs`: JSON-serializable arguments for the callback -- `requested_at`: UTC timestamp when the operation was enqueued -- `max_retry (optional)`: maximum retry count. `None` means unlimited -- `attempt`: current attempt number - -### Application databag - -The application databag represents the global lock state. - -Keys: -- `granted_unit`: unit identifier (unit name), or empty -- `granted_at`: UTC timestamp indicating when the lock was granted - -## Operation semantics - -- Units enqueue operations instead of overwriting a single pending request. -- Duplicate operations (same `callback_id` and `kwargs`) are ignored if they are - already the last queued operation. -- When granted the lock, a unit executes exactly one operation (the queue head). -- After execution, the lock is released so that other units may proceed. - -## Retry semantics - -- If a callback returns `OperationResult.RETRY_RELEASE` the unit will release the -lock and retry the operation later. -- If a callback returns `OperationResult.RETRY_HOLD` the unit will keep the -lock and retry immediately. -- Retry state (`attempt`) is tracked per operation. -- When `max_retry` is exceeded, the failing operation is dropped and the unit - proceeds to the next queued operation, if any. - -## Scheduling semantics - -- Only the leader schedules lock grants. -- If a valid lock grant exists, no new unit is scheduled. -- Requests are preferred over retries. -- Among requests, the operation with the oldest `requested_at` timestamp is selected. -- Among retries, the operation with the oldest `executed_at` timestamp is selected. -- Stale grants (e.g., pointing to departed units) are automatically released. - -All timestamps are stored in UTC using ISO 8601 format. - -## Using the library in a charm - -### 1. Declare a peer relation - -```yaml -peers: - restart: - interface: rolling_op -``` - -Import this library into src/charm.py, and initialize a PeerRollingOpsManager in the Charm's -`__init__`. The Charm should also define a callback routine, which will be executed when -a unit holds the distributed lock: - -src/charm.py -```python -from charms.rolling_ops.v1.rollingops import PeerRollingOpsManager, OperationResult - -class SomeCharm(CharmBase): - def __init__(self, *args): - super().__init__(*args) - - self.rolling_ops = PeerRollingOpsManager( - charm=self, - relation_name="restart", - callback_targets={ - "restart": self._restart, - "failed_restart": self._failed_restart, - "defer_restart": self._defer_restart, - }, - ) - - def _restart(self, force: bool) -> OperationResult: - # perform restart logic - return OperationResult.RELEASE - - def _failed_restart(self) -> OperationResult: - # perform restart logic - return OperationResult.RETRY_RELEASE - - def _defer_restart(self) -> OperationResult: - if not self.some_condition(): - return OperationResult.RETRY_HOLD - # do restart logic - return OperationResult.RELEASE -``` - -Request a rolling operation - -```python - - def _on_restart_action(self, event) -> None: - self.rolling_ops.request_async_lock( - callback_id="restart", - kwargs={"force": True}, - max_retry=3, - ) -``` - -All participating units must enqueue the operation in order to be included -in the rolling execution. - -Units that do not enqueue the operation will be skipped, allowing operators -to recover from partial failures by reissuing requests selectively. - -Do not include sensitive information in the kwargs of the callback. -These values will be stored in the databag. - -Make sure that callback_targets is not dynamic and that the mapping -contains the expected values at the moment of the callback execution. -""" - -import logging -from collections.abc import Callable -from typing import Any - -from ops import Relation -from ops.charm import ( - CharmBase, - RelationChangedEvent, - RelationDepartedEvent, -) -from ops.framework import EventBase, Object - -from charmlibs.rollingops._peer_models import ( - Lock, - LockIterator, - OperationResult, - RollingOpsDecodingError, - RollingOpsInvalidLockRequestError, - RollingOpsNoRelationError, - pick_oldest_completed, - pick_oldest_request, -) -from charmlibs.rollingops._peer_worker import PeerRollingOpsAsyncWorker - -logger = logging.getLogger(__name__) - - -class PeerRollingOpsManager(Object): - """Emitters and handlers for rolling ops.""" - - def __init__( - self, charm: CharmBase, relation_name: str, callback_targets: dict[str, Callable[..., Any]] - ): - """Register our custom events. - - params: - charm: the charm we are attaching this to. - relation_name: the peer relation name from metadata.yaml. - callback_targets: mapping from callback_id -> callable. - """ - super().__init__(charm, 'peer-rolling-ops-manager') - self._charm = charm - self.relation_name = relation_name - self.callback_targets = callback_targets - self.charm_dir = charm.charm_dir - self.worker = PeerRollingOpsAsyncWorker(charm, relation_name=relation_name) - - self.framework.observe( - charm.on[self.relation_name].relation_changed, self._on_relation_changed - ) - self.framework.observe( - charm.on[self.relation_name].relation_departed, self._on_relation_departed - ) - self.framework.observe(charm.on.leader_elected, self._process_locks) - self.framework.observe(charm.on.update_status, self._on_rollingops_lock_granted) - - @property - def _relation(self) -> Relation | None: - """Returns the peer relation used to manage locks.""" - return self.model.get_relation(self.relation_name) - - def _on_rollingops_lock_granted(self, event: EventBase) -> None: - """Handler of the custom hook rollingops_lock_granted. - - The custom hook is triggered by a background process. - """ - if not self._relation: - return - logger.info('Received a rolling-ops lock granted event.') - lock = Lock(self.model, self.relation_name, self.model.unit) - if lock.should_run(): - self._on_run_with_lock() - self._process_locks() - - def _on_relation_departed(self, event: RelationDepartedEvent) -> None: - """Leader cleanup: if a departing unit was granted a lock, clear the grant. - - This prevents deadlocks when the granted unit leaves the relation. - """ - if not self.model.unit.is_leader(): - return - if unit := event.departing_unit: - lock = Lock(self.model, self.relation_name, unit) - if lock.is_granted(): - lock.release() - self._process_locks() - - def _on_relation_changed(self, _: RelationChangedEvent) -> None: - """Process relation changed.""" - if self.model.unit.is_leader(): - self._process_locks() - return - - lock = Lock(self.model, self.relation_name, self.model.unit) - if lock.should_run(): - self._on_run_with_lock() - - def _valid_peer_unit_names(self) -> set[str]: - """Return all unit names currently participating in the peer relation.""" - if not self._relation: - return set() - names = {u.name for u in self._relation.units} - names.add(self.model.unit.name) - return names - - def _release_stale_grant(self) -> None: - """Ensure granted_unit refers to a unit currently on the peer relation.""" - if not self._relation: - return - - if not (granted_unit := self._relation.data[self.model.app].get('granted_unit', '')): - return - - valid_units = self._valid_peer_unit_names() - if granted_unit not in valid_units: - logger.warning( - 'granted_unit=%s is not in current peer units; releasing stale grant.', - granted_unit, - ) - self._relation.data[self.model.app].update({'granted_unit': '', 'granted_at': ''}) - - def _process_locks(self, _: EventBase | None = None) -> None: - """Process locks. - - This method is only executed by the leader unit. - It effectively releases the lock and triggers scheduling. - """ - if not self.model.unit.is_leader(): - return - - for lock in LockIterator(self.model, self.relation_name): - if lock.should_release(): - lock.release() - break - - self._release_stale_grant() - granted_unit = self._relation.data[self.model.app].get('granted_unit', '') # type: ignore[reportOptionalMemberAccess] - - if granted_unit: - logger.info('Current granted_unit=%s. No new unit will be scheduled.', granted_unit) - return - - self._schedule() - - def _schedule(self) -> None: - """Select and grant the next lock based on priority and queue state. - - This method iterates over all locks associated with the relation and - determines which unit should receive the lock next. - - Priority order: - 1. Units in RETRY_HOLD state are immediately granted the lock. - 2. Units in REQUEST state are considered next (oldest request first). - 3. Units in RETRY_RELEASE state are considered last (oldest completed first). - - If no eligible lock is found, no action is taken. - - Once a lock is selected, it is granted via `_grant_lock`. - """ - logger.info('Starting scheduling.') - - pending_requests: list[Lock] = [] - pending_retries: list[Lock] = [] - - for lock in LockIterator(self.model, self.relation_name): - if lock.is_retry_hold(): - self._grant_lock(lock) - return - if lock.is_waiting(): - pending_requests.append(lock) - elif lock.is_waiting_retry(): - pending_retries.append(lock) - - selected = None - if pending_requests: - selected = pick_oldest_request(pending_requests) - elif pending_retries: - selected = pick_oldest_completed(pending_retries) - - if not selected: - logger.info('No pending lock requests. Lock was not granted to any unit.') - return - - self._grant_lock(selected) - - def _grant_lock(self, selected: Lock) -> None: - """Grant the lock to the selected unit. - - If the lock is granted to the leader unit: - - If it is a retry, starts the worker to break the loop before next execution. - - Otherwise, the callback is run immediately - - Args: - selected: The lock instance to grant. - """ - selected.grant() - logger.info('Lock granted to unit=%s.', selected.unit.name) - if selected.unit == self.model.unit: - if selected.is_retry(): - self.worker.start() - return - self._on_run_with_lock() - self._process_locks() - - def request_async_lock( - self, - callback_id: str, - kwargs: dict[str, Any] | None = None, - max_retry: int | None = None, - ) -> None: - """Enqueue a rolling operation and request the distributed lock. - - This method appends an operation (identified by callback_id and kwargs) to the - calling unit's FIFO queue stored in the peer relation databag and marks the unit as - requesting the lock. It does not execute the operation directly. - - Args: - callback_id: Identifier for the callback to execute when this unit is granted - the lock. Must be a non-empty string and must exist in the manager's - callback registry. - kwargs: Keyword arguments to pass to the callback when executed. If omitted, - an empty dict is used. Must be JSON-serializable because it is stored - in Juju relation databags. - max_retry: Retry limit for this operation. None means unlimited retries. - 0 means no retries (drop immediately on first failure). Must be >= 0 - when provided. - - Raises: - RollingOpsInvalidLockRequestError: If any input is invalid (e.g. unknown callback_id, - non-dict kwargs, non-serializable kwargs, negative max_retry). - RollingOpsNoRelationError: If the peer relation does not exist. - """ - if callback_id not in self.callback_targets: - raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') - - try: - if kwargs is None: - kwargs = {} - lock = Lock(self.model, self.relation_name, self.model.unit) - lock.request(callback_id, kwargs, max_retry) - - except (RollingOpsDecodingError, ValueError) as e: - logger.error('Failed operation: %s', e) - raise RollingOpsInvalidLockRequestError('Failed to create the lock request') from e - except RollingOpsNoRelationError as e: - logger.debug('No %s peer relation yet.', self.relation_name) - raise e - - if self.model.unit.is_leader(): - self._process_locks() - - def _on_run_with_lock(self) -> None: - """Execute the current head operation if this unit holds the distributed lock. - - - If this unit does not currently hold the lock grant, no operation is run. - - If this unit holds the grant but has no queued operation, lock is released. - - Otherwise, the operation's callback is looked up by `callback_id` and - invoked with the operation kwargs. - """ - lock = Lock(self.model, self.relation_name, self.model.unit) - - if not lock.is_granted(): - logger.debug('Lock is not granted. Operation will not run.') - return - - if not (operation := lock.get_current_operation()): - logger.debug('There is no operation to run.') - lock.complete() - return - - if not (callback := self.callback_targets.get(operation.callback_id)): - logger.warning( - 'Operation %s target was not found. It cannot be executed.', - operation.callback_id, - ) - return - logger.info( - 'Executing callback_id=%s, attempt=%s', operation.callback_id, operation.attempt - ) - try: - result = callback(**operation.kwargs) - except Exception as e: - logger.exception('Operation failed: %s: %s', operation.callback_id, e) - result = OperationResult.RETRY_RELEASE - - match result: - case OperationResult.RETRY_HOLD: - logger.info( - 'Finished %s. Operation will be retried immediately.', operation.callback_id - ) - lock.retry_hold() - - case OperationResult.RETRY_RELEASE: - logger.info('Finished %s. Operation will be retried later.', operation.callback_id) - lock.retry_release() - - case _: - logger.info('Finished %s. Lock will be released.', operation.callback_id) - lock.complete() diff --git a/rollingops/src/charmlibs/rollingops/_peer_models.py b/rollingops/src/charmlibs/rollingops/_peer_models.py deleted file mode 100644 index b81ff4b3a..000000000 --- a/rollingops/src/charmlibs/rollingops/_peer_models.py +++ /dev/null @@ -1,521 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""peer rolling ops models.""" - -import json -import logging -from collections.abc import Iterator -from dataclasses import dataclass, field -from datetime import UTC, datetime -from enum import StrEnum -from typing import Any - -from ops import Model, RelationDataContent, Unit - -logger = logging.getLogger(__name__) - - -def _now_timestamp_str() -> str: - """UTC timestamp as a string using ISO 8601 format.""" - return datetime.now(UTC).isoformat() - - -def _now_timestamp() -> datetime: - """UTC timestamp.""" - return datetime.now(UTC) - - -def _parse_timestamp(timestamp: str) -> datetime | None: - """Parse timestamp string. Return None on errors to avoid selecting invalid timestamps.""" - try: - return datetime.fromisoformat(timestamp) - except Exception: - return None - - -class RollingOpsNoRelationError(Exception): - """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" - - -class RollingOpsDecodingError(Exception): - """Raised if the content of the databag cannot be processed.""" - - -class RollingOpsInvalidLockRequestError(Exception): - """Raised if the lock request is invalid.""" - - -@dataclass -class Operation: - """A single queued operation.""" - - callback_id: str - requested_at: datetime - max_retry: int | None - attempt: int - kwargs: dict[str, Any] = field(default_factory=dict[str, Any]) - - @classmethod - def _validate_fields( - cls, callback_id: Any, kwargs: Any, requested_at: Any, max_retry: Any, attempt: Any - ) -> None: - """Validate the class attributes.""" - if not isinstance(callback_id, str) or not callback_id.strip(): - raise ValueError('callback_id must be a non-empty string') - - if not isinstance(kwargs, dict): - raise ValueError('kwargs must be a dict') - try: - json.dumps(kwargs) - except TypeError as e: - raise ValueError(f'kwargs must be JSON-serializable: {e}') from e - - if not isinstance(requested_at, datetime): - raise ValueError('requested_at must be a datetime') - - if max_retry is not None: - if not isinstance(max_retry, int): - raise ValueError('max_retry must be an int') - if max_retry < 0: - raise ValueError('max_retry must be >= 0') - - if not isinstance(attempt, int): - raise ValueError('attempt must be an int') - if attempt < 0: - raise ValueError('attempt must be >= 0') - - def __post_init__(self) -> None: - """Validate the class attributes.""" - self._validate_fields( - self.callback_id, - self.kwargs, - self.requested_at, - self.max_retry, - self.attempt, - ) - - @classmethod - def create( - cls, - callback_id: str, - kwargs: dict[str, Any], - max_retry: int | None = None, - ) -> 'Operation': - """Create a new operation from a callback id and kwargs.""" - return cls( - callback_id=callback_id, - kwargs=kwargs, - requested_at=_now_timestamp(), - max_retry=max_retry, - attempt=0, - ) - - def _to_dict(self) -> dict[str, str]: - """Dict form (string-only values).""" - return { - 'callback_id': self.callback_id, - 'kwargs': self._kwargs_to_json(), - 'requested_at': self.requested_at.isoformat(), - 'max_retry': '' if self.max_retry is None else str(self.max_retry), - 'attempt': str(self.attempt), - } - - def to_string(self) -> str: - """Serialize to a string suitable for a Juju databag.""" - return json.dumps(self._to_dict(), separators=(',', ':')) - - def increase_attempt(self) -> None: - """Increment the attempt counter.""" - self.attempt += 1 - - def is_max_retry_reached(self) -> bool: - """Return True if attempt exceeds max_retry (unless max_retry is None).""" - if self.max_retry is None: - return False - return self.attempt > self.max_retry - - @classmethod - def from_string(cls, data: str) -> 'Operation': - """Deserialize from a Juju databag string. - - Raises: - RollingOpsDecodingError: if data cannot be deserialized. - """ - try: - obj = json.loads(data) - - return cls( - callback_id=obj['callback_id'], - requested_at=_parse_timestamp(obj['requested_at']), # type: ignore[reportArgumentType] - max_retry=int(obj['max_retry']) if obj.get('max_retry') else None, - attempt=int(obj['attempt']), - kwargs=json.loads(obj['kwargs']) if obj.get('kwargs') else {}, - ) - - except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e: - logger.error('Failed to deserialize Operation from %s: %s', data, e) - raise RollingOpsDecodingError( - 'Failed to deserialize data to create an Operation' - ) from e - - def _kwargs_to_json(self) -> str: - """Deterministic JSON serialization for kwargs.""" - return json.dumps(self.kwargs, sort_keys=True, separators=(',', ':')) - - def __eq__(self, other: object) -> bool: - """Equal for the operation.""" - if not isinstance(other, Operation): - return False - return self.callback_id == other.callback_id and self.kwargs == other.kwargs - - def __hash__(self) -> int: - """Hash for the operation.""" - return hash((self.callback_id, self._kwargs_to_json())) - - -class OperationQueue: - """In-memory FIFO queue of Operations with encode/decode helpers for storing in a databag.""" - - def __init__(self, operations: list[Operation] | None = None): - self.operations: list[Operation] = list(operations or []) - - def __len__(self) -> int: - """Return the number of operations in the queue.""" - return len(self.operations) - - @property - def empty(self) -> bool: - """Return True if there are no queued operations.""" - return not self.operations - - def peek(self) -> Operation | None: - """Return the first operation in the queue if it exists.""" - return self.operations[0] if self.operations else None - - def _peek_last(self) -> Operation | None: - """Return the last operation in the queue if it exists.""" - return self.operations[-1] if self.operations else None - - def dequeue(self) -> Operation | None: - """Drop the first operation in the queue if it exists and return it.""" - return self.operations.pop(0) if self.operations else None - - def increase_attempt(self) -> None: - """Increment the attempt counter for the head operation and persist it.""" - if self.empty: - return - self.operations[0].increase_attempt() - - def enqueue_lock_request( - self, callback_id: str, kwargs: dict[str, Any], max_retry: int | None = None - ) -> None: - """Append operation only if it is not equal to the last enqueued operation.""" - operation = Operation.create(callback_id, kwargs, max_retry=max_retry) - - last_operation = self._peek_last() - if last_operation is not None and last_operation == operation: - return - self.operations.append(operation) - - def to_string(self) -> str: - """Encode entire queue to a single string.""" - items = [op.to_string() for op in self.operations] - return json.dumps(items, separators=(',', ':')) - - @classmethod - def from_string(cls, data: str) -> 'OperationQueue': - """Decode queue from a string. - - Raises: - RollingOpsDecodingError: if data cannot be deserialized. - """ - if not data: - return cls() - - try: - items = json.loads(data) - except json.JSONDecodeError as e: - logger.error( - 'Failed to deserialize data to create an OperationQueue from %s: %s', data, e - ) - raise RollingOpsDecodingError( - 'Failed to deserialize data to create an OperationQueue.' - ) from e - if not isinstance(items, list) or not all(isinstance(s, str) for s in items): # type: ignore[reportUnknownVariableType] - raise RollingOpsDecodingError( - 'OperationQueue string must decode to a JSON list of strings.' - ) - - operations = [Operation.from_string(s) for s in items] # type: ignore[reportUnknownVariableType] - return cls(operations) - - -class LockIntent(StrEnum): - """Unit-level lock intents stored in unit databags.""" - - REQUEST = 'request' - RETRY_RELEASE = 'retry-release' - RETRY_HOLD = 'retry-hold' - IDLE = 'idle' - - -class OperationResult(StrEnum): - """Callback return values.""" - - RELEASE = 'release' - RETRY_RELEASE = 'retry-release' - RETRY_HOLD = 'retry-hold' - - -class Lock: - """State machine view over peer relation databags for a single unit. - - This class is the only component that should directly read/write the peer relation - databags for lock state, queue state, and grant state. - - Important: - - All relation databag values are strings. - - This class updates both unit databags and app databags, which triggers - relation-changed events. - """ - - def __init__(self, model: Model, relation_name: str, unit: Unit): - if not model.get_relation(relation_name): - # TODO: defer caller in this case (probably just fired too soon). - raise RollingOpsNoRelationError() - self.relation = model.get_relation(relation_name) - self.unit = unit - self.app = model.app - - @property - def _app_data(self) -> RelationDataContent: - return self.relation.data[self.app] # type: ignore[reportOptionalMemberAccess] - - @property - def _unit_data(self) -> RelationDataContent: - return self.relation.data[self.unit] # type: ignore[reportOptionalMemberAccess] - - @property - def _operations(self) -> OperationQueue: - return OperationQueue.from_string(self._unit_data.get('operations', '')) - - @property - def _state(self) -> str: - return self._unit_data.get('state', '') - - def request( - self, callback_id: str, kwargs: dict[str, Any], max_retry: int | None = None - ) -> None: - """Enqueue an operation and mark this unit as requesting the lock. - - Args: - callback_id: identifies which callback to execute. - kwargs: dict of callback kwargs. - max_retry: None -> unlimited retries, else explicit integer. - """ - queue = self._operations - - previous_length = len(queue) - queue.enqueue_lock_request(callback_id, kwargs, max_retry) - if previous_length == len(queue): - logger.info( - 'Operation %s not added to the queue. It already exists in the back of the queue.', - callback_id, - ) - return - - if len(queue) == 1: - self._unit_data.update({'state': LockIntent.REQUEST}) - - self._unit_data.update({'operations': queue.to_string()}) - logger.info('Operation %s added to the queue.', callback_id) - - def _set_retry(self, intent: LockIntent) -> None: - """Mark the given retry intent on the head operation. - - If max_retry is reached, the head operation is dropped via complete(). - """ - self._increase_attempt() - if self._is_max_retry_reached(): - logger.warning('Operation max retry reached. Dropping.') - self.complete() - return - self._unit_data.update({ - 'executed_at': _now_timestamp_str(), - 'state': intent, - }) - - def retry_release(self) -> None: - """Indicate that the operation should be retried but the lock should be released.""" - self._set_retry(LockIntent.RETRY_RELEASE) - - def retry_hold(self) -> None: - """Indicate that the operation should be retried but the lock should be kept.""" - self._set_retry(LockIntent.RETRY_HOLD) - - def complete(self) -> None: - """Mark the head operation as completed successfully, pop it from the queue. - - Update unit state depending on whether more operations remain. - """ - queue = self._operations - queue.dequeue() - next_state = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE - - self._unit_data.update({ - 'state': next_state, - 'operations': queue.to_string(), - 'executed_at': _now_timestamp_str(), - }) - - def release(self) -> None: - """Clear the application-level grant.""" - self._app_data.update({'granted_unit': '', 'granted_at': ''}) - - def grant(self) -> None: - """Grant a lock to a unit.""" - self._app_data.update({ - 'granted_unit': str(self.unit.name), - 'granted_at': _now_timestamp_str(), - }) - - def is_granted(self) -> bool: - """Return True if the unit holds the lock.""" - granted_unit = self._app_data.get('granted_unit', '') - return granted_unit == str(self.unit.name) - - def should_run(self) -> bool: - """Return True if the lock has been granted to the unit and it is time to run.""" - return self.is_granted() and not self._unit_executed_after_grant() - - def should_release(self) -> bool: - """Return True if the unit finished executing the callback and should be released.""" - return self.is_completed() or self._unit_executed_after_grant() - - def is_waiting(self) -> bool: - """Return True if this unit is waiting for a lock to be granted.""" - return self._state == LockIntent.REQUEST and not self.is_granted() - - def is_completed(self) -> bool: - """Return True if this unit is completed callback but still has the grant. - - Transitional state in which the unit is waiting for the leader to release the lock. - """ - return self._state == LockIntent.IDLE and self.is_granted() - - def is_retry(self) -> bool: - """Return True if this unit requested retry but still has the grant. - - Transitional state in which the unit is waiting for the leader to release the lock. - """ - unit_intent = self._state - return ( - unit_intent == LockIntent.RETRY_RELEASE or unit_intent == LockIntent.RETRY_HOLD - ) and self.is_granted() - - def is_waiting_retry(self) -> bool: - """Return True if the unit requested retry and is waiting for lock to be granted.""" - return self._state == LockIntent.RETRY_RELEASE and not self.is_granted() - - def is_retry_hold(self) -> bool: - """Return True if the unit requested retry and wants to keep the lock.""" - return self._state == LockIntent.RETRY_HOLD and not self.is_granted() - - def get_current_operation(self) -> Operation | None: - """Return the head operation for this unit, if any.""" - return self._operations.peek() - - def _is_max_retry_reached(self) -> bool: - """Return True if the head operation exceeded its max_retry (unless max_retry is None).""" - if not (operation := self.get_current_operation()): - return True - return operation.is_max_retry_reached() - - def _increase_attempt(self) -> None: - """Increment the attempt counter for the head operation and persist it.""" - q = self._operations - q.increase_attempt() - self._unit_data.update({'operations': q.to_string()}) - - def get_last_completed(self) -> datetime | None: - """Get the time the unit requested a retry of the head operation.""" - if timestamp_str := self._unit_data.get('executed_at', ''): - return _parse_timestamp(timestamp_str) - return None - - def get_requested_at(self) -> datetime | None: - """Get the time the head operation was requested at.""" - if not (operation := self.get_current_operation()): - return None - return operation.requested_at - - def _unit_executed_after_grant(self) -> bool: - """Returns True if the unit executed its callback after the lock was granted.""" - granted_at = _parse_timestamp(self._app_data.get('granted_at', '')) - executed_at = _parse_timestamp(self._unit_data.get('executed_at', '')) - - if granted_at is None or executed_at is None: - return False - return executed_at > granted_at - - -def pick_oldest_completed(locks: list[Lock]) -> Lock | None: - """Choose the retry lock with the oldest executed_at timestamp.""" - selected = None - oldest_timestamp = None - - for lock in locks: - timestamp = lock.get_last_completed() - if not timestamp: - continue - - if oldest_timestamp is None or timestamp < oldest_timestamp: - oldest_timestamp = timestamp - selected = lock - - return selected - - -def pick_oldest_request(locks: list[Lock]) -> Lock | None: - """Choose the lock with the oldest head operation.""" - selected = None - oldest_request = None - - for lock in locks: - timestamp = lock.get_requested_at() - if not timestamp: - continue - - if oldest_request is None or timestamp < oldest_request: - oldest_request = timestamp - selected = lock - - return selected - - -class LockIterator: - """Iterator over Lock objects for each unit present on the peer relation.""" - - def __init__(self, model: Model, relation_name: str): - relation = model.relations[relation_name][0] - units = relation.units - units.add(model.unit) - self._model = model - self._units = units - self._relation_name = relation_name - - def __iter__(self) -> Iterator[Lock]: - """Yields a lock for each unit we can find on the relation.""" - for unit in self._units: - yield Lock(self._model, self._relation_name, unit=unit) diff --git a/rollingops/src/charmlibs/rollingops/_peer_rollingops.py b/rollingops/src/charmlibs/rollingops/_peer_rollingops.py deleted file mode 100644 index 0933ce867..000000000 --- a/rollingops/src/charmlibs/rollingops/_peer_rollingops.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Background process.""" - -import argparse -import subprocess -import time - - -def main(): - """Juju hook event dispatcher.""" - parser = argparse.ArgumentParser() - parser.add_argument('--run-cmd', required=True) - parser.add_argument('--unit-name', required=True) - parser.add_argument('--charm-dir', required=True) - args = parser.parse_args() - - # Sleep so that the leader unit can properly leave the hook and start a new one - time.sleep(10) - dispatch_sub_cmd = ( - f'JUJU_DISPATCH_PATH=hooks/rollingops_lock_granted {args.charm_dir}/dispatch' - ) - res = subprocess.run([args.run_cmd, '-u', args.unit_name, dispatch_sub_cmd]) - res.check_returncode() - - -if __name__ == '__main__': - main() diff --git a/rollingops/src/charmlibs/rollingops/_peer_worker.py b/rollingops/src/charmlibs/rollingops/_peer_worker.py deleted file mode 100644 index 4a7a54860..000000000 --- a/rollingops/src/charmlibs/rollingops/_peer_worker.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" - -import logging -import os -import signal -import subprocess -from pathlib import Path -from sys import version_info - -from ops import Relation, RelationDataContent -from ops.charm import ( - CharmBase, -) -from ops.framework import Object - -logger = logging.getLogger(__name__) - - -class PeerRollingOpsAsyncWorker(Object): - """Spawns and manages the external rolling-ops worker process.""" - - def __init__(self, charm: CharmBase, relation_name: str): - super().__init__(charm, 'peer-rollingops-async-worker') - self._charm = charm - self._peers_name = relation_name - self._run_cmd = '/usr/bin/juju-exec' - self._charm_dir = charm.charm_dir - - @property - def _relation(self) -> Relation | None: - """Returns the peer relation.""" - return self._charm.model.get_relation(self._peers_name) - - @property - def _app_data(self) -> RelationDataContent: - """Returns the application databag in the peer relation.""" - return self._relation.data[self.model.app] # type: ignore[reportOptionalMemberAccess] - - def start(self) -> None: - """Start a new worker process.""" - if self._relation is None: - return - self.stop() - - # Remove JUJU_CONTEXT_ID so juju-run works from the spawned process - new_env = os.environ.copy() - new_env.pop('JUJU_CONTEXT_ID', None) - - for loc in new_env.get('PYTHONPATH', '').split(':'): - path = Path(loc) - venv_path = ( - path - / '..' - / 'venv' - / 'lib' - / f'python{version_info.major}.{version_info.minor}' - / 'site-packages' - ) - if path.stem == 'lib': - new_env['PYTHONPATH'] = f'{venv_path.resolve()}:{new_env["PYTHONPATH"]}' - break - - worker = ( - self._charm_dir - / 'venv' - / 'lib' - / f'python{version_info.major}.{version_info.minor}' - / 'site-packages' - / 'charmlibs' - / 'rollingops' - / '_peer_rollingops.py' - ) - - # These files must stay open for the lifetime of the worker process. - log_out = open('/var/log/peer_rollingops_worker.log', 'a') # noqa: SIM115 - log_err = open('/var/log/peer_rollingops_worker.err', 'a') # noqa: SIM115 - - pid = subprocess.Popen( - [ - '/usr/bin/python3', - '-u', - str(worker), - '--run-cmd', - self._run_cmd, - '--unit-name', - self._charm.model.unit.name, - '--charm-dir', - str(self._charm_dir), - ], - cwd=str(self._charm_dir), - stdout=log_out, - stderr=log_err, - env=new_env, - ).pid - - self._app_data.update({'rollingops-worker-pid': str(pid)}) - logger.info('Started RollingOps worker process with PID %s', pid) - - def stop(self) -> None: - """Stop the running worker process if it exists.""" - if self._relation is None: - return - - if not (pid_str := self._app_data.get('rollingops-worker-pid', '')): - return - - pid = int(pid_str) - try: - os.kill(pid, signal.SIGINT) - logger.info('Stopped RollingOps worker process PID %s', pid) - except OSError: - logger.info('Failed to stop RollingOps worker process PID %s', pid) - - self._app_data.update({'rollingops-worker-pid': ''}) diff --git a/rollingops/src/charmlibs/rollingops/_relations.py b/rollingops/src/charmlibs/rollingops/_relations.py deleted file mode 100644 index 7189f1ef1..000000000 --- a/rollingops/src/charmlibs/rollingops/_relations.py +++ /dev/null @@ -1,283 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging - -from dpcharmlibs.interfaces import ( - RequirerCommonModel, - ResourceCreatedEvent, - ResourceEndpointsChangedEvent, - ResourceProviderModel, - ResourceRequirerEventHandler, -) -from ops import Relation -from ops.charm import ( - CharmBase, - LeaderElectedEvent, - RelationBrokenEvent, - RelationChangedEvent, - SecretChangedEvent, -) -from ops.framework import Object - -from charmlibs.interfaces.tls_certificates import Certificate, TLSCertificatesError -from charmlibs.rollingops import _certificates as certificates -from charmlibs.rollingops import _etcdctl as etcdctl -from charmlibs.rollingops._models import RollingOpsInvalidSecretContentError, SharedCertificate - -logger = logging.getLogger(__name__) -CERT_SECRET_FIELD = 'rollingops-client-secret-id' # noqa: S105 -CERT_SECRET_LABEL = 'rollingops-client-cert' # noqa: S105 -CLIENT_CERT_FIELD = 'client-cert' -CLIENT_KEY_FIELD = 'client-key' -CLIENT_CA_FIELD = 'client-ca' - - -class SharedClientCertificateManager(Object): - """Manage the shared rollingops client certificate via peer relation secret.""" - - def __init__(self, charm: CharmBase, peer_relation_name: str) -> None: - super().__init__(charm, 'shared-client-certificate') - self.charm = charm - self.peer_relation_name = peer_relation_name - - self.framework.observe(charm.on.leader_elected, self._on_leader_elected) - self.framework.observe( - charm.on[peer_relation_name].relation_changed, - self._on_peer_relation_changed, - ) - self.framework.observe(charm.on.secret_changed, self._on_secret_changed) - - @property - def _peer_relation(self) -> Relation | None: - """Return the peer relation for this charm.""" - return self.model.get_relation(self.peer_relation_name) - - def _on_leader_elected(self, event: LeaderElectedEvent) -> None: - """Handle the leader elected event. - - When this unit becomes the leader, it is responsible for generating - and sharing the client certificate material with other units. - """ - self.create_and_share_certificate() - - def _on_secret_changed(self, event: SecretChangedEvent) -> None: - """Handle updates to secrets. - - This method is triggered when a secret changes. It ensures that - the latest certificate material is synchronized to local files. - """ - if event.secret.label == CERT_SECRET_LABEL: - self.sync_to_local_files() - - def _on_peer_relation_changed(self, event: RelationChangedEvent) -> None: - """React to peer relation changes. - - The leader ensures the shared certificate exists. - All units try to persist the shared certificate locally if available. - """ - self.create_and_share_certificate() - self.sync_to_local_files() - - def create_and_share_certificate(self) -> None: - """Ensure the application client certificate exists. - - Only the leader generates the certificate and writes it to the peer - relation application databag. - - If the secret ID corresponding to the shared certificate already - exists in the peer relation, it is not created again. - """ - relation = self._peer_relation - if relation is None or not self.model.unit.is_leader(): - return - - app_data = relation.data[self.model.app] - - if app_data.get(CERT_SECRET_FIELD): - logger.info( - 'Shared certificate already exists in the databag. No new certificate is created.' - ) - return - - common_name = f'rollingops-{self.model.uuid}-{self.model.app.name}' - shared = certificates.generate(common_name) - - secret = self.model.app.add_secret( - content={ - CLIENT_CERT_FIELD: shared.certificate.raw, - CLIENT_KEY_FIELD: shared.key.raw, - CLIENT_CA_FIELD: shared.ca.raw, - }, - label=CERT_SECRET_LABEL, - ) - - app_data.update({CERT_SECRET_FIELD: secret.id}) # type: ignore[arg-type] - logger.info('Shared certificate added to the databag.') - - def get_shared_certificate_from_peer_relation(self) -> SharedCertificate | None: - """Return the client certificate, key and ca from peer app data. - - Returns: - SharedCertificate or None if not yet available. - - Raises: - RollingOpsInvalidSecretContent: if the content of the secret holding - the certificates does not contain all the fields or they are empty. - """ - if not (relation := self._peer_relation): - logger.debug('Peer relation is not available yet.') - return None - - if not (secret_id := relation.data[self.model.app].get(CERT_SECRET_FIELD)): - logger.info('Shared certificate secret ID does not exist in the databag yet.') - return None - - secret = self.model.get_secret(id=secret_id) - content = secret.get_content(refresh=True) - - certificate = content.get(CLIENT_CERT_FIELD, '') - key = content.get(CLIENT_KEY_FIELD, '') - ca = content.get(CLIENT_CA_FIELD, '') - - if not certificate or not key or not ca: - raise RollingOpsInvalidSecretContentError( - 'Invalid secret content: expected non-empty values for ' - f"'{CLIENT_CERT_FIELD}', '{CLIENT_KEY_FIELD}', and '{CLIENT_CA_FIELD}'. " - 'Missing or empty values are not allowed.' - ) - - try: - return SharedCertificate.from_strings( - certificate=certificate, - key=key, - ca=ca, - ) - except (TLSCertificatesError, ValueError) as e: - raise RollingOpsInvalidSecretContentError( - 'Invalid secret content: certificate material could not be parsed.' - ) from e - - def sync_to_local_files(self) -> None: - """Persist shared certificate locally if available.""" - shared = self.get_shared_certificate_from_peer_relation() - if shared is None: - logger.info('Shared rollingops etcd client certificate is not available yet.') - return - - certificates.persist_client_cert_key_and_ca(shared) - - def get_local_request_cert(self) -> Certificate | None: - """Return the cert to place in relation requests.""" - shared = self.get_shared_certificate_from_peer_relation() - return None if shared is None else shared.certificate - - -class EtcdRequiresV1(Object): - """EtcdRequires implementation for data interfaces version 1.""" - - def __init__( - self, - charm: CharmBase, - relation_name: str, - cluster_id: str, - shared_certificates: SharedClientCertificateManager, - ) -> None: - super().__init__(charm, f'requirer-{relation_name}') - self.charm = charm - self.cluster_id = cluster_id - self.shared_certificates = shared_certificates - - self.etcd_interface = ResourceRequirerEventHandler( - self.charm, - relation_name=relation_name, - requests=self.client_requests(), - response_model=ResourceProviderModel, - ) - - self.framework.observe( - self.etcd_interface.on.endpoints_changed, self._on_endpoints_changed - ) - self.framework.observe(charm.on[relation_name].relation_broken, self._on_relation_broken) - self.framework.observe(self.etcd_interface.on.resource_created, self._on_resource_created) - - @property - def etcd_relation(self) -> Relation | None: - """Return the etcd relation if present.""" - relations = self.etcd_interface.relations - return relations[0] if relations else None - - def _on_relation_broken(self, event: RelationBrokenEvent) -> None: - """Remove the stored information about the etcd server.""" - etcdctl.cleanup() - - def _on_endpoints_changed( - self, event: ResourceEndpointsChangedEvent[ResourceProviderModel] - ) -> None: - """Handle updates to etcd endpoints from the provider. - - The method writes an environment configuration - file used by etcdctl to connect securely to the cluster. - - If no endpoints are provided in the event, the update is skipped. - """ - response = event.response - - if not response.endpoints: - logger.error('Received a endpoints changed event but no etcd endpoints available.') - return - - logger.info('etcd endpoints changed: %s', response.endpoints) - - etcdctl.write_config_file( - endpoints=response.endpoints, - client_cert_path=certificates.CLIENT_CERT_PATH, - client_key_path=certificates.CLIENT_KEY_PATH, - ) - - def _on_resource_created(self, event: ResourceCreatedEvent[ResourceProviderModel]) -> None: - """Handle provisioning of etcd connection resources. - - This method stores the trusted server CA locally and write the etcd client environment - configuration file. - """ - response = event.response - - if not response.tls_ca: - logger.error( - 'Received a resource created event but no etcd server CA chain available.' - ) - return - - etcdctl.write_trusted_server_ca(tls_ca_pem=response.tls_ca) - - if not response.endpoints: - logger.error('Received a resource created event but no etcd endpoints available.') - return - - etcdctl.write_config_file( - endpoints=response.endpoints, - client_cert_path=certificates.CLIENT_CERT_PATH, - client_key_path=certificates.CLIENT_KEY_PATH, - ) - - def client_requests(self) -> list[RequirerCommonModel]: - """Return the client requests for the etcd requirer interface.""" - cert = self.shared_certificates.get_local_request_cert() - return [ - RequirerCommonModel( - resource=self.cluster_id, - mtls_cert=None if cert is None else cert.raw, - ) - ] diff --git a/rollingops/src/charmlibs/rollingops/_worker.py b/rollingops/src/charmlibs/rollingops/_worker.py deleted file mode 100644 index 9b5430280..000000000 --- a/rollingops/src/charmlibs/rollingops/_worker.py +++ /dev/null @@ -1,173 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" - -import logging -import os -import signal -import subprocess -from sys import version_info - -from ops import Relation -from ops.charm import CharmBase -from ops.framework import Object - -from charmlibs import pathops -from charmlibs.rollingops._models import RollingOpsCharmLibMissingError, with_pebble_retry - -logger = logging.getLogger(__name__) - -WORKER_PID_FIELD = 'etcd-rollingops-worker-pid' - - -class EtcdRollingOpsAsyncWorker(Object): - """Spawns and manages the external rolling-ops worker process.""" - - def __init__(self, charm: CharmBase, peer_relation_name: str, owner: str): - super().__init__(charm, 'etcd-rollingops-async-worker') - self._charm = charm - self._peer_relation_name = peer_relation_name - self._run_cmd = '/usr/bin/juju-exec' - self._owner = owner - self._charm_dir = charm.charm_dir - - @property - def _relation(self) -> Relation | None: - return self.model.get_relation(self._peer_relation_name) - - def start(self) -> None: - """Start a new worker process. - - Raises: - RollingOpsCharmLibMissingError: if the lib files cannot be found. - """ - if self._relation is None: - return - - if pid_str := self._relation.data[self.model.unit].get(WORKER_PID_FIELD): - try: - pid = int(pid_str) - except (ValueError, TypeError): - pid = None - - if pid is not None and self._is_pid_alive(pid): - logger.info( - 'RollingOps worker already running with PID %s; not starting a new one.', pid - ) - return - - # Remove JUJU_CONTEXT_ID so juju-run works from the spawned process - new_env = os.environ.copy() - new_env.pop('JUJU_CONTEXT_ID', None) - - venv_path = ( - self._charm_dir - / 'venv' - / 'lib' - / f'python{version_info.major}.{version_info.minor}' - / 'site-packages' - ) - if not with_pebble_retry(lambda: venv_path.exists()): - raise RollingOpsCharmLibMissingError( - f'Expected virtualenv site-packages not found: {venv_path}' - ) - - for loc in new_env.get('PYTHONPATH', '').split(':'): - path = pathops.LocalPath(loc) - - if path.stem != 'lib': - continue - new_env['PYTHONPATH'] = f'{venv_path.resolve()}:{new_env["PYTHONPATH"]}' - break - - worker = venv_path / 'charmlibs' / 'rollingops' / '_etcd_rollingops.py' - if not with_pebble_retry(lambda: worker.exists()): - raise RollingOpsCharmLibMissingError(f'Worker script not found: {worker}') - - # These files must stay open for the lifetime of the worker process. - log_out = open('/var/log/etcd_rollingops_worker.log', 'a') # noqa: SIM115 - log_err = open('/var/log/etcd_rollingops_worker.err', 'a') # noqa: SIM115 - - pid = subprocess.Popen( - [ - '/usr/bin/python3', - '-u', - str(worker), - '--run-cmd', - self._run_cmd, - '--unit-name', - self.model.unit.name, - '--charm-dir', - str(self._charm_dir), - '--owner', - self._owner, - ], - cwd=str(self._charm_dir), - stdout=log_out, - stderr=log_err, - env=new_env, - ).pid - - self._relation.data[self.model.unit].update({WORKER_PID_FIELD: str(pid)}) - logger.info('Started etcd rollingops worker process with PID %s', pid) - - def _is_pid_alive(self, pid: int) -> bool: - if pid <= 0: - return False - try: - os.kill(pid, 0) - return True - except ProcessLookupError: - return False - except PermissionError: - return True - - def stop(self) -> None: - """Stop the running worker process if it exists.""" - if self._relation is None: - return - - pid_str = self._relation.data[self.model.unit].get(WORKER_PID_FIELD, '') - - try: - pid = int(pid_str) - except (TypeError, ValueError): - logger.info('Missing PID or invalid PID found in the databag.') - self._relation.data[self.model.unit].update({WORKER_PID_FIELD: ''}) - return - - try: - os.kill(pid, signal.SIGTERM) - logger.info('Sent SIGTERM to etcd rollingops worker process PID %s.', pid) - except ProcessLookupError: - logger.info('Process PID %s is already gone.', pid) - except PermissionError: - logger.warning('No permission to stop etcd rollingops worker process PID %s.', pid) - return - except OSError: - logger.warning('SIGTERM failed for PID %s, attempting SIGKILL', pid) - try: - os.kill(pid, signal.SIGKILL) - logger.info('Sent SIGKILL to etcd rollingops worker process PID %s', pid) - except ProcessLookupError: - logger.info('Process PID %s exited before SIGKILL', pid) - except PermissionError: - logger.warning('No permission to SIGKILL process PID %s', pid) - return - except OSError: - logger.warning('Failed to SIGKILL process PID %s', pid) - return - - self._relation.data[self.model.unit].update({WORKER_PID_FIELD: ''}) diff --git a/rollingops/tests/unit/test_certificates.py b/rollingops/tests/unit/test_certificates.py deleted file mode 100644 index 19dd62853..000000000 --- a/rollingops/tests/unit/test_certificates.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Learn more about testing at: https://juju.is/docs/sdk/testing - -from typing import Any - -from tests.unit.conftest import VALID_CA_CERT_PEM, VALID_CLIENT_CERT_PEM, VALID_CLIENT_KEY_PEM - -from charmlibs.interfaces.tls_certificates import ( - Certificate, - PrivateKey, -) -from charmlibs.rollingops._models import SharedCertificate - - -def make_shared_certificate() -> SharedCertificate: - return SharedCertificate( - certificate=Certificate.from_string(VALID_CLIENT_CERT_PEM), - key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), - ca=Certificate.from_string(VALID_CA_CERT_PEM), - ) - - -def test_certs(): - Certificate.from_string(VALID_CA_CERT_PEM) - PrivateKey.from_string(VALID_CLIENT_KEY_PEM) - Certificate.from_string(VALID_CLIENT_CERT_PEM) - - -def test_certificates_manager_exists_returns_false_when_no_files( - temp_certificates: Any, -) -> None: - assert temp_certificates._exists() is False - - -def test_certificates_manager_exists_returns_false_when_cert_does_not_exist( - temp_certificates: Any, -) -> None: - temp_certificates.CLIENT_KEY_PATH.write_text('client-key') - - assert temp_certificates._exists() is False - - -def test_certificates_manager_exists_returns_false_when_key_does_not_exist( - temp_certificates: Any, -) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text('client-cert') - - assert temp_certificates._exists() is False - - -def test_certificates_manager_exists_returns_true_when_all_files_exist( - temp_certificates: Any, -) -> None: - temp_certificates.CLIENT_KEY_PATH.write_text('client-key') - temp_certificates.CLIENT_CERT_PATH.write_text('client-cert') - temp_certificates.CA_CERT_PATH.write_text('ca-cert') - - assert temp_certificates._exists() is True - - -def test_certificates_manager_persist_client_cert_and_key_writes_files( - temp_certificates: Any, -) -> None: - shared_certificate = make_shared_certificate() - temp_certificates.persist_client_cert_key_and_ca(shared_certificate) - - assert temp_certificates.CLIENT_CERT_PATH.read_text() == shared_certificate.certificate.raw - assert temp_certificates.CLIENT_KEY_PATH.read_text() == shared_certificate.key.raw - assert temp_certificates.CA_CERT_PATH.read_text() == shared_certificate.ca.raw - - -def test_certificates_manager_has_client_cert_and_key_returns_false_when_files_missing( - temp_certificates: Any, -) -> None: - shared_certificate = make_shared_certificate() - assert temp_certificates._has_client_cert_key_and_ca(shared_certificate) is False - - -def test_certificates_manager_has_client_cert_and_key_returns_true_when_material_matches( - temp_certificates: Any, -) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) - temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) - temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) - - shared_certificate = make_shared_certificate() - assert temp_certificates._has_client_cert_key_and_ca(shared_certificate) is True - - -def test_certificates_manager_has_client_cert_and_key_returns_false_when_material_differs( - temp_certificates: Any, -) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) - temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) - temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) - - other_shared_certificate = SharedCertificate( - certificate=Certificate.from_string(VALID_CA_CERT_PEM), - key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), - ca=Certificate.from_string(VALID_CLIENT_CERT_PEM), - ) - assert temp_certificates._has_client_cert_key_and_ca(other_shared_certificate) is False - - -def test_certificates_manager_generate_does_nothing_when_files_already_exist( - temp_certificates: Any, -) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) - temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) - temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) - old_certificates = make_shared_certificate() - - new_certificates = temp_certificates.generate(common_name='unit-1') - - written = SharedCertificate.from_strings( - certificate=temp_certificates.CLIENT_CERT_PATH.read_text(), - key=temp_certificates.CLIENT_KEY_PATH.read_text(), - ca=temp_certificates.CA_CERT_PATH.read_text(), - ) - assert written == old_certificates - - assert new_certificates == old_certificates - - -def test_certificates_manager_generate_creates_all_files( - temp_certificates: Any, -) -> None: - shared = temp_certificates.generate(common_name='unit-1') - assert temp_certificates._exists() is True - - assert temp_certificates.CA_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') - assert temp_certificates.CLIENT_KEY_PATH.read_text().startswith( - '-----BEGIN RSA PRIVATE KEY-----' - ) - assert temp_certificates.CLIENT_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') - - assert temp_certificates.CA_CERT_PATH.read_text() == shared.ca.raw - assert temp_certificates.CLIENT_KEY_PATH.read_text() == shared.key.raw - assert temp_certificates.CLIENT_CERT_PATH.read_text() == shared.certificate.raw diff --git a/rollingops/tests/unit/test_etcdctl.py b/rollingops/tests/unit/test_etcdctl.py deleted file mode 100644 index 051103f2f..000000000 --- a/rollingops/tests/unit/test_etcdctl.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Learn more about testing at: https://juju.is/docs/sdk/testing - -import json -from typing import Any -from unittest.mock import patch - -import pytest - -from charmlibs.pathops import LocalPath -from charmlibs.rollingops import RollingOpsEtcdNotConfiguredError - - -def test_etcdctl_write_env(temp_etcdctl: Any) -> None: - temp_etcdctl.write_config_file( - endpoints='https://10.0.0.1:2379,https://10.0.0.2:2379', - client_cert_path=LocalPath('PATH1'), - client_key_path=LocalPath('PATH2'), - ) - - assert temp_etcdctl.BASE_DIR.exists() - - config = json.loads(temp_etcdctl.CONFIG_FILE_PATH.read_text()) - assert config == { - 'endpoints': 'https://10.0.0.1:2379,https://10.0.0.2:2379', - 'cacert_path': str(temp_etcdctl.SERVER_CA_PATH), - 'cert_path': 'PATH1', - 'key_path': 'PATH2', - } - - -def test_etcdctl_ensure_initialized_raises_when_env_missing(temp_etcdctl: Any) -> None: - with pytest.raises(RollingOpsEtcdNotConfiguredError): - temp_etcdctl.ensure_initialized() - - -def test_etcdctl_cleanup_removes_env_file_and_server_ca(temp_etcdctl: Any) -> None: - temp_etcdctl.BASE_DIR.mkdir(parents=True, exist_ok=True) - temp_etcdctl.CONFIG_FILE_PATH.write_text('env') - temp_etcdctl.SERVER_CA_PATH.write_text('ca') - - assert temp_etcdctl.CONFIG_FILE_PATH.exists() - assert temp_etcdctl.SERVER_CA_PATH.exists() - - temp_etcdctl.cleanup() - - assert not temp_etcdctl.CONFIG_FILE_PATH.exists() - assert not temp_etcdctl.SERVER_CA_PATH.exists() - - -def test_etcdctl_cleanup_is_noop_when_files_do_not_exist(temp_etcdctl: Any) -> None: - assert not temp_etcdctl.CONFIG_FILE_PATH.exists() - assert not temp_etcdctl.SERVER_CA_PATH.exists() - - temp_etcdctl.cleanup() - - assert not temp_etcdctl.CONFIG_FILE_PATH.exists() - assert not temp_etcdctl.SERVER_CA_PATH.exists() - - -def test_etcdctl_load_env_parses_exported_vars(temp_etcdctl: Any) -> None: - temp_etcdctl.BASE_DIR.mkdir(parents=True, exist_ok=True) - temp_etcdctl.SERVER_CA_PATH.write_text('SERVER CA') - temp_etcdctl.CONFIG_FILE_PATH.write_text( - json.dumps({ - 'endpoints': 'https://10.0.0.1:2379', - 'cacert_path': '/a-path/server-ca.pem', - 'cert_path': '/a-path/client.pem', - 'key_path': '/a-path/client.key', - }) - ) - - with patch.dict('os.environ', {'EXISTING_VAR': 'present'}, clear=True): - env = temp_etcdctl.load_env() - - assert env['EXISTING_VAR'] == 'present' - assert env['ETCDCTL_API'] == '3' - assert env['ETCDCTL_ENDPOINTS'] == 'https://10.0.0.1:2379' - assert env['ETCDCTL_CERT'] == '/a-path/client.pem' - assert env['ETCDCTL_KEY'] == '/a-path/client.key' - assert env['ETCDCTL_CACERT'] == '/a-path/server-ca.pem' diff --git a/rollingops/tests/unit/test_models.py b/rollingops/tests/unit/test_models.py deleted file mode 100644 index 2820dfea0..000000000 --- a/rollingops/tests/unit/test_models.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Learn more about testing at: https://juju.is/docs/sdk/testing - - -from charmlibs.rollingops._models import RollingOpsKeys - - -def test_rollingopskeys_paths() -> None: - keys = RollingOpsKeys.for_owner('cluster-a', 'unit-1') - - assert keys.cluster_prefix == '/rollingops/default/cluster-a/' - assert keys._owner_prefix == '/rollingops/default/cluster-a/unit-1/' - assert keys.lock_key == '/rollingops/default/cluster-a/granted-unit/' - assert keys.pending == '/rollingops/default/cluster-a/unit-1/pending/' - assert keys.inprogress == '/rollingops/default/cluster-a/unit-1/inprogress/' - assert keys.completed == '/rollingops/default/cluster-a/unit-1/completed/' - - -def test_rollingopskeys_lock_key_is_shared_within_cluster() -> None: - k1 = RollingOpsKeys.for_owner('cluster-a', 'unit-1') - k2 = RollingOpsKeys.for_owner('cluster-a', 'unit-2') - - assert k1.lock_key == k2.lock_key - assert k1.pending != k2.pending - assert k1.inprogress != k2.inprogress - assert k1.completed != k2.completed From f24a115911b3b8f6f544db67265d6c462abfdeb2 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Tue, 14 Apr 2026 09:30:27 +0200 Subject: [PATCH 03/15] raise of failed transactions --- .../rollingops/common/_exceptions.py | 4 ++ .../src/charmlibs/rollingops/etcd/_backend.py | 4 ++ .../src/charmlibs/rollingops/etcd/_etcd.py | 40 +++++++++++-------- .../charmlibs/rollingops/etcd/_rollingops.py | 3 +- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/common/_exceptions.py b/rollingops/src/charmlibs/rollingops/common/_exceptions.py index 209f0c67e..9dd97c9d2 100644 --- a/rollingops/src/charmlibs/rollingops/common/_exceptions.py +++ b/rollingops/src/charmlibs/rollingops/common/_exceptions.py @@ -69,3 +69,7 @@ class RollingOpsEtcdctlParseError(RollingOpsEtcdctlError): class RollingOpsSyncLockError(RollingOpsError): """Raised when there is an error during sync lock execution.""" + + +class RollingOpsEtcdTransactionError(RollingOpsError): + """Raised when an etcd transaction fails.""" diff --git a/rollingops/src/charmlibs/rollingops/etcd/_backend.py b/rollingops/src/charmlibs/rollingops/etcd/_backend.py index 7fede9add..c413e0100 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_backend.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_backend.py @@ -260,6 +260,10 @@ def _on_run_with_lock(self) -> RunWithLockOutcome: Returns: A structured outcome describing whether an operation was executed and, if so, which operation was finalized and with what result. + + Raises: + RollingOpsEtcdTransactionError: if the operation cannot be marked + as completed. """ if not self._async_lock.is_held(): logger.info('Lock is not granted. Operation will not run.') diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py index 6a38f949e..23ea9cfdc 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -20,6 +20,7 @@ import time import charmlibs.rollingops.etcd._etcdctl as etcdctl +from charmlibs.rollingops.common._exceptions import RollingOpsEtcdTransactionError from charmlibs.rollingops.common._models import Operation, OperationResult from charmlibs.rollingops.etcd._models import RollingOpsKeys @@ -359,46 +360,46 @@ def has_completed(self) -> bool: """ return self._completed.peek() is not None - def claim_next(self) -> bool: + def claim_next(self) -> None: """Move the next pending operation to the in-progress queue. This operation is performed atomically and only succeeds if: - the lock is still held by this owner - the head of the pending queue has not changed - Returns: - True if the operation was successfully claimed, - otherwise False. + Raises: + RollingOpsEtcdTransactionError: if the transaction failed. """ - return self._pending.move_head(self._inprogress.prefix) + if not self._pending.move_head(self._inprogress.prefix): + raise RollingOpsEtcdTransactionError('Failed to move operation to in progress.') def wait_until_completed(self) -> Operation: """Block until at least one operation appears in the completed queue.""" return self._completed.watch() - def requeue_completed(self) -> bool: + def requeue_completed(self) -> None: """Requeue the head completed operation back to the pending queue. This is typically used when an operation needs to be retried (e.g., RETRY_RELEASE or RETRY_HOLD semantics). - Returns: - True if the operation was successfully moved back to pending, - otherwise False. + Raises: + RollingOpsEtcdTransactionError: if the transaction failed. """ - return self._completed.move_head(self._pending.prefix) + if not self._completed.move_head(self._pending.prefix): + raise RollingOpsEtcdTransactionError('Failed to move operation to pending.') - def delete_completed(self) -> bool: + def delete_completed(self) -> None: """Remove the head operation from the completed queue. This is typically used when an operation has finished successfully and does not need to be retried. - Returns: - True if the operation was successfully removed, - otherwise False. + Raises: + RollingOpsEtcdTransactionError: if the transaction failed. """ - return self._completed.dequeue() + if not self._completed.dequeue(): + raise RollingOpsEtcdTransactionError('Failed finalize operation.') class ManagerOperationStore: @@ -432,7 +433,7 @@ def request(self, operation: Operation) -> None: """ self._pending.enqueue(operation) - def finalize(self, operation: Operation, result: OperationResult) -> bool: + def finalize(self, operation: Operation, result: OperationResult) -> None: """Move an in-progress operation to the completed queue. This should be called after the operation has been executed and its @@ -441,6 +442,10 @@ def finalize(self, operation: Operation, result: OperationResult) -> bool: Args: operation: The operation currently in the in-progress queue. result: Result of the executions. + + Raises: + RollingOpsEtcdTransactionError: if the operation cannot be marked + as completed. """ match result: case OperationResult.RETRY_HOLD: @@ -450,7 +455,8 @@ def finalize(self, operation: Operation, result: OperationResult) -> bool: case _: operation.complete() - return self._inprogress.move_operation(self._completed.prefix, operation) + if not self._inprogress.move_operation(self._completed.prefix, operation): + raise RollingOpsEtcdTransactionError('Failed to set the operation as completed.') def peek_current(self) -> Operation | None: """Return the current in-progress operation without modifying state. diff --git a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py index e8121aaa9..8e6ad2190 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py @@ -114,8 +114,7 @@ def main(): continue logger.info('Lock granted.') - if not operations.claim_next(): - raise RollingOpsEtcdInconsistencyError('Failed to get next operation.') + operations.claim_next() dispatch_lock_granted(args.unit_name, args.charm_dir) From f4080beb6e55ecf61f28696a26f25696ab91e3fc Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Fri, 17 Apr 2026 17:22:07 +0200 Subject: [PATCH 04/15] address review feedback Signed-off-by: Patricia Reinoso --- rollingops/pyproject.toml | 3 +- .../src/charmlibs/rollingops/__init__.py | 4 +- .../rollingops/_rollingops_manager.py | 68 ++-- .../rollingops/common/_base_worker.py | 86 ++--- .../charmlibs/rollingops/common/_models.py | 301 ++++++++++-------- .../src/charmlibs/rollingops/common/_utils.py | 56 +++- .../src/charmlibs/rollingops/etcd/_backend.py | 47 +-- .../src/charmlibs/rollingops/etcd/_etcd.py | 56 +++- .../src/charmlibs/rollingops/etcd/_etcdctl.py | 20 +- .../charmlibs/rollingops/etcd/_rollingops.py | 93 +++--- .../src/charmlibs/rollingops/etcd/_worker.py | 32 +- .../src/charmlibs/rollingops/peer/_backend.py | 12 +- .../src/charmlibs/rollingops/peer/_models.py | 44 ++- .../charmlibs/rollingops/peer/_rollingops.py | 19 +- .../src/charmlibs/rollingops/peer/_worker.py | 30 +- .../tests/integration/charms/actions.yaml | 2 +- .../integration/test_etcd_rolling_ops.py | 43 ++- rollingops/tests/unit/conftest.py | 46 +-- rollingops/tests/unit/test_common_models.py | 218 +++++++++++-- .../unit/test_etcd_rollingops_in_charm.py | 29 +- rollingops/tests/unit/test_peer_models.py | 144 --------- .../unit/test_peer_rollingops_in_charm.py | 148 +++++---- rollingops/uv.lock | 2 + 23 files changed, 856 insertions(+), 647 deletions(-) delete mode 100644 rollingops/tests/unit/test_peer_models.py diff --git a/rollingops/pyproject.toml b/rollingops/pyproject.toml index bd096196b..9522b1f28 100644 --- a/rollingops/pyproject.toml +++ b/rollingops/pyproject.toml @@ -19,7 +19,8 @@ dependencies = [ "charmlibs-interfaces-tls-certificates>=1.8.1", "charmlibs-pathops>=1.2.1", "dpcharmlibs-interfaces==1.0.0", - "tenacity" + "tenacity", + "pydantic>=2.12.5", ] [dependency-groups] diff --git a/rollingops/src/charmlibs/rollingops/__init__.py b/rollingops/src/charmlibs/rollingops/__init__.py index d70339b58..cc631ad82 100644 --- a/rollingops/src/charmlibs/rollingops/__init__.py +++ b/rollingops/src/charmlibs/rollingops/__init__.py @@ -27,7 +27,7 @@ RollingOpsNoRelationError, RollingOpsSyncLockError, ) -from .common._models import OperationResult, SyncLockBackend +from .common._models import OperationResult, RollingOpsState, RollingOpsStatus, SyncLockBackend __all__ = ( 'OperationResult', @@ -40,6 +40,8 @@ 'RollingOpsLibMissingError', 'RollingOpsManager', 'RollingOpsNoRelationError', + 'RollingOpsState', + 'RollingOpsStatus', 'RollingOpsSyncLockError', 'SyncLockBackend', ) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index dfc33a52f..75f88a105 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -33,9 +33,11 @@ ProcessingBackend, RollingOpsState, RollingOpsStatus, + RunWithLockStatus, SyncLockBackend, UnitBackendState, ) +from charmlibs.rollingops.common._utils import ETCD_FAILED_HOOK_NAME, LOCK_GRANTED_HOOK_NAME from charmlibs.rollingops.etcd._backend import EtcdRollingOpsBackend from charmlibs.rollingops.peer._backend import PeerRollingOpsBackend from charmlibs.rollingops.peer._models import PeerUnitOperations @@ -101,8 +103,8 @@ def __init__( self.peer_relation_name = peer_relation_name self.etcd_relation_name = etcd_relation_name self._sync_lock_targets = sync_lock_targets or {} - charm.on.define_event('rollingops_lock_granted', RollingOpsLockGrantedEvent) - charm.on.define_event('rollingops_etcd_failed', RollingOpsEtcdFailedEvent) + charm.on.define_event(LOCK_GRANTED_HOOK_NAME, RollingOpsLockGrantedEvent) + charm.on.define_event(ETCD_FAILED_HOOK_NAME, RollingOpsEtcdFailedEvent) self.peer_backend = PeerRollingOpsBackend( charm=charm, @@ -260,18 +262,24 @@ def _on_rollingops_lock_granted(self, event: RollingOpsLockGrantedEvent) -> None the peer backend. If the current unit is etcd-managed, the operation is executed through - the etcd backend. On successful execution, the result is mirrored back + the etcd backend. + """ + if self._backend_state.is_peer_managed(): + logger.info('Executing rollingop on peer backend.') + self.peer_backend._on_rollingops_lock_granted(event) + return + self._run_etcd_and_mirror_or_fallback() + + def _run_etcd_and_mirror_or_fallback(self) -> None: + """Run the etcd execution path and mirror its outcome to peer. + + On successful execution, the result is mirrored back to the peer relation so that peer state remains consistent and can be used for fallback. If etcd execution fails or mirrored state becomes inconsistent, the manager falls back to the peer backend and resumes processing there. """ - if self._backend_state.is_peer_managed(): - logger.info('Executing rollingop on peer backend.') - self.peer_backend._on_rollingops_lock_granted(event) - return - outcome = None try: logger.info('Executing rollingop on etcd backend.') outcome = self.etcd_backend._on_run_with_lock() @@ -294,11 +302,17 @@ def _on_rollingops_lock_granted(self, event: RollingOpsLockGrantedEvent) -> None self._fallback_current_unit_to_peer() return logger.info('Execution mirrored to peer relation.') + if outcome.status == RunWithLockStatus.EXECUTED_NOT_COMMITTED: + self._fallback_current_unit_to_peer() def _on_rollingops_etcd_failed(self, event: RollingOpsEtcdFailedEvent) -> None: """Fall back to peer when the etcd worker reports a fatal failure.""" - logger.warning('Received rollingops_etcd_failed; falling back to peer backend.') - self._fallback_current_unit_to_peer() + logger.warning('Received %s.', ETCD_FAILED_HOOK_NAME) + if self._backend_state.is_etcd_managed(): + # No need to stop the background process. This hook means that it stopped. + self._backend_state.fallback_to_peer() + self.peer_backend.ensure_processing() + logger.info('Fell back to peer backend.') def _get_sync_lock_backend(self, backend_id: str) -> SyncLockBackend: """Instantiate the configured peer sync lock backend. @@ -408,17 +422,16 @@ def state(self) -> RollingOpsState: """ if self._peer_relation is None: return RollingOpsState( - status=RollingOpsStatus.INVALID, - processing_backend=None, + status=RollingOpsStatus.UNAVAILABLE, + processing_backend=ProcessingBackend.PEER, operations=OperationQueue(), ) status = self.peer_backend.get_status() if self._backend_state.is_etcd_managed(): - try: - status = self.etcd_backend.get_status() - except Exception as e: - logger.exception('Failed to get status: %s', e) + status = self.etcd_backend.get_status() + if status == RollingOpsStatus.UNAVAILABLE: + logger.info('etcd backend is not available. Falling back to peer backend.') self._fallback_current_unit_to_peer() status = self.peer_backend.get_status() @@ -429,24 +442,23 @@ def state(self) -> RollingOpsState: operations=operations.queue, ) - def _on_update_status(self, _: EventBase) -> None: - """Periodic reconciliation of rolling-ops state. - - Ensures the correct backend is active, workers are running, - and fallback is triggered if etcd becomes unhealthy. - """ + def _on_update_status(self, event: EventBase) -> None: + """Periodic reconciliation of rolling-ops state.""" + logger.info('Received a update-status event.') if self._backend_state.is_etcd_managed(): if not self.etcd_backend.is_available(): logger.warning('etcd unavailable during update_status; falling back.') self._fallback_current_unit_to_peer() return - try: - self.etcd_backend.ensure_processing() - except Exception as e: - logger.warning('etcd worker failed: %s; falling back.', e) + if not self.etcd_backend.is_processing(): + logger.warning( + 'etcd backend is selected but no worker process is running; falling back.' + ) self._fallback_current_unit_to_peer() return - else: - self.peer_backend.ensure_processing() + self._run_etcd_and_mirror_or_fallback() + return + + self.peer_backend._on_rollingops_lock_granted(event) diff --git a/rollingops/src/charmlibs/rollingops/common/_base_worker.py b/rollingops/src/charmlibs/rollingops/common/_base_worker.py index 7dc1e46c2..c444b344d 100644 --- a/rollingops/src/charmlibs/rollingops/common/_base_worker.py +++ b/rollingops/src/charmlibs/rollingops/common/_base_worker.py @@ -122,22 +122,24 @@ def _worker_args(self) -> list[str]: """ return [] - def _get_pid_str(self) -> str: - """Return the stored worker PID string. + @property + def _pid(self) -> int | None: + """Return the stored worker PID. Returns: - The stored PID as a string, or an empty string if no PID is stored. + The stored PID, None if no PID is stored. Raises: NotImplementedError: If not implemented by a subclass. """ raise NotImplementedError - def _set_pid_str(self, pid: str) -> None: + @_pid.setter + def _pid(self, value: int | None) -> None: """Persist the worker PID string. Args: - pid: The PID string to persist. An empty string clears the stored PID. + value: The PID string to persist. An empty string clears the stored PID. Raises: NotImplementedError: If not implemented by a subclass. @@ -193,40 +195,37 @@ def start(self) -> None: required to start the worker is missing. OSError: If the worker subprocess cannot be started. """ - pid_str = self._get_pid_str() - if pid_str: - try: - pid = int(pid_str) - except (ValueError, TypeError): - pid = None - - if pid is not None and self._is_pid_alive(pid) and not self._on_existing_worker(pid): - return + if self._relation is None: + logger.info('Peer relation does not exist. Worker cannot start.') + return + pid = self._pid + if pid is not None and self._is_pid_alive(pid) and not self._on_existing_worker(pid): + return self._validate_startup_paths() worker = self._worker_script_path() env = self._build_env() - log_out = open(f'/var/log/{self._log_filename}.log', 'a') # noqa: SIM115 - pid = subprocess.Popen( - [ - '/usr/bin/python3', - '-u', - str(worker), - '--unit-name', - self.model.unit.name, - '--charm-dir', - str(self._charm_dir), - *self._worker_args(), - ], - cwd=str(self._charm_dir), - stdout=log_out, - stderr=log_out, - env=env, - ).pid - - self._set_pid_str(str(pid)) + with open(f'{self._log_filename}', 'a') as log_out: + pid = subprocess.Popen( + [ + '/usr/bin/python3', + '-u', + str(worker), + '--unit-name', + self.model.unit.name, + '--charm-dir', + str(self._charm_dir), + *self._worker_args(), + ], + cwd=str(self._charm_dir), + stdout=log_out, + stderr=log_out, + env=env, + ).pid + + self._pid = pid logger.info('Started %s process with PID %s', self._handle_name, pid) def stop(self) -> None: @@ -239,13 +238,13 @@ def stop(self) -> None: The stored PID is cleared when the worker is successfully considered stopped or no longer present. """ - pid_str = self._get_pid_str() + if self._relation is None: + logger.info('Peer relation not found. Worker cannot be stopped.') + return - try: - pid = int(pid_str) - except (TypeError, ValueError): - logger.info('Missing PID or invalid PID found in worker state.') - self._set_pid_str('') + pid = self._pid + if pid is None or pid <= 0: + logger.info('Invalid PID found. Worker cannot be stopped.') return try: @@ -270,4 +269,11 @@ def stop(self) -> None: logger.warning('Failed to SIGKILL process PID %s', pid) return - self._set_pid_str('') + self._pid = None + + def is_running(self) -> bool: + """Return whether the recorded worker process appears to be alive.""" + pid = self._pid + if pid is None: + return False + return self._is_pid_alive(pid) diff --git a/rollingops/src/charmlibs/rollingops/common/_models.py b/rollingops/src/charmlibs/rollingops/common/_models.py index 162d9f825..7b15a1fd0 100644 --- a/rollingops/src/charmlibs/rollingops/common/_models.py +++ b/rollingops/src/charmlibs/rollingops/common/_models.py @@ -17,12 +17,20 @@ import json import logging from abc import ABC, abstractmethod -from dataclasses import dataclass, field +from dataclasses import dataclass from datetime import datetime from enum import StrEnum from typing import Any from ops import Model, Unit +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + field_serializer, + field_validator, +) from charmlibs.rollingops.common._exceptions import ( RollingOpsDecodingError, @@ -34,7 +42,25 @@ class OperationResult(StrEnum): - """Callback return values.""" + """Result values returned by rolling-ops callbacks on async locks. + + These values control how the rolling-ops manager updates the operation + state and whether the distributed lock is released or retained. + + - RELEASE: + The operation completed successfully and no retry is required. + The lock is released and the next unit may be scheduled. + + - RETRY_RELEASE: + The operation failed or timed out and should be retried later. + The operation is re-queued and the lock is released so that + other units may proceed before this operation is retried. + + - RETRY_HOLD: + The operation failed or timed out and should be retried immediately. + The operation is re-queued and the lock is kept by the current + unit, allowing it to retry immediately. + """ RELEASE = 'release' RETRY_RELEASE = 'retry-release' @@ -59,22 +85,40 @@ class RunWithLockStatus(StrEnum): NO_OPERATION = 'no_operation' MISSING_CALLBACK = 'missing_callback' EXECUTED = 'executed' + EXECUTED_NOT_COMMITTED = 'executed_not_committed' class RollingOpsStatus(StrEnum): - """High-level rolling-ops state for a unit. + """High-level rolling-ops status for a unit. + + It reflects whether the unit is currently executing work, waiting + for execution, idle, or unable to participate. + + States: + + - UNAVAILABLE: + Rolling-ops cannot be used on this unit. This typically occurs when + required relations are missing or the selected backend is not reachable. + * peer backend: peer relation does not exist + * etcd backend: peer or etcd relation missing, or etcd not reachable - This status reflects whether the unit is currently executing, waiting, - or idle with respect to rolling operations. + - WAITING: + The unit has pending operations but does not currently hold the lock. + + - GRANTED: + The unit currently holds the lock and may execute operations. + + - IDLE: + The unit has no pending operations and is not holding the lock. """ - INVALID = 'invalid' + UNAVAILABLE = 'unavailable' WAITING = 'waiting' GRANTED = 'granted' IDLE = 'idle' -@dataclass +@dataclass(frozen=True) class RunWithLockOutcome: """Result of attempting to execute an operation under a distributed lock. @@ -129,8 +173,7 @@ def __init__(self, model: Model, relation_name: str, unit: Unit): self._relation = relation self.unit = unit - def _load(self) -> BackendState: - return self._relation.load(BackendState, self.unit, decoder=lambda s: s) + self._backend_state = self._relation.load(BackendState, self.unit, decoder=lambda s: s) def _save(self, data: BackendState) -> None: self._relation.save(data, self.unit, encoder=str) @@ -138,26 +181,24 @@ def _save(self, data: BackendState) -> None: @property def backend(self) -> ProcessingBackend: """Return which backend owns execution for this unit's queue.""" - return self._load().backend + return self._backend_state.backend @property def cleanup_needed(self) -> bool: """Return whether etcd cleanup is required before etcd can be reused.""" - return self._load().cleanup_needed + return self._backend_state.cleanup_needed def fallback_to_peer(self) -> None: """Switch this unit's queue to peer processing and mark etcd cleanup needed.""" - data = self._load() - data.backend = ProcessingBackend.PEER - data.cleanup_needed = True - self._save(data) + self._backend_state.backend = ProcessingBackend.PEER + self._backend_state.cleanup_needed = True + self._save(self._backend_state) def clear_fallback(self) -> None: """Clear the etcd cleanup-needed flag and set the backend to ETCD.""" - data = self._load() - data.backend = ProcessingBackend.ETCD - data.cleanup_needed = False - self._save(data) + self._backend_state.backend = ProcessingBackend.ETCD + self._backend_state.cleanup_needed = False + self._save(self._backend_state) def is_peer_managed(self) -> bool: """Return whether the peer backend should process this unit's queue.""" @@ -168,55 +209,63 @@ def is_etcd_managed(self) -> bool: return self.backend == ProcessingBackend.ETCD -@dataclass -class Operation: +class Operation(BaseModel): """A single queued operation.""" + model_config = ConfigDict(use_enum_values=True) + callback_id: str requested_at: datetime - max_retry: int | None - attempt: int - result: OperationResult | None - kwargs: dict[str, Any] = field(default_factory=dict[str, Any]) + max_retry: int | None = None + attempt: int = 0 + result: OperationResult | None = None + kwargs: dict[str, Any] = Field(default_factory=dict) + @field_validator('callback_id') @classmethod - def _validate_fields( - cls, callback_id: Any, kwargs: Any, requested_at: Any, max_retry: Any, attempt: Any - ) -> None: - """Validate the class attributes.""" - if not isinstance(callback_id, str) or not callback_id.strip(): + def validate_callback_id(cls, value: str) -> str: + if not value.strip(): raise ValueError('callback_id must be a non-empty string') + return value - if not isinstance(kwargs, dict): - raise ValueError('kwargs must be a dict') + @field_validator('kwargs') + @classmethod + def validate_kwargs(cls, value: dict[str, Any]) -> dict[str, Any]: try: - json.dumps(kwargs) + json.dumps(value) except TypeError as e: raise ValueError(f'kwargs must be JSON-serializable: {e}') from e + return value - if not isinstance(requested_at, datetime): - raise ValueError('requested_at must be a datetime') + @field_serializer('kwargs') + def serialize_kwargs(self, value: dict[str, Any]) -> dict[str, Any]: + """Ensure deterministic ordering of kwargs.""" + return dict(sorted(value.items())) - if max_retry is not None: - if not isinstance(max_retry, int): - raise ValueError('max_retry must be an int') - if max_retry < 0: - raise ValueError('max_retry must be >= 0') + @field_validator('max_retry') + @classmethod + def validate_max_retry(cls, value: int | None) -> int | None: + if value is not None and value < 0: + raise ValueError('max_retry must be >= 0') + return value - if not isinstance(attempt, int): - raise ValueError('attempt must be an int') - if attempt < 0: + @field_validator('attempt') + @classmethod + def validate_attempt(cls, value: int) -> int: + if value < 0: raise ValueError('attempt must be >= 0') + return value - def __post_init__(self) -> None: - """Validate the class attributes.""" - self._validate_fields( - self.callback_id, - self.kwargs, - self.requested_at, - self.max_retry, - self.attempt, - ) + @field_validator('requested_at', mode='before') + @classmethod + def validate_requested_at(cls, value: Any) -> Any: + if isinstance(value, str): + return parse_timestamp(value) + return value + + @field_serializer('requested_at') + def serialize_requested_at(self, value: datetime) -> str: + return datetime_to_str(value) @classmethod def create( @@ -235,20 +284,20 @@ def create( result=None, ) - def _to_dict(self) -> dict[str, str]: - """Dict form (string-only values).""" - return { - 'callback_id': self.callback_id, - 'kwargs': self._kwargs_to_json(), - 'requested_at': datetime_to_str(self.requested_at), - 'max_retry': '' if self.max_retry is None else str(self.max_retry), - 'attempt': str(self.attempt), - 'result': '' if self.result is None else self.result, - } - def to_string(self) -> str: - """Serialize to a string suitable for a Juju databag.""" - return json.dumps(self._to_dict(), separators=(',', ':')) + """Serialize to a single JSON object string.""" + return self.model_dump_json() + + @classmethod + def from_string(cls, data: str) -> 'Operation': + """Deserialize from a JSON string.""" + try: + return cls.model_validate_json(data) + except Exception as e: + logger.error('Failed to deserialize Operation from %s: %s', data, e) + raise RollingOpsDecodingError( + 'Failed to deserialize data to create an Operation' + ) from e def increase_attempt(self) -> None: """Increment the attempt counter.""" @@ -266,7 +315,11 @@ def complete(self) -> None: self.result = OperationResult.RELEASE def retry_release(self) -> None: - """Mark the operation for retry if it has not reached the max retry.""" + """Mark the operation to be retried later, releasing the lock. + + If the maximum retry count is reached, the operation is marked as + ``RELEASE`` and will not be retried further. + """ self.increase_attempt() if self.is_max_retry_reached(): logger.warning('Operation max retry reached. Dropping.') @@ -275,7 +328,11 @@ def retry_release(self) -> None: self.result = OperationResult.RETRY_RELEASE def retry_hold(self) -> None: - """Mark the operation for retry if it has not reached the max retry.""" + """Mark the operation to be retried immediately, retaining the lock. + + If the maximum retry count is reached, the operation is marked as + ``RELEASE`` and will not be retried further. + """ self.increase_attempt() if self.is_max_retry_reached(): self.result = OperationResult.RELEASE @@ -288,41 +345,6 @@ def op_id(self) -> str: """Return the unique identifier for this operation.""" return f'{datetime_to_str(self.requested_at)}-{self.callback_id}' - @classmethod - def from_string(cls, data: str) -> 'Operation': - """Deserialize from a Juju databag string. - - Raises: - RollingOpsDecodingError: if data cannot be deserialized. - """ - try: - obj = json.loads(data) - except json.JSONDecodeError as e: - logger.error('Failed to deserialize Operation from %s: %s', data, e) - raise RollingOpsDecodingError( - 'Failed to deserialize data to create an Operation' - ) from e - return cls.from_dict(obj) - - @classmethod - def from_dict(cls, data: dict[str, str]) -> 'Operation': - """Create an Operation from its dict (etcd) representation.""" - try: - return cls( - callback_id=data['callback_id'], - requested_at=parse_timestamp(data['requested_at']), # type: ignore[reportArgumentType] - max_retry=int(data['max_retry']) if data.get('max_retry') else None, - attempt=int(data['attempt']), - kwargs=json.loads(data['kwargs']) if data.get('kwargs') else {}, - result=OperationResult(data['result']) if data.get('result') else None, - ) - - except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e: - logger.error('Failed to deserialize Operation from %s: %s', data, e) - raise RollingOpsDecodingError( - 'Failed to deserialize data to create an Operation' - ) from e - def _kwargs_to_json(self) -> str: """Deterministic JSON serialization for kwargs.""" return json.dumps(self.kwargs, sort_keys=True, separators=(',', ':')) @@ -338,20 +360,25 @@ def __hash__(self) -> int: return hash((self.callback_id, self._kwargs_to_json())) -class OperationQueue: +class OperationQueue(RootModel[list[Operation]]): """In-memory FIFO queue of Operations with encode/decode helpers for storing in a databag.""" - def __init__(self, operations: list[Operation] | None = None): - self.operations: list[Operation] = list(operations or []) + def __init__(self, operations: list[Operation] | None = None) -> None: + super().__init__(root=operations or []) # pyright: ignore[reportUnknownMemberType] + + @property + def operations(self) -> list[Operation]: + """Return the underlying list of operations.""" + return self.root def __len__(self) -> int: """Return the number of operations in the queue.""" - return len(self.operations) + return len(self.root) @property def empty(self) -> bool: """Return True if there are no queued operations.""" - return not self.operations + return not self.root def peek(self) -> Operation | None: """Return the first operation in the queue if it exists.""" @@ -379,48 +406,64 @@ def enqueue(self, operation: Operation) -> None: self.operations.append(operation) def to_string(self) -> str: - """Encode entire queue to a single string.""" - items = [op.to_string() for op in self.operations] - return json.dumps(items, separators=(',', ':')) + """Encode entire queue to a single JSON string.""" + return self.model_dump_json() @classmethod def from_string(cls, data: str) -> 'OperationQueue': - """Decode queue from a string. + """Decode a queue from a JSON string. + + Args: + data: Serialized queue as a JSON array of operation objects. + + Returns: + The decoded operation queue. Raises: - RollingOpsDecodingError: if data cannot be deserialized. + RollingOpsDecodingError: If the queue cannot be deserialized. """ if not data: - return cls() + return cls([]) try: - items = json.loads(data) - except json.JSONDecodeError as e: + return cls.model_validate_json(data) + except Exception as e: logger.error( - 'Failed to deserialize data to create an OperationQueue from %s: %s', data, e + 'Failed to deserialize data to create an OperationQueue from %s: %s', + data, + e, ) raise RollingOpsDecodingError( 'Failed to deserialize data to create an OperationQueue.' ) from e - if not isinstance(items, list) or not all(isinstance(s, str) for s in items): # type: ignore[reportUnknownVariableType] - raise RollingOpsDecodingError( - 'OperationQueue string must decode to a JSON list of strings.' - ) - - operations = [Operation.from_string(s) for s in items] # type: ignore[reportUnknownVariableType] - return cls(operations) -@dataclass +@dataclass(frozen=True) class RollingOpsState: """Snapshot of the rolling-ops state for a unit. - This aggregates the current status, the backend responsible for - processing operations, and the unit's operation queue. + This object provides a view of the rolling-ops system from the perspective + of a single unit. + + This state is intended for decision-making in charm logic + + The `processing_backend` reflects the backend currently selected + for execution. It may change dynamically (e.g. fallback from etcd + to peer). + The `operations` queue always reflects the peer-backed state, which + acts as the source of truth and fallback mechanism. + When `status` is UNAVAILABLE, the unit cannot currently participate + in rolling operations due to missing relations or backend failures. + + Attributes: + status: High-level rolling-ops status for the unit. + processing_backend: Backend currently responsible for executing + operations (e.g. ETCD or PEER). + operations: The unit's operation queue. """ status: RollingOpsStatus - processing_backend: ProcessingBackend | None + processing_backend: ProcessingBackend operations: OperationQueue @@ -444,7 +487,7 @@ def acquire(self, timeout: int | None) -> None: Raises: TimeoutError: If the lock could not be acquired within the timeout. """ - pass + raise NotImplementedError @abstractmethod def release(self) -> None: @@ -453,4 +496,4 @@ def release(self) -> None: Implementations must ensure that only the lock owner can release the lock and that any associated resources are cleaned up. """ - pass + raise NotImplementedError diff --git a/rollingops/src/charmlibs/rollingops/common/_utils.py b/rollingops/src/charmlibs/rollingops/common/_utils.py index fbf819bc0..dfdddb221 100644 --- a/rollingops/src/charmlibs/rollingops/common/_utils.py +++ b/rollingops/src/charmlibs/rollingops/common/_utils.py @@ -29,6 +29,9 @@ logger = logging.getLogger(__name__) T = TypeVar('T') +LOCK_GRANTED_HOOK_NAME = 'rollingops_lock_granted' +ETCD_FAILED_HOOK_NAME = 'rollingops_etcd_failed' + @retry( retry=retry_if_exception_type((PebbleConnectionError, pebble.APIError, pebble.ChangeError)), @@ -53,39 +56,59 @@ def parse_timestamp(timestamp: str) -> datetime | None: return None -def datetime_to_str(datetime: datetime) -> str: - return str(datetime.timestamp()) +def datetime_to_str(dt: datetime) -> str: + return str(dt.timestamp()) -def setup_logging(log_file: str) -> None: +def setup_logging( + log_file: str, + *, + unit_name: str, + cluster_id: str | None = None, + owner: str | None = None, +) -> None: """Configure logging with file rotation. This sets up the root logger to write INFO-level (and above) logs to a rotating file handler. Log files are capped at 10 MB each, - with up to 3 backup files retained. + with up to 10 backup files retained. This functions is used in the context of the background process. Args: log_file: Path to the log file where logs should be written. + unit_name: Juju unit name associated with the background process. + cluster_id: Optional etcd cluster identifier. + owner: Optional worker owner identifier. """ handler = RotatingFileHandler( log_file, maxBytes=10 * 1024 * 1024, # 10 MB - backupCount=3, + backupCount=10, ) formatter = logging.Formatter( - '%(asctime)s [%(levelname)s] [%(process)d] %(name)s: %(message)s' + '%(asctime)s [%(levelname)s] [%(process)d] ' + '[unit=%(unit_name)s cluster=%(cluster_id)s owner=%(owner)s] ' + '%(name)s: %(message)s' ) handler.setFormatter(formatter) + def add_context(record: logging.LogRecord) -> bool: + record.unit_name = unit_name + record.cluster_id = cluster_id or '-' + record.owner = owner or '-' + return True + + handler.addFilter(add_context) + root = logging.getLogger() root.setLevel(logging.INFO) + root.handlers.clear() root.addHandler(handler) -def dispatch_hook(unit_name: str, charm_dir: str, hook_name: str) -> None: +def _dispatch_hook(unit_name: str, charm_dir: str, hook_name: str) -> None: """Execute a Juju hook on a specific unit via juju-exec. This function triggers a charm hook by invoking the charm's `dispatch` @@ -107,7 +130,7 @@ def dispatch_hook(unit_name: str, charm_dir: str, hook_name: str) -> None: def dispatch_lock_granted(unit_name: str, charm_dir: str) -> None: - """Dispatch the 'rollingops_lock_granted' hook on a unit. + """Dispatch the LOCK_GRANTED_HOOK_NAME hook on a unit. Args: unit_name: The Juju unit name (e.g., "app/0"). @@ -116,5 +139,18 @@ def dispatch_lock_granted(unit_name: str, charm_dir: str) -> None: Raises: subprocess.CalledProcessError: If the hook execution fails. """ - hook_name = 'rollingops_lock_granted' - dispatch_hook(unit_name, charm_dir, hook_name) + _dispatch_hook(unit_name, charm_dir, LOCK_GRANTED_HOOK_NAME) + + +def dispatch_etcd_failed(unit_name: str, charm_dir: str) -> None: + """Dispatch the fatal etcd-worker failure hook. + + This notifies the charm that the etcd worker encountered an + unrecoverable error so that higher-level logic can fall back to the + peer backend. + + Args: + unit_name: Name of the unit dispatching the hook. + charm_dir: Path to the charm root directory. + """ + _dispatch_hook(unit_name, charm_dir, ETCD_FAILED_HOOK_NAME) diff --git a/rollingops/src/charmlibs/rollingops/etcd/_backend.py b/rollingops/src/charmlibs/rollingops/etcd/_backend.py index c413e0100..ebc616d97 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_backend.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_backend.py @@ -86,7 +86,7 @@ def __init__( self.worker = EtcdRollingOpsAsyncWorker( charm, peer_relation_name=peer_relation_name, owner=owner, cluster_id=cluster_id ) - self.keys = RollingOpsKeys.for_owner(cluster_id, owner) + self.keys = RollingOpsKeys.for_owner(cluster_id=cluster_id, owner=owner) self.shared_certificates = SharedClientCertificateManager( charm, @@ -99,12 +99,10 @@ def __init__( cluster_id=self.keys.cluster_prefix, shared_certificates=self.shared_certificates, ) - - self.keys = RollingOpsKeys.for_owner(cluster_id=cluster_id, owner=owner) self._async_lock = EtcdLock(lock_key=self.keys.lock_key, owner=owner) self._sync_lock = EtcdLock(lock_key=self.keys.lock_key, owner=f'{owner}:sync') - self.operations = ManagerOperationStore(self.keys, owner) - self._lease = None + self._lease: EtcdLease | None = None + self.operations_store = ManagerOperationStore(self.keys, owner) self.framework.observe( charm.on[self.peer_relation_name].relation_departed, self._on_peer_relation_departed @@ -165,10 +163,10 @@ def enqueue_operation(self, operation: Operation) -> None: backend_state = UnitBackendState(self.model, self.peer_relation_name, self.model.unit) if backend_state.cleanup_needed: - self.operations.clean_up() + self.operations_store.clean_up() backend_state.clear_fallback() - self.operations.request(operation) + self.operations_store.request(operation) def ensure_processing(self): """Ensure that the etcd worker process is running. @@ -178,6 +176,10 @@ def ensure_processing(self): """ self.worker.start() + def is_processing(self) -> bool: + """Return whether the etcd worker process is currently running.""" + return self.worker.is_running() + def _on_etcd_relation_created(self, event: RelationCreatedEvent) -> None: """Validate that the etcdctl command is available when etcd is related. @@ -244,7 +246,7 @@ def request_async_lock( kwargs = {} operation = Operation.create(callback_id, kwargs, max_retry) - self.operations.request(operation) + self.operations_store.request(operation) self.worker.start() def _on_run_with_lock(self) -> RunWithLockOutcome: @@ -269,7 +271,7 @@ def _on_run_with_lock(self) -> RunWithLockOutcome: logger.info('Lock is not granted. Operation will not run.') return RunWithLockOutcome(status=RunWithLockStatus.NOT_GRANTED) - if not (operation := self.operations.peek_current()): + if not (operation := self.operations_store.peek_current()): logger.info('Lock granted but there is no operation to run.') return RunWithLockOutcome(status=RunWithLockStatus.NO_OPERATION) @@ -278,7 +280,7 @@ def _on_run_with_lock(self) -> RunWithLockOutcome: 'Operation %s target was not found. Releasing operation without retry.', operation.callback_id, ) - self.operations.finalize(operation, OperationResult.RELEASE) + self.operations_store.finalize(operation, OperationResult.RELEASE) return RunWithLockOutcome( status=RunWithLockStatus.MISSING_CALLBACK, op_id=operation.op_id, @@ -305,7 +307,15 @@ def _on_run_with_lock(self) -> RunWithLockOutcome: logger.info('Finished %s. Lock will be released.', operation.callback_id) result = OperationResult.RELEASE - self.operations.finalize(operation, result) + try: + self.operations_store.finalize(operation, result) + except Exception: + logger.exception('Failed to commit operation %s to etcd.', operation.callback_id) + return RunWithLockOutcome( + status=RunWithLockStatus.EXECUTED_NOT_COMMITTED, + op_id=operation.op_id, + result=result, + ) return RunWithLockOutcome( status=RunWithLockStatus.EXECUTED, op_id=operation.op_id, @@ -333,9 +343,12 @@ def acquire_sync_lock(self, timeout: int | None) -> None: try: self._lease.grant() + + if self._lease.id is None: + raise RollingOpsSyncLockError('Failed to grant an etcd lease.') while True: try: - if self._sync_lock.try_acquire(self._lease.id): # type: ignore[reportArgumentType] + if self._sync_lock.try_acquire(self._lease.id): logger.info('etcd lock acquired.') return except Exception: @@ -367,7 +380,7 @@ def get_status(self) -> RollingOpsStatus: unit's queued operation state. Returned values: - - INVALID: the peer or etcd relation is missing + - UNAVAILABLE: etcd backend is not available - GRANTED: the async lock is currently held by this unit - WAITING: this unit has queued work but does not hold the lock - IDLE: this unit has no pending work @@ -375,15 +388,13 @@ def get_status(self) -> RollingOpsStatus: Returns: The current rolling-ops status for this unit. """ - if self._peer_relation is None or self._etcd_relation is None: - return RollingOpsStatus.INVALID - - etcdctl.ensure_initialized() + if self._peer_relation is None or self._etcd_relation is None or not self.is_available(): + return RollingOpsStatus.UNAVAILABLE if self._async_lock.is_held(): return RollingOpsStatus.GRANTED - if self.operations.has_pending_work(): + if self.operations_store.has_pending_work(): return RollingOpsStatus.WAITING return RollingOpsStatus.IDLE diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py index 23ea9cfdc..b1278fe4c 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -14,19 +14,22 @@ """Classes that manage etcd concepts.""" -import json import logging import subprocess import time import charmlibs.rollingops.etcd._etcdctl as etcdctl -from charmlibs.rollingops.common._exceptions import RollingOpsEtcdTransactionError +from charmlibs.rollingops.common._exceptions import ( + RollingOpsEtcdctlFatalError, + RollingOpsEtcdctlParseError, + RollingOpsEtcdTransactionError, +) from charmlibs.rollingops.common._models import Operation, OperationResult from charmlibs.rollingops.etcd._models import RollingOpsKeys logger = logging.getLogger(__name__) -LOCK_LEASE_TTL = 60 +LOCK_LEASE_TTL = '60' class EtcdLease: @@ -38,9 +41,15 @@ def __init__(self): def grant(self) -> None: """Create a new lease and start the keep-alive process.""" - res = etcdctl.run('lease', 'grant', str(LOCK_LEASE_TTL)) - # parse: "lease 694d9c9aeca3422a granted with TTL(1800s)" + res = etcdctl.run('lease', 'grant', LOCK_LEASE_TTL) + # parse: "lease 694d9c9aeca3422a granted with TTL(60s)" parts = res.split() + try: + lease_id = parts[1] + int(lease_id, 16) + except (IndexError, ValueError) as e: + raise RollingOpsEtcdctlParseError(f'Invalid lease output: {res}') from e + self.id = parts[1] logger.info('%s', res) self._start_lease_keepalive() @@ -128,6 +137,9 @@ def try_acquire(self, lease_id: str) -> bool: Returns: True if the lock was successfully acquired, otherwise False. """ + if not self.lock_key or not self.owner or not lease_id: + raise RollingOpsEtcdctlFatalError('Invalid input for lock acquire transaction.') + txn = f"""\ version("{self.lock_key}") = "0" @@ -144,6 +156,9 @@ def release(self) -> None: the current owner. This prevents one process from accidentally releasing a lock held by another owner. """ + if not self.lock_key or not self.owner: + raise RollingOpsEtcdctlFatalError('Invalid input for lock release transaction.') + txn = f"""\ value("{self.lock_key}") = "{self.owner}" @@ -154,7 +169,9 @@ def release(self) -> None: etcdctl.txn(txn) def is_held(self) -> bool: - """Check whether the lock is currently held by this owner.""" + """Check whether the lock is currently held by the owner.""" + if not self.lock_key or not self.owner: + raise RollingOpsEtcdctlFatalError('Invalid input for check lock ownership operation.') res = etcdctl.run('get', self.lock_key, '--print-value-only') return res == self.owner @@ -178,14 +195,14 @@ def peek(self) -> Operation | None: kv = etcdctl.get_first_key_value_pair(self.prefix) if kv is None: return None - return Operation.from_dict(kv.value) + return Operation.model_validate(kv.value) def _peek_last(self) -> Operation | None: """Return the last operation in the queue without removing it.""" kv = etcdctl.get_last_key_value_pair(self.prefix) if kv is None: return None - return Operation.from_dict(kv.value) + return Operation.model_validate(kv.value) def move_head(self, to_queue_prefix: str) -> bool: """Move the first operation in the queue to another queue. @@ -207,14 +224,14 @@ def move_head(self, to_queue_prefix: str) -> bool: op_id = kv.key.split('/')[-1] new_key = f'{to_queue_prefix}{op_id}' - data = json.dumps(kv.value) - value_escaped = data.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') + op = Operation.model_validate(kv.value) + data = op.to_string() txn = f"""\ value("{self.lock_key}") = "{self.owner}" version("{kv.key}") != "0" - put "{new_key}" "{value_escaped}" + put "{new_key}" {data} del "{kv.key}" @@ -238,13 +255,12 @@ def move_operation(self, to_queue_prefix: str, operation: Operation) -> bool: new_key = f'{to_queue_prefix}{operation.op_id}' data = operation.to_string() - value_escaped = data.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') txn = f"""\ value("{self.lock_key}") = "{self.owner}" version("{old_key}") != "0" - put "{new_key}" "{value_escaped}" + put "{new_key}" {data} del "{old_key}" @@ -256,7 +272,7 @@ def watch(self) -> Operation: while True: kv = etcdctl.get_first_key_value_pair(self.prefix) if kv is not None: - return Operation.from_dict(kv.value) + return Operation.model_validate(kv.value) time.sleep(10) def dequeue(self) -> bool: @@ -303,7 +319,7 @@ def enqueue(self, operation: Operation) -> None: op_str = operation.to_string() key = f'{self.prefix}{operation.op_id}' - etcdctl.run('put', key, op_str) + etcdctl.run('put', key, cmd_input=op_str) logger.info('Operation %s added to the etcd queue.', operation.callback_id) def clear(self) -> None: @@ -360,19 +376,27 @@ def has_completed(self) -> bool: """ return self._completed.peek() is not None - def claim_next(self) -> None: + def claim_next(self) -> str: """Move the next pending operation to the in-progress queue. This operation is performed atomically and only succeeds if: - the lock is still held by this owner - the head of the pending queue has not changed + Returns: + The operation ID of the operation + Raises: RollingOpsEtcdTransactionError: if the transaction failed. """ if not self._pending.move_head(self._inprogress.prefix): raise RollingOpsEtcdTransactionError('Failed to move operation to in progress.') + operation = self._inprogress.peek() + if operation is None: + raise RollingOpsEtcdTransactionError('Failed to get the ID of the next operation.') + return operation.op_id + def wait_until_completed(self) -> Operation: """Block until at least one operation appears in the completed queue.""" return self._completed.watch() diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py index c191d608c..e5eb4f1c3 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py @@ -278,11 +278,12 @@ def _run_checked(*args: str, cmd_input: str | None = None) -> subprocess.Complet return res -def run(*args: str) -> str: +def run(*args: str, cmd_input: str | None = None) -> str: """Execute an etcdctl command. Args: args: List of arguments to pass to etcdctl. + cmd_input: value to use as input when running the command. Returns: The stdout of the command, stripped, or None if execution failed. @@ -293,7 +294,7 @@ def run(*args: str) -> str: PebbleConnectionError: if the remote container cannot be reached. RollingOpsEtcdctlError: etcdctl command error. """ - return _run_checked(*args).stdout.strip() + return _run_checked(*args, cmd_input=cmd_input).stdout.strip() def _get_key_value_pair(key_prefix: str, *extra_args: str) -> EtcdKV | None: @@ -388,6 +389,19 @@ def txn(txn_input: str) -> bool: RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. PebbleConnectionError: if the remote container cannot be reached. RollingOpsEtcdctlError: etcdctl command error. + RollingOpsEtcdctlParseError: if invalid response is found """ res = _run_checked('txn', cmd_input=txn_input) - return 'SUCCESS' in res.stdout + + lines = res.stdout.splitlines() + if not lines: + raise RollingOpsEtcdctlParseError('Empty txn response') + + first_line = lines[0].strip() + + if first_line == 'SUCCESS': + return True + if first_line == 'FAILURE': + return False + + raise RollingOpsEtcdctlParseError(f'Unexpected txn response: {res.stdout}') diff --git a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py index 8e6ad2190..9d45b183a 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py @@ -17,13 +17,19 @@ import time from charmlibs.rollingops.common._models import OperationResult -from charmlibs.rollingops.common._utils import dispatch_hook, dispatch_lock_granted, setup_logging +from charmlibs.rollingops.common._utils import ( + ETCD_FAILED_HOOK_NAME, + dispatch_etcd_failed, + dispatch_lock_granted, + setup_logging, +) from charmlibs.rollingops.etcd._etcd import ( EtcdLease, EtcdLock, WorkerOperationStore, ) from charmlibs.rollingops.etcd._models import RollingOpsKeys +from charmlibs.rollingops.etcd._worker import ETCD_LOG_FILENAME logger = logging.getLogger(__name__) @@ -36,21 +42,6 @@ class RollingOpsEtcdInconsistencyError(Exception): """Raised when unexpected or inconsistent etcd operation state is found.""" -def _dispatch_etcd_failed(unit_name: str, charm_dir: str) -> None: - """Dispatch the fatal etcd-worker failure hook. - - This notifies the charm that the etcd worker encountered an - unrecoverable error so that higher-level logic can fall back to the - peer backend. - - Args: - unit_name: Name of the unit dispatching the hook. - charm_dir: Path to the charm root directory. - """ - hook_name = 'rollingops_etcd_failed' - dispatch_hook(unit_name, charm_dir, hook_name) - - def main(): """Run the etcd rolling-ops worker loop. @@ -66,27 +57,44 @@ def main(): - any other result: remove the completed operation and release the lock If the worker detects invalid etcd queue state or encounters an - unrecoverable error, it dispatches the `rollingops_etcd_failed` + unrecoverable error, it dispatches the ETCD_FAILED_HOOK_NAME hook so the charm can fall back to peer-based processing. The worker always attempts to revoke its lease and release the lock before exiting. """ - setup_logging('/var/log/etcd_rollingops_worker.log') + parser = argparse.ArgumentParser(description='RollingOps etcd worker') + parser.add_argument( + '--unit-name', + type=str, + required=True, + help='Juju unit name (e.g. app/0)', + ) + parser.add_argument( + '--charm-dir', + type=str, + required=True, + help='Path to the charm directory', + ) - parser = argparse.ArgumentParser() - parser.add_argument('--unit-name', required=True) - parser.add_argument('--charm-dir', required=True) - parser.add_argument('--owner', required=True) - parser.add_argument('--cluster-id', required=True) + parser.add_argument( + '--owner', + type=str, + required=True, + help='Unique owner identifier for the unit', + ) + parser.add_argument( + '--cluster-id', + type=str, + required=True, + help='Cluster identifier', + ) args = parser.parse_args() - logger.info( - 'Worker starting (unit=%s owner=%s cluster=%s)', - args.unit_name, - args.owner, - args.cluster_id, + setup_logging( + ETCD_LOG_FILENAME, unit_name=args.unit_name, owner=args.owner, cluster_id=args.cluster_id ) + logger.info('Starting worker.') time.sleep(INITIAL_SLEEP) @@ -104,24 +112,29 @@ def main(): time.sleep(NEXT_OP_SLEEP) continue + logger.info('Operation found in the pending queue.') + if not lock.is_held(): if lease.id is None: lease.grant() - logger.info('Try to get lock.') - if not lock.try_acquire(lease.id): # pyright: ignore[reportArgumentType] + if lease.id is None: + raise RollingOpsEtcdInconsistencyError('Invalid lease ID found.') + + logger.info('Try to get lock using lease %s.', lease.id) + while not lock.try_acquire(lease.id): time.sleep(LOCK_ACQUIRE_SLEEP) continue - logger.info('Lock granted.') + logger.info('Lock granted using lease %s.', lease.id) - operations.claim_next() + op_id = operations.claim_next() dispatch_lock_granted(args.unit_name, args.charm_dir) - logger.info('Waiting for operation to be finished.') + logger.info('Waiting for operation %s to be finished.', op_id) operation = operations.wait_until_completed() - logger.info('Operation %s completed with %s', operation.callback_id, operation.result) + logger.info('Operation %s completed with %s', operation.op_id, operation.result) match operation.result: case OperationResult.RETRY_HOLD: operations.requeue_completed() @@ -133,27 +146,31 @@ def main(): case _: operations.delete_completed() + lease_id = lease.id lease.revoke() lock.release() - logger.info('Lease revoked and lock released.') + logger.info('Lease %s revoked and lock released.', lease_id) time.sleep(NEXT_OP_SLEEP) except Exception as e: logger.exception('Fatal etcd worker error: %s', e) try: - _dispatch_etcd_failed(args.unit_name, args.charm_dir) + dispatch_etcd_failed(args.unit_name, args.charm_dir) except Exception: - logger.exception('Failed to dispatch rollingops_etcd_failed hook.') + logger.exception('Failed to dispatch %s hook.', ETCD_FAILED_HOOK_NAME) finally: + lease_id = lease.id try: lease.revoke() + logger.info('Lease %s revoked.', lease_id) except Exception: - logger.exception('Failed to revoke lease during worker shutdown.') + logger.exception('Failed to revoke lease %s during worker shutdown.', lease_id) try: lock.release() + logger.info('Lock released.') except Exception: logger.exception('Failed to release lock during worker shutdown.') diff --git a/rollingops/src/charmlibs/rollingops/etcd/_worker.py b/rollingops/src/charmlibs/rollingops/etcd/_worker.py index 00f193109..8240b4378 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_worker.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_worker.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) -WORKER_PID_FIELD = 'etcd-rollingops-worker-pid' +ETCD_LOG_FILENAME = '/var/log/etcd_rollingops_worker.log' class EtcdRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): @@ -35,8 +35,8 @@ class EtcdRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): manage its own worker lifecycle. """ - _pid_field = WORKER_PID_FIELD - _log_filename = 'etcd_rollingops_worker' + _pid_field = 'etcd-rollingops-worker-pid' + _log_filename = ETCD_LOG_FILENAME def __init__(self, charm: CharmBase, peer_relation_name: str, owner: str, cluster_id: str): super().__init__(charm, 'etcd-rollingops-async-worker', peer_relation_name) @@ -66,7 +66,8 @@ def _worker_args(self) -> list[str]: self._cluster_id, ] - def _get_pid_str(self) -> str: + @property + def _pid(self) -> int | None: """Return the stored worker process PID for this unit. The PID is stored in the unit databag because each unit runs its own @@ -74,24 +75,35 @@ def _get_pid_str(self) -> str: that worker lifecycle management is isolated per unit. Returns: - The worker process PID as a string, or an empty string if not set. + The worker process PID, or None if not set. """ if self._relation is None: - return '' - return self._relation.data[self.model.unit].get(self._pid_field, '') + return None + pid = self._relation.data[self.model.unit].get(self._pid_field, '') - def _set_pid_str(self, pid: str) -> None: + try: + pid = int(pid) + except (ValueError, TypeError): + logger.info('Missing PID or invalid PID found in etcd worker state.') + pid = None + + return pid + + @_pid.setter + def _pid(self, value: int | None) -> None: """Persist the worker process PID in the unit databag. The PID is stored per unit to reflect that each unit owns and manages its own worker process when using the etcd backend. Args: - pid: The process identifier to store. + value: The process identifier to store. """ if self._relation is None: return - self._relation.data[self.model.unit].update({self._pid_field: pid}) + self._relation.data[self.model.unit].update({ + self._pid_field: '' if value is None else str(value) + }) def _on_existing_worker(self, pid: int) -> bool: """Executed on detection of an already running worker for this unit. diff --git a/rollingops/src/charmlibs/rollingops/peer/_backend.py b/rollingops/src/charmlibs/rollingops/peer/_backend.py index 4f324f710..310285c95 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_backend.py +++ b/rollingops/src/charmlibs/rollingops/peer/_backend.py @@ -221,7 +221,6 @@ def __init__( charm.on[self.relation_name].relation_departed, self._on_relation_departed ) self.framework.observe(charm.on.leader_elected, self._process_locks) - self.framework.observe(charm.on.update_status, self._on_rollingops_lock_granted) @property def _relation(self) -> Relation | None: @@ -287,7 +286,6 @@ def _on_rollingops_lock_granted(self, event: EventBase) -> None: """ if not self._relation: return - logger.info('Received a rolling-ops lock granted event.') lock = self._lock() operations = self._operations(self.model.unit) if operations.should_run(lock): @@ -555,7 +553,11 @@ def mirror_outcome(self, outcome: RunWithLockOutcome) -> None: 'Mismatch between the etcd and peer operation queue.' ) - case RunWithLockStatus.MISSING_CALLBACK | RunWithLockStatus.EXECUTED: + case ( + RunWithLockStatus.MISSING_CALLBACK + | RunWithLockStatus.EXECUTED + | RunWithLockStatus.EXECUTED_NOT_COMMITTED + ): self._operations(self.model.unit).mirror_result(outcome.op_id, outcome.result) # type: ignore[reportArgumentType] case _: raise RollingOpsDecodingError( @@ -569,7 +571,7 @@ def get_status(self) -> RollingOpsStatus: and from the shared peer lock state. Returned values: - - INVALID: the peer relation does not exist + - UNAVAILABLE: the peer relation does not exist - GRANTED: the current unit holds the peer lock - WAITING: the current unit has queued work but does not hold the lock - IDLE: the current unit has no pending work @@ -578,7 +580,7 @@ def get_status(self) -> RollingOpsStatus: The current rolling-ops status for this unit. """ if self._relation is None: - return RollingOpsStatus.INVALID + return RollingOpsStatus.UNAVAILABLE lock = self._lock() operations = self._operations(self.model.unit) diff --git a/rollingops/src/charmlibs/rollingops/peer/_models.py b/rollingops/src/charmlibs/rollingops/peer/_models.py index 2ccce1271..d11a3334d 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_models.py +++ b/rollingops/src/charmlibs/rollingops/peer/_models.py @@ -112,9 +112,7 @@ def __init__(self, model: Model, relation_name: str): self._relation = relation self._app = model.app - - def _load(self) -> PeerAppData: - return self._relation.load(PeerAppData, self._app, decoder=lambda s: s) + self._app_data = self._relation.load(PeerAppData, self._app, decoder=lambda s: s) def _save(self, data: PeerAppData) -> None: self._relation.save(data, self._app, encoder=str) @@ -122,26 +120,24 @@ def _save(self, data: PeerAppData) -> None: @property def granted_unit(self) -> str: """Return the unit name currently holding the grant, if any.""" - return self._load().granted_unit + return self._app_data.granted_unit @property def granted_at(self) -> datetime | None: """Return the timestamp when the grant was issued, if any.""" - return self._load().granted_at_dt + return self._app_data.granted_at_dt def grant(self, unit_name: str) -> None: """Grant the lock to the provided unit.""" - data = self._load() - data.granted_unit = unit_name - data.granted_at_dt = now_timestamp() - self._save(data) + self._app_data.granted_unit = unit_name + self._app_data.granted_at_dt = now_timestamp() + self._save(self._app_data) def release(self) -> None: """Clear the current grant.""" - data = self._load() - data.granted_unit = '' - data.granted_at_dt = None - self._save(data) + self._app_data.granted_unit = '' + self._app_data.granted_at_dt = None + self._save(self._app_data) def is_granted(self, unit_name: str) -> bool: """Return whether the provided unit currently holds the grant.""" @@ -159,9 +155,7 @@ def __init__(self, model: Model, relation_name: str, unit: Unit): self._relation = relation self.unit = unit self._backend_state = UnitBackendState(model, relation_name, unit) - - def _load(self) -> PeerUnitData: - return self._relation.load(PeerUnitData, self.unit, decoder=lambda s: s) + self._unit_data = self._relation.load(PeerUnitData, self.unit, decoder=lambda s: s) def _save(self, data: PeerUnitData) -> None: self._relation.save(data, self.unit, encoder=str) @@ -173,20 +167,20 @@ def is_peer_managed(self) -> bool: @property def intent(self) -> LockIntent: """Return the current unit intent.""" - return self._load().intent + return self._unit_data.intent @property def executed_at(self) -> datetime | None: """Return the last execution timestamp for this unit.""" - return self._load().executed_at_dt + return self._unit_data.executed_at_dt @property def queue(self) -> OperationQueue: - return self._load().queue + return self._unit_data.queue def get_current(self) -> Operation | None: """Return the head operation, if any.""" - return self._load().queue.peek() + return self._unit_data.queue.peek() def has_pending_work(self) -> bool: """Return whether this unit still has queued work.""" @@ -194,7 +188,7 @@ def has_pending_work(self) -> bool: def request(self, operation: Operation) -> None: """Enqueue an operation and mark this unit as requesting the lock.""" - data = self._load() + data = self._unit_data queue = data.queue previous_length = len(queue) @@ -211,14 +205,14 @@ def request(self, operation: Operation) -> None: data.queue = queue if len(queue) == 1: data.intent = LockIntent.REQUEST + self._unit_data = data self._save(data) logger.info('Operation %s added to the peer queue.', operation.callback_id) def finish(self, result: OperationResult) -> None: """Persist the result of executing the current operation.""" - data = self._load() - self._apply_result_to_data(data, result) - self._save(data) + self._apply_result_to_data(self._unit_data, result) + self._save(self._unit_data) def _apply_result_to_data( self, @@ -328,7 +322,7 @@ def mirror_result(self, op_id: str, result: OperationResult) -> None: Raises: RollingOpsDecodingError: if there is an inconsistency found. """ - data = self._load() + data = self._unit_data current = data.queue.peek() if current is None: diff --git a/rollingops/src/charmlibs/rollingops/peer/_rollingops.py b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py index 05bbb9081..8dae3c0f1 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py @@ -18,15 +18,26 @@ import time from charmlibs.rollingops.common._utils import dispatch_lock_granted, setup_logging +from charmlibs.rollingops.peer._worker import PEER_LOG_FILENAME def main(): """Juju hook event dispatcher.""" - parser = argparse.ArgumentParser() - parser.add_argument('--unit-name', required=True) - parser.add_argument('--charm-dir', required=True) + parser = argparse.ArgumentParser(description='RollingOps peer worker') + parser.add_argument( + '--unit-name', + type=str, + required=True, + help='Juju unit name (e.g. app/0)', + ) + parser.add_argument( + '--charm-dir', + type=str, + required=True, + help='Path to the charm directory', + ) args = parser.parse_args() - setup_logging('/var/log/peer_rollingops_worker.log') + setup_logging(PEER_LOG_FILENAME, unit_name=args.unit_name) # Sleep so that the leader unit can properly leave the hook and start a new one time.sleep(10) diff --git a/rollingops/src/charmlibs/rollingops/peer/_worker.py b/rollingops/src/charmlibs/rollingops/peer/_worker.py index 349154b8c..ca4da358c 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_worker.py +++ b/rollingops/src/charmlibs/rollingops/peer/_worker.py @@ -26,6 +26,8 @@ logger = logging.getLogger(__name__) +PEER_LOG_FILENAME = '/var/log/peer_rollingops_worker.log' + class PeerRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): """Manage the peer-backed rolling-ops worker process. @@ -37,7 +39,7 @@ class PeerRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): """ _pid_field = 'peer-rollingops-worker-pid' - _log_filename = 'peer_rollingops_worker' + _log_filename = PEER_LOG_FILENAME def __init__(self, charm: CharmBase, relation_name: str): super().__init__(charm, 'peer-rollingops-async-worker', relation_name) @@ -57,31 +59,39 @@ def _worker_script_path(self) -> pathops.LocalPath: self._venv_site_packages() / 'charmlibs' / 'rollingops' / 'peer' / '_rollingops.py' ) - def _get_pid_str(self) -> str: - """Return the stored worker process PID as a string. + @property + def _pid(self) -> int | None: + """Return the stored worker process PID. The PID is persisted in the application databag of the peer relation. - If no relation is available or no PID is stored, an empty string is returned. Returns: - The worker process PID as a string, or an empty string if not set. + The worker process PID, or None if not set. """ if self._relation is None: - return '' - return self._app_data.get(self._pid_field, '') + return None + pid = self._app_data.get(self._pid_field, '') + + try: + pid = int(pid) + except (ValueError, TypeError): + pid = None + + return pid - def _set_pid_str(self, pid: str) -> None: + @_pid.setter + def _pid(self, value: int | None) -> None: """Persist the worker process PID in the peer relation databag. The PID is stored in the application databag because it is used to trigger rolling operations on the leader and the leader may change. Args: - pid: The process identifier to store. + value: The process identifier to store. """ if self._relation is None: return - self._app_data.update({self._pid_field: pid}) + self._app_data.update({self._pid_field: '' if value is None else str(value)}) def _on_existing_worker(self, pid: int) -> bool: """Handle the presence of an already running worker process. diff --git a/rollingops/tests/integration/charms/actions.yaml b/rollingops/tests/integration/charms/actions.yaml index 7128ad9f2..196a181ce 100644 --- a/rollingops/tests/integration/charms/actions.yaml +++ b/rollingops/tests/integration/charms/actions.yaml @@ -39,6 +39,6 @@ sync-restart: type: integer default: 0 timeout: - description: "Time (seconds) to wait before given up." + description: "Time (seconds) to wait before giving up." type: integer default: 60 diff --git a/rollingops/tests/integration/test_etcd_rolling_ops.py b/rollingops/tests/integration/test_etcd_rolling_ops.py index 21c84fe35..4172d416e 100644 --- a/rollingops/tests/integration/test_etcd_rolling_ops.py +++ b/rollingops/tests/integration/test_etcd_rolling_ops.py @@ -473,20 +473,28 @@ def test_lock_released_when_unit_removed(juju: jubilant.Juju, app_name: str) -> def test_actions_still_work_after_etcd_relation_removed( juju: jubilant.Juju, app_name: str ) -> None: - units = sorted(juju.status().apps[app_name].units.keys()) - for unit in units: + second_app = f'{app_name}-secondary' + primary_units = sorted(juju.status().apps[app_name].units.keys()) + secondary_units = sorted(juju.status().apps[second_app].units.keys()) + all_units: list[str] = primary_units + secondary_units + + for unit in all_units: remove_transition_file(juju, unit) + wait_for_etcdctl_config_file(juju, unit) juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) - unit_a = units[3] + unit_a = primary_units[3] - juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 2}) - for i in range(3): - juju.run(unit_a, 'restart', {'delay': i}) + juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 1}) + juju.run(unit_a, 'restart', {'delay': 1}) + juju.run(unit_a, 'restart', {'delay': 2}) juju.remove_relation(f'{app_name}:etcd', 'etcd:etcd-client') + unit_b = secondary_units[1] + juju.run(unit_b, 'restart', {'delay': 1}) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) unit_a_events = get_unit_events(juju, unit_a) @@ -494,9 +502,20 @@ def test_actions_still_work_after_etcd_relation_removed( logger.info('unit_a_events %s', unit_a_events) - # During fallback if the execution is not fully committed to etcd, it may - # be re-executed on the peer context. - assert relevant_events.count('_failed_restart:start') >= 3, relevant_events - assert relevant_events.count('_failed_restart:retry_release') >= 3, relevant_events - assert relevant_events.count('_restart:start') >= 3, relevant_events - assert relevant_events.count('_restart:done') >= 3, relevant_events + assert relevant_events.count('_failed_restart:start') == 2, relevant_events + assert relevant_events.count('_failed_restart:retry_release') == 2, relevant_events + assert relevant_events.count('_restart:start') == 2, relevant_events + assert relevant_events.count('_restart:done') == 2, relevant_events + + unit_b_events = get_unit_events(juju, unit_b) + assert len(unit_b_events) == 3 + restart_events = [ + (e['event'], e['processing_backend']) + for e in unit_b_events + if not e['event'].startswith('action') + ] + + assert restart_events == [ + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ], f'unexpected event sequence: {restart_events}' diff --git a/rollingops/tests/unit/conftest.py b/rollingops/tests/unit/conftest.py index bf96787fe..bb3c46f61 100644 --- a/rollingops/tests/unit/conftest.py +++ b/rollingops/tests/unit/conftest.py @@ -35,7 +35,6 @@ from charmlibs.rollingops import RollingOpsManager from charmlibs.rollingops.common._models import OperationResult from charmlibs.rollingops.etcd._models import SharedCertificate -from charmlibs.rollingops.peer._backend import PeerRollingOpsBackend VALID_CA_CERT_PEM = """-----BEGIN CERTIFICATE----- MIIC6DCCAdCgAwIBAgIUW42TU9LSjEZLMCclWrvSwAsgRtcwDQYJKoZIhvcNAQEL @@ -166,7 +165,7 @@ def certificates_manager_patches() -> Generator[dict[str, MagicMock], None, None } -class BaseRollingOpsTestCharm(ops.CharmBase): +class RollingOpsCharm(ops.CharmBase): def __init__(self, framework: ops.Framework): super().__init__(framework) @@ -176,16 +175,17 @@ def __init__(self, framework: ops.Framework): '_deferred_restart': self._deferred_restart, } - self.restart_manager = self._make_restart_manager(callback_targets) + self.restart_manager = RollingOpsManager( + charm=self, + peer_relation_name='restart', + etcd_relation_name='etcd', + cluster_id='cluster-12345', + callback_targets=callback_targets, + ) self.framework.observe(self.on.restart_action, self._on_restart_action) self.framework.observe(self.on.failed_restart_action, self._on_failed_restart_action) self.framework.observe(self.on.deferred_restart_action, self._on_deferred_restart_action) - def _make_restart_manager( - self, callback_targets: dict[str, Any] - ) -> PeerRollingOpsBackend | RollingOpsManager: - raise NotImplementedError - def _on_restart_action(self, event: ActionEvent) -> None: delay = event.params.get('delay') self.restart_manager.request_async_lock(callback_id='_restart', kwargs={'delay': delay}) @@ -218,31 +218,6 @@ def _deferred_restart(self, delay: int = 0) -> OperationResult: return OperationResult.RETRY_HOLD -class PeerRollingOpsCharm(BaseRollingOpsTestCharm): - def _make_restart_manager(self, callback_targets: dict[str, Any]) -> PeerRollingOpsBackend: - return PeerRollingOpsBackend( - charm=self, - relation_name='restart', - callback_targets=callback_targets, - ) - - -class RollingOpsCharm(BaseRollingOpsTestCharm): - def _make_restart_manager(self, callback_targets: dict[str, Any]) -> RollingOpsManager: - return RollingOpsManager( - charm=self, - peer_relation_name='restart', - etcd_relation_name='etcd', - cluster_id='cluster-12345', - callback_targets=callback_targets, - ) - - -@pytest.fixture -def peer_charm_test() -> type[PeerRollingOpsCharm]: - return PeerRollingOpsCharm - - @pytest.fixture def charm_test() -> type[RollingOpsCharm]: return RollingOpsCharm @@ -307,8 +282,3 @@ def charm_test() -> type[RollingOpsCharm]: @pytest.fixture def ctx(charm_test: type[RollingOpsCharm]) -> Context[RollingOpsCharm]: return Context(charm_test, meta=meta, actions=actions) - - -@pytest.fixture -def peer_ctx(peer_charm_test: type[PeerRollingOpsCharm]) -> Context[PeerRollingOpsCharm]: - return Context(peer_charm_test, meta=meta, actions=actions) diff --git a/rollingops/tests/unit/test_common_models.py b/rollingops/tests/unit/test_common_models.py index 8ae9a1644..fc4072ff6 100644 --- a/rollingops/tests/unit/test_common_models.py +++ b/rollingops/tests/unit/test_common_models.py @@ -23,6 +23,7 @@ from charmlibs.rollingops.common._exceptions import RollingOpsDecodingError from charmlibs.rollingops.common._models import ( Operation, + OperationQueue, OperationResult, ) @@ -36,7 +37,7 @@ def test_operation_create_sets_fields(): assert isinstance(op.requested_at, datetime) -def test_operation_to_string_contains_string_values_only(): +def test_operation_to_string(): ts = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) op = Operation( callback_id='cb', @@ -48,16 +49,20 @@ def test_operation_to_string_contains_string_values_only(): ) s = op.to_string() - obj = json.loads(s) + expected = ( + '{"callback_id":"cb",' + '"requested_at":"1771848000.123456",' + '"max_retry":null,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"a":1,"b":2}}' + ) - assert obj['callback_id'] == 'cb' - assert obj['kwargs'] == '{"a":1,"b":2}' - assert obj['requested_at'] == str(ts.timestamp()) - assert obj.get('max_retry', '') == '' + assert s == expected -def test_operation_to_string_contains_string_values_only_zero_max_retry(): - ts = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) +def test_operation_to_string_zero_max_retry(): + ts = datetime(2026, 2, 23, 4, 0, 0, 123456, tzinfo=UTC) op = Operation( callback_id='cb', kwargs={'b': 2, 'a': 1}, @@ -68,12 +73,39 @@ def test_operation_to_string_contains_string_values_only_zero_max_retry(): ) s = op.to_string() - obj = json.loads(s) + expected = ( + '{"callback_id":"cb",' + '"requested_at":"1771819200.123456",' + '"max_retry":0,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"a":1,"b":2}}' + ) + assert s == expected + + +def test_operation_to_string_none_max_retry(): + ts = datetime(2026, 2, 23, 4, 0, 0, 123456, tzinfo=UTC) + op = Operation( + callback_id='cb', + kwargs={'b': 2, 'a': 1}, + requested_at=ts, + max_retry=None, + attempt=0, + result=None, + ) + + s = op.to_string() + expected = ( + '{"callback_id":"cb",' + '"requested_at":"1771819200.123456",' + '"max_retry":null,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"a":1,"b":2}}' + ) - assert obj['callback_id'] == 'cb' - assert obj['kwargs'] == '{"a":1,"b":2}' - assert obj['requested_at'] == str(ts.timestamp()) - assert obj.get('max_retry', '') == '0' + assert s == expected def test_operation_is_max_retry_reached_on_zero_max_retry(): @@ -132,8 +164,8 @@ def test_operation_from_string_valid_payload(): requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) payload = json.dumps({ 'callback_id': 'cb-123', - 'kwargs': json.dumps({'b': 2, 'a': 'x'}), - 'requested_at': str(requested_at.timestamp()), + 'kwargs': {'b': 2, 'a': 'x'}, + 'requested_at': '1773311445.123456', 'max_retry': '5', 'attempt': '2', }) @@ -152,9 +184,7 @@ def test_from_string_valid_payload_with_empty_kwargs_and_no_max_retry(): requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) payload = json.dumps({ 'callback_id': 'cb-123', - 'kwargs': '', - 'requested_at': str(requested_at.timestamp()), - 'max_retry': '', + 'requested_at': '1773311445.123456', 'attempt': '0', }) @@ -172,8 +202,8 @@ def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) payload = json.dumps({ 'callback_id': 'cb-123', - 'kwargs': '{}', - 'requested_at': str(requested_at.timestamp()), + 'kwargs': {}, + 'requested_at': '1773311445.123456', 'max_retry': '0', 'attempt': '0', }) @@ -195,7 +225,7 @@ def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): json.dumps( # invalid requested_at { 'callback_id': 'cb-123', - 'kwargs': json.dumps({'x': 1}), + 'kwargs': {'x': 1}, 'requested_at': 'bad-ts', 'max_retry': '3', 'attempt': '1', @@ -205,15 +235,15 @@ def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): { 'callback_id': 'cb-123', 'kwargs': '{bad kwargs json', - 'requested_at': str(datetime.now(UTC).timestamp()), + 'requested_at': '1773311445.123456', 'max_retry': '3', 'attempt': '1', } ), json.dumps( # missing callback_id { - 'kwargs': json.dumps({'x': 1}), - 'requested_at': str(datetime.now(UTC).timestamp()), + 'kwargs': {'x': 1}, + 'requested_at': '1773311445.123456', 'max_retry': '3', 'attempt': '1', } @@ -222,7 +252,7 @@ def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): { 'callback_id': 'cb-123', 'kwargs': '[]', - 'requested_at': str(datetime.now(UTC).timestamp()), + 'requested_at': '1773311445.123456', 'max_retry': '3', 'attempt': '1', } @@ -230,7 +260,7 @@ def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): json.dumps( # missing requested_at { 'callback_id': 'cb-123', - 'kwargs': '{}', + 'kwargs': {}, 'requested_at': '', 'max_retry': '3', 'attempt': '1', @@ -239,7 +269,7 @@ def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): json.dumps( # result { 'callback_id': 'cb-123', - 'kwargs': '{}', + 'kwargs': {}, 'requested_at': 'bad-ts', 'max_retry': '3', 'attempt': '1', @@ -377,3 +407,137 @@ def test_retry_release_with_no_max_retry_sets_retry_release() -> None: assert operation.attempt == 6 assert operation.result == OperationResult.RETRY_RELEASE + + +def test_queue_empty_behaviour(): + q = OperationQueue() + + assert len(q) == 0 + assert q.empty is True + assert q.peek() is None + assert q.dequeue() is None + + assert q.to_string() == '[]' + + +def test_queue_enqueue_and_fifo_order(): + q = OperationQueue() + op1 = Operation.create('a', {'x': 2}) + op2 = Operation.create('b', {'i': 2}) + q.enqueue(op1) + q.enqueue(op2) + + assert len(q) == 2 + op = q.peek() + assert op is not None + assert op == op1 + + first = q.dequeue() + assert first is not None + assert first == op1 + assert len(q) == 1 + op = q.peek() + assert op is not None + assert op == op2 + + second = q.dequeue() + assert second is not None + assert second == op2 + assert q.empty is True + + +def test_queue_deduplicates_only_against_last_item(): + q = OperationQueue() + op1 = Operation.create('a', {'x': 2}) + op2 = Operation.create('a', {'x': 2}) + op3 = Operation.create('a', {'x': 4}) + + q.enqueue(op1) + assert len(q) == 1 + + q.enqueue(op2) + assert len(q) == 1 + + q.enqueue(op3) + assert len(q) == 2 + + q.enqueue(op2) + assert len(q) == 3 + + +def test_queue_to_string_and_from_string(): + q1 = OperationQueue() + ts1 = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) + op1 = Operation( + callback_id='a', + kwargs={'x': 1}, + requested_at=ts1, + max_retry=5, + attempt=0, + result=None, + ) + ts2 = datetime(2026, 2, 20, 12, 0, 0, 123456, tzinfo=UTC) + op2 = Operation( + callback_id='b', + kwargs={'y': 'z'}, + requested_at=ts2, + max_retry=None, + attempt=0, + result=None, + ) + q1.enqueue(op1) + q1.enqueue(op2) + + encoded = q1.to_string() + expected = ( + '[{"callback_id":"a",' + '"requested_at":"1771848000.123456",' + '"max_retry":5,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"x":1}},' + '{"callback_id":"b",' + '"requested_at":"1771588800.123456",' + '"max_retry":null,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"y":"z"}}]' + ) + + assert encoded == expected + + q2 = OperationQueue.from_string(encoded) + + assert len(q2) == 2 + op = q2.peek() + assert op is not None + assert op == op1 + + op = q2.dequeue() + assert op is not None + assert op == op1 + + op = q2.dequeue() + assert op is not None + assert op == op2 + assert q2.empty + + +def test_queue_from_string_empty_string_is_empty_queue(): + q = OperationQueue.from_string('') + assert q.empty + assert q.peek() is None + + +def test_queue_from_string_rejects_non_list_json(): + with pytest.raises( + RollingOpsDecodingError, match='Failed to deserialize data to create an OperationQueue' + ): + OperationQueue.from_string('{"not": "a list"}') + + +def test_queue_from_string_rejects_invalid_json(): + with pytest.raises( + RollingOpsDecodingError, match='Failed to deserialize data to create an OperationQueue' + ): + OperationQueue.from_string('{invalid') diff --git a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py index 828b6fff6..e668412fd 100644 --- a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py @@ -32,14 +32,12 @@ PrivateKey, ) from charmlibs.rollingops.common._exceptions import ( - RollingOpsEtcdNotConfiguredError, RollingOpsInvalidSecretContentError, ) from charmlibs.rollingops.common._models import ( Operation, OperationQueue, ProcessingBackend, - RollingOpsState, RollingOpsStatus, ) from charmlibs.rollingops.etcd._models import SharedCertificate @@ -193,10 +191,9 @@ def test_state_not_initialized(ctx: Context[RollingOpsCharm]): state = State(leader=True) with ctx(ctx.on.start(), state) as mgr: - rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] - assert isinstance(rolling_state, RollingOpsState) - assert rolling_state.status == RollingOpsStatus.INVALID - assert rolling_state.processing_backend is None + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.UNAVAILABLE + assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 0 @@ -214,8 +211,7 @@ def test_state_peer_idle(ctx: Context[RollingOpsCharm]): state = State(leader=False, relations={peer_rel}) with ctx(ctx.on.update_status(), state) as mgr: - rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] - assert isinstance(rolling_state, RollingOpsState) + rolling_state = mgr.charm.restart_manager.state assert rolling_state.status == RollingOpsStatus.IDLE assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 0 @@ -237,8 +233,7 @@ def test_state_peer_waiting(ctx: Context[RollingOpsCharm]): state = State(leader=False, relations={peer_rel}) with ctx(ctx.on.update_status(), state) as mgr: - rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] - assert isinstance(rolling_state, RollingOpsState) + rolling_state = mgr.charm.restart_manager.state assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 @@ -263,8 +258,7 @@ def test_state_peer_is_granted(ctx: Context[RollingOpsCharm]): state = State(leader=False, relations={peer_rel}) with ctx(ctx.on.update_status(), state) as mgr: - rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] - assert isinstance(rolling_state, RollingOpsState) + rolling_state = mgr.charm.restart_manager.state assert rolling_state.status == RollingOpsStatus.GRANTED assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 @@ -289,8 +283,7 @@ def test_state_peer_waiting_retry(ctx: Context[RollingOpsCharm]): state = State(leader=False, relations={peer_rel}) with ctx(ctx.on.update_status(), state) as mgr: - rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] - assert isinstance(rolling_state, RollingOpsState) + rolling_state = mgr.charm.restart_manager.state assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 @@ -318,8 +311,7 @@ def test_state_etcd_status(ctx: Context[RollingOpsCharm]): return_value=RollingOpsStatus.GRANTED, ): with ctx(ctx.on.update_status(), state) as mgr: - rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] - assert isinstance(rolling_state, RollingOpsState) + rolling_state = mgr.charm.restart_manager.state assert rolling_state.status == RollingOpsStatus.GRANTED assert rolling_state.processing_backend == ProcessingBackend.ETCD assert len(rolling_state.operations) == 1 @@ -342,11 +334,10 @@ def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCh with patch( 'charmlibs.rollingops._rollingops_manager.EtcdRollingOpsBackend.get_status', - side_effect=RollingOpsEtcdNotConfiguredError('boom'), + return_value=RollingOpsStatus.UNAVAILABLE, ): with ctx(ctx.on.update_status(), state) as mgr: - rolling_state = mgr.charm.restart_manager.state # type: ignore[reportUnknownVariableType] - assert isinstance(rolling_state, RollingOpsState) + rolling_state = mgr.charm.restart_manager.state assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 diff --git a/rollingops/tests/unit/test_peer_models.py b/rollingops/tests/unit/test_peer_models.py deleted file mode 100644 index 709a38867..000000000 --- a/rollingops/tests/unit/test_peer_models.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Learn more about testing at: https://juju.is/docs/sdk/testing - -import json - -import pytest - -from charmlibs.rollingops.common._exceptions import RollingOpsDecodingError -from charmlibs.rollingops.common._models import Operation, OperationQueue - - -def _decode_queue_string(queue_str: str) -> list[dict[str, str]]: - """Helper: decode OperationQueue.to_string() -> list of dicts.""" - items = json.loads(queue_str) - assert isinstance(items, list) - return [json.loads(s) for s in items] # type: ignore[reportUnknownArgumentType] - - -def test_queue_empty_behaviour(): - q = OperationQueue() - - assert len(q) == 0 - assert q.empty is True - assert q.peek() is None - assert q.dequeue() is None - - assert json.loads(q.to_string()) == [] - - -def test_queue_enqueue_and_fifo_order(): - q = OperationQueue() - op1 = Operation.create('a', {'x': 2}) - op2 = Operation.create('b', {'i': 2}) - q.enqueue(op1) - q.enqueue(op2) - - assert len(q) == 2 - op = q.peek() - assert op is not None - assert op == op1 - - first = q.dequeue() - assert first is not None - assert first == op1 - assert len(q) == 1 - op = q.peek() - assert op is not None - assert op == op2 - - second = q.dequeue() - assert second is not None - assert second == op2 - assert q.empty is True - - -def test_queue_deduplicates_only_against_last_item(): - q = OperationQueue() - op1 = Operation.create('a', {'x': 2}) - op2 = Operation.create('a', {'x': 2}) - op3 = Operation.create('a', {'x': 4}) - - q.enqueue(op1) - assert len(q) == 1 - - q.enqueue(op2) - assert len(q) == 1 - - q.enqueue(op3) - assert len(q) == 2 - - q.enqueue(op2) - assert len(q) == 3 - - -def test_queue_to_string_and_from_string(): - q1 = OperationQueue() - op1 = Operation.create('a', {'x': 1}, max_retry=5) - op2 = Operation.create('b', {'y': 'z'}, max_retry=None) - q1.enqueue(op1) - q1.enqueue(op2) - - encoded = q1.to_string() - q2 = OperationQueue.from_string(encoded) - - assert len(q2) == 2 - op = q2.peek() - assert op is not None - assert op == op1 - - op = q2.dequeue() - assert op is not None - assert op == op1 - - op = q2.dequeue() - assert op is not None - assert op == op2 - assert q2.empty - - -def test_queue_from_string_empty_string_is_empty_queue(): - q = OperationQueue.from_string('') - assert q.empty - assert q.peek() is None - - -def test_queue_from_string_rejects_non_list_json(): - with pytest.raises(RollingOpsDecodingError, match='OperationQueue string'): - OperationQueue.from_string(json.dumps({'not': 'a list'})) - - -def test_queue_from_string_rejects_invalid_jason(): - with pytest.raises(RollingOpsDecodingError, match='Failed to deserialize data'): - OperationQueue.from_string('{invalid') - - -def test_queue_encoding_is_list_of_operation_strings(): - q = OperationQueue() - op1 = Operation.create('a', {'x': 1}) - q.enqueue(op1) - s = q.to_string() - - decoded = json.loads(s) - assert isinstance(decoded, list) - assert len(decoded) == 1 # type: ignore[reportUnknownArgumentType] - assert isinstance(decoded[0], str) - - op_dicts = _decode_queue_string(s) - assert op_dicts[0]['callback_id'] == 'a' - assert op_dicts[0]['kwargs'] == '{"x":1}' - assert op_dicts[0].get('max_retry', '') == '' - assert 'requested_at' in op_dicts[0] diff --git a/rollingops/tests/unit/test_peer_rollingops_in_charm.py b/rollingops/tests/unit/test_peer_rollingops_in_charm.py index 9c18745c0..11389de47 100644 --- a/rollingops/tests/unit/test_peer_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_peer_rollingops_in_charm.py @@ -16,11 +16,12 @@ from typing import Any +from unittest.mock import MagicMock import pytest from ops.testing import Context, PeerRelation, State from scenario import RawDataBagContents -from tests.unit.conftest import PeerRollingOpsCharm +from tests.unit.conftest import RollingOpsCharm from charmlibs.rollingops.common._exceptions import RollingOpsInvalidLockRequestError from charmlibs.rollingops.common._models import Operation, OperationQueue @@ -46,13 +47,13 @@ def _make_operation_queue( def test_lock_request_enqueues_and_sets_request( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.action('restart', params={'delay': 10}), + state_out = ctx.run( + ctx.on.action('restart', params={'delay': 10}), state_in, ) @@ -75,14 +76,13 @@ def test_lock_request_enqueues_and_sets_request( [ (-5), (-1), - ('3'), ], ) -def test_lock_request_invalid_inputs(peer_ctx: Context[PeerRollingOpsCharm], max_retry: Any): +def test_lock_request_invalid_inputs(ctx: Context[RollingOpsCharm], max_retry: Any): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with ctx(ctx.on.update_status(), state_in) as mgr: with pytest.raises(RollingOpsInvalidLockRequestError): mgr.charm.restart_manager.request_async_lock( callback_id='_restart', @@ -99,13 +99,11 @@ def test_lock_request_invalid_inputs(peer_ctx: Context[PeerRollingOpsCharm], max ('unknown',), ], ) -def test_lock_request_invalid_callback_id( - peer_ctx: Context[PeerRollingOpsCharm], callback_id: str -): +def test_lock_request_invalid_callback_id(ctx: Context[RollingOpsCharm], callback_id: str): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with ctx(ctx.on.update_status(), state_in) as mgr: with pytest.raises(RollingOpsInvalidLockRequestError, match='Unknown callback_id'): mgr.charm.restart_manager.request_async_lock( callback_id=callback_id, @@ -122,11 +120,11 @@ def test_lock_request_invalid_callback_id( ({'x': OperationQueue()}), ], ) -def test_lock_request_invalid_kwargs(peer_ctx: Context[PeerRollingOpsCharm], kwargs: Any): +def test_lock_request_invalid_kwargs(ctx: Context[RollingOpsCharm], kwargs: Any): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with ctx(ctx.on.update_status(), state_in) as mgr: with pytest.raises( RollingOpsInvalidLockRequestError, match='Failed to create the lock request' ): @@ -137,7 +135,7 @@ def test_lock_request_invalid_kwargs(peer_ctx: Context[PeerRollingOpsCharm], kwa ) -def test_existing_operation_then_new_request(peer_ctx: Context[PeerRollingOpsCharm]): +def test_existing_operation_then_new_request(ctx: Context[RollingOpsCharm]): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', @@ -146,7 +144,7 @@ def test_existing_operation_then_new_request(peer_ctx: Context[PeerRollingOpsCha state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.action('restart', params={'delay': 10}), state_in) + state_out = ctx.run(ctx.on.action('restart', params={'delay': 10}), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.REQUEST @@ -158,7 +156,7 @@ def test_existing_operation_then_new_request(peer_ctx: Context[PeerRollingOpsCha def test_new_request_does_not_overwrite_state_if_queue_not_empty( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) executed_at = str(now_timestamp().timestamp()) @@ -172,7 +170,7 @@ def test_new_request_does_not_overwrite_state_if_queue_not_empty( ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.action('restart', params={'delay': 10}), state_in) + state_out = ctx.run(ctx.on.action('restart', params={'delay': 10}), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.RETRY_RELEASE @@ -184,9 +182,9 @@ def test_new_request_does_not_overwrite_state_if_queue_not_empty( def test_relation_changed_without_grant_does_not_run_operation( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): - remote_unit_name = f'{peer_ctx.app_name}/1' + remote_unit_name = f'{ctx.app_name}/1' queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', @@ -199,9 +197,7 @@ def test_relation_changed_without_grant_does_not_run_operation( state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.REQUEST @@ -210,9 +206,9 @@ def test_relation_changed_without_grant_does_not_run_operation( assert databag.get('executed_at', '') == '' -def test_lock_complete_pops_head(peer_ctx: Context[PeerRollingOpsCharm]): - remote_unit_name = f'{peer_ctx.app_name}/1' - local_unit_name = f'{peer_ctx.app_name}/0' +def test_lock_complete_pops_head(ctx: Context[RollingOpsCharm]): + remote_unit_name = f'{ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' queue = _make_operation_queue(callback_id='_restart', kwargs={}, max_retry=0) peer = PeerRelation( endpoint='restart', @@ -224,9 +220,7 @@ def test_lock_complete_pops_head(peer_ctx: Context[PeerRollingOpsCharm]): ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.IDLE @@ -238,10 +232,10 @@ def test_lock_complete_pops_head(peer_ctx: Context[PeerRollingOpsCharm]): def test_successful_operation_leaves_state_request_when_more_ops_remain( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): - local_unit_name = f'{peer_ctx.app_name}/0' - remote_unit_name = f'{peer_ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' + remote_unit_name = f'{ctx.app_name}/1' queue = OperationQueue() op1 = Operation.create(callback_id='_restart', kwargs={}, max_retry=None) op2 = Operation.create(callback_id='_failed_restart', kwargs={}, max_retry=None) @@ -260,9 +254,7 @@ def test_successful_operation_leaves_state_request_when_more_ops_remain( state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.REQUEST @@ -281,12 +273,12 @@ def test_successful_operation_leaves_state_request_when_more_ops_remain( ], ) def test_lock_retry_marks_retry( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], callback_id: str, lock_intent: LockIntent, ): - remote_unit_name = f'{peer_ctx.app_name}/1' - local_unit_name = f'{peer_ctx.app_name}/0' + remote_unit_name = f'{ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' queue = _make_operation_queue(callback_id=callback_id, kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', @@ -298,9 +290,7 @@ def test_lock_retry_marks_retry( ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == lock_intent @@ -327,11 +317,11 @@ def test_lock_retry_marks_retry( ], ) def test_lock_retry_drops_when_max_retry_reached( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], callback_id: str, ): - remote_unit_name = f'{peer_ctx.app_name}/1' - local_unit_name = f'{peer_ctx.app_name}/0' + remote_unit_name = f'{ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' queue = OperationQueue() op1 = Operation.create(callback_id=callback_id, kwargs={}, max_retry=3) @@ -352,9 +342,7 @@ def test_lock_retry_drops_when_max_retry_reached( ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.IDLE @@ -364,7 +352,11 @@ def test_lock_retry_drops_when_max_retry_reached( assert len(q) == 0 -def test_lock_grant_and_release(peer_ctx: Context[PeerRollingOpsCharm]): +def test_lock_grant_and_release( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', @@ -372,17 +364,21 @@ def test_lock_grant_and_release(peer_ctx: Context[PeerRollingOpsCharm]): ) state_in = State(leader=True, relations={peer}) - state = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state, peer) - unit_name = f'{peer_ctx.app_name}/1' + unit_name = f'{ctx.app_name}/1' assert unit_name in databag['granted_unit'] assert databag['granted_at'] is not None -def test_scheduling_does_nothing_if_lock_already_granted(peer_ctx: Context[PeerRollingOpsCharm]): +def test_scheduling_does_nothing_if_lock_already_granted( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) - remote_unit_name = f'{peer_ctx.app_name}/1' + remote_unit_name = f'{ctx.app_name}/1' now_timestamp_str = str(now_timestamp().timestamp()) peer = PeerRelation( endpoint='restart', @@ -394,16 +390,18 @@ def test_scheduling_does_nothing_if_lock_already_granted(peer_ctx: Context[PeerR ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _app_databag(state_out, peer) assert databag['granted_unit'] == remote_unit_name assert databag['granted_at'] == now_timestamp_str -def test_schedule_picks_retry_hold(peer_ctx: Context[PeerRollingOpsCharm]): +def test_schedule_picks_retry_hold( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): old_operation = str(now_timestamp().timestamp()) queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) new_operation = str(now_timestamp().timestamp()) @@ -430,14 +428,18 @@ def test_schedule_picks_retry_hold(peer_ctx: Context[PeerRollingOpsCharm]): ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/3' + remote_unit_name = f'{ctx.app_name}/3' assert databag['granted_unit'] == remote_unit_name -def test_schedule_picks_oldest_requested_at_among_requests(peer_ctx: Context[PeerRollingOpsCharm]): +def test_schedule_picks_oldest_requested_at_among_requests( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): old_queue = OperationQueue() old_op = Operation.create(callback_id='restart', kwargs={}, max_retry=2) old_queue.enqueue(old_op) @@ -455,14 +457,16 @@ def test_schedule_picks_oldest_requested_at_among_requests(peer_ctx: Context[Pee ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/2' + remote_unit_name = f'{ctx.app_name}/2' assert databag['granted_unit'] == remote_unit_name def test_schedule_picks_oldest_executed_at_among_retries_when_no_requests( - peer_ctx: Context[PeerRollingOpsCharm], + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], ): old_operation = str(now_timestamp().timestamp()) queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) @@ -485,14 +489,18 @@ def test_schedule_picks_oldest_executed_at_among_retries_when_no_requests( ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/2' + remote_unit_name = f'{ctx.app_name}/2' assert databag['granted_unit'] == remote_unit_name -def test_schedule_prioritizes_requests_over_retries(peer_ctx: Context[PeerRollingOpsCharm]): +def test_schedule_prioritizes_requests_over_retries( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( @@ -508,21 +516,25 @@ def test_schedule_prioritizes_requests_over_retries(peer_ctx: Context[PeerRollin ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/2' + remote_unit_name = f'{ctx.app_name}/2' assert databag['granted_unit'] == remote_unit_name -def test_no_unit_is_granted_if_there_are_no_requests(peer_ctx: Context[PeerRollingOpsCharm]): +def test_no_unit_is_granted_if_there_are_no_requests( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): peer = PeerRelation( endpoint='restart', peers_data={1: {'state': LockIntent.IDLE}, 2: {'state': LockIntent.IDLE}}, ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) assert databag.get('granted_unit', '') == '' diff --git a/rollingops/uv.lock b/rollingops/uv.lock index e6833c191..16c3c97a8 100644 --- a/rollingops/uv.lock +++ b/rollingops/uv.lock @@ -114,6 +114,7 @@ dependencies = [ { name = "charmlibs-pathops" }, { name = "dpcharmlibs-interfaces" }, { name = "ops" }, + { name = "pydantic" }, { name = "tenacity" }, ] @@ -133,6 +134,7 @@ requires-dist = [ { name = "charmlibs-pathops", specifier = ">=1.2.1" }, { name = "dpcharmlibs-interfaces", specifier = "==1.0.0" }, { name = "ops" }, + { name = "pydantic", specifier = ">=2.12.5" }, { name = "tenacity" }, ] From 9bf857bc6dac84bc837a1ede8dc61a7d56d25c41 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Tue, 21 Apr 2026 20:30:04 +0200 Subject: [PATCH 05/15] short uuid and subprocess attach to parent --- rollingops/pyproject.toml | 1 + rollingops/src/charmlibs/rollingops/__init__.py | 13 ++++++++++++- .../charmlibs/rollingops/_rollingops_manager.py | 1 + .../src/charmlibs/rollingops/etcd/_certificates.py | 11 ++++++++--- rollingops/src/charmlibs/rollingops/etcd/_etcd.py | 14 +++++++++++--- .../src/charmlibs/rollingops/etcd/_relations.py | 3 +-- rollingops/tests/unit/test_etcd_certificates.py | 4 ++-- rollingops/uv.lock | 11 +++++++++++ 8 files changed, 47 insertions(+), 11 deletions(-) diff --git a/rollingops/pyproject.toml b/rollingops/pyproject.toml index 9522b1f28..701b0684b 100644 --- a/rollingops/pyproject.toml +++ b/rollingops/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "dpcharmlibs-interfaces==1.0.0", "tenacity", "pydantic>=2.12.5", + "shortuuid>=1.0.13", ] [dependency-groups] diff --git a/rollingops/src/charmlibs/rollingops/__init__.py b/rollingops/src/charmlibs/rollingops/__init__.py index cc631ad82..181969351 100644 --- a/rollingops/src/charmlibs/rollingops/__init__.py +++ b/rollingops/src/charmlibs/rollingops/__init__.py @@ -27,10 +27,21 @@ RollingOpsNoRelationError, RollingOpsSyncLockError, ) -from .common._models import OperationResult, RollingOpsState, RollingOpsStatus, SyncLockBackend +from .common._models import ( + Operation, + OperationQueue, + OperationResult, + ProcessingBackend, + RollingOpsState, + RollingOpsStatus, + SyncLockBackend, +) __all__ = ( + 'Operation', + 'OperationQueue', 'OperationResult', + 'ProcessingBackend', 'RollingOpsDecodingError', 'RollingOpsEtcdNotConfiguredError', 'RollingOpsEtcdctlError', diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index 75f88a105..dbd34d1a7 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -304,6 +304,7 @@ def _run_etcd_and_mirror_or_fallback(self) -> None: logger.info('Execution mirrored to peer relation.') if outcome.status == RunWithLockStatus.EXECUTED_NOT_COMMITTED: self._fallback_current_unit_to_peer() + logger.info('Fell back to peer backend.') def _on_rollingops_etcd_failed(self, event: RollingOpsEtcdFailedEvent) -> None: """Fall back to peer when the etcd worker reports a fatal failure.""" diff --git a/rollingops/src/charmlibs/rollingops/etcd/_certificates.py b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py index e408731ae..3c53bd939 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_certificates.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py @@ -24,6 +24,8 @@ from datetime import timedelta +import shortuuid + from charmlibs import pathops from charmlibs.interfaces.tls_certificates import ( Certificate, @@ -88,7 +90,7 @@ def _has_client_cert_key_and_ca(shared: SharedCertificate) -> bool: raise RollingOpsFileSystemError('Failed to read certificates and key.') from e -def generate(common_name: str) -> SharedCertificate: +def generate(model_uuid: str, app_name: str) -> SharedCertificate: """Generate a client CA and client certificate if they do not exist. This method creates: @@ -101,8 +103,8 @@ def generate(common_name: str) -> SharedCertificate: If the certificates already exist, this method does nothing. Args: - common_name: Common Name (CN) used in the client certificate - subject. This value should not contain slashes. + model_uuid: string used to build the common name. + app_name: string used to build the common name. Raises: PebbleConnectionError: if the remote container cannot be reached @@ -115,6 +117,9 @@ def generate(common_name: str) -> SharedCertificate: CA_CERT_PATH, ) + # Produce a unique <=64-character string + raw = f'{model_uuid}-{app_name}' + common_name = shortuuid.uuid(name=raw) ca_key = PrivateKey.generate(key_size=KEY_SIZE) ca_attributes = CertificateRequestAttributes( common_name=common_name, diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py index b1278fe4c..0a1664809 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -14,7 +14,9 @@ """Classes that manage etcd concepts.""" +import ctypes import logging +import signal import subprocess import time @@ -35,6 +37,8 @@ class EtcdLease: """Manage the lifecycle of an etcd lease and its keep-alive process.""" + PR_SET_PDEATHSIG = 1 + def __init__(self): self.id: str | None = None self.keepalive_proc: subprocess.Popen[str] | None = None @@ -71,6 +75,12 @@ def revoke(self) -> None: finally: self.id = None + @staticmethod + def _set_parent_death_signal() -> None: + """Ask the kernel to send SIGTERM to the child if its parent dies.""" + libc = ctypes.CDLL('libc.so.6') + libc.prctl(EtcdLease.PR_SET_PDEATHSIG, signal.SIGTERM) + def _start_lease_keepalive(self) -> None: """Start the background process that keeps the lease alive.""" lease_id = self.id @@ -81,10 +91,8 @@ def _start_lease_keepalive(self) -> None: self.keepalive_proc = subprocess.Popen( [etcdctl.ETCDCTL_CMD, 'lease', 'keep-alive', lease_id], env=etcdctl.load_env(), - stdin=subprocess.PIPE, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, text=True, + preexec_fn=self._set_parent_death_signal, ) logger.info('Keepalive started for lease %s.', self.id) diff --git a/rollingops/src/charmlibs/rollingops/etcd/_relations.py b/rollingops/src/charmlibs/rollingops/etcd/_relations.py index 7c2ff809b..8d37f92d8 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_relations.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_relations.py @@ -112,8 +112,7 @@ def create_and_share_certificate(self) -> None: ) return - common_name = f'rollingops-{self.model.uuid}-{self.model.app.name}' - shared = certificates.generate(common_name) + shared = certificates.generate(self.model.uuid, self.model.app.name) secret = self.model.app.add_secret( content={ diff --git a/rollingops/tests/unit/test_etcd_certificates.py b/rollingops/tests/unit/test_etcd_certificates.py index 00b21e4cb..4984bb6db 100644 --- a/rollingops/tests/unit/test_etcd_certificates.py +++ b/rollingops/tests/unit/test_etcd_certificates.py @@ -123,7 +123,7 @@ def test_certificates_manager_generate_does_nothing_when_files_already_exist( temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) old_certificates = make_shared_certificate() - new_certificates = temp_certificates.generate(common_name='unit-1') + new_certificates = temp_certificates.generate(model_uuid='model', app_name='unit-1') written = SharedCertificate.from_strings( certificate=temp_certificates.CLIENT_CERT_PATH.read_text(), @@ -138,7 +138,7 @@ def test_certificates_manager_generate_does_nothing_when_files_already_exist( def test_certificates_manager_generate_creates_all_files( temp_certificates: Any, ) -> None: - shared = temp_certificates.generate(common_name='unit-1') + shared = temp_certificates.generate(model_uuid='model', app_name='unit-1') assert temp_certificates._exists() is True assert temp_certificates.CA_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') diff --git a/rollingops/uv.lock b/rollingops/uv.lock index 16c3c97a8..b5d0d3651 100644 --- a/rollingops/uv.lock +++ b/rollingops/uv.lock @@ -115,6 +115,7 @@ dependencies = [ { name = "dpcharmlibs-interfaces" }, { name = "ops" }, { name = "pydantic" }, + { name = "shortuuid" }, { name = "tenacity" }, ] @@ -135,6 +136,7 @@ requires-dist = [ { name = "dpcharmlibs-interfaces", specifier = "==1.0.0" }, { name = "ops" }, { name = "pydantic", specifier = ">=2.12.5" }, + { name = "shortuuid", specifier = ">=1.0.13" }, { name = "tenacity" }, ] @@ -425,6 +427,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "shortuuid" +version = "1.0.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/e2/bcf761f3bff95856203f9559baf3741c416071dd200c0fc19fad7f078f86/shortuuid-1.0.13.tar.gz", hash = "sha256:3bb9cf07f606260584b1df46399c0b87dd84773e7b25912b7e391e30797c5e72", size = 9662, upload-time = "2024-03-11T20:11:06.879Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/44/21d6bf170bf40b41396480d8d49ad640bca3f2b02139cd52aa1e272830a5/shortuuid-1.0.13-py3-none-any.whl", hash = "sha256:a482a497300b49b4953e15108a7913244e1bb0d41f9d332f5e9925dba33a3c5a", size = 10529, upload-time = "2024-03-11T20:11:04.807Z" }, +] + [[package]] name = "tenacity" version = "9.1.4" From bd6b2083aac86fe47a91720db6b388d61ce0028e Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Wed, 22 Apr 2026 19:59:53 +0200 Subject: [PATCH 06/15] use pipes to end refresh lease process --- .../src/charmlibs/rollingops/etcd/_etcd.py | 58 ++++++++++++++----- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py index 0a1664809..5ce1aef7f 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -14,9 +14,8 @@ """Classes that manage etcd concepts.""" -import ctypes import logging -import signal +import os import subprocess import time @@ -37,11 +36,10 @@ class EtcdLease: """Manage the lifecycle of an etcd lease and its keep-alive process.""" - PR_SET_PDEATHSIG = 1 - def __init__(self): self.id: str | None = None self.keepalive_proc: subprocess.Popen[str] | None = None + self._pipe_write_fd: int | None = None def grant(self) -> None: """Create a new lease and start the keep-alive process.""" @@ -75,12 +73,6 @@ def revoke(self) -> None: finally: self.id = None - @staticmethod - def _set_parent_death_signal() -> None: - """Ask the kernel to send SIGTERM to the child if its parent dies.""" - libc = ctypes.CDLL('libc.so.6') - libc.prctl(EtcdLease.PR_SET_PDEATHSIG, signal.SIGTERM) - def _start_lease_keepalive(self) -> None: """Start the background process that keeps the lease alive.""" lease_id = self.id @@ -88,18 +80,52 @@ def _start_lease_keepalive(self) -> None: logger.info('Lease ID is None. Keepalive for this lease cannot be started.') return etcdctl.ensure_initialized() - self.keepalive_proc = subprocess.Popen( - [etcdctl.ETCDCTL_CMD, 'lease', 'keep-alive', lease_id], - env=etcdctl.load_env(), - text=True, - preexec_fn=self._set_parent_death_signal, - ) + + pipe_read_fd, pipe_write_fd = os.pipe() + self._pipe_write_fd = pipe_write_fd + + keep_alive_cmd = f'{etcdctl.ETCDCTL_CMD} lease keep-alive {lease_id} /dev/null; wait' # noqa: E501 + try: + self.keepalive_proc = subprocess.Popen( + ['bash', '-c', keep_alive_cmd], + # The pipe read side becomes the child's stdin + # so when the parent closes its write side, this stdin gets EOF + stdin=pipe_read_fd, + env=etcdctl.load_env(), + text=True, + close_fds=True, + preexec_fn=self._close_write_side_in_child, + ) + except Exception: # OSError perhaps? + os.close(pipe_read_fd) + os.close(pipe_write_fd) + self._pipe_write_fd = None + raise + + os.close(pipe_read_fd) logger.info('Keepalive started for lease %s.', self.id) + def _close_write_side_in_child(self) -> None: + if self._pipe_write_fd is None: + return + os.close(self._pipe_write_fd) + def _stop_keepalive(self) -> None: """Terminate the keep-alive subprocess if it is running.""" + # Close the write side of the pipe to set EOF to the child's stdin + # and trigger the `read -r _` + if self._pipe_write_fd is not None: + try: + os.close(self._pipe_write_fd) + except OSError: + pass + finally: + self._pipe_write_fd = None + if self.keepalive_proc is None: return + + # Additional safeguard try: self.keepalive_proc.terminate() except ProcessLookupError: From bc29d8667643b2a0a3610e2739cbce2abd535908 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Thu, 23 Apr 2026 20:44:50 +0200 Subject: [PATCH 07/15] feat: parameterize logs location --- .../rollingops/_rollingops_manager.py | 83 ++- .../rollingops/common/_base_worker.py | 15 +- .../src/charmlibs/rollingops/common/_utils.py | 8 +- .../src/charmlibs/rollingops/etcd/_backend.py | 37 +- .../rollingops/etcd/_certificates.py | 254 +++---- .../src/charmlibs/rollingops/etcd/_etcd.py | 71 +- .../src/charmlibs/rollingops/etcd/_etcdctl.py | 627 +++++++++--------- .../charmlibs/rollingops/etcd/_relations.py | 30 +- .../charmlibs/rollingops/etcd/_rollingops.py | 19 +- .../src/charmlibs/rollingops/etcd/_worker.py | 16 +- .../src/charmlibs/rollingops/peer/_backend.py | 11 +- .../charmlibs/rollingops/peer/_rollingops.py | 9 +- .../src/charmlibs/rollingops/peer/_worker.py | 14 +- .../integration/test_etcd_rolling_ops.py | 4 +- rollingops/tests/unit/conftest.py | 73 +- .../tests/unit/test_etcd_certificates.py | 54 +- rollingops/tests/unit/test_etcd_etcdctl.py | 34 +- .../unit/test_etcd_rollingops_in_charm.py | 113 +++- .../unit/test_peer_rollingops_in_charm.py | 47 +- 19 files changed, 887 insertions(+), 632 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index dbd34d1a7..66a8d22fc 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -20,6 +20,7 @@ from ops import CharmBase, Object, Relation, RelationBrokenEvent from ops.framework import EventBase +from pydantic import ValidationError from charmlibs.rollingops.common._exceptions import ( RollingOpsDecodingError, @@ -66,11 +67,12 @@ class RollingOpsManager(Object): def __init__( self, charm: CharmBase, - peer_relation_name: str, - etcd_relation_name: str, - cluster_id: str, callback_targets: dict[str, Any], + peer_relation_name: str, + etcd_relation_name: str | None = None, + cluster_id: str | None = None, sync_lock_targets: dict[str, type[SyncLockBackend]] | None = None, + base_dir: str = '/var/lib/rollingops', ): """Create a rolling operations manager with etcd and peer backends. @@ -86,16 +88,18 @@ def __init__( Args: charm: The charm instance owning this manager. + callback_targets: Mapping of callback identifiers to callables + executed when queued operations are granted the lock. peer_relation_name: Name of the peer relation used for fallback state and operation mirroring. etcd_relation_name: Name of the relation providing etcd access. + If not provided, only peer backend is used. cluster_id: Identifier used to scope etcd-backed state for this rolling-ops instance. - callback_targets: Mapping of callback identifiers to callables - executed when queued operations are granted the lock. sync_lock_targets: Optional mapping of sync lock backend identifiers to backend implementations used when acquiring synchronous locks through the peer fallback path. + base_dir: base directory where all files related to rollingops will be written. """ super().__init__(charm, 'rolling-ops-manager') @@ -110,17 +114,23 @@ def __init__( charm=charm, relation_name=peer_relation_name, callback_targets=callback_targets, + base_dir=base_dir, ) - self.etcd_backend = EtcdRollingOpsBackend( - charm=charm, - peer_relation_name=peer_relation_name, - etcd_relation_name=etcd_relation_name, - cluster_id=cluster_id, - callback_targets=callback_targets, - ) - self.framework.observe( - charm.on[self.etcd_relation_name].relation_broken, self._on_etcd_relation_broken - ) + self.etcd_backend: EtcdRollingOpsBackend | None = None + if etcd_relation_name and cluster_id: + self.etcd_backend = EtcdRollingOpsBackend( + charm=charm, + peer_relation_name=peer_relation_name, + etcd_relation_name=etcd_relation_name, + cluster_id=cluster_id, + callback_targets=callback_targets, + base_dir=base_dir, + ) + self.framework.observe( + charm.on[etcd_relation_name].relation_broken, + self._on_etcd_relation_broken, + ) + self.framework.observe(charm.on.rollingops_lock_granted, self._on_rollingops_lock_granted) self.framework.observe(charm.on.rollingops_etcd_failed, self._on_rollingops_etcd_failed) self.framework.observe(charm.on.update_status, self._on_update_status) @@ -158,6 +168,10 @@ def _select_processing_backend(self) -> ProcessingBackend: Returns: The selected processing backend. """ + if self.etcd_backend is None: + logger.info('etcd backend not configured; selecting peer backend.') + return ProcessingBackend.PEER + if not self.etcd_backend.is_available(): logger.info('etcd backend unavailable; selecting peer backend.') return ProcessingBackend.PEER @@ -183,7 +197,8 @@ def _fallback_current_unit_to_peer(self) -> None: so that queued operations can continue without being lost. """ self._backend_state.fallback_to_peer() - self.etcd_backend.worker.stop() + if self.etcd_backend is not None: + self.etcd_backend.worker.stop() self.peer_backend.ensure_processing() def request_async_lock( @@ -240,7 +255,7 @@ def request_async_lock( 'Failed to persists operation in peer backend.' ) from e - if backend == ProcessingBackend.ETCD: + if backend == ProcessingBackend.ETCD and self.etcd_backend is not None: try: self.etcd_backend.enqueue_operation(operation) except Exception as e: @@ -250,7 +265,7 @@ def request_async_lock( ) backend = ProcessingBackend.PEER - if backend == ProcessingBackend.ETCD: + if backend == ProcessingBackend.ETCD and self.etcd_backend is not None: self.etcd_backend.ensure_processing() else: self._fallback_current_unit_to_peer() @@ -280,6 +295,11 @@ def _run_etcd_and_mirror_or_fallback(self) -> None: If etcd execution fails or mirrored state becomes inconsistent, the manager falls back to the peer backend and resumes processing there. """ + if self.etcd_backend is None: + logger.info('etcd backend not configured; using peer backend.') + self._fallback_current_unit_to_peer() + return + try: logger.info('Executing rollingop on etcd backend.') outcome = self.etcd_backend._on_run_with_lock() @@ -364,7 +384,7 @@ def acquire_sync_lock(self, backend_id: str, timeout: int): times out. RollingOpsSyncLockError: if there is an error when acquiring the lock. """ - if self.etcd_backend.is_available(): + if self.etcd_backend is not None and self.etcd_backend.is_available(): logger.info('Acquiring sync lock on etcd.') try: self.etcd_backend.acquire_sync_lock(timeout) @@ -429,7 +449,7 @@ def state(self) -> RollingOpsState: ) status = self.peer_backend.get_status() - if self._backend_state.is_etcd_managed(): + if self.etcd_backend is not None and self._backend_state.is_etcd_managed(): status = self.etcd_backend.get_status() if status == RollingOpsStatus.UNAVAILABLE: logger.info('etcd backend is not available. Falling back to peer backend.') @@ -443,11 +463,32 @@ def state(self) -> RollingOpsState: operations=operations.queue, ) + def is_waiting(self, callback_id: str, kwargs: dict[str, Any] | None = None) -> bool: + """Return whether the current unit has a pending operation matching callback and kwargs.""" + if self._peer_relation is None: + return False + + operations = PeerUnitOperations( + self.model, + self.peer_relation_name, + self.model.unit, + ).queue.operations + + kwargs = kwargs or {} + + try: + check_operation = Operation.create(callback_id=callback_id, kwargs=kwargs) + print(operations) + except ValidationError: + return False + + return any(op == check_operation for op in operations) + def _on_update_status(self, event: EventBase) -> None: """Periodic reconciliation of rolling-ops state.""" logger.info('Received a update-status event.') if self._backend_state.is_etcd_managed(): - if not self.etcd_backend.is_available(): + if self.etcd_backend is None or not self.etcd_backend.is_available(): logger.warning('etcd unavailable during update_status; falling back.') self._fallback_current_unit_to_peer() return diff --git a/rollingops/src/charmlibs/rollingops/common/_base_worker.py b/rollingops/src/charmlibs/rollingops/common/_base_worker.py index c444b344d..9c6b925f3 100644 --- a/rollingops/src/charmlibs/rollingops/common/_base_worker.py +++ b/rollingops/src/charmlibs/rollingops/common/_base_worker.py @@ -48,7 +48,13 @@ class BaseRollingOpsAsyncWorker(Object): _pid_field: str _log_filename: str - def __init__(self, charm: CharmBase, handle_name: str, peer_relation_name: str): + def __init__( + self, + charm: CharmBase, + handle_name: str, + peer_relation_name: str, + base_dir: str, + ): """Initialize the base rolling-ops worker helper. Args: @@ -56,12 +62,14 @@ def __init__(self, charm: CharmBase, handle_name: str, peer_relation_name: str): handle_name: Framework handle name used for this worker object. peer_relation_name: Name of the peer relation used by subclasses to store and retrieve worker state. + base_dir: base directory used for logs in the background process. """ super().__init__(charm, handle_name) self._charm = charm self._charm_dir = charm.charm_dir self._peer_relation_name = peer_relation_name self._handle_name = handle_name + self._base_dir = base_dir @property def _relation(self) -> Relation | None: @@ -207,12 +215,15 @@ def start(self) -> None: worker = self._worker_script_path() env = self._build_env() - with open(f'{self._log_filename}', 'a') as log_out: + log_filename = pathops.LocalPath(self._base_dir) / self._log_filename + with open(log_filename, 'a') as log_out: pid = subprocess.Popen( [ '/usr/bin/python3', '-u', str(worker), + '--base-dir', + self._base_dir, '--unit-name', self.model.unit.name, '--charm-dir', diff --git a/rollingops/src/charmlibs/rollingops/common/_utils.py b/rollingops/src/charmlibs/rollingops/common/_utils.py index dfdddb221..ebfb58809 100644 --- a/rollingops/src/charmlibs/rollingops/common/_utils.py +++ b/rollingops/src/charmlibs/rollingops/common/_utils.py @@ -24,6 +24,7 @@ from ops import pebble from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed +from charmlibs import pathops from charmlibs.pathops import PebbleConnectionError logger = logging.getLogger(__name__) @@ -61,7 +62,8 @@ def datetime_to_str(dt: datetime) -> str: def setup_logging( - log_file: str, + base_dir: str, + log_filename: str, *, unit_name: str, cluster_id: str | None = None, @@ -76,11 +78,13 @@ def setup_logging( This functions is used in the context of the background process. Args: - log_file: Path to the log file where logs should be written. + base_dir: base directory used to write the rollingops files + log_filename: name of the file where logs should be written. unit_name: Juju unit name associated with the background process. cluster_id: Optional etcd cluster identifier. owner: Optional worker owner identifier. """ + log_file = pathops.LocalPath(base_dir) / log_filename handler = RotatingFileHandler( log_file, maxBytes=10 * 1024 * 1024, # 10 MB diff --git a/rollingops/src/charmlibs/rollingops/etcd/_backend.py b/rollingops/src/charmlibs/rollingops/etcd/_backend.py index ebc616d97..7299e22e4 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_backend.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_backend.py @@ -36,8 +36,8 @@ RunWithLockStatus, UnitBackendState, ) -from charmlibs.rollingops.etcd import _etcdctl as etcdctl from charmlibs.rollingops.etcd._etcd import EtcdLease, EtcdLock, ManagerOperationStore +from charmlibs.rollingops.etcd._etcdctl import ETCDCTL_CMD, Etcdctl from charmlibs.rollingops.etcd._models import RollingOpsKeys from charmlibs.rollingops.etcd._relations import EtcdRequiresV1, SharedClientCertificateManager from charmlibs.rollingops.etcd._worker import EtcdRollingOpsAsyncWorker @@ -63,6 +63,7 @@ def __init__( etcd_relation_name: str, cluster_id: str, callback_targets: dict[str, Any], + base_dir: str, ): """Initialize the etcd-backed rolling-ops backend. @@ -75,22 +76,31 @@ def __init__( instance. callback_targets: Mapping from callback identifiers to callables executed when an operation is granted the asynchronous lock. + base_dir: base directory where all files related to rollingops will be written. """ super().__init__(charm, 'etcd-rolling-ops-manager') self._charm = charm self.peer_relation_name = peer_relation_name self.etcd_relation_name = etcd_relation_name self.callback_targets = callback_targets + self._base_dir = base_dir + + self.etcdctl = Etcdctl(self._base_dir) owner = f'{self.model.uuid}-{self.model.unit.name}'.replace('/', '-') self.worker = EtcdRollingOpsAsyncWorker( - charm, peer_relation_name=peer_relation_name, owner=owner, cluster_id=cluster_id + charm, + peer_relation_name=peer_relation_name, + owner=owner, + cluster_id=cluster_id, + base_dir=self._base_dir, ) self.keys = RollingOpsKeys.for_owner(cluster_id=cluster_id, owner=owner) self.shared_certificates = SharedClientCertificateManager( charm, peer_relation_name=peer_relation_name, + base_dir=self._base_dir, ) self.etcd = EtcdRequiresV1( @@ -98,11 +108,16 @@ def __init__( relation_name=etcd_relation_name, cluster_id=self.keys.cluster_prefix, shared_certificates=self.shared_certificates, + base_dir=self._base_dir, + ) + self._async_lock = EtcdLock( + lock_key=self.keys.lock_key, owner=owner, base_dir=self._base_dir + ) + self._sync_lock = EtcdLock( + lock_key=self.keys.lock_key, owner=f'{owner}:sync', base_dir=self._base_dir ) - self._async_lock = EtcdLock(lock_key=self.keys.lock_key, owner=owner) - self._sync_lock = EtcdLock(lock_key=self.keys.lock_key, owner=f'{owner}:sync') self._lease: EtcdLease | None = None - self.operations_store = ManagerOperationStore(self.keys, owner) + self.operations_store = ManagerOperationStore(self.keys, owner, base_dir=self._base_dir) self.framework.observe( charm.on[self.peer_relation_name].relation_departed, self._on_peer_relation_departed @@ -133,7 +148,7 @@ def is_available(self) -> bool: if self._etcd_relation is None: return False try: - etcdctl.ensure_initialized() + self.etcdctl.ensure_initialized() except Exception: return False return True @@ -159,7 +174,7 @@ def enqueue_operation(self, operation: Operation) -> None: if self._etcd_relation is None: raise RollingOpsNoEtcdRelationError - etcdctl.ensure_initialized() + self.etcdctl.ensure_initialized() backend_state = UnitBackendState(self.model, self.peer_relation_name, self.model.unit) if backend_state.cleanup_needed: @@ -186,8 +201,8 @@ def _on_etcd_relation_created(self, event: RelationCreatedEvent) -> None: Args: event: The relation-created event for the etcd relation. """ - if not etcdctl.is_etcdctl_installed(): - logger.error('%s is not installed.', etcdctl.ETCDCTL_CMD) + if not self.etcdctl.is_etcdctl_installed(): + logger.error('%s is not installed.', ETCDCTL_CMD) def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: """Handle removal of a unit from the peer relation. @@ -240,7 +255,7 @@ def request_async_lock( if not self._etcd_relation: raise RollingOpsNoEtcdRelationError - etcdctl.ensure_initialized() + self.etcdctl.ensure_initialized() if kwargs is None: kwargs = {} @@ -337,7 +352,7 @@ def acquire_sync_lock(self, timeout: int | None) -> None: TimeoutError: If the lock could not be acquired before the timeout. RollingOpsSyncLockError: if there was an error obtaining the lock. """ - self._lease = EtcdLease() + self._lease = EtcdLease(self._base_dir) deadline = None if timeout is None else time.monotonic() + timeout diff --git a/rollingops/src/charmlibs/rollingops/etcd/_certificates.py b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py index 3c53bd939..0ff310242 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_certificates.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py @@ -38,136 +38,138 @@ from charmlibs.rollingops.common._utils import with_pebble_retry from charmlibs.rollingops.etcd._models import SharedCertificate -BASE_DIR = pathops.LocalPath('/var/lib/rollingops/tls') -CA_CERT_PATH = BASE_DIR / 'client-ca.pem' -CLIENT_KEY_PATH = BASE_DIR / 'client.key' -CLIENT_CERT_PATH = BASE_DIR / 'client.pem' VALIDITY_DAYS = 365 * 50 KEY_SIZE = 4096 -def persist_client_cert_key_and_ca(shared: SharedCertificate) -> None: - """Persist the provided client certificate, key, and CA to disk. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - if _has_client_cert_key_and_ca(shared): - return - try: - with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) - shared.write_to_paths(CLIENT_CERT_PATH, CLIENT_KEY_PATH, CA_CERT_PATH) - - except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to persist client certificates and key.') from e - - -def _has_client_cert_key_and_ca(shared: SharedCertificate) -> bool: - """Return whether the provided certificate material matches local files. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - if not _exists(): - return False - try: - stored = SharedCertificate.from_paths( - CLIENT_CERT_PATH, - CLIENT_KEY_PATH, - CA_CERT_PATH, +class CertificateStore: + def __init__(self, base_dir: str): + self.base_dir = pathops.LocalPath(base_dir) / 'tls' + self.cert_path = self.base_dir / 'client.pem' + self.key_path = self.base_dir / 'client.key' + self.ca_path = self.base_dir / 'client-ca.pem' + + def persist_client_cert_key_and_ca(self, shared: SharedCertificate) -> None: + """Persist the provided client certificate, key, and CA to disk. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + if self._has_client_cert_key_and_ca(shared): + return + try: + with_pebble_retry(lambda: self.base_dir.mkdir(parents=True, exist_ok=True)) + shared.write_to_paths(self.cert_path, self.key_path, self.ca_path) + + except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError( + 'Failed to persist client certificates and key.' + ) from e + + def _has_client_cert_key_and_ca(self, shared: SharedCertificate) -> bool: + """Return whether the provided certificate material matches local files. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + if not self._exists(): + return False + try: + stored = SharedCertificate.from_paths( + self.cert_path, + self.key_path, + self.ca_path, + ) + return stored == shared + + except ( + FileNotFoundError, + IsADirectoryError, + PermissionError, + TLSCertificatesError, + ValueError, + ) as e: + raise RollingOpsFileSystemError('Failed to read certificates and key.') from e + + def generate(self, model_uuid: str, app_name: str) -> SharedCertificate: + """Generate a client CA and client certificate if they do not exist. + + This method creates: + 1. A CA private key and self-signed CA certificate. + 2. A client private key. + 3. A certificate signing request (CSR) using the provided common name. + 4. A client certificate signed by the generated CA. + + The generated files are written to disk and reused in future runs. + If the certificates already exist, this method does nothing. + + Args: + model_uuid: string used to build the common name. + app_name: string used to build the common name. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + if self._exists(): + return SharedCertificate.from_paths( + self.cert_path, + self.key_path, + self.ca_path, + ) + + # Produce a unique <=64-character string + raw = f'{model_uuid}-{app_name}' + common_name = shortuuid.uuid(name=raw) + ca_key = PrivateKey.generate(key_size=KEY_SIZE) + ca_attributes = CertificateRequestAttributes( + common_name=common_name, + is_ca=True, + add_unique_id_to_subject_name=False, ) - return stored == shared - - except ( - FileNotFoundError, - IsADirectoryError, - PermissionError, - TLSCertificatesError, - ValueError, - ) as e: - raise RollingOpsFileSystemError('Failed to read certificates and key.') from e - - -def generate(model_uuid: str, app_name: str) -> SharedCertificate: - """Generate a client CA and client certificate if they do not exist. - - This method creates: - 1. A CA private key and self-signed CA certificate. - 2. A client private key. - 3. A certificate signing request (CSR) using the provided common name. - 4. A client certificate signed by the generated CA. - - The generated files are written to disk and reused in future runs. - If the certificates already exist, this method does nothing. - - Args: - model_uuid: string used to build the common name. - app_name: string used to build the common name. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - if _exists(): - return SharedCertificate.from_paths( - CLIENT_CERT_PATH, - CLIENT_KEY_PATH, - CA_CERT_PATH, + ca_crt = Certificate.generate_self_signed_ca( + attributes=ca_attributes, + private_key=ca_key, + validity=timedelta(days=VALIDITY_DAYS), ) - # Produce a unique <=64-character string - raw = f'{model_uuid}-{app_name}' - common_name = shortuuid.uuid(name=raw) - ca_key = PrivateKey.generate(key_size=KEY_SIZE) - ca_attributes = CertificateRequestAttributes( - common_name=common_name, - is_ca=True, - add_unique_id_to_subject_name=False, - ) - ca_crt = Certificate.generate_self_signed_ca( - attributes=ca_attributes, - private_key=ca_key, - validity=timedelta(days=VALIDITY_DAYS), - ) - - client_key = PrivateKey.generate(key_size=KEY_SIZE) - - csr_attributes = CertificateRequestAttributes( - common_name=common_name, add_unique_id_to_subject_name=False - ) - csr = CertificateSigningRequest.generate( - attributes=csr_attributes, - private_key=client_key, - ) - - client_crt = Certificate.generate( - csr=csr, - ca=ca_crt, - ca_private_key=ca_key, - validity=timedelta(days=VALIDITY_DAYS), - is_ca=False, - ) - - shared = SharedCertificate( - certificate=client_crt, - key=client_key, - ca=ca_crt, - ) - - persist_client_cert_key_and_ca(shared) - return shared - - -def _exists() -> bool: - """Check whether the client certificates and CA certificate already exist. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - """ - return ( - with_pebble_retry(lambda: CA_CERT_PATH.exists()) - and with_pebble_retry(lambda: CLIENT_KEY_PATH.exists()) - and with_pebble_retry(lambda: CLIENT_CERT_PATH.exists()) - ) + client_key = PrivateKey.generate(key_size=KEY_SIZE) + + csr_attributes = CertificateRequestAttributes( + common_name=common_name, add_unique_id_to_subject_name=False + ) + csr = CertificateSigningRequest.generate( + attributes=csr_attributes, + private_key=client_key, + ) + + client_crt = Certificate.generate( + csr=csr, + ca=ca_crt, + ca_private_key=ca_key, + validity=timedelta(days=VALIDITY_DAYS), + is_ca=False, + ) + + shared = SharedCertificate( + certificate=client_crt, + key=client_key, + ca=ca_crt, + ) + + self.persist_client_cert_key_and_ca(shared) + return shared + + def _exists(self) -> bool: + """Check whether the client certificates and CA certificate already exist. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + """ + return ( + with_pebble_retry(lambda: self.ca_path.exists()) + and with_pebble_retry(lambda: self.key_path.exists()) + and with_pebble_retry(lambda: self.cert_path.exists()) + ) diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py index 5ce1aef7f..8ce7ff124 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -19,13 +19,13 @@ import subprocess import time -import charmlibs.rollingops.etcd._etcdctl as etcdctl from charmlibs.rollingops.common._exceptions import ( RollingOpsEtcdctlFatalError, RollingOpsEtcdctlParseError, RollingOpsEtcdTransactionError, ) from charmlibs.rollingops.common._models import Operation, OperationResult +from charmlibs.rollingops.etcd._etcdctl import ETCDCTL_CMD, Etcdctl from charmlibs.rollingops.etcd._models import RollingOpsKeys logger = logging.getLogger(__name__) @@ -36,14 +36,15 @@ class EtcdLease: """Manage the lifecycle of an etcd lease and its keep-alive process.""" - def __init__(self): + def __init__(self, base_dir: str): self.id: str | None = None self.keepalive_proc: subprocess.Popen[str] | None = None self._pipe_write_fd: int | None = None + self._etcdctl = Etcdctl(base_dir) def grant(self) -> None: """Create a new lease and start the keep-alive process.""" - res = etcdctl.run('lease', 'grant', LOCK_LEASE_TTL) + res = self._etcdctl.run('lease', 'grant', LOCK_LEASE_TTL) # parse: "lease 694d9c9aeca3422a granted with TTL(60s)" parts = res.split() try: @@ -61,7 +62,7 @@ def revoke(self) -> None: lease_id = self.id try: if self.id is not None: - etcdctl.run('lease', 'revoke', self.id) + self._etcdctl.run('lease', 'revoke', self.id) except Exception: logger.exception('Fail to revoke lease %s.', lease_id) raise @@ -79,19 +80,19 @@ def _start_lease_keepalive(self) -> None: if lease_id is None: logger.info('Lease ID is None. Keepalive for this lease cannot be started.') return - etcdctl.ensure_initialized() + self._etcdctl.ensure_initialized() pipe_read_fd, pipe_write_fd = os.pipe() self._pipe_write_fd = pipe_write_fd - keep_alive_cmd = f'{etcdctl.ETCDCTL_CMD} lease keep-alive {lease_id} /dev/null; wait' # noqa: E501 + keep_alive_cmd = f'{ETCDCTL_CMD} lease keep-alive {lease_id} /dev/null; wait' # noqa: E501 try: self.keepalive_proc = subprocess.Popen( ['bash', '-c', keep_alive_cmd], # The pipe read side becomes the child's stdin # so when the parent closes its write side, this stdin gets EOF stdin=pipe_read_fd, - env=etcdctl.load_env(), + env=self._etcdctl.load_env(), text=True, close_fds=True, preexec_fn=self._close_write_side_in_child, @@ -154,9 +155,10 @@ class EtcdLock: automatically released if the owner stops refreshing the lease. """ - def __init__(self, lock_key: str, owner: str): + def __init__(self, lock_key: str, owner: str, base_dir: str): self.lock_key = lock_key self.owner = owner + self._etcdctl = Etcdctl(base_dir) def try_acquire(self, lease_id: str) -> bool: """Attempt to acquire the lock. @@ -181,7 +183,7 @@ def try_acquire(self, lease_id: str) -> bool: """ - return etcdctl.txn(txn) + return self._etcdctl.txn(txn) def release(self) -> None: """Release the lock if it is currently held by this owner. @@ -200,13 +202,13 @@ def release(self) -> None: """ - etcdctl.txn(txn) + self._etcdctl.txn(txn) def is_held(self) -> bool: """Check whether the lock is currently held by the owner.""" if not self.lock_key or not self.owner: raise RollingOpsEtcdctlFatalError('Invalid input for check lock ownership operation.') - res = etcdctl.run('get', self.lock_key, '--print-value-only') + res = self._etcdctl.run('get', self.lock_key, '--print-value-only') return res == self.owner @@ -219,21 +221,22 @@ class EtcdOperationQueue: the value contains the serialized operation data. """ - def __init__(self, prefix: str, lock_key: str, owner: str): + def __init__(self, prefix: str, lock_key: str, owner: str, base_dir: str): self.prefix = prefix self.lock_key = lock_key self.owner = owner + self._etcdctl = Etcdctl(base_dir) def peek(self) -> Operation | None: """Return the first operation in the queue without removing it.""" - kv = etcdctl.get_first_key_value_pair(self.prefix) + kv = self._etcdctl.get_first_key_value_pair(self.prefix) if kv is None: return None return Operation.model_validate(kv.value) def _peek_last(self) -> Operation | None: """Return the last operation in the queue without removing it.""" - kv = etcdctl.get_last_key_value_pair(self.prefix) + kv = self._etcdctl.get_last_key_value_pair(self.prefix) if kv is None: return None return Operation.model_validate(kv.value) @@ -252,7 +255,7 @@ def move_head(self, to_queue_prefix: str) -> bool: Returns: True if the operation was moved successfully, otherwise False. """ - kv = etcdctl.get_first_key_value_pair(self.prefix) + kv = self._etcdctl.get_first_key_value_pair(self.prefix) if kv is None: return False @@ -270,7 +273,7 @@ def move_head(self, to_queue_prefix: str) -> bool: """ - return etcdctl.txn(txn) + return self._etcdctl.txn(txn) def move_operation(self, to_queue_prefix: str, operation: Operation) -> bool: """Move a specific operation from this queue to another queue. @@ -299,12 +302,12 @@ def move_operation(self, to_queue_prefix: str, operation: Operation) -> bool: """ - return etcdctl.txn(txn) + return self._etcdctl.txn(txn) def watch(self) -> Operation: """Block until at least one operation exists and return it.""" while True: - kv = etcdctl.get_first_key_value_pair(self.prefix) + kv = self._etcdctl.get_first_key_value_pair(self.prefix) if kv is not None: return Operation.model_validate(kv.value) time.sleep(10) @@ -318,7 +321,7 @@ def dequeue(self) -> bool: Returns: True if the operation was removed successfully, otherwise False. """ - kv = etcdctl.get_first_key_value_pair(self.prefix) + kv = self._etcdctl.get_first_key_value_pair(self.prefix) if kv is None: return False @@ -330,7 +333,7 @@ def dequeue(self) -> bool: """ - return etcdctl.txn(txn) + return self._etcdctl.txn(txn) def enqueue(self, operation: Operation) -> None: """Insert a new operation into the queue. @@ -353,11 +356,11 @@ def enqueue(self, operation: Operation) -> None: op_str = operation.to_string() key = f'{self.prefix}{operation.op_id}' - etcdctl.run('put', key, cmd_input=op_str) + self._etcdctl.run('put', key, cmd_input=op_str) logger.info('Operation %s added to the etcd queue.', operation.callback_id) def clear(self) -> None: - etcdctl.run('del', self.prefix, '--prefix') + self._etcdctl.run('del', self.prefix, '--prefix') class WorkerOperationStore: @@ -378,10 +381,14 @@ class WorkerOperationStore: - requeue or delete completed operations """ - def __init__(self, keys: RollingOpsKeys, owner: str): - self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner) - self._inprogress = EtcdOperationQueue(keys.inprogress, keys.lock_key, owner) - self._completed = EtcdOperationQueue(keys.completed, keys.lock_key, owner) + def __init__(self, keys: RollingOpsKeys, owner: str, base_dir: str): + self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner, base_dir=base_dir) + self._inprogress = EtcdOperationQueue( + keys.inprogress, keys.lock_key, owner, base_dir=base_dir + ) + self._completed = EtcdOperationQueue( + keys.completed, keys.lock_key, owner, base_dir=base_dir + ) def has_pending(self) -> bool: """Check whether there are pending operations. @@ -475,10 +482,14 @@ class ManagerOperationStore: Queue transitions and storage details remain encapsulated behind this API. """ - def __init__(self, keys: RollingOpsKeys, owner: str): - self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner) - self._inprogress = EtcdOperationQueue(keys.inprogress, keys.lock_key, owner) - self._completed = EtcdOperationQueue(keys.completed, keys.lock_key, owner) + def __init__(self, keys: RollingOpsKeys, owner: str, base_dir: str): + self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner, base_dir=base_dir) + self._inprogress = EtcdOperationQueue( + keys.inprogress, keys.lock_key, owner, base_dir=base_dir + ) + self._completed = EtcdOperationQueue( + keys.completed, keys.lock_key, owner, base_dir=base_dir + ) def request(self, operation: Operation) -> None: """Add a new operation to the pending queue. diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py index e5eb4f1c3..e4ee4b334 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py @@ -25,7 +25,6 @@ import shutil import subprocess from dataclasses import asdict -from functools import lru_cache from tenacity import ( before_sleep_log, @@ -48,360 +47,358 @@ logger = logging.getLogger(__name__) -BASE_DIR = pathops.LocalPath('/var/lib/rollingops/etcd') -SERVER_CA_PATH = BASE_DIR / 'server-ca.pem' -CONFIG_FILE_PATH = BASE_DIR / 'etcdctl.json' ETCDCTL_CMD = 'etcdctl' ETCDCTL_TIMEOUT_SECONDS = 15 ETCDCTL_RETRY_ATTEMPTS = 12 ETCDCTL_RETRY_WAIT_SECONDS = 5 -@lru_cache(maxsize=1) -def is_etcdctl_installed() -> bool: - """Return whether the snap-provided etcdctl command is available.""" - return shutil.which(ETCDCTL_CMD) is not None - - -def write_trusted_server_ca(tls_ca_pem: str) -> None: - """Persist the etcd server CA certificate to disk. - - Args: - tls_ca_pem: PEM-encoded CA certificate. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - try: - with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) - with_pebble_retry(lambda: SERVER_CA_PATH.write_text(tls_ca_pem, mode=CERT_MODE)) - except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to persist etcd trusted CA certificate.') from e - - -def write_config_file( - endpoints: str, - client_cert_path: pathops.LocalPath, - client_key_path: pathops.LocalPath, -) -> None: - """Create or update the etcdctl configuration JSON file. - - This function writes a JSON file containing the required ETCDCTL_* - variables used by etcdctl to connect to the etcd cluster. - - Args: - endpoints: Comma-separated list of etcd endpoints. - client_cert_path: Path to the client certificate. - client_key_path: Path to the client private key. - - Raises: - PebbleConnectionError: if the remote container cannot be reached - RollingOpsFileSystemError: if there is a problem when writing the certificates - """ - config = EtcdConfig( - endpoints=endpoints, - cacert_path=str(SERVER_CA_PATH), - cert_path=str(client_cert_path), - key_path=str(client_key_path), - ) - - try: - with_pebble_retry(lambda: BASE_DIR.mkdir(parents=True, exist_ok=True)) - with_pebble_retry( - lambda: CONFIG_FILE_PATH.write_text(json.dumps(asdict(config), indent=2), mode=0o600) +class Etcdctl: + def __init__(self, base_dir: str): + self.base_dir = pathops.LocalPath(base_dir) / 'etcd' + self.server_ca_path = self.base_dir / 'server-ca.pem' + self.config_file_path = self.base_dir / 'etcdctl.json' + + def is_etcdctl_installed(self) -> bool: + """Return whether the snap-provided etcdctl command is available.""" + return shutil.which(ETCDCTL_CMD) is not None + + def write_trusted_server_ca(self, tls_ca_pem: str) -> None: + """Persist the etcd server CA certificate to disk. + + Args: + tls_ca_pem: PEM-encoded CA certificate. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + try: + with_pebble_retry(lambda: self.base_dir.mkdir(parents=True, exist_ok=True)) + with_pebble_retry(lambda: self.server_ca_path.write_text(tls_ca_pem, mode=CERT_MODE)) + except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError( + 'Failed to persist etcd trusted CA certificate.' + ) from e + + def write_config_file( + self, + endpoints: str, + client_cert_path: pathops.LocalPath, + client_key_path: pathops.LocalPath, + ) -> None: + """Create or update the etcdctl configuration JSON file. + + This function writes a JSON file containing the required ETCDCTL_* + variables used by etcdctl to connect to the etcd cluster. + + Args: + endpoints: Comma-separated list of etcd endpoints. + client_cert_path: Path to the client certificate. + client_key_path: Path to the client private key. + + Raises: + PebbleConnectionError: if the remote container cannot be reached + RollingOpsFileSystemError: if there is a problem when writing the certificates + """ + config = EtcdConfig( + endpoints=endpoints, + cacert_path=str(self.server_ca_path), + cert_path=str(client_cert_path), + key_path=str(client_key_path), ) - except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to persist etcd config file.') from e - - -def _load_config() -> EtcdConfig: - """Load etcd configuration from disk. - - Raises: - RollingOpsEtcdNotConfiguredError: If the config file does not exist. - RollingOpsFileSystemError: if we faile to read the etcd configuration file or - file cannot be deserialized. - PebbleConnectionError: if the remote container cannot be reached - """ - if not with_pebble_retry(lambda: CONFIG_FILE_PATH.exists()): - raise RollingOpsEtcdNotConfiguredError( - f'etcdctl config file does not exist: {CONFIG_FILE_PATH}' - ) - - try: - data = json.loads(CONFIG_FILE_PATH.read_text()) - return EtcdConfig(**data) - except FileNotFoundError as e: - raise RollingOpsEtcdNotConfiguredError('etcd configuration file not found.') from e - except (IsADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to read the etcd config file.') from e - except (json.JSONDecodeError, TypeError) as e: - raise RollingOpsFileSystemError('Invalid etcd configuration file format.') from e - - -def load_env() -> dict[str, str]: - """Return environment variables for etcdctl. - - Returns: A dictionary containing environment variables to pass to subprocess calls. - - Raises: - RollingOpsEtcdNotConfiguredError: If the environment file does not exist. - RollingOpsFileSystemError: if we fail to read the etcd configuration file or - the file cannot be deserialized. - PebbleConnectionError: if the remote container cannot be reached - """ - config = _load_config() - - env = os.environ.copy() - env.update({ - 'ETCDCTL_API': '3', - 'ETCDCTL_ENDPOINTS': config.endpoints, - 'ETCDCTL_CACERT': config.cacert_path, - 'ETCDCTL_CERT': config.cert_path, - 'ETCDCTL_KEY': config.key_path, - }) - return env - - -def ensure_initialized(): - """Checks whether the etcd config file for etcdctl is setup. - - Raises: - RollingOpsEtcdNotConfiguredError: if the etcd config file does not exist, etcd - server CA does not exist or etcdctl is not installed. - PebbleConnectionError: if the remote container cannot be reached. - """ - if not with_pebble_retry(lambda: CONFIG_FILE_PATH.exists()): - raise RollingOpsEtcdNotConfiguredError( - f'etcdctl config file does not exist: {CONFIG_FILE_PATH}' - ) - if not with_pebble_retry(lambda: SERVER_CA_PATH.exists()): - raise RollingOpsEtcdNotConfiguredError( - f'etcdctl server CA file does not exist: {SERVER_CA_PATH}' - ) - if not is_etcdctl_installed(): - raise RollingOpsEtcdNotConfiguredError(f'{ETCDCTL_CMD} is not installed.') - - -def cleanup() -> None: - """Removes the etcdctl env file and the trusted etcd server CA. - - Raises: - RollingOpsFileSystemError: if there is a problem when deleting the files. - PebbleConnectionError: if the remote container cannot be reached. - """ - try: - with_pebble_retry(lambda: SERVER_CA_PATH.unlink(missing_ok=True)) - with_pebble_retry(lambda: CONFIG_FILE_PATH.unlink(missing_ok=True)) - except (IsADirectoryError, PermissionError) as e: - raise RollingOpsFileSystemError('Failed to remove etcd config file and CA.') from e - - -def _is_retryable_stderr(stderr: str) -> bool: - """Return whether stderr looks like a transient etcd/client failure.""" - text = stderr.lower() - retryable_markers = ( - 'connection refused', - 'context deadline exceeded', - 'deadline exceeded', - 'temporarily unavailable', - 'transport is closing', - 'connection reset', - 'broken pipe', - 'unavailable', - 'leader changed', - 'etcdserver: request timed out', - ) - return any(marker in text for marker in retryable_markers) + try: + with_pebble_retry(lambda: self.base_dir.mkdir(parents=True, exist_ok=True)) + with_pebble_retry( + lambda: self.config_file_path.write_text( + json.dumps(asdict(config), indent=2), mode=0o600 + ) + ) + except (FileNotFoundError, LookupError, NotADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to persist etcd config file.') from e + + def _load_config(self) -> EtcdConfig: + """Load etcd configuration from disk. + + Raises: + RollingOpsEtcdNotConfiguredError: If the config file does not exist. + RollingOpsFileSystemError: if we faile to read the etcd configuration file or + file cannot be deserialized. + PebbleConnectionError: if the remote container cannot be reached + """ + if not with_pebble_retry(lambda: self.config_file_path.exists()): + raise RollingOpsEtcdNotConfiguredError( + f'etcdctl config file does not exist: {self.config_file_path}' + ) -@retry( - retry=retry_if_exception_type(RollingOpsEtcdctlRetryableError), - stop=stop_after_attempt(ETCDCTL_RETRY_ATTEMPTS), - wait=wait_fixed(ETCDCTL_RETRY_WAIT_SECONDS), - before_sleep=before_sleep_log(logger, logging.WARNING), - reraise=True, -) -def _run_checked(*args: str, cmd_input: str | None = None) -> subprocess.CompletedProcess[str]: - """Execute etcdctl and return the completed process. - - Raises: - RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. - PebbleConnectionError: if the remote container cannot be reached. - RollingOpsEtcdctlRetryableError: for transient command failures. - RollingOpsEtcdctlFatalError: for non-retryable command failures. - """ - ensure_initialized() - - cmd = [ETCDCTL_CMD, *args] - - try: - res = subprocess.run( - cmd, - env=load_env(), - input=cmd_input, - text=True, - capture_output=True, - check=False, - timeout=ETCDCTL_TIMEOUT_SECONDS, - ) - except subprocess.TimeoutExpired as e: - logger.warning( - 'Timed out running etcdctl: cmd=%r stdout=%r stderr=%r', cmd, e.stdout, e.stderr - ) - raise RollingOpsEtcdctlRetryableError(f'Timed out running etcdctl: {cmd!r}') from e - except FileNotFoundError as e: - logger.exception('etcdctl executable not found: %s', ETCDCTL_CMD) - raise RollingOpsEtcdctlFatalError(f'etcdctl executable not found: {ETCDCTL_CMD}') from e - except OSError as e: - logger.exception('Failed to execute etcdctl: cmd=%r', cmd) - raise RollingOpsEtcdctlFatalError(f'Failed to execute etcdctl: {cmd!r}') from e - - if res.returncode != 0: - logger.warning( - 'etcdctl command failed: cmd=%r returncode=%s stdout=%r stderr=%r', - cmd, - res.returncode, - res.stdout, - res.stderr, - ) - if _is_retryable_stderr(res.stderr): - raise RollingOpsEtcdctlRetryableError( - f'Retryable etcdctl failure (rc={res.returncode}): {res.stderr.strip()}' + try: + data = json.loads(self.config_file_path.read_text()) + return EtcdConfig(**data) + except FileNotFoundError as e: + raise RollingOpsEtcdNotConfiguredError('etcd configuration file not found.') from e + except (IsADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to read the etcd config file.') from e + except (json.JSONDecodeError, TypeError) as e: + raise RollingOpsFileSystemError('Invalid etcd configuration file format.') from e + + def load_env(self) -> dict[str, str]: + """Return environment variables for etcdctl. + + Returns: A dictionary containing environment variables to pass to subprocess calls. + + Raises: + RollingOpsEtcdNotConfiguredError: If the environment file does not exist. + RollingOpsFileSystemError: if we fail to read the etcd configuration file or + the file cannot be deserialized. + PebbleConnectionError: if the remote container cannot be reached + """ + config = self._load_config() + + env = os.environ.copy() + env.update({ + 'ETCDCTL_API': '3', + 'ETCDCTL_ENDPOINTS': config.endpoints, + 'ETCDCTL_CACERT': config.cacert_path, + 'ETCDCTL_CERT': config.cert_path, + 'ETCDCTL_KEY': config.key_path, + }) + return env + + def ensure_initialized(self): + """Checks whether the etcd config file for etcdctl is setup. + + Raises: + RollingOpsEtcdNotConfiguredError: if the etcd config file does not exist, etcd + server CA does not exist or etcdctl is not installed. + PebbleConnectionError: if the remote container cannot be reached. + """ + if not with_pebble_retry(lambda: self.config_file_path.exists()): + raise RollingOpsEtcdNotConfiguredError( + f'etcdctl config file does not exist: {self.config_file_path}' ) - raise RollingOpsEtcdctlFatalError( - f'etcdctl failed (rc={res.returncode}): {res.stderr.strip()}' + if not with_pebble_retry(lambda: self.server_ca_path.exists()): + raise RollingOpsEtcdNotConfiguredError( + f'etcdctl server CA file does not exist: {self.server_ca_path}' + ) + if not self.is_etcdctl_installed(): + raise RollingOpsEtcdNotConfiguredError(f'{ETCDCTL_CMD} is not installed.') + + def cleanup(self) -> None: + """Removes the etcdctl env file and the trusted etcd server CA. + + Raises: + RollingOpsFileSystemError: if there is a problem when deleting the files. + PebbleConnectionError: if the remote container cannot be reached. + """ + try: + with_pebble_retry(lambda: self.server_ca_path.unlink(missing_ok=True)) + with_pebble_retry(lambda: self.config_file_path.unlink(missing_ok=True)) + except (IsADirectoryError, PermissionError) as e: + raise RollingOpsFileSystemError('Failed to remove etcd config file and CA.') from e + + def _is_retryable_stderr(self, stderr: str) -> bool: + """Return whether stderr looks like a transient etcd/client failure.""" + text = stderr.lower() + retryable_markers = ( + 'connection refused', + 'context deadline exceeded', + 'deadline exceeded', + 'temporarily unavailable', + 'transport is closing', + 'connection reset', + 'broken pipe', + 'unavailable', + 'leader changed', + 'etcdserver: request timed out', ) + return any(marker in text for marker in retryable_markers) + + @retry( + retry=retry_if_exception_type(RollingOpsEtcdctlRetryableError), + stop=stop_after_attempt(ETCDCTL_RETRY_ATTEMPTS), + wait=wait_fixed(ETCDCTL_RETRY_WAIT_SECONDS), + before_sleep=before_sleep_log(logger, logging.WARNING), + reraise=True, + ) + def _run_checked( + self, *args: str, cmd_input: str | None = None + ) -> subprocess.CompletedProcess[str]: + """Execute etcdctl and return the completed process. + + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlRetryableError: for transient command failures. + RollingOpsEtcdctlFatalError: for non-retryable command failures. + """ + self.ensure_initialized() + + cmd = [ETCDCTL_CMD, *args] + + try: + res = subprocess.run( + cmd, + env=self.load_env(), + input=cmd_input, + text=True, + capture_output=True, + check=False, + timeout=ETCDCTL_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + logger.warning( + 'Timed out running etcdctl: cmd=%r stdout=%r stderr=%r', cmd, e.stdout, e.stderr + ) + raise RollingOpsEtcdctlRetryableError(f'Timed out running etcdctl: {cmd!r}') from e + except FileNotFoundError as e: + logger.exception('etcdctl executable not found: %s', ETCDCTL_CMD) + raise RollingOpsEtcdctlFatalError( + f'etcdctl executable not found: {ETCDCTL_CMD}' + ) from e + except OSError as e: + logger.exception('Failed to execute etcdctl: cmd=%r', cmd) + raise RollingOpsEtcdctlFatalError(f'Failed to execute etcdctl: {cmd!r}') from e + + if res.returncode != 0: + logger.warning( + 'etcdctl command failed: cmd=%r returncode=%s stdout=%r stderr=%r', + cmd, + res.returncode, + res.stdout, + res.stderr, + ) + if self._is_retryable_stderr(res.stderr): + raise RollingOpsEtcdctlRetryableError( + f'Retryable etcdctl failure (rc={res.returncode}): {res.stderr.strip()}' + ) + raise RollingOpsEtcdctlFatalError( + f'etcdctl failed (rc={res.returncode}): {res.stderr.strip()}' + ) - logger.debug('etcdctl command succeeded: cmd=%r stdout=%r', cmd, res.stdout) - return res - - -def run(*args: str, cmd_input: str | None = None) -> str: - """Execute an etcdctl command. - - Args: - args: List of arguments to pass to etcdctl. - cmd_input: value to use as input when running the command. - - Returns: - The stdout of the command, stripped, or None if execution failed. - - Raises: - RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. - RollingOpsFileSystemError: if configuration cannot be read. - PebbleConnectionError: if the remote container cannot be reached. - RollingOpsEtcdctlError: etcdctl command error. - """ - return _run_checked(*args, cmd_input=cmd_input).stdout.strip() + logger.debug('etcdctl command succeeded: cmd=%r stdout=%r', cmd, res.stdout) + return res + def run(self, *args: str, cmd_input: str | None = None) -> str: + """Execute an etcdctl command. -def _get_key_value_pair(key_prefix: str, *extra_args: str) -> EtcdKV | None: - """Retrieve the first key and value under a given prefix. + Args: + args: List of arguments to pass to etcdctl. + cmd_input: value to use as input when running the command. - Args: - key_prefix: Key prefix to search for. - extra_args: Arguments to the get command + Returns: + The stdout of the command, stripped, or None if execution failed. - Returns: - A EtcdKV containing: - - The key string - - The parsed JSON value as a dictionary + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + RollingOpsFileSystemError: if configuration cannot be read. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlError: etcdctl command error. + """ + return self._run_checked(*args, cmd_input=cmd_input).stdout.strip() - Returns None if no key exists. + def _get_key_value_pair(self, key_prefix: str, *extra_args: str) -> EtcdKV | None: + """Retrieve the first key and value under a given prefix. - Raises: - RollingOpsEtcdctlParseError: if the output is malformed + Args: + key_prefix: Key prefix to search for. + extra_args: Arguments to the get command - """ - res = run('get', key_prefix, '--prefix', *extra_args) - out = res.splitlines() - if len(out) < 2: - return None + Returns: + A EtcdKV containing: + - The key string + - The parsed JSON value as a dictionary - try: - value = json.loads(out[1]) - except json.JSONDecodeError as e: - raise RollingOpsEtcdctlParseError( - f'Failed to parse JSON value for key {out[0]}: {out[1]}' - ) from e + Returns None if no key exists. - return EtcdKV(key=out[0], value=value) + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + """ + res = self.run('get', key_prefix, '--prefix', *extra_args) + out = res.splitlines() + if len(out) < 2: + return None -def get_first_key_value_pair(key_prefix: str) -> EtcdKV | None: - """Retrieve the first key and value under a given prefix. + try: + value = json.loads(out[1]) + except json.JSONDecodeError as e: + raise RollingOpsEtcdctlParseError( + f'Failed to parse JSON value for key {out[0]}: {out[1]}' + ) from e - Args: - key_prefix: Key prefix to search for. + return EtcdKV(key=out[0], value=value) - Returns: - A tuple containing: - - The key string - - The parsed JSON value as a dictionary + def get_first_key_value_pair(self, key_prefix: str) -> EtcdKV | None: + """Retrieve the first key and value under a given prefix. - Returns None if no key exists or the command fails. + Args: + key_prefix: Key prefix to search for. - Raises: - RollingOpsEtcdctlParseError: if the output is malformed - """ - return _get_key_value_pair(key_prefix, '--limit=1') + Returns: + A tuple containing: + - The key string + - The parsed JSON value as a dictionary + Returns None if no key exists or the command fails. -def get_last_key_value_pair(key_prefix: str) -> EtcdKV | None: - """Retrieve the last key and value under a given prefix. + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + """ + return self._get_key_value_pair(key_prefix, '--limit=1') - Args: - key_prefix: Key prefix to search for. + def get_last_key_value_pair(self, key_prefix: str) -> EtcdKV | None: + """Retrieve the last key and value under a given prefix. - Returns: - A tuple containing: - - The key string - - The parsed JSON value as a dictionary + Args: + key_prefix: Key prefix to search for. - Returns None if no key exists or the command fails. + Returns: + A tuple containing: + - The key string + - The parsed JSON value as a dictionary - Raises: - RollingOpsEtcdctlParseError: if the output is malformed - """ - return _get_key_value_pair( - key_prefix, - '--sort-by=KEY', - '--order=DESCEND', - '--limit=1', - ) + Returns None if no key exists or the command fails. + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + """ + return self._get_key_value_pair( + key_prefix, + '--sort-by=KEY', + '--order=DESCEND', + '--limit=1', + ) -def txn(txn_input: str) -> bool: - """Execute an etcd transaction. + def txn(self, txn_input: str) -> bool: + """Execute an etcd transaction. - The transaction string should follow the etcdctl transaction format - where comparison statements are followed by operations. + The transaction string should follow the etcdctl transaction format + where comparison statements are followed by operations. - Args: - txn_input: The transaction specification passed to `etcdctl txn`. + Args: + txn_input: The transaction specification passed to `etcdctl txn`. - Returns: - True if the transaction succeeded, otherwise False. + Returns: + True if the transaction succeeded, otherwise False. - Raises: - RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. - PebbleConnectionError: if the remote container cannot be reached. - RollingOpsEtcdctlError: etcdctl command error. - RollingOpsEtcdctlParseError: if invalid response is found - """ - res = _run_checked('txn', cmd_input=txn_input) + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlError: etcdctl command error. + RollingOpsEtcdctlParseError: if invalid response is found + """ + res = self._run_checked('txn', cmd_input=txn_input) - lines = res.stdout.splitlines() - if not lines: - raise RollingOpsEtcdctlParseError('Empty txn response') + lines = res.stdout.splitlines() + if not lines: + raise RollingOpsEtcdctlParseError('Empty txn response') - first_line = lines[0].strip() + first_line = lines[0].strip() - if first_line == 'SUCCESS': - return True - if first_line == 'FAILURE': - return False + if first_line == 'SUCCESS': + return True + if first_line == 'FAILURE': + return False - raise RollingOpsEtcdctlParseError(f'Unexpected txn response: {res.stdout}') + raise RollingOpsEtcdctlParseError(f'Unexpected txn response: {res.stdout}') diff --git a/rollingops/src/charmlibs/rollingops/etcd/_relations.py b/rollingops/src/charmlibs/rollingops/etcd/_relations.py index 8d37f92d8..191bd51b1 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_relations.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_relations.py @@ -33,8 +33,8 @@ from charmlibs.interfaces.tls_certificates import Certificate, TLSCertificatesError from charmlibs.rollingops.common._exceptions import RollingOpsInvalidSecretContentError -from charmlibs.rollingops.etcd import _certificates as certificates -from charmlibs.rollingops.etcd import _etcdctl as etcdctl +from charmlibs.rollingops.etcd._certificates import CertificateStore +from charmlibs.rollingops.etcd._etcdctl import Etcdctl from charmlibs.rollingops.etcd._models import SharedCertificate logger = logging.getLogger(__name__) @@ -48,10 +48,11 @@ class SharedClientCertificateManager(Object): """Manage the shared rollingops client certificate via peer relation secret.""" - def __init__(self, charm: CharmBase, peer_relation_name: str) -> None: + def __init__(self, charm: CharmBase, peer_relation_name: str, base_dir: str) -> None: super().__init__(charm, 'shared-client-certificate') self.charm = charm self.peer_relation_name = peer_relation_name + self.certificates_store = CertificateStore(base_dir) self.framework.observe(charm.on.leader_elected, self._on_leader_elected) self.framework.observe( @@ -112,7 +113,7 @@ def create_and_share_certificate(self) -> None: ) return - shared = certificates.generate(self.model.uuid, self.model.app.name) + shared = self.certificates_store.generate(self.model.uuid, self.model.app.name) secret = self.model.app.add_secret( content={ @@ -176,7 +177,7 @@ def sync_to_local_files(self) -> None: logger.info('Shared rollingops etcd client certificate is not available yet.') return - certificates.persist_client_cert_key_and_ca(shared) + self.certificates_store.persist_client_cert_key_and_ca(shared) def get_local_request_cert(self) -> Certificate | None: """Return the cert to place in relation requests.""" @@ -193,11 +194,14 @@ def __init__( relation_name: str, cluster_id: str, shared_certificates: SharedClientCertificateManager, + base_dir: str, ) -> None: super().__init__(charm, f'requirer-{relation_name}') self.charm = charm self.cluster_id = cluster_id self.shared_certificates = shared_certificates + self.certificates_store = CertificateStore(base_dir) + self.etcdctl = Etcdctl(base_dir) self.etcd_interface = ResourceRequirerEventHandler( self.charm, @@ -220,7 +224,7 @@ def etcd_relation(self) -> Relation | None: def _on_relation_broken(self, event: RelationBrokenEvent) -> None: """Remove the stored information about the etcd server.""" - etcdctl.cleanup() + self.etcdctl.cleanup() def _on_endpoints_changed( self, event: ResourceEndpointsChangedEvent[ResourceProviderModel] @@ -240,10 +244,10 @@ def _on_endpoints_changed( logger.info('etcd endpoints changed: %s', response.endpoints) - etcdctl.write_config_file( + self.etcdctl.write_config_file( endpoints=response.endpoints, - client_cert_path=certificates.CLIENT_CERT_PATH, - client_key_path=certificates.CLIENT_KEY_PATH, + client_cert_path=self.certificates_store.cert_path, + client_key_path=self.certificates_store.key_path, ) def _on_resource_created(self, event: ResourceCreatedEvent[ResourceProviderModel]) -> None: @@ -260,16 +264,16 @@ def _on_resource_created(self, event: ResourceCreatedEvent[ResourceProviderModel ) return - etcdctl.write_trusted_server_ca(tls_ca_pem=response.tls_ca) + self.etcdctl.write_trusted_server_ca(tls_ca_pem=response.tls_ca) if not response.endpoints: logger.error('Received a resource created event but no etcd endpoints available.') return - etcdctl.write_config_file( + self.etcdctl.write_config_file( endpoints=response.endpoints, - client_cert_path=certificates.CLIENT_CERT_PATH, - client_key_path=certificates.CLIENT_KEY_PATH, + client_cert_path=self.certificates_store.cert_path, + client_key_path=self.certificates_store.key_path, ) def client_requests(self) -> list[RequirerCommonModel]: diff --git a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py index 9d45b183a..228f7d457 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py @@ -64,6 +64,12 @@ def main(): before exiting. """ parser = argparse.ArgumentParser(description='RollingOps etcd worker') + parser.add_argument( + '--base-dir', + type=str, + required=True, + help='Base directory used to store all rollingops files.', + ) parser.add_argument( '--unit-name', type=str, @@ -91,17 +97,22 @@ def main(): ) args = parser.parse_args() + base_dir = args.base_dir setup_logging( - ETCD_LOG_FILENAME, unit_name=args.unit_name, owner=args.owner, cluster_id=args.cluster_id + base_dir=base_dir, + log_filename=ETCD_LOG_FILENAME, + unit_name=args.unit_name, + owner=args.owner, + cluster_id=args.cluster_id, ) logger.info('Starting worker.') time.sleep(INITIAL_SLEEP) keys = RollingOpsKeys.for_owner(args.cluster_id, args.owner) - lock = EtcdLock(keys.lock_key, args.owner) - lease = EtcdLease() - operations = WorkerOperationStore(keys, args.owner) + lock = EtcdLock(keys.lock_key, args.owner, base_dir) + lease = EtcdLease(base_dir) + operations = WorkerOperationStore(keys, args.owner, base_dir) try: while True: diff --git a/rollingops/src/charmlibs/rollingops/etcd/_worker.py b/rollingops/src/charmlibs/rollingops/etcd/_worker.py index 8240b4378..d5535687e 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_worker.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_worker.py @@ -23,7 +23,8 @@ logger = logging.getLogger(__name__) -ETCD_LOG_FILENAME = '/var/log/etcd_rollingops_worker.log' +ETCD_LOG_FILENAME = 'etcd_rollingops_worker.log' +WORKER_PID_FIELD = 'etcd-rollingops-worker-pid' class EtcdRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): @@ -35,11 +36,18 @@ class EtcdRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): manage its own worker lifecycle. """ - _pid_field = 'etcd-rollingops-worker-pid' + _pid_field = WORKER_PID_FIELD _log_filename = ETCD_LOG_FILENAME - def __init__(self, charm: CharmBase, peer_relation_name: str, owner: str, cluster_id: str): - super().__init__(charm, 'etcd-rollingops-async-worker', peer_relation_name) + def __init__( + self, charm: CharmBase, peer_relation_name: str, owner: str, cluster_id: str, base_dir: str + ): + super().__init__( + charm, + 'etcd-rollingops-async-worker', + peer_relation_name, + base_dir=base_dir, + ) self._owner = owner self._cluster_id = cluster_id diff --git a/rollingops/src/charmlibs/rollingops/peer/_backend.py b/rollingops/src/charmlibs/rollingops/peer/_backend.py index 310285c95..a6fa89825 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_backend.py +++ b/rollingops/src/charmlibs/rollingops/peer/_backend.py @@ -197,7 +197,11 @@ class PeerRollingOpsBackend(Object): """ def __init__( - self, charm: CharmBase, relation_name: str, callback_targets: dict[str, Callable[..., Any]] + self, + charm: CharmBase, + relation_name: str, + callback_targets: dict[str, Callable[..., Any]], + base_dir: str, ): """Initialize the peer-backed rolling-ops backend. @@ -207,12 +211,15 @@ def __init__( operation state. callback_targets: Mapping from callback identifiers to callables executed when this unit is granted the lock. + base_dir: base directory where all files related to rollingops will be written. """ super().__init__(charm, 'peer-rolling-ops-manager') self._charm = charm self.relation_name = relation_name self.callback_targets = callback_targets - self.worker = PeerRollingOpsAsyncWorker(charm, relation_name=relation_name) + self.worker = PeerRollingOpsAsyncWorker( + charm, relation_name=relation_name, base_dir=base_dir + ) self.framework.observe( charm.on[self.relation_name].relation_changed, self._on_relation_changed diff --git a/rollingops/src/charmlibs/rollingops/peer/_rollingops.py b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py index 8dae3c0f1..b8c89b236 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py @@ -24,6 +24,12 @@ def main(): """Juju hook event dispatcher.""" parser = argparse.ArgumentParser(description='RollingOps peer worker') + parser.add_argument( + '--base-dir', + type=str, + required=True, + help='Base directory used to store all rollingops files.', + ) parser.add_argument( '--unit-name', type=str, @@ -37,7 +43,8 @@ def main(): help='Path to the charm directory', ) args = parser.parse_args() - setup_logging(PEER_LOG_FILENAME, unit_name=args.unit_name) + base_dir = args.base_dir + setup_logging(base_dir=base_dir, log_filename=PEER_LOG_FILENAME, unit_name=args.unit_name) # Sleep so that the leader unit can properly leave the hook and start a new one time.sleep(10) diff --git a/rollingops/src/charmlibs/rollingops/peer/_worker.py b/rollingops/src/charmlibs/rollingops/peer/_worker.py index ca4da358c..410a760f4 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_worker.py +++ b/rollingops/src/charmlibs/rollingops/peer/_worker.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) -PEER_LOG_FILENAME = '/var/log/peer_rollingops_worker.log' +PEER_LOG_FILENAME = 'peer_rollingops_worker.log' +WORKER_PID_FIELD = 'peer-rollingops-worker-pid' class PeerRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): @@ -38,11 +39,16 @@ class PeerRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): stop, or restart an existing worker process as needed. """ - _pid_field = 'peer-rollingops-worker-pid' + _pid_field = WORKER_PID_FIELD _log_filename = PEER_LOG_FILENAME - def __init__(self, charm: CharmBase, relation_name: str): - super().__init__(charm, 'peer-rollingops-async-worker', relation_name) + def __init__(self, charm: CharmBase, relation_name: str, base_dir: str): + super().__init__( + charm, + 'peer-rollingops-async-worker', + relation_name, + base_dir=base_dir, + ) @property def _app_data(self) -> RelationDataContent: diff --git a/rollingops/tests/integration/test_etcd_rolling_ops.py b/rollingops/tests/integration/test_etcd_rolling_ops.py index 4172d416e..662918c97 100644 --- a/rollingops/tests/integration/test_etcd_rolling_ops.py +++ b/rollingops/tests/integration/test_etcd_rolling_ops.py @@ -31,8 +31,8 @@ logger = logging.getLogger(__name__) TIMEOUT = 15 * 60.0 -ETCD_PROCESS_LOGS = '/var/log/etcd_rollingops_worker.log' -PEER_PROCCES_LOGS = '/var/log/peer_rollingops_worker.log' +ETCD_PROCESS_LOGS = '/var/lib/rollingops/etcd_rollingops_worker.log' +PEER_PROCCES_LOGS = '/var/lib/rollingops/peer_rollingops_worker.log' ETCD_CONFIG_FILE = '/var/lib/rollingops/etcd/etcdctl.json' diff --git a/rollingops/tests/unit/conftest.py b/rollingops/tests/unit/conftest.py index bb3c46f61..4ccffbe2b 100644 --- a/rollingops/tests/unit/conftest.py +++ b/rollingops/tests/unit/conftest.py @@ -14,7 +14,6 @@ """Fixtures for unit tests, typically mocking out parts of the external system.""" -import types from collections.abc import Generator from pathlib import Path from typing import Any @@ -31,7 +30,6 @@ Certificate, PrivateKey, ) -from charmlibs.pathops import LocalPath from charmlibs.rollingops import RollingOpsManager from charmlibs.rollingops.common._models import OperationResult from charmlibs.rollingops.etcd._models import SharedCertificate @@ -104,50 +102,28 @@ @pytest.fixture -def temp_certificates(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> types.ModuleType: - base_dir = LocalPath(str(tmp_path)) / 'tls' - ca_cert = base_dir / 'client-ca.pem' - client_key = base_dir / 'client.key' - client_cert = base_dir / 'client.pem' - - monkeypatch.setattr(certificates, 'BASE_DIR', base_dir) - monkeypatch.setattr(certificates, 'CA_CERT_PATH', ca_cert) - monkeypatch.setattr(certificates, 'CLIENT_KEY_PATH', client_key) - monkeypatch.setattr(certificates, 'CLIENT_CERT_PATH', client_cert) - - base_dir.mkdir(parents=True, exist_ok=True) - return certificates - - -@pytest.fixture -def temp_etcdctl(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> types.ModuleType: - base_dir = LocalPath(str(tmp_path)) / 'etcd' - server_ca = base_dir / 'server-ca.pem' - env_file = base_dir / 'etcdctl.json' - - monkeypatch.setattr(etcdctl, 'BASE_DIR', base_dir) - monkeypatch.setattr(etcdctl, 'SERVER_CA_PATH', server_ca) - monkeypatch.setattr(etcdctl, 'CONFIG_FILE_PATH', env_file) - - base_dir.mkdir(parents=True, exist_ok=True) - return etcdctl +def temp_certificates(tmp_path: Path) -> certificates.CertificateStore: + client = certificates.CertificateStore(str(tmp_path)) + client.base_dir.mkdir(parents=True, exist_ok=True) + return client @pytest.fixture -def etcdctl_patch() -> Generator[MagicMock, None, None]: - with patch('charmlibs.rollingops.etcd._certificates') as mock_etcdctl: - yield mock_etcdctl +def temp_etcdctl(tmp_path: Path) -> etcdctl.Etcdctl: + client = etcdctl.Etcdctl(str(tmp_path)) + client.base_dir.mkdir(parents=True, exist_ok=True) + return client @pytest.fixture def certificates_manager_patches() -> Generator[dict[str, MagicMock], None, None]: with ( patch( - 'charmlibs.rollingops.etcd._certificates._exists', + 'charmlibs.rollingops.etcd._certificates.CertificateStore._exists', return_value=False, ), patch( - 'charmlibs.rollingops.etcd._certificates.generate', + 'charmlibs.rollingops.etcd._certificates.CertificateStore.generate', return_value=SharedCertificate( certificate=Certificate.from_string(VALID_CLIENT_CERT_PEM), key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), @@ -155,13 +131,13 @@ def certificates_manager_patches() -> Generator[dict[str, MagicMock], None, None ), ) as mock_generate, patch( - 'charmlibs.rollingops.etcd._certificates.persist_client_cert_key_and_ca', + 'charmlibs.rollingops.etcd._certificates.CertificateStore.persist_client_cert_key_and_ca', return_value=None, - ) as mock_persit, + ) as mock_persist, ): yield { 'generate': mock_generate, - 'persist': mock_persit, + 'persist': mock_persist, } @@ -282,3 +258,26 @@ def charm_test() -> type[RollingOpsCharm]: @pytest.fixture def ctx(charm_test: type[RollingOpsCharm]) -> Context[RollingOpsCharm]: return Context(charm_test, meta=meta, actions=actions) + + +class StrictPeerRollingOpsCharm(ops.CharmBase): + def __init__(self, framework: ops.Framework): + super().__init__(framework) + + self.restart_manager = RollingOpsManager( + charm=self, + peer_relation_name='restart', + callback_targets={}, + ) + + +@pytest.fixture +def strict_peer_charm_test() -> type[StrictPeerRollingOpsCharm]: + return StrictPeerRollingOpsCharm + + +@pytest.fixture +def strict_peer_ctx( + charm_test: type[StrictPeerRollingOpsCharm], +) -> Context[StrictPeerRollingOpsCharm]: + return Context(charm_test, meta=meta, actions=actions) diff --git a/rollingops/tests/unit/test_etcd_certificates.py b/rollingops/tests/unit/test_etcd_certificates.py index 4984bb6db..e0218baa0 100644 --- a/rollingops/tests/unit/test_etcd_certificates.py +++ b/rollingops/tests/unit/test_etcd_certificates.py @@ -48,7 +48,7 @@ def test_certificates_manager_exists_returns_false_when_no_files( def test_certificates_manager_exists_returns_false_when_cert_does_not_exist( temp_certificates: Any, ) -> None: - temp_certificates.CLIENT_KEY_PATH.write_text('client-key') + temp_certificates.key_path.write_text('client-key') assert temp_certificates._exists() is False @@ -56,7 +56,7 @@ def test_certificates_manager_exists_returns_false_when_cert_does_not_exist( def test_certificates_manager_exists_returns_false_when_key_does_not_exist( temp_certificates: Any, ) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text('client-cert') + temp_certificates.cert_path.write_text('client-cert') assert temp_certificates._exists() is False @@ -64,9 +64,9 @@ def test_certificates_manager_exists_returns_false_when_key_does_not_exist( def test_certificates_manager_exists_returns_true_when_all_files_exist( temp_certificates: Any, ) -> None: - temp_certificates.CLIENT_KEY_PATH.write_text('client-key') - temp_certificates.CLIENT_CERT_PATH.write_text('client-cert') - temp_certificates.CA_CERT_PATH.write_text('ca-cert') + temp_certificates.key_path.write_text('client-key') + temp_certificates.cert_path.write_text('client-cert') + temp_certificates.ca_path.write_text('ca-cert') assert temp_certificates._exists() is True @@ -77,9 +77,9 @@ def test_certificates_manager_persist_client_cert_and_key_writes_files( shared_certificate = make_shared_certificate() temp_certificates.persist_client_cert_key_and_ca(shared_certificate) - assert temp_certificates.CLIENT_CERT_PATH.read_text() == shared_certificate.certificate.raw - assert temp_certificates.CLIENT_KEY_PATH.read_text() == shared_certificate.key.raw - assert temp_certificates.CA_CERT_PATH.read_text() == shared_certificate.ca.raw + assert temp_certificates.cert_path.read_text() == shared_certificate.certificate.raw + assert temp_certificates.key_path.read_text() == shared_certificate.key.raw + assert temp_certificates.ca_path.read_text() == shared_certificate.ca.raw def test_certificates_manager_has_client_cert_and_key_returns_false_when_files_missing( @@ -92,9 +92,9 @@ def test_certificates_manager_has_client_cert_and_key_returns_false_when_files_m def test_certificates_manager_has_client_cert_and_key_returns_true_when_material_matches( temp_certificates: Any, ) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) - temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) - temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) + temp_certificates.cert_path.write_text(VALID_CLIENT_CERT_PEM) + temp_certificates.key_path.write_text(VALID_CLIENT_KEY_PEM) + temp_certificates.ca_path.write_text(VALID_CA_CERT_PEM) shared_certificate = make_shared_certificate() assert temp_certificates._has_client_cert_key_and_ca(shared_certificate) is True @@ -103,9 +103,9 @@ def test_certificates_manager_has_client_cert_and_key_returns_true_when_material def test_certificates_manager_has_client_cert_and_key_returns_false_when_material_differs( temp_certificates: Any, ) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) - temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) - temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) + temp_certificates.cert_path.write_text(VALID_CLIENT_CERT_PEM) + temp_certificates.key_path.write_text(VALID_CLIENT_KEY_PEM) + temp_certificates.ca_path.write_text(VALID_CA_CERT_PEM) other_shared_certificate = SharedCertificate( certificate=Certificate.from_string(VALID_CA_CERT_PEM), @@ -118,17 +118,17 @@ def test_certificates_manager_has_client_cert_and_key_returns_false_when_materia def test_certificates_manager_generate_does_nothing_when_files_already_exist( temp_certificates: Any, ) -> None: - temp_certificates.CLIENT_CERT_PATH.write_text(VALID_CLIENT_CERT_PEM) - temp_certificates.CLIENT_KEY_PATH.write_text(VALID_CLIENT_KEY_PEM) - temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) + temp_certificates.cert_path.write_text(VALID_CLIENT_CERT_PEM) + temp_certificates.key_path.write_text(VALID_CLIENT_KEY_PEM) + temp_certificates.ca_path.write_text(VALID_CA_CERT_PEM) old_certificates = make_shared_certificate() new_certificates = temp_certificates.generate(model_uuid='model', app_name='unit-1') written = SharedCertificate.from_strings( - certificate=temp_certificates.CLIENT_CERT_PATH.read_text(), - key=temp_certificates.CLIENT_KEY_PATH.read_text(), - ca=temp_certificates.CA_CERT_PATH.read_text(), + certificate=temp_certificates.cert_path.read_text(), + key=temp_certificates.key_path.read_text(), + ca=temp_certificates.ca_path.read_text(), ) assert written == old_certificates @@ -141,12 +141,10 @@ def test_certificates_manager_generate_creates_all_files( shared = temp_certificates.generate(model_uuid='model', app_name='unit-1') assert temp_certificates._exists() is True - assert temp_certificates.CA_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') - assert temp_certificates.CLIENT_KEY_PATH.read_text().startswith( - '-----BEGIN RSA PRIVATE KEY-----' - ) - assert temp_certificates.CLIENT_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') + assert temp_certificates.ca_path.read_text().startswith('-----BEGIN CERTIFICATE-----') + assert temp_certificates.key_path.read_text().startswith('-----BEGIN RSA PRIVATE KEY-----') + assert temp_certificates.cert_path.read_text().startswith('-----BEGIN CERTIFICATE-----') - assert temp_certificates.CA_CERT_PATH.read_text() == shared.ca.raw - assert temp_certificates.CLIENT_KEY_PATH.read_text() == shared.key.raw - assert temp_certificates.CLIENT_CERT_PATH.read_text() == shared.certificate.raw + assert temp_certificates.ca_path.read_text() == shared.ca.raw + assert temp_certificates.key_path.read_text() == shared.key.raw + assert temp_certificates.cert_path.read_text() == shared.certificate.raw diff --git a/rollingops/tests/unit/test_etcd_etcdctl.py b/rollingops/tests/unit/test_etcd_etcdctl.py index 26497fa1e..df8b85ca5 100644 --- a/rollingops/tests/unit/test_etcd_etcdctl.py +++ b/rollingops/tests/unit/test_etcd_etcdctl.py @@ -31,12 +31,12 @@ def test_etcdctl_write_env(temp_etcdctl: Any) -> None: client_key_path=LocalPath('PATH2'), ) - assert temp_etcdctl.BASE_DIR.exists() + assert temp_etcdctl.base_dir.exists() - config = json.loads(temp_etcdctl.CONFIG_FILE_PATH.read_text()) + config = json.loads(temp_etcdctl.config_file_path.read_text()) assert config == { 'endpoints': 'https://10.0.0.1:2379,https://10.0.0.2:2379', - 'cacert_path': str(temp_etcdctl.SERVER_CA_PATH), + 'cacert_path': str(temp_etcdctl.server_ca_path), 'cert_path': 'PATH1', 'key_path': 'PATH2', } @@ -48,33 +48,33 @@ def test_etcdctl_ensure_initialized_raises_when_env_missing(temp_etcdctl: Any) - def test_etcdctl_cleanup_removes_env_file_and_server_ca(temp_etcdctl: Any) -> None: - temp_etcdctl.BASE_DIR.mkdir(parents=True, exist_ok=True) - temp_etcdctl.CONFIG_FILE_PATH.write_text('env') - temp_etcdctl.SERVER_CA_PATH.write_text('ca') + temp_etcdctl.base_dir.mkdir(parents=True, exist_ok=True) + temp_etcdctl.config_file_path.write_text('env') + temp_etcdctl.server_ca_path.write_text('ca') - assert temp_etcdctl.CONFIG_FILE_PATH.exists() - assert temp_etcdctl.SERVER_CA_PATH.exists() + assert temp_etcdctl.config_file_path.exists() + assert temp_etcdctl.server_ca_path.exists() temp_etcdctl.cleanup() - assert not temp_etcdctl.CONFIG_FILE_PATH.exists() - assert not temp_etcdctl.SERVER_CA_PATH.exists() + assert not temp_etcdctl.config_file_path.exists() + assert not temp_etcdctl.server_ca_path.exists() def test_etcdctl_cleanup_is_noop_when_files_do_not_exist(temp_etcdctl: Any) -> None: - assert not temp_etcdctl.CONFIG_FILE_PATH.exists() - assert not temp_etcdctl.SERVER_CA_PATH.exists() + assert not temp_etcdctl.config_file_path.exists() + assert not temp_etcdctl.server_ca_path.exists() temp_etcdctl.cleanup() - assert not temp_etcdctl.CONFIG_FILE_PATH.exists() - assert not temp_etcdctl.SERVER_CA_PATH.exists() + assert not temp_etcdctl.config_file_path.exists() + assert not temp_etcdctl.server_ca_path.exists() def test_etcdctl_load_env_parses_exported_vars(temp_etcdctl: Any) -> None: - temp_etcdctl.BASE_DIR.mkdir(parents=True, exist_ok=True) - temp_etcdctl.SERVER_CA_PATH.write_text('SERVER CA') - temp_etcdctl.CONFIG_FILE_PATH.write_text( + temp_etcdctl.base_dir.mkdir(parents=True, exist_ok=True) + temp_etcdctl.server_ca_path.write_text('SERVER CA') + temp_etcdctl.config_file_path.write_text( json.dumps({ 'endpoints': 'https://10.0.0.1:2379', 'cacert_path': '/a-path/server-ca.pem', diff --git a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py index e668412fd..a2a9b625f 100644 --- a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py @@ -51,7 +51,6 @@ def _unit_databag(state: State, peer: PeerRelation) -> RawDataBagContents: def test_leader_elected_creates_shared_secret_and_stores_id( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): peer_relation = PeerRelation(endpoint='restart') @@ -68,7 +67,6 @@ def test_leader_elected_creates_shared_secret_and_stores_id( def test_leader_elected_does_not_regenerate_when_secret_already_exists( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): peer_relation = PeerRelation( @@ -95,7 +93,6 @@ def test_leader_elected_does_not_regenerate_when_secret_already_exists( def test_non_leader_does_not_create_shared_secret( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): peer_relation = PeerRelation(endpoint='restart') @@ -110,7 +107,6 @@ def test_non_leader_does_not_create_shared_secret( def test_relation_changed_syncs_local_certificate_from_secret( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): peer_relation = PeerRelation( @@ -138,7 +134,6 @@ def test_relation_changed_syncs_local_certificate_from_secret( def test_invalid_certificate_secret_content_raises( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): peer_relation = PeerRelation( @@ -341,3 +336,111 @@ def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCh assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 + + +def test_is_waiting_returns_true_when_matching_operation_exists(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}), + Operation.create('restart', {'delay': 2}), + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + assert mgr.charm.restart_manager.is_waiting('restart', {'delay': 1}) is True + + +def test_is_waiting_returns_false_when_callback_matches_but_kwargs_do_not( + ctx: Context[RollingOpsCharm], +): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}), + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + assert mgr.charm.restart_manager.is_waiting('restart', {'delay': 2}) is False + + +def test_is_waiting_returns_false_when_callback_does_not_match(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}), + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + assert mgr.charm.restart_manager.is_waiting('other-callback', {'delay': 1}) is False + + +def test_is_waiting_returns_true_when_kwargs_is_none_and_matching_operation_has_empty_kwargs( + ctx: Context[RollingOpsCharm], +): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([ + Operation.create('restart', {}), + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + assert mgr.charm.restart_manager.is_waiting('restart') is True + + +def test_is_waiting_returns_false_when_operation_validation_fails(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([]).to_string(), + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + assert mgr.charm.restart_manager.is_waiting('restart', 'a') is False # type: ignore[reportArgumentType] diff --git a/rollingops/tests/unit/test_peer_rollingops_in_charm.py b/rollingops/tests/unit/test_peer_rollingops_in_charm.py index 11389de47..7ad6cae6f 100644 --- a/rollingops/tests/unit/test_peer_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_peer_rollingops_in_charm.py @@ -21,8 +21,9 @@ import pytest from ops.testing import Context, PeerRelation, State from scenario import RawDataBagContents -from tests.unit.conftest import RollingOpsCharm +from tests.unit.conftest import RollingOpsCharm, StrictPeerRollingOpsCharm +from charmlibs.rollingops import ProcessingBackend, RollingOpsStatus from charmlibs.rollingops.common._exceptions import RollingOpsInvalidLockRequestError from charmlibs.rollingops.common._models import Operation, OperationQueue from charmlibs.rollingops.common._utils import now_timestamp @@ -354,7 +355,6 @@ def test_lock_retry_drops_when_max_retry_reached( def test_lock_grant_and_release( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) @@ -374,7 +374,6 @@ def test_lock_grant_and_release( def test_scheduling_does_nothing_if_lock_already_granted( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) @@ -399,7 +398,6 @@ def test_scheduling_does_nothing_if_lock_already_granted( def test_schedule_picks_retry_hold( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): old_operation = str(now_timestamp().timestamp()) @@ -437,7 +435,6 @@ def test_schedule_picks_retry_hold( def test_schedule_picks_oldest_requested_at_among_requests( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): old_queue = OperationQueue() @@ -465,7 +462,6 @@ def test_schedule_picks_oldest_requested_at_among_requests( def test_schedule_picks_oldest_executed_at_among_retries_when_no_requests( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): old_operation = str(now_timestamp().timestamp()) @@ -498,7 +494,6 @@ def test_schedule_picks_oldest_executed_at_among_retries_when_no_requests( def test_schedule_prioritizes_requests_over_retries( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) @@ -525,7 +520,6 @@ def test_schedule_prioritizes_requests_over_retries( def test_no_unit_is_granted_if_there_are_no_requests( certificates_manager_patches: dict[str, MagicMock], - etcdctl_patch: MagicMock, ctx: Context[RollingOpsCharm], ): peer = PeerRelation( @@ -539,3 +533,40 @@ def test_no_unit_is_granted_if_there_are_no_requests( databag = _app_databag(state_out, peer) assert databag.get('granted_unit', '') == '' assert databag.get('granted_at', '') == '' + + +def test_strict_peer_no_unit_is_granted_if_there_are_no_requests( + certificates_manager_patches: dict[str, MagicMock], + strict_peer_ctx: Context[StrictPeerRollingOpsCharm], +): + peer = PeerRelation( + endpoint='restart', + peers_data={1: {'state': LockIntent.IDLE}, 2: {'state': LockIntent.IDLE}}, + ) + state_in = State(leader=True, relations={peer}) + + state_out = strict_peer_ctx.run(strict_peer_ctx.on.leader_elected(), state_in) + + databag = _app_databag(state_out, peer) + assert databag.get('granted_unit', '') == '' + assert databag.get('granted_at', '') == '' + + +def test_state_peer_idle(strict_peer_ctx: Context[StrictPeerRollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_unit_data={ + 'state': '', + 'operations': '', + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with strict_peer_ctx(strict_peer_ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.IDLE + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 0 From c6a6c24ba0fc7dfaa604071310be95324ab9871f Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Thu, 23 Apr 2026 20:57:50 +0200 Subject: [PATCH 08/15] remove print --- rollingops/src/charmlibs/rollingops/_rollingops_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index 66a8d22fc..9db90cf3c 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -478,7 +478,6 @@ def is_waiting(self, callback_id: str, kwargs: dict[str, Any] | None = None) -> try: check_operation = Operation.create(callback_id=callback_id, kwargs=kwargs) - print(operations) except ValidationError: return False From 8d321dca33f34342e5f0f5281da4235f17c05ed8 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Thu, 23 Apr 2026 21:26:10 +0200 Subject: [PATCH 09/15] fix docs --- .../charmlibs/rollingops/common/_models.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/common/_models.py b/rollingops/src/charmlibs/rollingops/common/_models.py index 7b15a1fd0..59378b840 100644 --- a/rollingops/src/charmlibs/rollingops/common/_models.py +++ b/rollingops/src/charmlibs/rollingops/common/_models.py @@ -99,17 +99,14 @@ class RollingOpsStatus(StrEnum): - UNAVAILABLE: Rolling-ops cannot be used on this unit. This typically occurs when required relations are missing or the selected backend is not reachable. - * peer backend: peer relation does not exist - * etcd backend: peer or etcd relation missing, or etcd not reachable + * peer backend: peer relation does not exist + * etcd backend: peer or etcd relation missing, or etcd not reachable - - WAITING: - The unit has pending operations but does not currently hold the lock. + - WAITING: The unit has pending operations but does not currently hold the lock. - - GRANTED: - The unit currently holds the lock and may execute operations. + - GRANTED: The unit currently holds the lock and may execute operations. - - IDLE: - The unit has no pending operations and is not holding the lock. + - IDLE: The unit has no pending operations and is not holding the lock. """ UNAVAILABLE = 'unavailable' @@ -455,11 +452,9 @@ class RollingOpsState: When `status` is UNAVAILABLE, the unit cannot currently participate in rolling operations due to missing relations or backend failures. - Attributes: - status: High-level rolling-ops status for the unit. - processing_backend: Backend currently responsible for executing - operations (e.g. ETCD or PEER). - operations: The unit's operation queue. + status: High-level rolling-ops status for the unit. + processing_backend: Backend currently responsible for executing operations (e.g. ETCD or PEER). + operations: The unit's operation queue. """ status: RollingOpsStatus From 4a01b73227dfd6b607c12eed58ddc00c5f9324da Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Fri, 24 Apr 2026 10:11:37 +0200 Subject: [PATCH 10/15] add is_ready --- .../rollingops/_rollingops_manager.py | 12 +++++++++--- .../src/charmlibs/rollingops/common/_models.py | 8 ++++---- .../src/charmlibs/rollingops/etcd/_backend.py | 4 ++-- .../src/charmlibs/rollingops/peer/_backend.py | 18 +++++++++++++++--- .../tests/integration/test_peer_rolling_ops.py | 2 +- .../unit/test_etcd_rollingops_in_charm.py | 16 ++++++++++++++-- 6 files changed, 45 insertions(+), 15 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index 9db90cf3c..f3144357c 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -135,6 +135,8 @@ def __init__( self.framework.observe(charm.on.rollingops_etcd_failed, self._on_rollingops_etcd_failed) self.framework.observe(charm.on.update_status, self._on_update_status) + self.is_ready() + @property def _peer_relation(self) -> Relation | None: """Return the peer relation for this charm.""" @@ -441,9 +443,9 @@ def state(self) -> RollingOpsState: A snapshot of the current rolling-ops status, backend selection, and queued operations for this unit. """ - if self._peer_relation is None: + if not self.peer_backend.is_ready(): return RollingOpsState( - status=RollingOpsStatus.UNAVAILABLE, + status=RollingOpsStatus.NOT_READY, processing_backend=ProcessingBackend.PEER, operations=OperationQueue(), ) @@ -451,7 +453,7 @@ def state(self) -> RollingOpsState: status = self.peer_backend.get_status() if self.etcd_backend is not None and self._backend_state.is_etcd_managed(): status = self.etcd_backend.get_status() - if status == RollingOpsStatus.UNAVAILABLE: + if status == RollingOpsStatus.NOT_READY: logger.info('etcd backend is not available. Falling back to peer backend.') self._fallback_current_unit_to_peer() status = self.peer_backend.get_status() @@ -463,6 +465,10 @@ def state(self) -> RollingOpsState: operations=operations.queue, ) + def is_ready(self) -> bool: + """Return whether the rollingops manager can be used on this unit.""" + return self.state.status != RollingOpsStatus.NOT_READY + def is_waiting(self, callback_id: str, kwargs: dict[str, Any] | None = None) -> bool: """Return whether the current unit has a pending operation matching callback and kwargs.""" if self._peer_relation is None: diff --git a/rollingops/src/charmlibs/rollingops/common/_models.py b/rollingops/src/charmlibs/rollingops/common/_models.py index 59378b840..c8cf3df4b 100644 --- a/rollingops/src/charmlibs/rollingops/common/_models.py +++ b/rollingops/src/charmlibs/rollingops/common/_models.py @@ -96,10 +96,10 @@ class RollingOpsStatus(StrEnum): States: - - UNAVAILABLE: + - NOT_READY: Rolling-ops cannot be used on this unit. This typically occurs when required relations are missing or the selected backend is not reachable. - * peer backend: peer relation does not exist + * peer backend: peer relation does not exist or it is waiting for all the units to join * etcd backend: peer or etcd relation missing, or etcd not reachable - WAITING: The unit has pending operations but does not currently hold the lock. @@ -109,7 +109,7 @@ class RollingOpsStatus(StrEnum): - IDLE: The unit has no pending operations and is not holding the lock. """ - UNAVAILABLE = 'unavailable' + NOT_READY = 'not-ready' WAITING = 'waiting' GRANTED = 'granted' IDLE = 'idle' @@ -449,7 +449,7 @@ class RollingOpsState: to peer). The `operations` queue always reflects the peer-backed state, which acts as the source of truth and fallback mechanism. - When `status` is UNAVAILABLE, the unit cannot currently participate + When `status` is NOT_READY, the unit cannot currently participate in rolling operations due to missing relations or backend failures. status: High-level rolling-ops status for the unit. diff --git a/rollingops/src/charmlibs/rollingops/etcd/_backend.py b/rollingops/src/charmlibs/rollingops/etcd/_backend.py index 7299e22e4..c2ddfbb0b 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_backend.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_backend.py @@ -395,7 +395,7 @@ def get_status(self) -> RollingOpsStatus: unit's queued operation state. Returned values: - - UNAVAILABLE: etcd backend is not available + - NOT_READY: etcd backend is not available - GRANTED: the async lock is currently held by this unit - WAITING: this unit has queued work but does not hold the lock - IDLE: this unit has no pending work @@ -404,7 +404,7 @@ def get_status(self) -> RollingOpsStatus: The current rolling-ops status for this unit. """ if self._peer_relation is None or self._etcd_relation is None or not self.is_available(): - return RollingOpsStatus.UNAVAILABLE + return RollingOpsStatus.NOT_READY if self._async_lock.is_held(): return RollingOpsStatus.GRANTED diff --git a/rollingops/src/charmlibs/rollingops/peer/_backend.py b/rollingops/src/charmlibs/rollingops/peer/_backend.py index a6fa89825..172b7d760 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_backend.py +++ b/rollingops/src/charmlibs/rollingops/peer/_backend.py @@ -571,6 +571,18 @@ def mirror_outcome(self, outcome: RunWithLockOutcome) -> None: f'Unsupported run-with-lock outcome: {outcome.status}' ) + def is_ready(self) -> bool: + """Return whether the peer backend is ready. + + It is ready if all the units in the relation have started and joined + the relation. + """ + if self._relation is None: + return False + planned_units = self.model.app.planned_units() + units_in_relation = len(self._relation.units) + return planned_units == (units_in_relation + 1) + def get_status(self) -> RollingOpsStatus: """Return the current rolling-ops status for this unit in peer mode. @@ -578,7 +590,7 @@ def get_status(self) -> RollingOpsStatus: and from the shared peer lock state. Returned values: - - UNAVAILABLE: the peer relation does not exist + - NOT_READY: the peer relation does not exist - GRANTED: the current unit holds the peer lock - WAITING: the current unit has queued work but does not hold the lock - IDLE: the current unit has no pending work @@ -586,8 +598,8 @@ def get_status(self) -> RollingOpsStatus: Returns: The current rolling-ops status for this unit. """ - if self._relation is None: - return RollingOpsStatus.UNAVAILABLE + if not self.is_ready(): + return RollingOpsStatus.NOT_READY lock = self._lock() operations = self._operations(self.model.unit) diff --git a/rollingops/tests/integration/test_peer_rolling_ops.py b/rollingops/tests/integration/test_peer_rolling_ops.py index dcd527b61..bc032b724 100644 --- a/rollingops/tests/integration/test_peer_rolling_ops.py +++ b/rollingops/tests/integration/test_peer_rolling_ops.py @@ -215,7 +215,7 @@ def test_retry_release_alternates_execution(juju: jubilant.Juju, app_name: str): juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 2}, wait=TIMEOUT) juju.run(unit_b, 'failed-restart', {'delay': 1, 'max-retry': 2}, wait=TIMEOUT) - time.sleep(60) # wait for operation execution. TODO: in charm use lock state to clear status. + time.sleep(90) # wait for operation execution. TODO: in charm use lock state to clear status. all_events: list[dict[str, str]] = [] all_events.extend(get_unit_events(juju, unit_a)) diff --git a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py index a2a9b625f..e260432af 100644 --- a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py @@ -187,9 +187,10 @@ def test_state_not_initialized(ctx: Context[RollingOpsCharm]): with ctx(ctx.on.start(), state) as mgr: rolling_state = mgr.charm.restart_manager.state - assert rolling_state.status == RollingOpsStatus.UNAVAILABLE + assert rolling_state.status == RollingOpsStatus.NOT_READY assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 0 + assert mgr.charm.restart_manager.is_ready() is False def test_state_peer_idle(ctx: Context[RollingOpsCharm]): @@ -210,6 +211,7 @@ def test_state_peer_idle(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.IDLE assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 0 + assert mgr.charm.restart_manager.is_ready() is True def test_state_peer_waiting(ctx: Context[RollingOpsCharm]): @@ -232,6 +234,7 @@ def test_state_peer_waiting(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 + assert mgr.charm.restart_manager.is_ready() is True def test_state_peer_is_granted(ctx: Context[RollingOpsCharm]): @@ -257,6 +260,7 @@ def test_state_peer_is_granted(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.GRANTED assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 + assert mgr.charm.restart_manager.is_ready() is True def test_state_peer_waiting_retry(ctx: Context[RollingOpsCharm]): @@ -282,6 +286,7 @@ def test_state_peer_waiting_retry(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 + assert mgr.charm.restart_manager.is_ready() is True def test_state_etcd_status(ctx: Context[RollingOpsCharm]): @@ -310,6 +315,7 @@ def test_state_etcd_status(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.GRANTED assert rolling_state.processing_backend == ProcessingBackend.ETCD assert len(rolling_state.operations) == 1 + assert mgr.charm.restart_manager.is_ready() is True def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCharm]): @@ -329,13 +335,14 @@ def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCh with patch( 'charmlibs.rollingops._rollingops_manager.EtcdRollingOpsBackend.get_status', - return_value=RollingOpsStatus.UNAVAILABLE, + return_value=RollingOpsStatus.NOT_READY, ): with ctx(ctx.on.update_status(), state) as mgr: rolling_state = mgr.charm.restart_manager.state assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 + assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_true_when_matching_operation_exists(ctx: Context[RollingOpsCharm]): @@ -358,6 +365,7 @@ def test_is_waiting_returns_true_when_matching_operation_exists(ctx: Context[Rol with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart', {'delay': 1}) is True + assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_false_when_callback_matches_but_kwargs_do_not( @@ -381,6 +389,7 @@ def test_is_waiting_returns_false_when_callback_matches_but_kwargs_do_not( with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart', {'delay': 2}) is False + assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_false_when_callback_does_not_match(ctx: Context[RollingOpsCharm]): @@ -402,6 +411,7 @@ def test_is_waiting_returns_false_when_callback_does_not_match(ctx: Context[Roll with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('other-callback', {'delay': 1}) is False + assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_true_when_kwargs_is_none_and_matching_operation_has_empty_kwargs( @@ -425,6 +435,7 @@ def test_is_waiting_returns_true_when_kwargs_is_none_and_matching_operation_has_ with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart') is True + assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_false_when_operation_validation_fails(ctx: Context[RollingOpsCharm]): @@ -444,3 +455,4 @@ def test_is_waiting_returns_false_when_operation_validation_fails(ctx: Context[R with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart', 'a') is False # type: ignore[reportArgumentType] + assert mgr.charm.restart_manager.is_ready() is True From 5a90c025c43cfda5ae234a9ab06f5934f1a6bb94 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Fri, 24 Apr 2026 10:30:33 +0200 Subject: [PATCH 11/15] fix raise during tear down --- .../charmlibs/rollingops/_rollingops_manager.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index f3144357c..1334fbe48 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -198,6 +198,9 @@ def _fallback_current_unit_to_peer(self) -> None: It is used when etcd becomes unavailable, unhealthy, or inconsistent, so that queued operations can continue without being lost. """ + if self._peer_relation is None: + logger.info('Peer relation does not exists. Cannot fallback.') + return self._backend_state.fallback_to_peer() if self.etcd_backend is not None: self.etcd_backend.worker.stop() @@ -235,7 +238,7 @@ def request_async_lock( if callback_id not in self.peer_backend.callback_targets: raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') - if not self._peer_relation: + if self._peer_relation is None: raise RollingOpsNoRelationError('No %s peer relation yet.', self.peer_relation_name) if kwargs is None: @@ -281,6 +284,9 @@ def _on_rollingops_lock_granted(self, event: RollingOpsLockGrantedEvent) -> None If the current unit is etcd-managed, the operation is executed through the etcd backend. """ + if self._peer_relation is None: + logger.error('Peer relation does not exists. Cannot run lock granted.') + return if self._backend_state.is_peer_managed(): logger.info('Executing rollingop on peer backend.') self.peer_backend._on_rollingops_lock_granted(event) @@ -331,6 +337,9 @@ def _run_etcd_and_mirror_or_fallback(self) -> None: def _on_rollingops_etcd_failed(self, event: RollingOpsEtcdFailedEvent) -> None: """Fall back to peer when the etcd worker reports a fatal failure.""" logger.warning('Received %s.', ETCD_FAILED_HOOK_NAME) + if self._peer_relation is None: + logger.info('Peer relation does not exists. Cannot fallback.') + return if self._backend_state.is_etcd_managed(): # No need to stop the background process. This hook means that it stopped. self._backend_state.fallback_to_peer() @@ -492,6 +501,9 @@ def is_waiting(self, callback_id: str, kwargs: dict[str, Any] | None = None) -> def _on_update_status(self, event: EventBase) -> None: """Periodic reconciliation of rolling-ops state.""" logger.info('Received a update-status event.') + if self._peer_relation is None: + logger.info('Peer relation does not exists. Cannot update status.') + return if self._backend_state.is_etcd_managed(): if self.etcd_backend is None or not self.etcd_backend.is_available(): logger.warning('etcd unavailable during update_status; falling back.') From 7af838fe31631fdd16a03e8a5a964e7baf7adc72 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Fri, 24 Apr 2026 11:18:24 +0200 Subject: [PATCH 12/15] fix: create parent dir --- rollingops/src/charmlibs/rollingops/common/_base_worker.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/common/_base_worker.py b/rollingops/src/charmlibs/rollingops/common/_base_worker.py index 9c6b925f3..276023f58 100644 --- a/rollingops/src/charmlibs/rollingops/common/_base_worker.py +++ b/rollingops/src/charmlibs/rollingops/common/_base_worker.py @@ -215,8 +215,11 @@ def start(self) -> None: worker = self._worker_script_path() env = self._build_env() - log_filename = pathops.LocalPath(self._base_dir) / self._log_filename - with open(log_filename, 'a') as log_out: + base_dir_path = pathops.LocalPath(self._base_dir) + with_pebble_retry(lambda: base_dir_path.mkdir(parents=True, exist_ok=True)) + + log_file = base_dir_path / self._log_filename + with open(log_file, 'a') as log_out: pid = subprocess.Popen( [ '/usr/bin/python3', From 220fd0f9ce4ddbcec9b82de3e3f7ecec6b8fb166 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Fri, 24 Apr 2026 13:00:51 +0200 Subject: [PATCH 13/15] remove is ready --- .../charmlibs/rollingops/_rollingops_manager.py | 8 +------- .../src/charmlibs/rollingops/common/_models.py | 2 +- .../src/charmlibs/rollingops/peer/_backend.py | 14 +------------- .../tests/unit/test_etcd_rollingops_in_charm.py | 12 ------------ 4 files changed, 3 insertions(+), 33 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index 1334fbe48..193f5225f 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -135,8 +135,6 @@ def __init__( self.framework.observe(charm.on.rollingops_etcd_failed, self._on_rollingops_etcd_failed) self.framework.observe(charm.on.update_status, self._on_update_status) - self.is_ready() - @property def _peer_relation(self) -> Relation | None: """Return the peer relation for this charm.""" @@ -452,7 +450,7 @@ def state(self) -> RollingOpsState: A snapshot of the current rolling-ops status, backend selection, and queued operations for this unit. """ - if not self.peer_backend.is_ready(): + if self._peer_relation is None: return RollingOpsState( status=RollingOpsStatus.NOT_READY, processing_backend=ProcessingBackend.PEER, @@ -474,10 +472,6 @@ def state(self) -> RollingOpsState: operations=operations.queue, ) - def is_ready(self) -> bool: - """Return whether the rollingops manager can be used on this unit.""" - return self.state.status != RollingOpsStatus.NOT_READY - def is_waiting(self, callback_id: str, kwargs: dict[str, Any] | None = None) -> bool: """Return whether the current unit has a pending operation matching callback and kwargs.""" if self._peer_relation is None: diff --git a/rollingops/src/charmlibs/rollingops/common/_models.py b/rollingops/src/charmlibs/rollingops/common/_models.py index c8cf3df4b..cdb760d26 100644 --- a/rollingops/src/charmlibs/rollingops/common/_models.py +++ b/rollingops/src/charmlibs/rollingops/common/_models.py @@ -99,7 +99,7 @@ class RollingOpsStatus(StrEnum): - NOT_READY: Rolling-ops cannot be used on this unit. This typically occurs when required relations are missing or the selected backend is not reachable. - * peer backend: peer relation does not exist or it is waiting for all the units to join + * peer backend: peer relation does not exist * etcd backend: peer or etcd relation missing, or etcd not reachable - WAITING: The unit has pending operations but does not currently hold the lock. diff --git a/rollingops/src/charmlibs/rollingops/peer/_backend.py b/rollingops/src/charmlibs/rollingops/peer/_backend.py index 172b7d760..3c5a83b3d 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_backend.py +++ b/rollingops/src/charmlibs/rollingops/peer/_backend.py @@ -571,18 +571,6 @@ def mirror_outcome(self, outcome: RunWithLockOutcome) -> None: f'Unsupported run-with-lock outcome: {outcome.status}' ) - def is_ready(self) -> bool: - """Return whether the peer backend is ready. - - It is ready if all the units in the relation have started and joined - the relation. - """ - if self._relation is None: - return False - planned_units = self.model.app.planned_units() - units_in_relation = len(self._relation.units) - return planned_units == (units_in_relation + 1) - def get_status(self) -> RollingOpsStatus: """Return the current rolling-ops status for this unit in peer mode. @@ -598,7 +586,7 @@ def get_status(self) -> RollingOpsStatus: Returns: The current rolling-ops status for this unit. """ - if not self.is_ready(): + if self._relation is None: return RollingOpsStatus.NOT_READY lock = self._lock() diff --git a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py index e260432af..e89d32d54 100644 --- a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py @@ -190,7 +190,6 @@ def test_state_not_initialized(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.NOT_READY assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 0 - assert mgr.charm.restart_manager.is_ready() is False def test_state_peer_idle(ctx: Context[RollingOpsCharm]): @@ -211,7 +210,6 @@ def test_state_peer_idle(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.IDLE assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 0 - assert mgr.charm.restart_manager.is_ready() is True def test_state_peer_waiting(ctx: Context[RollingOpsCharm]): @@ -234,7 +232,6 @@ def test_state_peer_waiting(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 - assert mgr.charm.restart_manager.is_ready() is True def test_state_peer_is_granted(ctx: Context[RollingOpsCharm]): @@ -260,7 +257,6 @@ def test_state_peer_is_granted(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.GRANTED assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 - assert mgr.charm.restart_manager.is_ready() is True def test_state_peer_waiting_retry(ctx: Context[RollingOpsCharm]): @@ -286,7 +282,6 @@ def test_state_peer_waiting_retry(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 - assert mgr.charm.restart_manager.is_ready() is True def test_state_etcd_status(ctx: Context[RollingOpsCharm]): @@ -315,7 +310,6 @@ def test_state_etcd_status(ctx: Context[RollingOpsCharm]): assert rolling_state.status == RollingOpsStatus.GRANTED assert rolling_state.processing_backend == ProcessingBackend.ETCD assert len(rolling_state.operations) == 1 - assert mgr.charm.restart_manager.is_ready() is True def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCharm]): @@ -342,7 +336,6 @@ def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCh assert rolling_state.status == RollingOpsStatus.WAITING assert rolling_state.processing_backend == ProcessingBackend.PEER assert len(rolling_state.operations) == 1 - assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_true_when_matching_operation_exists(ctx: Context[RollingOpsCharm]): @@ -365,7 +358,6 @@ def test_is_waiting_returns_true_when_matching_operation_exists(ctx: Context[Rol with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart', {'delay': 1}) is True - assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_false_when_callback_matches_but_kwargs_do_not( @@ -389,7 +381,6 @@ def test_is_waiting_returns_false_when_callback_matches_but_kwargs_do_not( with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart', {'delay': 2}) is False - assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_false_when_callback_does_not_match(ctx: Context[RollingOpsCharm]): @@ -411,7 +402,6 @@ def test_is_waiting_returns_false_when_callback_does_not_match(ctx: Context[Roll with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('other-callback', {'delay': 1}) is False - assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_true_when_kwargs_is_none_and_matching_operation_has_empty_kwargs( @@ -435,7 +425,6 @@ def test_is_waiting_returns_true_when_kwargs_is_none_and_matching_operation_has_ with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart') is True - assert mgr.charm.restart_manager.is_ready() is True def test_is_waiting_returns_false_when_operation_validation_fails(ctx: Context[RollingOpsCharm]): @@ -455,4 +444,3 @@ def test_is_waiting_returns_false_when_operation_validation_fails(ctx: Context[R with ctx(ctx.on.update_status(), state) as mgr: assert mgr.charm.restart_manager.is_waiting('restart', 'a') is False # type: ignore[reportArgumentType] - assert mgr.charm.restart_manager.is_ready() is True From 1fe4bfd1ac2147af28b222aa0228670267c36f3d Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Fri, 24 Apr 2026 15:35:51 +0200 Subject: [PATCH 14/15] use pathops instead of str --- .../src/charmlibs/rollingops/_rollingops_manager.py | 6 +++++- .../src/charmlibs/rollingops/common/_base_worker.py | 7 +++---- rollingops/src/charmlibs/rollingops/common/_utils.py | 4 ++-- rollingops/src/charmlibs/rollingops/etcd/_backend.py | 3 ++- .../src/charmlibs/rollingops/etcd/_certificates.py | 4 ++-- rollingops/src/charmlibs/rollingops/etcd/_etcd.py | 11 ++++++----- rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py | 4 ++-- .../src/charmlibs/rollingops/etcd/_relations.py | 7 +++++-- .../src/charmlibs/rollingops/etcd/_rollingops.py | 3 ++- rollingops/src/charmlibs/rollingops/etcd/_worker.py | 7 ++++++- rollingops/src/charmlibs/rollingops/peer/_backend.py | 3 ++- .../src/charmlibs/rollingops/peer/_rollingops.py | 3 ++- rollingops/src/charmlibs/rollingops/peer/_worker.py | 2 +- rollingops/tests/unit/conftest.py | 7 +++++-- 14 files changed, 45 insertions(+), 26 deletions(-) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py index 193f5225f..02837b099 100644 --- a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -22,6 +22,7 @@ from ops.framework import EventBase from pydantic import ValidationError +from charmlibs import pathops from charmlibs.rollingops.common._exceptions import ( RollingOpsDecodingError, RollingOpsInvalidLockRequestError, @@ -72,7 +73,7 @@ def __init__( etcd_relation_name: str | None = None, cluster_id: str | None = None, sync_lock_targets: dict[str, type[SyncLockBackend]] | None = None, - base_dir: str = '/var/lib/rollingops', + base_dir: pathops.LocalPath | None = None, ): """Create a rolling operations manager with etcd and peer backends. @@ -103,6 +104,9 @@ def __init__( """ super().__init__(charm, 'rolling-ops-manager') + if base_dir is None: + base_dir = pathops.LocalPath('/var/lib/rollingops') + self.charm = charm self.peer_relation_name = peer_relation_name self.etcd_relation_name = etcd_relation_name diff --git a/rollingops/src/charmlibs/rollingops/common/_base_worker.py b/rollingops/src/charmlibs/rollingops/common/_base_worker.py index 276023f58..ede0b1262 100644 --- a/rollingops/src/charmlibs/rollingops/common/_base_worker.py +++ b/rollingops/src/charmlibs/rollingops/common/_base_worker.py @@ -53,7 +53,7 @@ def __init__( charm: CharmBase, handle_name: str, peer_relation_name: str, - base_dir: str, + base_dir: pathops.LocalPath, ): """Initialize the base rolling-ops worker helper. @@ -215,10 +215,9 @@ def start(self) -> None: worker = self._worker_script_path() env = self._build_env() - base_dir_path = pathops.LocalPath(self._base_dir) - with_pebble_retry(lambda: base_dir_path.mkdir(parents=True, exist_ok=True)) + with_pebble_retry(lambda: self._base_dir.mkdir(parents=True, exist_ok=True)) - log_file = base_dir_path / self._log_filename + log_file = self._base_dir / self._log_filename with open(log_file, 'a') as log_out: pid = subprocess.Popen( [ diff --git a/rollingops/src/charmlibs/rollingops/common/_utils.py b/rollingops/src/charmlibs/rollingops/common/_utils.py index ebfb58809..79cb39b5f 100644 --- a/rollingops/src/charmlibs/rollingops/common/_utils.py +++ b/rollingops/src/charmlibs/rollingops/common/_utils.py @@ -62,7 +62,7 @@ def datetime_to_str(dt: datetime) -> str: def setup_logging( - base_dir: str, + base_dir: pathops.LocalPath, log_filename: str, *, unit_name: str, @@ -84,7 +84,7 @@ def setup_logging( cluster_id: Optional etcd cluster identifier. owner: Optional worker owner identifier. """ - log_file = pathops.LocalPath(base_dir) / log_filename + log_file = base_dir / log_filename handler = RotatingFileHandler( log_file, maxBytes=10 * 1024 * 1024, # 10 MB diff --git a/rollingops/src/charmlibs/rollingops/etcd/_backend.py b/rollingops/src/charmlibs/rollingops/etcd/_backend.py index c2ddfbb0b..ff267e8af 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_backend.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_backend.py @@ -23,6 +23,7 @@ RelationDepartedEvent, ) +from charmlibs import pathops from charmlibs.rollingops.common._exceptions import ( RollingOpsInvalidLockRequestError, RollingOpsNoEtcdRelationError, @@ -63,7 +64,7 @@ def __init__( etcd_relation_name: str, cluster_id: str, callback_targets: dict[str, Any], - base_dir: str, + base_dir: pathops.LocalPath, ): """Initialize the etcd-backed rolling-ops backend. diff --git a/rollingops/src/charmlibs/rollingops/etcd/_certificates.py b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py index 0ff310242..2619d22b1 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_certificates.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py @@ -43,8 +43,8 @@ class CertificateStore: - def __init__(self, base_dir: str): - self.base_dir = pathops.LocalPath(base_dir) / 'tls' + def __init__(self, base_dir: pathops.LocalPath): + self.base_dir = base_dir / 'tls' self.cert_path = self.base_dir / 'client.pem' self.key_path = self.base_dir / 'client.key' self.ca_path = self.base_dir / 'client-ca.pem' diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py index 8ce7ff124..97dd308b7 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -19,6 +19,7 @@ import subprocess import time +from charmlibs import pathops from charmlibs.rollingops.common._exceptions import ( RollingOpsEtcdctlFatalError, RollingOpsEtcdctlParseError, @@ -36,7 +37,7 @@ class EtcdLease: """Manage the lifecycle of an etcd lease and its keep-alive process.""" - def __init__(self, base_dir: str): + def __init__(self, base_dir: pathops.LocalPath): self.id: str | None = None self.keepalive_proc: subprocess.Popen[str] | None = None self._pipe_write_fd: int | None = None @@ -155,7 +156,7 @@ class EtcdLock: automatically released if the owner stops refreshing the lease. """ - def __init__(self, lock_key: str, owner: str, base_dir: str): + def __init__(self, lock_key: str, owner: str, base_dir: pathops.LocalPath): self.lock_key = lock_key self.owner = owner self._etcdctl = Etcdctl(base_dir) @@ -221,7 +222,7 @@ class EtcdOperationQueue: the value contains the serialized operation data. """ - def __init__(self, prefix: str, lock_key: str, owner: str, base_dir: str): + def __init__(self, prefix: str, lock_key: str, owner: str, base_dir: pathops.LocalPath): self.prefix = prefix self.lock_key = lock_key self.owner = owner @@ -381,7 +382,7 @@ class WorkerOperationStore: - requeue or delete completed operations """ - def __init__(self, keys: RollingOpsKeys, owner: str, base_dir: str): + def __init__(self, keys: RollingOpsKeys, owner: str, base_dir: pathops.LocalPath): self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner, base_dir=base_dir) self._inprogress = EtcdOperationQueue( keys.inprogress, keys.lock_key, owner, base_dir=base_dir @@ -482,7 +483,7 @@ class ManagerOperationStore: Queue transitions and storage details remain encapsulated behind this API. """ - def __init__(self, keys: RollingOpsKeys, owner: str, base_dir: str): + def __init__(self, keys: RollingOpsKeys, owner: str, base_dir: pathops.LocalPath): self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner, base_dir=base_dir) self._inprogress = EtcdOperationQueue( keys.inprogress, keys.lock_key, owner, base_dir=base_dir diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py index e4ee4b334..375d7575e 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py @@ -54,8 +54,8 @@ class Etcdctl: - def __init__(self, base_dir: str): - self.base_dir = pathops.LocalPath(base_dir) / 'etcd' + def __init__(self, base_dir: pathops.LocalPath): + self.base_dir = base_dir / 'etcd' self.server_ca_path = self.base_dir / 'server-ca.pem' self.config_file_path = self.base_dir / 'etcdctl.json' diff --git a/rollingops/src/charmlibs/rollingops/etcd/_relations.py b/rollingops/src/charmlibs/rollingops/etcd/_relations.py index 191bd51b1..a41edf92c 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_relations.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_relations.py @@ -31,6 +31,7 @@ ) from ops.framework import Object +from charmlibs import pathops from charmlibs.interfaces.tls_certificates import Certificate, TLSCertificatesError from charmlibs.rollingops.common._exceptions import RollingOpsInvalidSecretContentError from charmlibs.rollingops.etcd._certificates import CertificateStore @@ -48,7 +49,9 @@ class SharedClientCertificateManager(Object): """Manage the shared rollingops client certificate via peer relation secret.""" - def __init__(self, charm: CharmBase, peer_relation_name: str, base_dir: str) -> None: + def __init__( + self, charm: CharmBase, peer_relation_name: str, base_dir: pathops.LocalPath + ) -> None: super().__init__(charm, 'shared-client-certificate') self.charm = charm self.peer_relation_name = peer_relation_name @@ -194,7 +197,7 @@ def __init__( relation_name: str, cluster_id: str, shared_certificates: SharedClientCertificateManager, - base_dir: str, + base_dir: pathops.LocalPath, ) -> None: super().__init__(charm, f'requirer-{relation_name}') self.charm = charm diff --git a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py index 228f7d457..8237e0a05 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py @@ -16,6 +16,7 @@ import logging import time +from charmlibs import pathops from charmlibs.rollingops.common._models import OperationResult from charmlibs.rollingops.common._utils import ( ETCD_FAILED_HOOK_NAME, @@ -66,7 +67,7 @@ def main(): parser = argparse.ArgumentParser(description='RollingOps etcd worker') parser.add_argument( '--base-dir', - type=str, + type=pathops.LocalPath, required=True, help='Base directory used to store all rollingops files.', ) diff --git a/rollingops/src/charmlibs/rollingops/etcd/_worker.py b/rollingops/src/charmlibs/rollingops/etcd/_worker.py index d5535687e..0ed52848b 100644 --- a/rollingops/src/charmlibs/rollingops/etcd/_worker.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_worker.py @@ -40,7 +40,12 @@ class EtcdRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): _log_filename = ETCD_LOG_FILENAME def __init__( - self, charm: CharmBase, peer_relation_name: str, owner: str, cluster_id: str, base_dir: str + self, + charm: CharmBase, + peer_relation_name: str, + owner: str, + cluster_id: str, + base_dir: pathops.LocalPath, ): super().__init__( charm, diff --git a/rollingops/src/charmlibs/rollingops/peer/_backend.py b/rollingops/src/charmlibs/rollingops/peer/_backend.py index 3c5a83b3d..23f14def9 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_backend.py +++ b/rollingops/src/charmlibs/rollingops/peer/_backend.py @@ -160,6 +160,7 @@ def _on_restart_action(self, event) -> None: ) from ops.framework import EventBase +from charmlibs import pathops from charmlibs.rollingops.common._exceptions import ( RollingOpsDecodingError, RollingOpsInvalidLockRequestError, @@ -201,7 +202,7 @@ def __init__( charm: CharmBase, relation_name: str, callback_targets: dict[str, Callable[..., Any]], - base_dir: str, + base_dir: pathops.LocalPath, ): """Initialize the peer-backed rolling-ops backend. diff --git a/rollingops/src/charmlibs/rollingops/peer/_rollingops.py b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py index b8c89b236..970fcf686 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py @@ -17,6 +17,7 @@ import argparse import time +from charmlibs import pathops from charmlibs.rollingops.common._utils import dispatch_lock_granted, setup_logging from charmlibs.rollingops.peer._worker import PEER_LOG_FILENAME @@ -26,7 +27,7 @@ def main(): parser = argparse.ArgumentParser(description='RollingOps peer worker') parser.add_argument( '--base-dir', - type=str, + type=pathops.LocalPath, required=True, help='Base directory used to store all rollingops files.', ) diff --git a/rollingops/src/charmlibs/rollingops/peer/_worker.py b/rollingops/src/charmlibs/rollingops/peer/_worker.py index 410a760f4..430613627 100644 --- a/rollingops/src/charmlibs/rollingops/peer/_worker.py +++ b/rollingops/src/charmlibs/rollingops/peer/_worker.py @@ -42,7 +42,7 @@ class PeerRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): _pid_field = WORKER_PID_FIELD _log_filename = PEER_LOG_FILENAME - def __init__(self, charm: CharmBase, relation_name: str, base_dir: str): + def __init__(self, charm: CharmBase, relation_name: str, base_dir: pathops.LocalPath): super().__init__( charm, 'peer-rollingops-async-worker', diff --git a/rollingops/tests/unit/conftest.py b/rollingops/tests/unit/conftest.py index 4ccffbe2b..6be4dcfb5 100644 --- a/rollingops/tests/unit/conftest.py +++ b/rollingops/tests/unit/conftest.py @@ -26,6 +26,7 @@ import charmlibs.rollingops.etcd._certificates as certificates import charmlibs.rollingops.etcd._etcdctl as etcdctl +from charmlibs import pathops from charmlibs.interfaces.tls_certificates import ( Certificate, PrivateKey, @@ -103,14 +104,16 @@ @pytest.fixture def temp_certificates(tmp_path: Path) -> certificates.CertificateStore: - client = certificates.CertificateStore(str(tmp_path)) + path = pathops.LocalPath(str(tmp_path)) + client = certificates.CertificateStore(path) client.base_dir.mkdir(parents=True, exist_ok=True) return client @pytest.fixture def temp_etcdctl(tmp_path: Path) -> etcdctl.Etcdctl: - client = etcdctl.Etcdctl(str(tmp_path)) + path = pathops.LocalPath(str(tmp_path)) + client = etcdctl.Etcdctl(path) client.base_dir.mkdir(parents=True, exist_ok=True) return client From e1c2926a396ac8488cf8dc0addcd44e3d7d67643 Mon Sep 17 00:00:00 2001 From: Patricia Reinoso Date: Fri, 24 Apr 2026 20:02:14 +0200 Subject: [PATCH 15/15] scale etcd in integration tests --- rollingops/tests/integration/test_etcd_rolling_ops.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/rollingops/tests/integration/test_etcd_rolling_ops.py b/rollingops/tests/integration/test_etcd_rolling_ops.py index 662918c97..9036ab4cf 100644 --- a/rollingops/tests/integration/test_etcd_rolling_ops.py +++ b/rollingops/tests/integration/test_etcd_rolling_ops.py @@ -61,11 +61,7 @@ def test_charm_is_integrated_with_etcd(juju: jubilant.Juju, app_name: str): app='self-signed-certificates', channel='1/stable', ) - juju.deploy( - 'charmed-etcd', - app='etcd', - channel='3.6/stable', - ) + juju.deploy('charmed-etcd', app='etcd', channel='3.6/stable', num_units=3) juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) juju.integrate(