diff --git a/rollingops/pyproject.toml b/rollingops/pyproject.toml index bd096196b..701b0684b 100644 --- a/rollingops/pyproject.toml +++ b/rollingops/pyproject.toml @@ -19,7 +19,9 @@ dependencies = [ "charmlibs-interfaces-tls-certificates>=1.8.1", "charmlibs-pathops>=1.2.1", "dpcharmlibs-interfaces==1.0.0", - "tenacity" + "tenacity", + "pydantic>=2.12.5", + "shortuuid>=1.0.13", ] [dependency-groups] diff --git a/rollingops/src/charmlibs/rollingops/__init__.py b/rollingops/src/charmlibs/rollingops/__init__.py index 453742a80..181969351 100644 --- a/rollingops/src/charmlibs/rollingops/__init__.py +++ b/rollingops/src/charmlibs/rollingops/__init__.py @@ -14,17 +14,45 @@ """The charmlibs.rollingops package.""" -from ._base_manager import RollingOpsManager -from ._models import ( - OperationResult, +from ._rollingops_manager import RollingOpsManager +from ._version import __version__ as __version__ +from .common._exceptions import ( + RollingOpsDecodingError, + RollingOpsEtcdctlError, RollingOpsEtcdNotConfiguredError, + RollingOpsFileSystemError, RollingOpsInvalidLockRequestError, + RollingOpsInvalidSecretContentError, + RollingOpsLibMissingError, + RollingOpsNoRelationError, + RollingOpsSyncLockError, +) +from .common._models import ( + Operation, + OperationQueue, + OperationResult, + ProcessingBackend, + RollingOpsState, + RollingOpsStatus, + SyncLockBackend, ) -from ._version import __version__ as __version__ __all__ = ( + 'Operation', + 'OperationQueue', 'OperationResult', + 'ProcessingBackend', + 'RollingOpsDecodingError', 'RollingOpsEtcdNotConfiguredError', + 'RollingOpsEtcdctlError', + 'RollingOpsFileSystemError', 'RollingOpsInvalidLockRequestError', + 'RollingOpsInvalidSecretContentError', + 'RollingOpsLibMissingError', 'RollingOpsManager', + 'RollingOpsNoRelationError', + 'RollingOpsState', + 'RollingOpsStatus', + 'RollingOpsSyncLockError', + 'SyncLockBackend', ) diff --git a/rollingops/src/charmlibs/rollingops/_base_manager.py b/rollingops/src/charmlibs/rollingops/_base_manager.py deleted file mode 100644 index 3ac9d50e1..000000000 --- a/rollingops/src/charmlibs/rollingops/_base_manager.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" - -import logging -from typing import Any - -from ops import CharmBase, Object -from ops.framework import EventBase - -from charmlibs.rollingops._manager import EtcdRollingOpsManager -from charmlibs.rollingops._peer_manager import PeerRollingOpsManager - -logger = logging.getLogger(__name__) - - -class RollingOpsLockGrantedEvent(EventBase): - """Custom event emitted when the background worker grants the lock.""" - - -class RollingOpsManager(Object): - def __init__( - self, - charm: CharmBase, - peer_relation_name: str, - etcd_relation_name: str, - cluster_id: str, - callback_targets: dict[str, Any], - ): - super().__init__(charm, 'rolling-ops-manager') - - self.charm = charm - self.peer_relation_name = peer_relation_name - self.etcd_relation_name = etcd_relation_name - charm.on.define_event('rollingops_lock_granted', RollingOpsLockGrantedEvent) - - self.peer_manager = PeerRollingOpsManager( - charm=charm, - relation_name=peer_relation_name, - callback_targets=callback_targets, - ) - self.etcd_manager = EtcdRollingOpsManager( - charm=charm, - peer_relation_name=peer_relation_name, - etcd_relation_name=etcd_relation_name, - cluster_id=cluster_id, - callback_targets=callback_targets, - ) - - self.framework.observe(charm.on.rollingops_lock_granted, self._on_rollingops_lock_granted) - - def _has_relation(self, relation_name: str) -> bool: - return self.model.get_relation(relation_name) is not None - - def _get_active_manager(self) -> Any: - has_etcd = self._has_relation(self.etcd_relation_name) - has_peer = self._has_relation(self.peer_relation_name) - - if has_etcd: - return self.etcd_manager - - if has_peer: - return self.peer_manager - - raise RuntimeError('No active rollingops relation found.') - - def request_async_lock( - self, callback_id: str, kwargs: dict[str, Any] | None = None, max_retry: int | None = None - ) -> None: - manager = self._get_active_manager() - return manager.request_async_lock( - callback_id=callback_id, kwargs=kwargs, max_retry=max_retry - ) - - def _on_rollingops_lock_granted(self, event: RollingOpsLockGrantedEvent) -> None: - """Handler of the custom hook rollingops_lock_granted. - - The custom hook is triggered by a background process. - """ - manager = self._get_active_manager() - manager._on_rollingops_lock_granted(event) diff --git a/rollingops/src/charmlibs/rollingops/_etcd_rollingops.py b/rollingops/src/charmlibs/rollingops/_etcd_rollingops.py deleted file mode 100644 index 816d7659d..000000000 --- a/rollingops/src/charmlibs/rollingops/_etcd_rollingops.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import subprocess -import time - - -def main(): - """Juju hook event dispatcher.""" - parser = argparse.ArgumentParser() - parser.add_argument('--run-cmd', required=True) - parser.add_argument('--unit-name', required=True) - parser.add_argument('--charm-dir', required=True) - parser.add_argument('--owner', required=True) - args = parser.parse_args() - - time.sleep(10) - - dispatch_sub_cmd = ( - f'JUJU_DISPATCH_PATH=hooks/rollingops_lock_granted {args.charm_dir}/dispatch' - ) - res = subprocess.run([args.run_cmd, '-u', args.unit_name, dispatch_sub_cmd]) - res.check_returncode() - - -if __name__ == '__main__': - main() diff --git a/rollingops/src/charmlibs/rollingops/_manager.py b/rollingops/src/charmlibs/rollingops/_manager.py deleted file mode 100644 index 6497db79f..000000000 --- a/rollingops/src/charmlibs/rollingops/_manager.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any - -from ops import Relation -from ops.charm import ( - CharmBase, - RelationBrokenEvent, - RelationCreatedEvent, - RelationDepartedEvent, -) -from ops.framework import EventBase, Object - -from charmlibs.rollingops import _etcdctl as etcdctl -from charmlibs.rollingops._models import ( - RollingOpsEtcdNotConfiguredError, - RollingOpsInvalidLockRequestError, - RollingOpsKeys, - RollingOpsNoEtcdRelationError, -) -from charmlibs.rollingops._relations import EtcdRequiresV1, SharedClientCertificateManager -from charmlibs.rollingops._worker import EtcdRollingOpsAsyncWorker - -logger = logging.getLogger(__name__) - - -class EtcdRollingOpsManager(Object): - """Rolling ops manager for clusters.""" - - def __init__( - self, - charm: CharmBase, - peer_relation_name: str, - etcd_relation_name: str, - cluster_id: str, - callback_targets: dict[str, Any], - ): - """Register our custom events. - - params: - charm: the charm we are attaching this to. - peer_relation_name: peer relation used for rolling ops. - etcd_relation_name: the relation to integrate with etcd. - cluster_id: unique identifier for the cluster - callback_targets: mapping from callback_id -> callable. - """ - super().__init__(charm, 'etcd-rolling-ops-manager') - self._charm = charm - self.peer_relation_name = peer_relation_name - self.etcd_relation_name = etcd_relation_name - self.callback_targets = callback_targets - self.charm_dir = charm.charm_dir - - owner = f'{self.model.uuid}-{self.model.unit.name}'.replace('/', '-') - self.worker = EtcdRollingOpsAsyncWorker( - charm, peer_relation_name=peer_relation_name, owner=owner - ) - self.keys = RollingOpsKeys.for_owner(cluster_id, owner) - - self.shared_certificates = SharedClientCertificateManager( - charm, - peer_relation_name=peer_relation_name, - ) - - self.etcd = EtcdRequiresV1( - charm, - relation_name=etcd_relation_name, - cluster_id=self.keys.cluster_prefix, - shared_certificates=self.shared_certificates, - ) - - self.framework.observe( - charm.on[self.peer_relation_name].relation_departed, self._on_peer_relation_departed - ) - self.framework.observe( - charm.on[self.etcd_relation_name].relation_broken, self._on_etcd_relation_broken - ) - self.framework.observe( - charm.on[self.etcd_relation_name].relation_created, self._on_etcd_relation_created - ) - - @property - def _peer_relation(self) -> Relation | None: - """Return the peer relation for this charm.""" - return self.model.get_relation(self.peer_relation_name) - - @property - def _etcd_relation(self) -> Relation | None: - """Return the etcd relation for this charm.""" - return self.model.get_relation(self.etcd_relation_name) - - def _on_etcd_relation_created(self, event: RelationCreatedEvent) -> None: - """Check whether the snap-provided etcdctl command is available.""" - if not etcdctl.is_etcdctl_installed(): - logger.error('%s is not installed', etcdctl.ETCDCTL_CMD) - # TODO: fallback to peer relation implementation. - - def _on_rollingops_lock_granted(self, event: EventBase) -> None: - """Handle the event when a rolling operation lock is granted. - - If etcd is not yet configured, the operation is skipped. - """ - if not self._peer_relation or not self._etcd_relation: - # TODO: handle this case. Fallback to peer relation. - return - try: - etcdctl.ensure_initialized() - except RollingOpsEtcdNotConfiguredError: - # TODO: handle this case. Fallback to peer relation. - return - logger.info('Received a rolling-op lock granted event.') - self._on_run_with_lock() - - def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: - """Handle a unit departing from the peer relation. - - If the current unit is the one departing, stop the etcd worker - process to ensure a clean shutdown. - """ - unit = event.departing_unit - if unit == self.model.unit: - self.worker.stop() - - def _on_etcd_relation_broken(self, event: RelationBrokenEvent) -> None: - """Handle the etcd relation being fully removed. - - This method stops the etcd worker process since the required - relation is no longer available. - """ - self.worker.stop() - - def request_async_lock( - self, - callback_id: str, - kwargs: dict[str, Any] | None = None, - max_retry: int | None = None, - ) -> None: - """This is a dummy function. - - Here we spawn a new process that will trigger a Juju hook. - This function will be completely remade in the next PR. - - Args: - callback_id: Identifier of the registered callback to execute when - the lock is granted. - kwargs: Optional keyword arguments passed to the callback when - executed. Must be JSON-serializable. - max_retry: Maximum number of retries for the operation. - - None: retry indefinitely - - 0: do not retry on failure - - Raises: - RollingOpsInvalidLockRequestError: If the callback_id is not registered or - invalid parameters were provided. - RollingOpsNoEtcdRelationError: if the etcd relation does not exist - RollingOpsEtcdNotConfiguredError: if etcd client has not been configured yet - PebbleConnectionError: if the remote container cannot be reached. - RollingOpsCharmLibMissingError: if the charm libs cannot be found. - """ - if callback_id not in self.callback_targets: - raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') - - if not self._etcd_relation: - raise RollingOpsNoEtcdRelationError - - etcdctl.ensure_initialized() - - # TODO: implement actual lock request - - self.worker.start() - - def _on_run_with_lock(self) -> None: - """This is a dummy function. - - Here we try to reach etcd from each unit. - This function will be completely remade in the next PR. - """ - # TODO: implement the actual execution under lock - etcdctl.run('put', self.keys.lock_key, self.keys.owner) - - result = etcdctl.run('get', self.keys.lock_key, '--print-value-only') - - if result is None: - logger.error('Unexpected response from etcd.') - return - - callback = self.callback_targets.get('_restart', '') - callback(delay=1) diff --git a/rollingops/src/charmlibs/rollingops/_peer_models.py b/rollingops/src/charmlibs/rollingops/_peer_models.py deleted file mode 100644 index b81ff4b3a..000000000 --- a/rollingops/src/charmlibs/rollingops/_peer_models.py +++ /dev/null @@ -1,521 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""peer rolling ops models.""" - -import json -import logging -from collections.abc import Iterator -from dataclasses import dataclass, field -from datetime import UTC, datetime -from enum import StrEnum -from typing import Any - -from ops import Model, RelationDataContent, Unit - -logger = logging.getLogger(__name__) - - -def _now_timestamp_str() -> str: - """UTC timestamp as a string using ISO 8601 format.""" - return datetime.now(UTC).isoformat() - - -def _now_timestamp() -> datetime: - """UTC timestamp.""" - return datetime.now(UTC) - - -def _parse_timestamp(timestamp: str) -> datetime | None: - """Parse timestamp string. Return None on errors to avoid selecting invalid timestamps.""" - try: - return datetime.fromisoformat(timestamp) - except Exception: - return None - - -class RollingOpsNoRelationError(Exception): - """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" - - -class RollingOpsDecodingError(Exception): - """Raised if the content of the databag cannot be processed.""" - - -class RollingOpsInvalidLockRequestError(Exception): - """Raised if the lock request is invalid.""" - - -@dataclass -class Operation: - """A single queued operation.""" - - callback_id: str - requested_at: datetime - max_retry: int | None - attempt: int - kwargs: dict[str, Any] = field(default_factory=dict[str, Any]) - - @classmethod - def _validate_fields( - cls, callback_id: Any, kwargs: Any, requested_at: Any, max_retry: Any, attempt: Any - ) -> None: - """Validate the class attributes.""" - if not isinstance(callback_id, str) or not callback_id.strip(): - raise ValueError('callback_id must be a non-empty string') - - if not isinstance(kwargs, dict): - raise ValueError('kwargs must be a dict') - try: - json.dumps(kwargs) - except TypeError as e: - raise ValueError(f'kwargs must be JSON-serializable: {e}') from e - - if not isinstance(requested_at, datetime): - raise ValueError('requested_at must be a datetime') - - if max_retry is not None: - if not isinstance(max_retry, int): - raise ValueError('max_retry must be an int') - if max_retry < 0: - raise ValueError('max_retry must be >= 0') - - if not isinstance(attempt, int): - raise ValueError('attempt must be an int') - if attempt < 0: - raise ValueError('attempt must be >= 0') - - def __post_init__(self) -> None: - """Validate the class attributes.""" - self._validate_fields( - self.callback_id, - self.kwargs, - self.requested_at, - self.max_retry, - self.attempt, - ) - - @classmethod - def create( - cls, - callback_id: str, - kwargs: dict[str, Any], - max_retry: int | None = None, - ) -> 'Operation': - """Create a new operation from a callback id and kwargs.""" - return cls( - callback_id=callback_id, - kwargs=kwargs, - requested_at=_now_timestamp(), - max_retry=max_retry, - attempt=0, - ) - - def _to_dict(self) -> dict[str, str]: - """Dict form (string-only values).""" - return { - 'callback_id': self.callback_id, - 'kwargs': self._kwargs_to_json(), - 'requested_at': self.requested_at.isoformat(), - 'max_retry': '' if self.max_retry is None else str(self.max_retry), - 'attempt': str(self.attempt), - } - - def to_string(self) -> str: - """Serialize to a string suitable for a Juju databag.""" - return json.dumps(self._to_dict(), separators=(',', ':')) - - def increase_attempt(self) -> None: - """Increment the attempt counter.""" - self.attempt += 1 - - def is_max_retry_reached(self) -> bool: - """Return True if attempt exceeds max_retry (unless max_retry is None).""" - if self.max_retry is None: - return False - return self.attempt > self.max_retry - - @classmethod - def from_string(cls, data: str) -> 'Operation': - """Deserialize from a Juju databag string. - - Raises: - RollingOpsDecodingError: if data cannot be deserialized. - """ - try: - obj = json.loads(data) - - return cls( - callback_id=obj['callback_id'], - requested_at=_parse_timestamp(obj['requested_at']), # type: ignore[reportArgumentType] - max_retry=int(obj['max_retry']) if obj.get('max_retry') else None, - attempt=int(obj['attempt']), - kwargs=json.loads(obj['kwargs']) if obj.get('kwargs') else {}, - ) - - except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e: - logger.error('Failed to deserialize Operation from %s: %s', data, e) - raise RollingOpsDecodingError( - 'Failed to deserialize data to create an Operation' - ) from e - - def _kwargs_to_json(self) -> str: - """Deterministic JSON serialization for kwargs.""" - return json.dumps(self.kwargs, sort_keys=True, separators=(',', ':')) - - def __eq__(self, other: object) -> bool: - """Equal for the operation.""" - if not isinstance(other, Operation): - return False - return self.callback_id == other.callback_id and self.kwargs == other.kwargs - - def __hash__(self) -> int: - """Hash for the operation.""" - return hash((self.callback_id, self._kwargs_to_json())) - - -class OperationQueue: - """In-memory FIFO queue of Operations with encode/decode helpers for storing in a databag.""" - - def __init__(self, operations: list[Operation] | None = None): - self.operations: list[Operation] = list(operations or []) - - def __len__(self) -> int: - """Return the number of operations in the queue.""" - return len(self.operations) - - @property - def empty(self) -> bool: - """Return True if there are no queued operations.""" - return not self.operations - - def peek(self) -> Operation | None: - """Return the first operation in the queue if it exists.""" - return self.operations[0] if self.operations else None - - def _peek_last(self) -> Operation | None: - """Return the last operation in the queue if it exists.""" - return self.operations[-1] if self.operations else None - - def dequeue(self) -> Operation | None: - """Drop the first operation in the queue if it exists and return it.""" - return self.operations.pop(0) if self.operations else None - - def increase_attempt(self) -> None: - """Increment the attempt counter for the head operation and persist it.""" - if self.empty: - return - self.operations[0].increase_attempt() - - def enqueue_lock_request( - self, callback_id: str, kwargs: dict[str, Any], max_retry: int | None = None - ) -> None: - """Append operation only if it is not equal to the last enqueued operation.""" - operation = Operation.create(callback_id, kwargs, max_retry=max_retry) - - last_operation = self._peek_last() - if last_operation is not None and last_operation == operation: - return - self.operations.append(operation) - - def to_string(self) -> str: - """Encode entire queue to a single string.""" - items = [op.to_string() for op in self.operations] - return json.dumps(items, separators=(',', ':')) - - @classmethod - def from_string(cls, data: str) -> 'OperationQueue': - """Decode queue from a string. - - Raises: - RollingOpsDecodingError: if data cannot be deserialized. - """ - if not data: - return cls() - - try: - items = json.loads(data) - except json.JSONDecodeError as e: - logger.error( - 'Failed to deserialize data to create an OperationQueue from %s: %s', data, e - ) - raise RollingOpsDecodingError( - 'Failed to deserialize data to create an OperationQueue.' - ) from e - if not isinstance(items, list) or not all(isinstance(s, str) for s in items): # type: ignore[reportUnknownVariableType] - raise RollingOpsDecodingError( - 'OperationQueue string must decode to a JSON list of strings.' - ) - - operations = [Operation.from_string(s) for s in items] # type: ignore[reportUnknownVariableType] - return cls(operations) - - -class LockIntent(StrEnum): - """Unit-level lock intents stored in unit databags.""" - - REQUEST = 'request' - RETRY_RELEASE = 'retry-release' - RETRY_HOLD = 'retry-hold' - IDLE = 'idle' - - -class OperationResult(StrEnum): - """Callback return values.""" - - RELEASE = 'release' - RETRY_RELEASE = 'retry-release' - RETRY_HOLD = 'retry-hold' - - -class Lock: - """State machine view over peer relation databags for a single unit. - - This class is the only component that should directly read/write the peer relation - databags for lock state, queue state, and grant state. - - Important: - - All relation databag values are strings. - - This class updates both unit databags and app databags, which triggers - relation-changed events. - """ - - def __init__(self, model: Model, relation_name: str, unit: Unit): - if not model.get_relation(relation_name): - # TODO: defer caller in this case (probably just fired too soon). - raise RollingOpsNoRelationError() - self.relation = model.get_relation(relation_name) - self.unit = unit - self.app = model.app - - @property - def _app_data(self) -> RelationDataContent: - return self.relation.data[self.app] # type: ignore[reportOptionalMemberAccess] - - @property - def _unit_data(self) -> RelationDataContent: - return self.relation.data[self.unit] # type: ignore[reportOptionalMemberAccess] - - @property - def _operations(self) -> OperationQueue: - return OperationQueue.from_string(self._unit_data.get('operations', '')) - - @property - def _state(self) -> str: - return self._unit_data.get('state', '') - - def request( - self, callback_id: str, kwargs: dict[str, Any], max_retry: int | None = None - ) -> None: - """Enqueue an operation and mark this unit as requesting the lock. - - Args: - callback_id: identifies which callback to execute. - kwargs: dict of callback kwargs. - max_retry: None -> unlimited retries, else explicit integer. - """ - queue = self._operations - - previous_length = len(queue) - queue.enqueue_lock_request(callback_id, kwargs, max_retry) - if previous_length == len(queue): - logger.info( - 'Operation %s not added to the queue. It already exists in the back of the queue.', - callback_id, - ) - return - - if len(queue) == 1: - self._unit_data.update({'state': LockIntent.REQUEST}) - - self._unit_data.update({'operations': queue.to_string()}) - logger.info('Operation %s added to the queue.', callback_id) - - def _set_retry(self, intent: LockIntent) -> None: - """Mark the given retry intent on the head operation. - - If max_retry is reached, the head operation is dropped via complete(). - """ - self._increase_attempt() - if self._is_max_retry_reached(): - logger.warning('Operation max retry reached. Dropping.') - self.complete() - return - self._unit_data.update({ - 'executed_at': _now_timestamp_str(), - 'state': intent, - }) - - def retry_release(self) -> None: - """Indicate that the operation should be retried but the lock should be released.""" - self._set_retry(LockIntent.RETRY_RELEASE) - - def retry_hold(self) -> None: - """Indicate that the operation should be retried but the lock should be kept.""" - self._set_retry(LockIntent.RETRY_HOLD) - - def complete(self) -> None: - """Mark the head operation as completed successfully, pop it from the queue. - - Update unit state depending on whether more operations remain. - """ - queue = self._operations - queue.dequeue() - next_state = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE - - self._unit_data.update({ - 'state': next_state, - 'operations': queue.to_string(), - 'executed_at': _now_timestamp_str(), - }) - - def release(self) -> None: - """Clear the application-level grant.""" - self._app_data.update({'granted_unit': '', 'granted_at': ''}) - - def grant(self) -> None: - """Grant a lock to a unit.""" - self._app_data.update({ - 'granted_unit': str(self.unit.name), - 'granted_at': _now_timestamp_str(), - }) - - def is_granted(self) -> bool: - """Return True if the unit holds the lock.""" - granted_unit = self._app_data.get('granted_unit', '') - return granted_unit == str(self.unit.name) - - def should_run(self) -> bool: - """Return True if the lock has been granted to the unit and it is time to run.""" - return self.is_granted() and not self._unit_executed_after_grant() - - def should_release(self) -> bool: - """Return True if the unit finished executing the callback and should be released.""" - return self.is_completed() or self._unit_executed_after_grant() - - def is_waiting(self) -> bool: - """Return True if this unit is waiting for a lock to be granted.""" - return self._state == LockIntent.REQUEST and not self.is_granted() - - def is_completed(self) -> bool: - """Return True if this unit is completed callback but still has the grant. - - Transitional state in which the unit is waiting for the leader to release the lock. - """ - return self._state == LockIntent.IDLE and self.is_granted() - - def is_retry(self) -> bool: - """Return True if this unit requested retry but still has the grant. - - Transitional state in which the unit is waiting for the leader to release the lock. - """ - unit_intent = self._state - return ( - unit_intent == LockIntent.RETRY_RELEASE or unit_intent == LockIntent.RETRY_HOLD - ) and self.is_granted() - - def is_waiting_retry(self) -> bool: - """Return True if the unit requested retry and is waiting for lock to be granted.""" - return self._state == LockIntent.RETRY_RELEASE and not self.is_granted() - - def is_retry_hold(self) -> bool: - """Return True if the unit requested retry and wants to keep the lock.""" - return self._state == LockIntent.RETRY_HOLD and not self.is_granted() - - def get_current_operation(self) -> Operation | None: - """Return the head operation for this unit, if any.""" - return self._operations.peek() - - def _is_max_retry_reached(self) -> bool: - """Return True if the head operation exceeded its max_retry (unless max_retry is None).""" - if not (operation := self.get_current_operation()): - return True - return operation.is_max_retry_reached() - - def _increase_attempt(self) -> None: - """Increment the attempt counter for the head operation and persist it.""" - q = self._operations - q.increase_attempt() - self._unit_data.update({'operations': q.to_string()}) - - def get_last_completed(self) -> datetime | None: - """Get the time the unit requested a retry of the head operation.""" - if timestamp_str := self._unit_data.get('executed_at', ''): - return _parse_timestamp(timestamp_str) - return None - - def get_requested_at(self) -> datetime | None: - """Get the time the head operation was requested at.""" - if not (operation := self.get_current_operation()): - return None - return operation.requested_at - - def _unit_executed_after_grant(self) -> bool: - """Returns True if the unit executed its callback after the lock was granted.""" - granted_at = _parse_timestamp(self._app_data.get('granted_at', '')) - executed_at = _parse_timestamp(self._unit_data.get('executed_at', '')) - - if granted_at is None or executed_at is None: - return False - return executed_at > granted_at - - -def pick_oldest_completed(locks: list[Lock]) -> Lock | None: - """Choose the retry lock with the oldest executed_at timestamp.""" - selected = None - oldest_timestamp = None - - for lock in locks: - timestamp = lock.get_last_completed() - if not timestamp: - continue - - if oldest_timestamp is None or timestamp < oldest_timestamp: - oldest_timestamp = timestamp - selected = lock - - return selected - - -def pick_oldest_request(locks: list[Lock]) -> Lock | None: - """Choose the lock with the oldest head operation.""" - selected = None - oldest_request = None - - for lock in locks: - timestamp = lock.get_requested_at() - if not timestamp: - continue - - if oldest_request is None or timestamp < oldest_request: - oldest_request = timestamp - selected = lock - - return selected - - -class LockIterator: - """Iterator over Lock objects for each unit present on the peer relation.""" - - def __init__(self, model: Model, relation_name: str): - relation = model.relations[relation_name][0] - units = relation.units - units.add(model.unit) - self._model = model - self._units = units - self._relation_name = relation_name - - def __iter__(self) -> Iterator[Lock]: - """Yields a lock for each unit we can find on the relation.""" - for unit in self._units: - yield Lock(self._model, self._relation_name, unit=unit) diff --git a/rollingops/src/charmlibs/rollingops/_peer_worker.py b/rollingops/src/charmlibs/rollingops/_peer_worker.py deleted file mode 100644 index 4a7a54860..000000000 --- a/rollingops/src/charmlibs/rollingops/_peer_worker.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" - -import logging -import os -import signal -import subprocess -from pathlib import Path -from sys import version_info - -from ops import Relation, RelationDataContent -from ops.charm import ( - CharmBase, -) -from ops.framework import Object - -logger = logging.getLogger(__name__) - - -class PeerRollingOpsAsyncWorker(Object): - """Spawns and manages the external rolling-ops worker process.""" - - def __init__(self, charm: CharmBase, relation_name: str): - super().__init__(charm, 'peer-rollingops-async-worker') - self._charm = charm - self._peers_name = relation_name - self._run_cmd = '/usr/bin/juju-exec' - self._charm_dir = charm.charm_dir - - @property - def _relation(self) -> Relation | None: - """Returns the peer relation.""" - return self._charm.model.get_relation(self._peers_name) - - @property - def _app_data(self) -> RelationDataContent: - """Returns the application databag in the peer relation.""" - return self._relation.data[self.model.app] # type: ignore[reportOptionalMemberAccess] - - def start(self) -> None: - """Start a new worker process.""" - if self._relation is None: - return - self.stop() - - # Remove JUJU_CONTEXT_ID so juju-run works from the spawned process - new_env = os.environ.copy() - new_env.pop('JUJU_CONTEXT_ID', None) - - for loc in new_env.get('PYTHONPATH', '').split(':'): - path = Path(loc) - venv_path = ( - path - / '..' - / 'venv' - / 'lib' - / f'python{version_info.major}.{version_info.minor}' - / 'site-packages' - ) - if path.stem == 'lib': - new_env['PYTHONPATH'] = f'{venv_path.resolve()}:{new_env["PYTHONPATH"]}' - break - - worker = ( - self._charm_dir - / 'venv' - / 'lib' - / f'python{version_info.major}.{version_info.minor}' - / 'site-packages' - / 'charmlibs' - / 'rollingops' - / '_peer_rollingops.py' - ) - - # These files must stay open for the lifetime of the worker process. - log_out = open('/var/log/peer_rollingops_worker.log', 'a') # noqa: SIM115 - log_err = open('/var/log/peer_rollingops_worker.err', 'a') # noqa: SIM115 - - pid = subprocess.Popen( - [ - '/usr/bin/python3', - '-u', - str(worker), - '--run-cmd', - self._run_cmd, - '--unit-name', - self._charm.model.unit.name, - '--charm-dir', - str(self._charm_dir), - ], - cwd=str(self._charm_dir), - stdout=log_out, - stderr=log_err, - env=new_env, - ).pid - - self._app_data.update({'rollingops-worker-pid': str(pid)}) - logger.info('Started RollingOps worker process with PID %s', pid) - - def stop(self) -> None: - """Stop the running worker process if it exists.""" - if self._relation is None: - return - - if not (pid_str := self._app_data.get('rollingops-worker-pid', '')): - return - - pid = int(pid_str) - try: - os.kill(pid, signal.SIGINT) - logger.info('Stopped RollingOps worker process PID %s', pid) - except OSError: - logger.info('Failed to stop RollingOps worker process PID %s', pid) - - self._app_data.update({'rollingops-worker-pid': ''}) diff --git a/rollingops/src/charmlibs/rollingops/_rollingops_manager.py b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py new file mode 100644 index 000000000..dbd34d1a7 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/_rollingops_manager.py @@ -0,0 +1,465 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common rolling-ops interface coordinating etcd-backed and peer-backed execution.""" + +import logging +from contextlib import contextmanager +from typing import Any + +from ops import CharmBase, Object, Relation, RelationBrokenEvent +from ops.framework import EventBase + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsDecodingError, + RollingOpsInvalidLockRequestError, + RollingOpsNoRelationError, + RollingOpsSyncLockError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationQueue, + ProcessingBackend, + RollingOpsState, + RollingOpsStatus, + RunWithLockStatus, + SyncLockBackend, + UnitBackendState, +) +from charmlibs.rollingops.common._utils import ETCD_FAILED_HOOK_NAME, LOCK_GRANTED_HOOK_NAME +from charmlibs.rollingops.etcd._backend import EtcdRollingOpsBackend +from charmlibs.rollingops.peer._backend import PeerRollingOpsBackend +from charmlibs.rollingops.peer._models import PeerUnitOperations + +logger = logging.getLogger(__name__) + + +class RollingOpsLockGrantedEvent(EventBase): + """Custom event emitted when the background worker grants the lock.""" + + +class RollingOpsEtcdFailedEvent(EventBase): + """Custom event emitted when the etcd worker hits a fatal error.""" + + +class RollingOpsManager(Object): + """Coordinate rolling operations across etcd and peer backends. + + This object exposes a common API for queuing asynchronous rolling + operations and acquiring synchronous locks. It prefers etcd when + available, mirrors operation state into the peer relation, and falls + back to peer-based processing when etcd becomes unavailable or + inconsistent. + """ + + def __init__( + self, + charm: CharmBase, + peer_relation_name: str, + etcd_relation_name: str, + cluster_id: str, + callback_targets: dict[str, Any], + sync_lock_targets: dict[str, type[SyncLockBackend]] | None = None, + ): + """Create a rolling operations manager with etcd and peer backends. + + This manager coordinates rolling operations across two backends: + + - an etcd-backed backend, used when etcd is available + - a peer-relation-backed backend, used as a fallback + + Operations are always persisted in the peer backend. When etcd is + available, operations are also mirrored to etcd and processed there. + If etcd becomes unavailable or unhealthy, this manager falls back to + the peer backend and continues processing from the mirrored state. + + Args: + charm: The charm instance owning this manager. + peer_relation_name: Name of the peer relation used for fallback + state and operation mirroring. + etcd_relation_name: Name of the relation providing etcd access. + cluster_id: Identifier used to scope etcd-backed state for this + rolling-ops instance. + callback_targets: Mapping of callback identifiers to callables + executed when queued operations are granted the lock. + sync_lock_targets: Optional mapping of sync lock backend + identifiers to backend implementations used when acquiring + synchronous locks through the peer fallback path. + """ + super().__init__(charm, 'rolling-ops-manager') + + self.charm = charm + self.peer_relation_name = peer_relation_name + self.etcd_relation_name = etcd_relation_name + self._sync_lock_targets = sync_lock_targets or {} + charm.on.define_event(LOCK_GRANTED_HOOK_NAME, RollingOpsLockGrantedEvent) + charm.on.define_event(ETCD_FAILED_HOOK_NAME, RollingOpsEtcdFailedEvent) + + self.peer_backend = PeerRollingOpsBackend( + charm=charm, + relation_name=peer_relation_name, + callback_targets=callback_targets, + ) + self.etcd_backend = EtcdRollingOpsBackend( + charm=charm, + peer_relation_name=peer_relation_name, + etcd_relation_name=etcd_relation_name, + cluster_id=cluster_id, + callback_targets=callback_targets, + ) + self.framework.observe( + charm.on[self.etcd_relation_name].relation_broken, self._on_etcd_relation_broken + ) + self.framework.observe(charm.on.rollingops_lock_granted, self._on_rollingops_lock_granted) + self.framework.observe(charm.on.rollingops_etcd_failed, self._on_rollingops_etcd_failed) + self.framework.observe(charm.on.update_status, self._on_update_status) + + @property + def _peer_relation(self) -> Relation | None: + """Return the peer relation for this charm.""" + return self.model.get_relation(self.peer_relation_name) + + @property + def _backend_state(self) -> UnitBackendState: + """Return the backend selection state stored for the current unit. + + This state determines whether the current unit is managed by the etcd + backend or the peer backend, and is used to control fallback and + recovery decisions. + """ + return UnitBackendState(self.model, self.peer_relation_name, self.model.unit) + + def _on_etcd_relation_broken(self, event: RelationBrokenEvent) -> None: + """Handle the etcd relation being fully removed. + + This method stops the etcd worker process since the required + relation is no longer available. + """ + self._fallback_current_unit_to_peer() + + def _select_processing_backend(self) -> ProcessingBackend: + """Choose which backend should handle new operations for this unit. + + Etcd is preferred when available, but a unit that has fallen back to + peer remains peer-managed until its pending peer work is drained. + This ensures backend transitions happen only from a clean state. + + Returns: + The selected processing backend. + """ + if not self.etcd_backend.is_available(): + logger.info('etcd backend unavailable; selecting peer backend.') + return ProcessingBackend.PEER + + if self._backend_state.is_peer_managed() and not self.peer_backend.has_pending_work(): + logger.info('etcd backend is available. Switching to etcd backend.') + return ProcessingBackend.ETCD + + if self._backend_state.is_etcd_managed(): + logger.info('etcd backend selected.') + return ProcessingBackend.ETCD + + logger.info('peer backend selected.') + return ProcessingBackend.PEER + + def _fallback_current_unit_to_peer(self) -> None: + """Move the current unit to the peer backend and resume processing there. + + This method marks the unit as peer-managed, stops the etcd worker, + and ensures that peer-based processing is running. + + It is used when etcd becomes unavailable, unhealthy, or inconsistent, + so that queued operations can continue without being lost. + """ + self._backend_state.fallback_to_peer() + self.etcd_backend.worker.stop() + self.peer_backend.ensure_processing() + + def request_async_lock( + self, + callback_id: str, + kwargs: dict[str, Any] | None = None, + max_retry: int | None = None, + ) -> None: + """Queue a rolling operation and trigger processing on the active backend. + + A new operation is created and always persisted in the peer backend. + If etcd is currently selected as the processing backend, the operation + is also mirrored to etcd and processing is triggered there. + + If persisting to etcd fails, the manager falls back to peer-based + processing. This guarantees that operations remain schedulable even + when etcd is unavailable. + + Args: + callback_id: Identifier of the callback to execute when the + operation is granted the rolling lock. + kwargs: Optional keyword arguments passed to the callback target. + max_retry: Optional maximum number of retries allowed for the + operation. None means infinte retries. + + Raises: + RollingOpsInvalidLockRequestError: If the callback identifier is + unknown, the operation cannot be created, or it cannot be + persisted in the peer backend. + RollingOpsNoRelationError: If the peer relation is not available. + """ + if callback_id not in self.peer_backend.callback_targets: + raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') + + if not self._peer_relation: + raise RollingOpsNoRelationError('No %s peer relation yet.', self.peer_relation_name) + + if kwargs is None: + kwargs = {} + + backend = self._select_processing_backend() + + try: + operation = Operation.create(callback_id, kwargs, max_retry) + except (RollingOpsDecodingError, ValueError) as e: + logger.error('Failed to create operation: %s', e) + raise RollingOpsInvalidLockRequestError('Failed to create the lock request') from e + + try: + self.peer_backend.enqueue_operation(operation) + except (RollingOpsDecodingError, ValueError) as e: + logger.error('Failed to persists operation in peer backend: %s', e) + raise RollingOpsInvalidLockRequestError( + 'Failed to persists operation in peer backend.' + ) from e + + if backend == ProcessingBackend.ETCD: + try: + self.etcd_backend.enqueue_operation(operation) + except Exception as e: + logger.warning( + 'Failed to persist operation in etcd backend; falling back to peer: %s', + e, + ) + backend = ProcessingBackend.PEER + + if backend == ProcessingBackend.ETCD: + self.etcd_backend.ensure_processing() + else: + self._fallback_current_unit_to_peer() + + def _on_rollingops_lock_granted(self, event: RollingOpsLockGrantedEvent) -> None: + """Handle a granted rolling lock and dispatch execution to the active backend. + + If the current unit is peer-managed, the operation is executed through + the peer backend. + + If the current unit is etcd-managed, the operation is executed through + the etcd backend. + """ + if self._backend_state.is_peer_managed(): + logger.info('Executing rollingop on peer backend.') + self.peer_backend._on_rollingops_lock_granted(event) + return + self._run_etcd_and_mirror_or_fallback() + + def _run_etcd_and_mirror_or_fallback(self) -> None: + """Run the etcd execution path and mirror its outcome to peer. + + On successful execution, the result is mirrored back + to the peer relation so that peer state remains consistent and can be + used for fallback. + + If etcd execution fails or mirrored state becomes inconsistent, the + manager falls back to the peer backend and resumes processing there. + """ + try: + logger.info('Executing rollingop on etcd backend.') + outcome = self.etcd_backend._on_run_with_lock() + except Exception as e: + logger.warning( + 'etcd backend failed while handling rollingops_lock_granted; ' + 'falling back to peer: %s', + e, + ) + self._fallback_current_unit_to_peer() + return + + try: + self.peer_backend.mirror_outcome(outcome) + except RollingOpsDecodingError: + logger.info( + 'Inconsistencies found between peer relation and etcd. ' + 'Falling back to peer backend.' + ) + self._fallback_current_unit_to_peer() + return + logger.info('Execution mirrored to peer relation.') + if outcome.status == RunWithLockStatus.EXECUTED_NOT_COMMITTED: + self._fallback_current_unit_to_peer() + logger.info('Fell back to peer backend.') + + def _on_rollingops_etcd_failed(self, event: RollingOpsEtcdFailedEvent) -> None: + """Fall back to peer when the etcd worker reports a fatal failure.""" + logger.warning('Received %s.', ETCD_FAILED_HOOK_NAME) + if self._backend_state.is_etcd_managed(): + # No need to stop the background process. This hook means that it stopped. + self._backend_state.fallback_to_peer() + self.peer_backend.ensure_processing() + logger.info('Fell back to peer backend.') + + def _get_sync_lock_backend(self, backend_id: str) -> SyncLockBackend: + """Instantiate the configured peer sync lock backend. + + Args: + backend_id: Identifier of the configured sync lock backend. + + Returns: + A new sync lock backend instance. + + Raises: + RollingOpsSyncLockError: If no backend is registered for + the given identifier. + """ + backend_cls = self._sync_lock_targets.get(backend_id, None) + if backend_cls is None: + raise RollingOpsSyncLockError(f'Unknown sync lock backend: {backend_id}.') + + return backend_cls() + + @contextmanager + def acquire_sync_lock(self, backend_id: str, timeout: int): + """Acquire a synchronous lock, using etcd when available and peer as fallback. + + This context manager first attempts to acquire the lock through the + etcd backend. If etcd is available and the lock is acquired, the + protected block is executed under the etcd lock. + + If etcd fails due to an operational error, the manager falls back to + the configured peer sync lock backend identified by `backend_id`. + If etcd acquisition times out, the timeout is propagated and no + fallback occurs. + + On context exit, the acquired lock is released through the backend + that granted it. + + Args: + backend_id: Identifier of the peer sync lock backend to use if + etcd acquisition cannot be used. + timeout: Maximum time in seconds to wait for lock acquisition. + None means infinite time. + + Yields: + None. The protected code runs while the lock is held. + + Raises: + TimeoutError: If lock acquisition through etcd or the peer backend + times out. + RollingOpsSyncLockError: if there is an error when acquiring the lock. + """ + if self.etcd_backend.is_available(): + logger.info('Acquiring sync lock on etcd.') + try: + self.etcd_backend.acquire_sync_lock(timeout) + yield + return + except TimeoutError: + raise + except Exception as e: + # etcd is not reachable or unhealthy + logger.exception( + 'Failed to request etcd sync lock; falling back to peer: %s', + e, + ) + finally: + try: + self.etcd_backend.release_sync_lock() + logger.info('etcd lock released.') + except Exception as e: + logger.exception('Failed to release sync lock: %s', e) + + backend = self._get_sync_lock_backend(backend_id) + logger.info('Acquiring sync lock backend %s.', backend_id) + try: + backend.acquire(timeout=timeout) + except Exception as e: + raise RollingOpsSyncLockError( + f'Failed to acquire sync lock backend {backend_id}' + ) from e + + try: + yield + finally: + try: + backend.release() + logger.info('Sync lock backend %s released.', backend_id) + except Exception as e: + raise RollingOpsSyncLockError( + f'Failed to release sync lock backend {backend_id}' + ) from e + + @property + def state(self) -> RollingOpsState: + """Return the current rolling-ops state for this unit. + + The returned state is always based on the peer relation for the + operation queue, since peer state is the durable fallback source of + truth. + + Status is taken from the etcd backend when this unit is currently + etcd-managed. If status retrieval from etcd fails, the unit falls + back to the peer backend and peer status is returned instead. + + Returns: + A snapshot of the current rolling-ops status, backend selection, + and queued operations for this unit. + """ + if self._peer_relation is None: + return RollingOpsState( + status=RollingOpsStatus.UNAVAILABLE, + processing_backend=ProcessingBackend.PEER, + operations=OperationQueue(), + ) + + status = self.peer_backend.get_status() + if self._backend_state.is_etcd_managed(): + status = self.etcd_backend.get_status() + if status == RollingOpsStatus.UNAVAILABLE: + logger.info('etcd backend is not available. Falling back to peer backend.') + self._fallback_current_unit_to_peer() + status = self.peer_backend.get_status() + + operations = PeerUnitOperations(self.model, self.peer_relation_name, self.model.unit) + return RollingOpsState( + status=status, + processing_backend=self._backend_state.backend, + operations=operations.queue, + ) + + def _on_update_status(self, event: EventBase) -> None: + """Periodic reconciliation of rolling-ops state.""" + logger.info('Received a update-status event.') + if self._backend_state.is_etcd_managed(): + if not self.etcd_backend.is_available(): + logger.warning('etcd unavailable during update_status; falling back.') + self._fallback_current_unit_to_peer() + return + + if not self.etcd_backend.is_processing(): + logger.warning( + 'etcd backend is selected but no worker process is running; falling back.' + ) + self._fallback_current_unit_to_peer() + return + + self._run_etcd_and_mirror_or_fallback() + return + + self.peer_backend._on_rollingops_lock_granted(event) diff --git a/rollingops/src/charmlibs/rollingops/_worker.py b/rollingops/src/charmlibs/rollingops/_worker.py deleted file mode 100644 index 9b5430280..000000000 --- a/rollingops/src/charmlibs/rollingops/_worker.py +++ /dev/null @@ -1,173 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" - -import logging -import os -import signal -import subprocess -from sys import version_info - -from ops import Relation -from ops.charm import CharmBase -from ops.framework import Object - -from charmlibs import pathops -from charmlibs.rollingops._models import RollingOpsCharmLibMissingError, with_pebble_retry - -logger = logging.getLogger(__name__) - -WORKER_PID_FIELD = 'etcd-rollingops-worker-pid' - - -class EtcdRollingOpsAsyncWorker(Object): - """Spawns and manages the external rolling-ops worker process.""" - - def __init__(self, charm: CharmBase, peer_relation_name: str, owner: str): - super().__init__(charm, 'etcd-rollingops-async-worker') - self._charm = charm - self._peer_relation_name = peer_relation_name - self._run_cmd = '/usr/bin/juju-exec' - self._owner = owner - self._charm_dir = charm.charm_dir - - @property - def _relation(self) -> Relation | None: - return self.model.get_relation(self._peer_relation_name) - - def start(self) -> None: - """Start a new worker process. - - Raises: - RollingOpsCharmLibMissingError: if the lib files cannot be found. - """ - if self._relation is None: - return - - if pid_str := self._relation.data[self.model.unit].get(WORKER_PID_FIELD): - try: - pid = int(pid_str) - except (ValueError, TypeError): - pid = None - - if pid is not None and self._is_pid_alive(pid): - logger.info( - 'RollingOps worker already running with PID %s; not starting a new one.', pid - ) - return - - # Remove JUJU_CONTEXT_ID so juju-run works from the spawned process - new_env = os.environ.copy() - new_env.pop('JUJU_CONTEXT_ID', None) - - venv_path = ( - self._charm_dir - / 'venv' - / 'lib' - / f'python{version_info.major}.{version_info.minor}' - / 'site-packages' - ) - if not with_pebble_retry(lambda: venv_path.exists()): - raise RollingOpsCharmLibMissingError( - f'Expected virtualenv site-packages not found: {venv_path}' - ) - - for loc in new_env.get('PYTHONPATH', '').split(':'): - path = pathops.LocalPath(loc) - - if path.stem != 'lib': - continue - new_env['PYTHONPATH'] = f'{venv_path.resolve()}:{new_env["PYTHONPATH"]}' - break - - worker = venv_path / 'charmlibs' / 'rollingops' / '_etcd_rollingops.py' - if not with_pebble_retry(lambda: worker.exists()): - raise RollingOpsCharmLibMissingError(f'Worker script not found: {worker}') - - # These files must stay open for the lifetime of the worker process. - log_out = open('/var/log/etcd_rollingops_worker.log', 'a') # noqa: SIM115 - log_err = open('/var/log/etcd_rollingops_worker.err', 'a') # noqa: SIM115 - - pid = subprocess.Popen( - [ - '/usr/bin/python3', - '-u', - str(worker), - '--run-cmd', - self._run_cmd, - '--unit-name', - self.model.unit.name, - '--charm-dir', - str(self._charm_dir), - '--owner', - self._owner, - ], - cwd=str(self._charm_dir), - stdout=log_out, - stderr=log_err, - env=new_env, - ).pid - - self._relation.data[self.model.unit].update({WORKER_PID_FIELD: str(pid)}) - logger.info('Started etcd rollingops worker process with PID %s', pid) - - def _is_pid_alive(self, pid: int) -> bool: - if pid <= 0: - return False - try: - os.kill(pid, 0) - return True - except ProcessLookupError: - return False - except PermissionError: - return True - - def stop(self) -> None: - """Stop the running worker process if it exists.""" - if self._relation is None: - return - - pid_str = self._relation.data[self.model.unit].get(WORKER_PID_FIELD, '') - - try: - pid = int(pid_str) - except (TypeError, ValueError): - logger.info('Missing PID or invalid PID found in the databag.') - self._relation.data[self.model.unit].update({WORKER_PID_FIELD: ''}) - return - - try: - os.kill(pid, signal.SIGTERM) - logger.info('Sent SIGTERM to etcd rollingops worker process PID %s.', pid) - except ProcessLookupError: - logger.info('Process PID %s is already gone.', pid) - except PermissionError: - logger.warning('No permission to stop etcd rollingops worker process PID %s.', pid) - return - except OSError: - logger.warning('SIGTERM failed for PID %s, attempting SIGKILL', pid) - try: - os.kill(pid, signal.SIGKILL) - logger.info('Sent SIGKILL to etcd rollingops worker process PID %s', pid) - except ProcessLookupError: - logger.info('Process PID %s exited before SIGKILL', pid) - except PermissionError: - logger.warning('No permission to SIGKILL process PID %s', pid) - return - except OSError: - logger.warning('Failed to SIGKILL process PID %s', pid) - return - - self._relation.data[self.model.unit].update({WORKER_PID_FIELD: ''}) diff --git a/rollingops/src/charmlibs/rollingops/common/__init__.py b/rollingops/src/charmlibs/rollingops/common/__init__.py new file mode 100644 index 000000000..33bb77934 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common code used by peer and etcd rolling ops.""" diff --git a/rollingops/src/charmlibs/rollingops/common/_base_worker.py b/rollingops/src/charmlibs/rollingops/common/_base_worker.py new file mode 100644 index 000000000..c444b344d --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_base_worker.py @@ -0,0 +1,279 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common class to manager background processes.""" + +import logging +import os +import signal +import subprocess +from sys import version_info + +from ops import CharmBase, Object, Relation + +from charmlibs import pathops +from charmlibs.rollingops.common._exceptions import RollingOpsLibMissingError +from charmlibs.rollingops.common._utils import with_pebble_retry + +logger = logging.getLogger(__name__) + + +class BaseRollingOpsAsyncWorker(Object): + """Base class for external rolling-ops worker processes. + + This class provides the common lifecycle management for background + worker processes used by rolling-ops backends. It is responsible for: + + - locating the worker script inside the charm virtualenv + - building the execution environment for the subprocess + - validating required files before startup + - starting and stopping the worker process + - persisting and retrieving the worker PID through backend-specific storage + + Subclasses define where worker state is stored, how existing workers + should be handled, and which worker script and arguments should be used. + """ + + _pid_field: str + _log_filename: str + + def __init__(self, charm: CharmBase, handle_name: str, peer_relation_name: str): + """Initialize the base rolling-ops worker helper. + + Args: + charm: The charm instance managing the worker process. + handle_name: Framework handle name used for this worker object. + peer_relation_name: Name of the peer relation used by subclasses + to store and retrieve worker state. + """ + super().__init__(charm, handle_name) + self._charm = charm + self._charm_dir = charm.charm_dir + self._peer_relation_name = peer_relation_name + self._handle_name = handle_name + + @property + def _relation(self) -> Relation | None: + """Return the peer relation used for worker state.""" + return self._charm.model.get_relation(self._peer_relation_name) + + def _venv_site_packages(self) -> pathops.LocalPath: + """Return the site-packages path for the charm virtualenv. + + This path is used to locate the rolling-ops worker scripts and ensure + the spawned subprocess can import charm library code. + """ + return pathops.LocalPath( + self._charm_dir + / 'venv' + / 'lib' + / f'python{version_info.major}.{version_info.minor}' + / 'site-packages' + ) + + def _build_env(self) -> dict[str, str]: + """Build the environment used to spawn the worker subprocess. + + The worker runs outside the current Juju hook context, so the Juju + context identifier is removed from the environment. The charm virtualenv + site-packages path is also prepended to ``PYTHONPATH`` so that the + worker can import charm libraries correctly. + + Returns: + A copy of the current environment adjusted for the worker process. + """ + new_env = os.environ.copy() + new_env.pop('JUJU_CONTEXT_ID', None) + + venv_path = self._venv_site_packages() + + for loc in new_env.get('PYTHONPATH', '').split(':'): + path = pathops.LocalPath(loc) + + if path.stem != 'lib': + continue + new_env['PYTHONPATH'] = f'{venv_path.resolve()}:{new_env["PYTHONPATH"]}' + break + return new_env + + def _worker_script_path(self) -> pathops.LocalPath: + """Return the worker script path.""" + raise NotImplementedError + + def _worker_args(self) -> list[str]: + """Return additional backend-specific command-line arguments. + + Subclasses may override this to pass extra arguments required by the + worker process. + + Returns: + A list of command-line arguments to append when starting the worker. + """ + return [] + + @property + def _pid(self) -> int | None: + """Return the stored worker PID. + + Returns: + The stored PID, None if no PID is stored. + + Raises: + NotImplementedError: If not implemented by a subclass. + """ + raise NotImplementedError + + @_pid.setter + def _pid(self, value: int | None) -> None: + """Persist the worker PID string. + + Args: + value: The PID string to persist. An empty string clears the stored PID. + + Raises: + NotImplementedError: If not implemented by a subclass. + """ + raise NotImplementedError + + def _on_existing_worker(self, pid: int) -> bool: + """Handle case where a worker is already running. + + Returns: + True if a new worker should be started, + False if start() should return early. + """ + raise NotImplementedError + + def _validate_startup_paths(self) -> None: + """Validate that the worker runtime files exist before startup. + + This checks that the charm virtualenv site-packages directory exists + and that the backend-specific worker script is present. + + Raises: + RollingOpsLibMissingError: If the virtualenv or worker script + cannot be found. + """ + venv_path = self._venv_site_packages() + if not with_pebble_retry(lambda: venv_path.exists()): + raise RollingOpsLibMissingError( + f'Expected virtualenv site-packages not found: {venv_path}' + ) + + worker = self._worker_script_path() + if not with_pebble_retry(lambda: worker.exists()): + raise RollingOpsLibMissingError(f'Worker script not found: {worker}') + + def _is_pid_alive(self, pid: int) -> bool: + """Return whether the given PID appears to be alive.""" + if pid <= 0: + return False + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + except PermissionError: + return True + + def start(self) -> None: + """Start the worker subprocess if one is not already running. + + Raises: + RollingOpsLibMissingError: If the virtualenv or worker script + required to start the worker is missing. + OSError: If the worker subprocess cannot be started. + """ + if self._relation is None: + logger.info('Peer relation does not exist. Worker cannot start.') + return + pid = self._pid + if pid is not None and self._is_pid_alive(pid) and not self._on_existing_worker(pid): + return + + self._validate_startup_paths() + + worker = self._worker_script_path() + env = self._build_env() + + with open(f'{self._log_filename}', 'a') as log_out: + pid = subprocess.Popen( + [ + '/usr/bin/python3', + '-u', + str(worker), + '--unit-name', + self.model.unit.name, + '--charm-dir', + str(self._charm_dir), + *self._worker_args(), + ], + cwd=str(self._charm_dir), + stdout=log_out, + stderr=log_out, + env=env, + ).pid + + self._pid = pid + logger.info('Started %s process with PID %s', self._handle_name, pid) + + def stop(self) -> None: + """Stop the running worker subprocess, if one is recorded. + + This method reads the stored PID, sends ``SIGTERM`` to the process, + and falls back to ``SIGKILL`` if termination fails. If the process is + already gone or the stored PID is invalid, worker state is cleaned up. + + The stored PID is cleared when the worker is successfully considered + stopped or no longer present. + """ + if self._relation is None: + logger.info('Peer relation not found. Worker cannot be stopped.') + return + + pid = self._pid + if pid is None or pid <= 0: + logger.info('Invalid PID found. Worker cannot be stopped.') + return + + try: + os.kill(pid, signal.SIGTERM) + logger.info('Sent SIGTERM to rollingops worker process PID %s.', pid) + except ProcessLookupError: + logger.info('Process PID %s is already gone.', pid) + except PermissionError: + logger.warning('No permission to stop rollingops worker process PID %s.', pid) + return + except OSError: + logger.warning('SIGTERM failed for PID %s, attempting SIGKILL', pid) + try: + os.kill(pid, signal.SIGKILL) + logger.info('Sent SIGKILL to rollingops worker process PID %s', pid) + except ProcessLookupError: + logger.info('Process PID %s exited before SIGKILL', pid) + except PermissionError: + logger.warning('No permission to SIGKILL process PID %s', pid) + return + except OSError: + logger.warning('Failed to SIGKILL process PID %s', pid) + return + + self._pid = None + + def is_running(self) -> bool: + """Return whether the recorded worker process appears to be alive.""" + pid = self._pid + if pid is None: + return False + return self._is_pid_alive(pid) diff --git a/rollingops/src/charmlibs/rollingops/common/_exceptions.py b/rollingops/src/charmlibs/rollingops/common/_exceptions.py new file mode 100644 index 000000000..9dd97c9d2 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_exceptions.py @@ -0,0 +1,75 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Exceptions used in rollingops.""" + + +class RollingOpsError(Exception): + """General rollingops error.""" + + +class RollingOpsNoRelationError(RollingOpsError): + """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" + + +class RollingOpsNoEtcdRelationError(RollingOpsNoRelationError): + """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" + + +class RollingOpsFileSystemError(RollingOpsError): + """Raised if there is a problem when interacting with the filesystem.""" + + +class RollingOpsInvalidLockRequestError(RollingOpsError): + """Raised if the lock request is invalid.""" + + +class RollingOpsDecodingError(RollingOpsError): + """Raised if json content cannot be processed.""" + + +class RollingOpsInvalidSecretContentError(RollingOpsError): + """Raised if the content of a secret is invalid.""" + + +class RollingOpsLibMissingError(RollingOpsError): + """Raised if the path to the libraries cannot be resolved.""" + + +class RollingOpsEtcdctlError(RollingOpsError): + """Base exception for etcdctl command failures.""" + + +class RollingOpsEtcdctlRetryableError(RollingOpsEtcdctlError): + """A transient etcdctl failure that may succeed on retry.""" + + +class RollingOpsEtcdNotConfiguredError(RollingOpsEtcdctlError): + """Raised if etcd client has not been configured yet (env file does not exist).""" + + +class RollingOpsEtcdctlFatalError(RollingOpsEtcdctlError): + """A non-retryable etcdctl failure.""" + + +class RollingOpsEtcdctlParseError(RollingOpsEtcdctlError): + """Raised when etcdctl output cannot be parsed.""" + + +class RollingOpsSyncLockError(RollingOpsError): + """Raised when there is an error during sync lock execution.""" + + +class RollingOpsEtcdTransactionError(RollingOpsError): + """Raised when an etcd transaction fails.""" diff --git a/rollingops/src/charmlibs/rollingops/common/_models.py b/rollingops/src/charmlibs/rollingops/common/_models.py new file mode 100644 index 000000000..7b15a1fd0 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_models.py @@ -0,0 +1,499 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rolling ops common models.""" + +import json +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import datetime +from enum import StrEnum +from typing import Any + +from ops import Model, Unit +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + field_serializer, + field_validator, +) + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsDecodingError, + RollingOpsNoRelationError, +) +from charmlibs.rollingops.common._utils import datetime_to_str, now_timestamp, parse_timestamp + +logger = logging.getLogger(__name__) + + +class OperationResult(StrEnum): + """Result values returned by rolling-ops callbacks on async locks. + + These values control how the rolling-ops manager updates the operation + state and whether the distributed lock is released or retained. + + - RELEASE: + The operation completed successfully and no retry is required. + The lock is released and the next unit may be scheduled. + + - RETRY_RELEASE: + The operation failed or timed out and should be retried later. + The operation is re-queued and the lock is released so that + other units may proceed before this operation is retried. + + - RETRY_HOLD: + The operation failed or timed out and should be retried immediately. + The operation is re-queued and the lock is kept by the current + unit, allowing it to retry immediately. + """ + + RELEASE = 'release' + RETRY_RELEASE = 'retry-release' + RETRY_HOLD = 'retry-hold' + + +class ProcessingBackend(StrEnum): + """Backend responsible for processing a unit's queue.""" + + PEER = 'peer' + ETCD = 'etcd' + + +class RunWithLockStatus(StrEnum): + """Status of an attempt to execute an operation under a distributed lock. + + These values describe what happened when a unit tried to run an + operation while interacting with the lock. + """ + + NOT_GRANTED = 'not_granted' + NO_OPERATION = 'no_operation' + MISSING_CALLBACK = 'missing_callback' + EXECUTED = 'executed' + EXECUTED_NOT_COMMITTED = 'executed_not_committed' + + +class RollingOpsStatus(StrEnum): + """High-level rolling-ops status for a unit. + + It reflects whether the unit is currently executing work, waiting + for execution, idle, or unable to participate. + + States: + + - UNAVAILABLE: + Rolling-ops cannot be used on this unit. This typically occurs when + required relations are missing or the selected backend is not reachable. + * peer backend: peer relation does not exist + * etcd backend: peer or etcd relation missing, or etcd not reachable + + - WAITING: + The unit has pending operations but does not currently hold the lock. + + - GRANTED: + The unit currently holds the lock and may execute operations. + + - IDLE: + The unit has no pending operations and is not holding the lock. + """ + + UNAVAILABLE = 'unavailable' + WAITING = 'waiting' + GRANTED = 'granted' + IDLE = 'idle' + + +@dataclass(frozen=True) +class RunWithLockOutcome: + """Result of attempting to execute an operation under a distributed lock. + + This object captures both whether an operation was executed and, if so, + the identity and result of that operation. It is used to propagate + execution outcomes across backends (e.g. etcd → peer mirroring). + """ + + status: RunWithLockStatus + op_id: str | None = None + result: OperationResult | None = None + + +@dataclass +class BackendState: + """Unit-scoped backend ownership and recovery state.""" + + processing_backend: str = ProcessingBackend.PEER + etcd_cleanup_needed: str = 'false' + + @property + def cleanup_needed(self) -> bool: + """Return whether stale etcd state must be cleaned before reuse.""" + return self.etcd_cleanup_needed == 'true' + + @cleanup_needed.setter + def cleanup_needed(self, value: bool) -> None: + """Persist whether stale etcd state cleanup is required.""" + self.etcd_cleanup_needed = 'true' if value else 'false' + + @property + def backend(self) -> ProcessingBackend: + """Return which backend owns execution for this unit's queue.""" + if not self.processing_backend: + return ProcessingBackend.PEER + return ProcessingBackend(self.processing_backend) + + @backend.setter + def backend(self, value: ProcessingBackend) -> None: + """Persist the backend owner.""" + self.processing_backend = value + + +class UnitBackendState: + """Manage backend ownership and fallback state for one unit queue.""" + + def __init__(self, model: Model, relation_name: str, unit: Unit): + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + self._relation = relation + self.unit = unit + + self._backend_state = self._relation.load(BackendState, self.unit, decoder=lambda s: s) + + def _save(self, data: BackendState) -> None: + self._relation.save(data, self.unit, encoder=str) + + @property + def backend(self) -> ProcessingBackend: + """Return which backend owns execution for this unit's queue.""" + return self._backend_state.backend + + @property + def cleanup_needed(self) -> bool: + """Return whether etcd cleanup is required before etcd can be reused.""" + return self._backend_state.cleanup_needed + + def fallback_to_peer(self) -> None: + """Switch this unit's queue to peer processing and mark etcd cleanup needed.""" + self._backend_state.backend = ProcessingBackend.PEER + self._backend_state.cleanup_needed = True + self._save(self._backend_state) + + def clear_fallback(self) -> None: + """Clear the etcd cleanup-needed flag and set the backend to ETCD.""" + self._backend_state.backend = ProcessingBackend.ETCD + self._backend_state.cleanup_needed = False + self._save(self._backend_state) + + def is_peer_managed(self) -> bool: + """Return whether the peer backend should process this unit's queue.""" + return self.backend == ProcessingBackend.PEER + + def is_etcd_managed(self) -> bool: + """Return whether the etcd backend should process this unit's queue.""" + return self.backend == ProcessingBackend.ETCD + + +class Operation(BaseModel): + """A single queued operation.""" + + model_config = ConfigDict(use_enum_values=True) + + callback_id: str + requested_at: datetime + max_retry: int | None = None + attempt: int = 0 + result: OperationResult | None = None + kwargs: dict[str, Any] = Field(default_factory=dict) + + @field_validator('callback_id') + @classmethod + def validate_callback_id(cls, value: str) -> str: + if not value.strip(): + raise ValueError('callback_id must be a non-empty string') + return value + + @field_validator('kwargs') + @classmethod + def validate_kwargs(cls, value: dict[str, Any]) -> dict[str, Any]: + try: + json.dumps(value) + except TypeError as e: + raise ValueError(f'kwargs must be JSON-serializable: {e}') from e + return value + + @field_serializer('kwargs') + def serialize_kwargs(self, value: dict[str, Any]) -> dict[str, Any]: + """Ensure deterministic ordering of kwargs.""" + return dict(sorted(value.items())) + + @field_validator('max_retry') + @classmethod + def validate_max_retry(cls, value: int | None) -> int | None: + if value is not None and value < 0: + raise ValueError('max_retry must be >= 0') + return value + + @field_validator('attempt') + @classmethod + def validate_attempt(cls, value: int) -> int: + if value < 0: + raise ValueError('attempt must be >= 0') + return value + + @field_validator('requested_at', mode='before') + @classmethod + def validate_requested_at(cls, value: Any) -> Any: + if isinstance(value, str): + return parse_timestamp(value) + return value + + @field_serializer('requested_at') + def serialize_requested_at(self, value: datetime) -> str: + return datetime_to_str(value) + + @classmethod + def create( + cls, + callback_id: str, + kwargs: dict[str, Any], + max_retry: int | None = None, + ) -> 'Operation': + """Create a new operation from a callback id and kwargs.""" + return cls( + callback_id=callback_id, + kwargs=kwargs, + requested_at=now_timestamp(), + max_retry=max_retry, + attempt=0, + result=None, + ) + + def to_string(self) -> str: + """Serialize to a single JSON object string.""" + return self.model_dump_json() + + @classmethod + def from_string(cls, data: str) -> 'Operation': + """Deserialize from a JSON string.""" + try: + return cls.model_validate_json(data) + except Exception as e: + logger.error('Failed to deserialize Operation from %s: %s', data, e) + raise RollingOpsDecodingError( + 'Failed to deserialize data to create an Operation' + ) from e + + def increase_attempt(self) -> None: + """Increment the attempt counter.""" + self.attempt += 1 + + def is_max_retry_reached(self) -> bool: + """Return True if attempt exceeds max_retry (unless max_retry is None).""" + if self.max_retry is None: + return False + return self.attempt > self.max_retry + + def complete(self) -> None: + """Mark the operation as completed to indicate the lock should be released.""" + self.increase_attempt() + self.result = OperationResult.RELEASE + + def retry_release(self) -> None: + """Mark the operation to be retried later, releasing the lock. + + If the maximum retry count is reached, the operation is marked as + ``RELEASE`` and will not be retried further. + """ + self.increase_attempt() + if self.is_max_retry_reached(): + logger.warning('Operation max retry reached. Dropping.') + self.result = OperationResult.RELEASE + else: + self.result = OperationResult.RETRY_RELEASE + + def retry_hold(self) -> None: + """Mark the operation to be retried immediately, retaining the lock. + + If the maximum retry count is reached, the operation is marked as + ``RELEASE`` and will not be retried further. + """ + self.increase_attempt() + if self.is_max_retry_reached(): + self.result = OperationResult.RELEASE + logger.warning('Operation max retry reached. Dropping.') + else: + self.result = OperationResult.RETRY_HOLD + + @property + def op_id(self) -> str: + """Return the unique identifier for this operation.""" + return f'{datetime_to_str(self.requested_at)}-{self.callback_id}' + + def _kwargs_to_json(self) -> str: + """Deterministic JSON serialization for kwargs.""" + return json.dumps(self.kwargs, sort_keys=True, separators=(',', ':')) + + def __eq__(self, other: object) -> bool: + """Equal for the operation.""" + if not isinstance(other, Operation): + return False + return self.callback_id == other.callback_id and self.kwargs == other.kwargs + + def __hash__(self) -> int: + """Hash for the operation.""" + return hash((self.callback_id, self._kwargs_to_json())) + + +class OperationQueue(RootModel[list[Operation]]): + """In-memory FIFO queue of Operations with encode/decode helpers for storing in a databag.""" + + def __init__(self, operations: list[Operation] | None = None) -> None: + super().__init__(root=operations or []) # pyright: ignore[reportUnknownMemberType] + + @property + def operations(self) -> list[Operation]: + """Return the underlying list of operations.""" + return self.root + + def __len__(self) -> int: + """Return the number of operations in the queue.""" + return len(self.root) + + @property + def empty(self) -> bool: + """Return True if there are no queued operations.""" + return not self.root + + def peek(self) -> Operation | None: + """Return the first operation in the queue if it exists.""" + return self.operations[0] if self.operations else None + + def _peek_last(self) -> Operation | None: + """Return the last operation in the queue if it exists.""" + return self.operations[-1] if self.operations else None + + def dequeue(self) -> Operation | None: + """Drop the first operation in the queue if it exists and return it.""" + return self.operations.pop(0) if self.operations else None + + def increase_attempt(self) -> None: + """Increment the attempt counter for the head operation and persist it.""" + if self.empty: + return + self.operations[0].increase_attempt() + + def enqueue(self, operation: Operation) -> None: + """Append operation only if it is not equal to the tail operation.""" + last_operation = self._peek_last() + if last_operation is not None and last_operation == operation: + return + self.operations.append(operation) + + def to_string(self) -> str: + """Encode entire queue to a single JSON string.""" + return self.model_dump_json() + + @classmethod + def from_string(cls, data: str) -> 'OperationQueue': + """Decode a queue from a JSON string. + + Args: + data: Serialized queue as a JSON array of operation objects. + + Returns: + The decoded operation queue. + + Raises: + RollingOpsDecodingError: If the queue cannot be deserialized. + """ + if not data: + return cls([]) + + try: + return cls.model_validate_json(data) + except Exception as e: + logger.error( + 'Failed to deserialize data to create an OperationQueue from %s: %s', + data, + e, + ) + raise RollingOpsDecodingError( + 'Failed to deserialize data to create an OperationQueue.' + ) from e + + +@dataclass(frozen=True) +class RollingOpsState: + """Snapshot of the rolling-ops state for a unit. + + This object provides a view of the rolling-ops system from the perspective + of a single unit. + + This state is intended for decision-making in charm logic + + The `processing_backend` reflects the backend currently selected + for execution. It may change dynamically (e.g. fallback from etcd + to peer). + The `operations` queue always reflects the peer-backed state, which + acts as the source of truth and fallback mechanism. + When `status` is UNAVAILABLE, the unit cannot currently participate + in rolling operations due to missing relations or backend failures. + + Attributes: + status: High-level rolling-ops status for the unit. + processing_backend: Backend currently responsible for executing + operations (e.g. ETCD or PEER). + operations: The unit's operation queue. + """ + + status: RollingOpsStatus + processing_backend: ProcessingBackend + operations: OperationQueue + + +class SyncLockBackend(ABC): + """Interface for synchronous lock backends. + + Implementations provide a mechanism to acquire and release a lock + protecting a critical section. These backends are used by the + RollingOpsManager to coordinate synchronous operations within a + single unit when etcd is not available. + """ + + @abstractmethod + def acquire(self, timeout: int | None) -> None: + """Acquire the lock, blocking until it is granted or timeout expires. + + Args: + timeout: Maximum time in seconds to wait for the lock. + None means wait indefinitely. + + Raises: + TimeoutError: If the lock could not be acquired within the timeout. + """ + raise NotImplementedError + + @abstractmethod + def release(self) -> None: + """Release the lock. + + Implementations must ensure that only the lock owner can release + the lock and that any associated resources are cleaned up. + """ + raise NotImplementedError diff --git a/rollingops/src/charmlibs/rollingops/common/_utils.py b/rollingops/src/charmlibs/rollingops/common/_utils.py new file mode 100644 index 000000000..dfdddb221 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/common/_utils.py @@ -0,0 +1,156 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rolling ops common functions.""" + +import logging +import subprocess +from collections.abc import Callable +from datetime import UTC, datetime +from logging.handlers import RotatingFileHandler +from typing import TypeVar + +from ops import pebble +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed + +from charmlibs.pathops import PebbleConnectionError + +logger = logging.getLogger(__name__) +T = TypeVar('T') + +LOCK_GRANTED_HOOK_NAME = 'rollingops_lock_granted' +ETCD_FAILED_HOOK_NAME = 'rollingops_etcd_failed' + + +@retry( + retry=retry_if_exception_type((PebbleConnectionError, pebble.APIError, pebble.ChangeError)), + stop=stop_after_attempt(3), + wait=wait_fixed(10), + reraise=True, +) +def with_pebble_retry[T](func: Callable[[], T]) -> T: + return func() + + +def now_timestamp() -> datetime: + """UTC timestamp.""" + return datetime.now(UTC) + + +def parse_timestamp(timestamp: str) -> datetime | None: + """Parse epoch timestamp string. Return None on errors.""" + try: + return datetime.fromtimestamp(float(timestamp), tz=UTC) + except Exception: + return None + + +def datetime_to_str(dt: datetime) -> str: + return str(dt.timestamp()) + + +def setup_logging( + log_file: str, + *, + unit_name: str, + cluster_id: str | None = None, + owner: str | None = None, +) -> None: + """Configure logging with file rotation. + + This sets up the root logger to write INFO-level (and above) logs + to a rotating file handler. Log files are capped at 10 MB each, + with up to 10 backup files retained. + + This functions is used in the context of the background process. + + Args: + log_file: Path to the log file where logs should be written. + unit_name: Juju unit name associated with the background process. + cluster_id: Optional etcd cluster identifier. + owner: Optional worker owner identifier. + """ + handler = RotatingFileHandler( + log_file, + maxBytes=10 * 1024 * 1024, # 10 MB + backupCount=10, + ) + + formatter = logging.Formatter( + '%(asctime)s [%(levelname)s] [%(process)d] ' + '[unit=%(unit_name)s cluster=%(cluster_id)s owner=%(owner)s] ' + '%(name)s: %(message)s' + ) + handler.setFormatter(formatter) + + def add_context(record: logging.LogRecord) -> bool: + record.unit_name = unit_name + record.cluster_id = cluster_id or '-' + record.owner = owner or '-' + return True + + handler.addFilter(add_context) + + root = logging.getLogger() + root.setLevel(logging.INFO) + root.handlers.clear() + root.addHandler(handler) + + +def _dispatch_hook(unit_name: str, charm_dir: str, hook_name: str) -> None: + """Execute a Juju hook on a specific unit via juju-exec. + + This function triggers a charm hook by invoking the charm's `dispatch` + script with the appropriate JUJU_DISPATCH_PATH environment variable. + + Args: + unit_name: The Juju unit name (e.g., "app/0") on which to run the hook. + charm_dir: Filesystem path to the charm directory containing the dispatch script. + hook_name: Name of the hook to dispatch (without the "hooks/" prefix). + + Raises: + subprocess.CalledProcessError: If the juju-exec command fails. + """ + run_cmd = '/usr/bin/juju-exec' + dispatch_sub_cmd = f'JUJU_DISPATCH_PATH=hooks/{hook_name} {charm_dir}/dispatch' + res = subprocess.run([run_cmd, '-u', unit_name, dispatch_sub_cmd], check=False) + res.check_returncode() + logger.info('%s hook dispatched.', hook_name) + + +def dispatch_lock_granted(unit_name: str, charm_dir: str) -> None: + """Dispatch the LOCK_GRANTED_HOOK_NAME hook on a unit. + + Args: + unit_name: The Juju unit name (e.g., "app/0"). + charm_dir: Filesystem path to the charm directory. + + Raises: + subprocess.CalledProcessError: If the hook execution fails. + """ + _dispatch_hook(unit_name, charm_dir, LOCK_GRANTED_HOOK_NAME) + + +def dispatch_etcd_failed(unit_name: str, charm_dir: str) -> None: + """Dispatch the fatal etcd-worker failure hook. + + This notifies the charm that the etcd worker encountered an + unrecoverable error so that higher-level logic can fall back to the + peer backend. + + Args: + unit_name: Name of the unit dispatching the hook. + charm_dir: Path to the charm root directory. + """ + _dispatch_hook(unit_name, charm_dir, ETCD_FAILED_HOOK_NAME) diff --git a/rollingops/src/charmlibs/rollingops/etcd/__init__.py b/rollingops/src/charmlibs/rollingops/etcd/__init__.py new file mode 100644 index 000000000..064b097a3 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rollingops for charms using etcd.""" diff --git a/rollingops/src/charmlibs/rollingops/etcd/_backend.py b/rollingops/src/charmlibs/rollingops/etcd/_backend.py new file mode 100644 index 000000000..ebc616d97 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_backend.py @@ -0,0 +1,400 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +from typing import Any + +from ops import Object, Relation +from ops.charm import ( + CharmBase, + RelationCreatedEvent, + RelationDepartedEvent, +) + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsInvalidLockRequestError, + RollingOpsNoEtcdRelationError, + RollingOpsSyncLockError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationResult, + RollingOpsStatus, + RunWithLockOutcome, + RunWithLockStatus, + UnitBackendState, +) +from charmlibs.rollingops.etcd import _etcdctl as etcdctl +from charmlibs.rollingops.etcd._etcd import EtcdLease, EtcdLock, ManagerOperationStore +from charmlibs.rollingops.etcd._models import RollingOpsKeys +from charmlibs.rollingops.etcd._relations import EtcdRequiresV1, SharedClientCertificateManager +from charmlibs.rollingops.etcd._worker import EtcdRollingOpsAsyncWorker + +logger = logging.getLogger(__name__) + + +class EtcdRollingOpsBackend(Object): + """Manage rolling operations using etcd-backed coordination. + + This backend stores operation state in etcd, coordinates asynchronous + execution through an etcd-backed distributed lock, and exposes a + synchronous lock interface for critical sections. + + Each unit manages its own etcd worker process and operation queues. + Operations are scoped using a cluster identifier and a per-unit owner. + """ + + def __init__( + self, + charm: CharmBase, + peer_relation_name: str, + etcd_relation_name: str, + cluster_id: str, + callback_targets: dict[str, Any], + ): + """Initialize the etcd-backed rolling-ops backend. + + Args: + charm: The charm instance owning this backend. + peer_relation_name: Name of the peer relation used for shared + state and worker coordination. + etcd_relation_name: Name of the relation providing etcd access. + cluster_id: Identifier used to scope etcd keys for this rolling-ops + instance. + callback_targets: Mapping from callback identifiers to callables + executed when an operation is granted the asynchronous lock. + """ + super().__init__(charm, 'etcd-rolling-ops-manager') + self._charm = charm + self.peer_relation_name = peer_relation_name + self.etcd_relation_name = etcd_relation_name + self.callback_targets = callback_targets + + owner = f'{self.model.uuid}-{self.model.unit.name}'.replace('/', '-') + self.worker = EtcdRollingOpsAsyncWorker( + charm, peer_relation_name=peer_relation_name, owner=owner, cluster_id=cluster_id + ) + self.keys = RollingOpsKeys.for_owner(cluster_id=cluster_id, owner=owner) + + self.shared_certificates = SharedClientCertificateManager( + charm, + peer_relation_name=peer_relation_name, + ) + + self.etcd = EtcdRequiresV1( + charm, + relation_name=etcd_relation_name, + cluster_id=self.keys.cluster_prefix, + shared_certificates=self.shared_certificates, + ) + self._async_lock = EtcdLock(lock_key=self.keys.lock_key, owner=owner) + self._sync_lock = EtcdLock(lock_key=self.keys.lock_key, owner=f'{owner}:sync') + self._lease: EtcdLease | None = None + self.operations_store = ManagerOperationStore(self.keys, owner) + + self.framework.observe( + charm.on[self.peer_relation_name].relation_departed, self._on_peer_relation_departed + ) + self.framework.observe( + charm.on[self.etcd_relation_name].relation_created, self._on_etcd_relation_created + ) + + @property + def _peer_relation(self) -> Relation | None: + """Return the peer relation for this backend.""" + return self.model.get_relation(self.peer_relation_name) + + @property + def _etcd_relation(self) -> Relation | None: + """Return the etcd relation for this backend.""" + return self.model.get_relation(self.etcd_relation_name) + + def is_available(self) -> bool: + """Return whether the etcd backend is currently usable. + + The backend is considered available only if the etcd relation exists + and the etcd client has been initialized successfully. + + Returns: + True if etcd can currently be used, otherwise False. + """ + if self._etcd_relation is None: + return False + try: + etcdctl.ensure_initialized() + except Exception: + return False + return True + + def enqueue_operation(self, operation: Operation) -> None: + """Persist an operation in etcd for this unit. + + Before storing the operation, this method clears any pending fallback + state for the current unit. If the unit had previously fallen back + from etcd to peer processing and cleanup is still required, stale etcd + operation state is removed first so processing can resume from a clean + slate. + + Args: + operation: The operation to enqueue. + + Raises: + RollingOpsNoEtcdRelationError: If the etcd relation does not exist. + RollingOpsEtcdNotConfiguredError: If the etcd client has not been + configured yet. + PebbleConnectionError: If the remote container cannot be reached. + """ + if self._etcd_relation is None: + raise RollingOpsNoEtcdRelationError + + etcdctl.ensure_initialized() + + backend_state = UnitBackendState(self.model, self.peer_relation_name, self.model.unit) + if backend_state.cleanup_needed: + self.operations_store.clean_up() + backend_state.clear_fallback() + + self.operations_store.request(operation) + + def ensure_processing(self): + """Ensure that the etcd worker process is running. + + The worker is responsible for acquiring the asynchronous lock and + processing queued operations for this unit. + """ + self.worker.start() + + def is_processing(self) -> bool: + """Return whether the etcd worker process is currently running.""" + return self.worker.is_running() + + def _on_etcd_relation_created(self, event: RelationCreatedEvent) -> None: + """Validate that the etcdctl command is available when etcd is related. + + Args: + event: The relation-created event for the etcd relation. + """ + if not etcdctl.is_etcdctl_installed(): + logger.error('%s is not installed.', etcdctl.ETCDCTL_CMD) + + def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: + """Handle removal of a unit from the peer relation. + + If the current unit is departing, the etcd worker process is stopped + to ensure a clean shutdown and avoid leaving a stale worker running. + + Args: + event: The peer relation departed event. + """ + unit = event.departing_unit + if unit == self.model.unit: + self.worker.stop() + + def request_async_lock( + self, + callback_id: str, + kwargs: dict[str, Any] | None = None, + max_retry: int | None = None, + ) -> None: + """Queue a rolling operation and trigger asynchronous lock acquisition. + + This method creates a new operation representing a callback to execute + once the distributed lock is granted. The operation is appended to the + unit's pending operation queue stored in etcd. + + If the operation is successfully enqueued, the background worker process + responsible for acquiring the distributed lock and processing operations + is started. + + Args: + callback_id: Identifier of the registered callback to execute when + the lock is granted. + kwargs: Optional keyword arguments passed to the callback when + executed. Must be JSON-serializable. + max_retry: Maximum number of retries for the operation. + - None: retry indefinitely + - 0: do not retry on failure + + Raises: + RollingOpsInvalidLockRequestError: If the callback_id is not registered or + invalid parameters were provided. + RollingOpsNoEtcdRelationError: if the etcd relation does not exist + RollingOpsEtcdNotConfiguredError: if etcd client has not been configured yet + PebbleConnectionError: if the remote container cannot be reached. + """ + if callback_id not in self.callback_targets: + raise RollingOpsInvalidLockRequestError(f'Unknown callback_id: {callback_id}') + + if not self._etcd_relation: + raise RollingOpsNoEtcdRelationError + + etcdctl.ensure_initialized() + + if kwargs is None: + kwargs = {} + + operation = Operation.create(callback_id, kwargs, max_retry) + self.operations_store.request(operation) + self.worker.start() + + def _on_run_with_lock(self) -> RunWithLockOutcome: + """Execute the current operation while holding the distributed lock. + + This method is triggered when the worker determines that the current + unit owns the distributed lock. The method retrieves the head operation + from the in-progress queue and executes its registered callback. + + After execution, the operation is moved to the completed queue and its + updated state is persisted. + + Returns: + A structured outcome describing whether an operation was executed + and, if so, which operation was finalized and with what result. + + Raises: + RollingOpsEtcdTransactionError: if the operation cannot be marked + as completed. + """ + if not self._async_lock.is_held(): + logger.info('Lock is not granted. Operation will not run.') + return RunWithLockOutcome(status=RunWithLockStatus.NOT_GRANTED) + + if not (operation := self.operations_store.peek_current()): + logger.info('Lock granted but there is no operation to run.') + return RunWithLockOutcome(status=RunWithLockStatus.NO_OPERATION) + + if not (callback := self.callback_targets.get(operation.callback_id)): + logger.error( + 'Operation %s target was not found. Releasing operation without retry.', + operation.callback_id, + ) + self.operations_store.finalize(operation, OperationResult.RELEASE) + return RunWithLockOutcome( + status=RunWithLockStatus.MISSING_CALLBACK, + op_id=operation.op_id, + result=OperationResult.RELEASE, + ) + logger.info( + 'Executing callback_id=%s, attempt=%s', operation.callback_id, operation.attempt + ) + + try: + result = callback(**operation.kwargs) + except Exception as e: + logger.exception('Operation failed: %s: %s', operation.callback_id, e) + result = OperationResult.RETRY_RELEASE + + match result: + case OperationResult.RETRY_HOLD: + logger.info( + 'Finished %s. Operation will be retried immediately.', operation.callback_id + ) + case OperationResult.RETRY_RELEASE: + logger.info('Finished %s. Operation will be retried later.', operation.callback_id) + case _: + logger.info('Finished %s. Lock will be released.', operation.callback_id) + result = OperationResult.RELEASE + + try: + self.operations_store.finalize(operation, result) + except Exception: + logger.exception('Failed to commit operation %s to etcd.', operation.callback_id) + return RunWithLockOutcome( + status=RunWithLockStatus.EXECUTED_NOT_COMMITTED, + op_id=operation.op_id, + result=result, + ) + return RunWithLockOutcome( + status=RunWithLockStatus.EXECUTED, + op_id=operation.op_id, + result=result, + ) + + def acquire_sync_lock(self, timeout: int | None) -> None: + """Acquire the etcd-backed synchronous lock for this unit. + + A dedicated lease is granted and kept alive for the duration of the + lock. The backend then repeatedly attempts to acquire the sync lock + until it succeeds or the timeout expires. + + Args: + timeout: Maximum time in seconds to wait for the lock. + None means wait indefinitely. + + Raises: + TimeoutError: If the lock could not be acquired before the timeout. + RollingOpsSyncLockError: if there was an error obtaining the lock. + """ + self._lease = EtcdLease() + + deadline = None if timeout is None else time.monotonic() + timeout + + try: + self._lease.grant() + + if self._lease.id is None: + raise RollingOpsSyncLockError('Failed to grant an etcd lease.') + while True: + try: + if self._sync_lock.try_acquire(self._lease.id): + logger.info('etcd lock acquired.') + return + except Exception: + logger.exception('Failed while trying to acquire etcd sync lock.') + raise + + if deadline is not None and time.monotonic() >= deadline: + raise TimeoutError(f'Timed out acquiring etcd sync lock after {timeout}s.') + + time.sleep(15) + + except Exception as e: + try: + self._lease.revoke() + except Exception: + logger.exception('Failed to revoke lease %s.', self._lease.id) + raise RollingOpsSyncLockError('Failed to acquire the etcd sync lock') from e + + def release_sync_lock(self) -> None: + """Release the synchronous lock and revoke its lease.""" + self._sync_lock.release() + if self._lease is not None: + self._lease.revoke() + + def get_status(self) -> RollingOpsStatus: + """Return the rolling-ops status for this unit in etcd mode. + + Status is derived from the current etcd-backed lock state and the + unit's queued operation state. + + Returned values: + - UNAVAILABLE: etcd backend is not available + - GRANTED: the async lock is currently held by this unit + - WAITING: this unit has queued work but does not hold the lock + - IDLE: this unit has no pending work + + Returns: + The current rolling-ops status for this unit. + """ + if self._peer_relation is None or self._etcd_relation is None or not self.is_available(): + return RollingOpsStatus.UNAVAILABLE + + if self._async_lock.is_held(): + return RollingOpsStatus.GRANTED + + if self.operations_store.has_pending_work(): + return RollingOpsStatus.WAITING + + return RollingOpsStatus.IDLE diff --git a/rollingops/src/charmlibs/rollingops/_certificates.py b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py similarity index 89% rename from rollingops/src/charmlibs/rollingops/_certificates.py rename to rollingops/src/charmlibs/rollingops/etcd/_certificates.py index 34f4b9e71..3c53bd939 100644 --- a/rollingops/src/charmlibs/rollingops/_certificates.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_certificates.py @@ -19,11 +19,13 @@ with etcd via TLS. Certificates are generated only once and persisted under a local directory so they can be reused across charm executions. -Certificates are valid for 20 years. They are not renewed or rotated. +Certificates are valid for 50 years. They are not renewed or rotated. """ from datetime import timedelta +import shortuuid + from charmlibs import pathops from charmlibs.interfaces.tls_certificates import ( Certificate, @@ -32,11 +34,9 @@ PrivateKey, TLSCertificatesError, ) -from charmlibs.rollingops._models import ( - RollingOpsFileSystemError, - SharedCertificate, - with_pebble_retry, -) +from charmlibs.rollingops.common._exceptions import RollingOpsFileSystemError +from charmlibs.rollingops.common._utils import with_pebble_retry +from charmlibs.rollingops.etcd._models import SharedCertificate BASE_DIR = pathops.LocalPath('/var/lib/rollingops/tls') CA_CERT_PATH = BASE_DIR / 'client-ca.pem' @@ -90,7 +90,7 @@ def _has_client_cert_key_and_ca(shared: SharedCertificate) -> bool: raise RollingOpsFileSystemError('Failed to read certificates and key.') from e -def generate(common_name: str) -> SharedCertificate: +def generate(model_uuid: str, app_name: str) -> SharedCertificate: """Generate a client CA and client certificate if they do not exist. This method creates: @@ -103,8 +103,8 @@ def generate(common_name: str) -> SharedCertificate: If the certificates already exist, this method does nothing. Args: - common_name: Common Name (CN) used in the client certificate - subject. This value should not contain slashes. + model_uuid: string used to build the common name. + app_name: string used to build the common name. Raises: PebbleConnectionError: if the remote container cannot be reached @@ -117,6 +117,9 @@ def generate(common_name: str) -> SharedCertificate: CA_CERT_PATH, ) + # Produce a unique <=64-character string + raw = f'{model_uuid}-{app_name}' + common_name = shortuuid.uuid(name=raw) ca_key = PrivateKey.generate(key_size=KEY_SIZE) ca_attributes = CertificateRequestAttributes( common_name=common_name, diff --git a/rollingops/src/charmlibs/rollingops/etcd/_etcd.py b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py new file mode 100644 index 000000000..5ce1aef7f --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcd.py @@ -0,0 +1,546 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes that manage etcd concepts.""" + +import logging +import os +import subprocess +import time + +import charmlibs.rollingops.etcd._etcdctl as etcdctl +from charmlibs.rollingops.common._exceptions import ( + RollingOpsEtcdctlFatalError, + RollingOpsEtcdctlParseError, + RollingOpsEtcdTransactionError, +) +from charmlibs.rollingops.common._models import Operation, OperationResult +from charmlibs.rollingops.etcd._models import RollingOpsKeys + +logger = logging.getLogger(__name__) + +LOCK_LEASE_TTL = '60' + + +class EtcdLease: + """Manage the lifecycle of an etcd lease and its keep-alive process.""" + + def __init__(self): + self.id: str | None = None + self.keepalive_proc: subprocess.Popen[str] | None = None + self._pipe_write_fd: int | None = None + + def grant(self) -> None: + """Create a new lease and start the keep-alive process.""" + res = etcdctl.run('lease', 'grant', LOCK_LEASE_TTL) + # parse: "lease 694d9c9aeca3422a granted with TTL(60s)" + parts = res.split() + try: + lease_id = parts[1] + int(lease_id, 16) + except (IndexError, ValueError) as e: + raise RollingOpsEtcdctlParseError(f'Invalid lease output: {res}') from e + + self.id = parts[1] + logger.info('%s', res) + self._start_lease_keepalive() + + def revoke(self) -> None: + """Revoke the current lease and stop the keep-alive process.""" + lease_id = self.id + try: + if self.id is not None: + etcdctl.run('lease', 'revoke', self.id) + except Exception: + logger.exception('Fail to revoke lease %s.', lease_id) + raise + finally: + try: + self._stop_keepalive() + except Exception: + logger.exception('Fail to stop keepalive for lease %s.', lease_id) + finally: + self.id = None + + def _start_lease_keepalive(self) -> None: + """Start the background process that keeps the lease alive.""" + lease_id = self.id + if lease_id is None: + logger.info('Lease ID is None. Keepalive for this lease cannot be started.') + return + etcdctl.ensure_initialized() + + pipe_read_fd, pipe_write_fd = os.pipe() + self._pipe_write_fd = pipe_write_fd + + keep_alive_cmd = f'{etcdctl.ETCDCTL_CMD} lease keep-alive {lease_id} /dev/null; wait' # noqa: E501 + try: + self.keepalive_proc = subprocess.Popen( + ['bash', '-c', keep_alive_cmd], + # The pipe read side becomes the child's stdin + # so when the parent closes its write side, this stdin gets EOF + stdin=pipe_read_fd, + env=etcdctl.load_env(), + text=True, + close_fds=True, + preexec_fn=self._close_write_side_in_child, + ) + except Exception: # OSError perhaps? + os.close(pipe_read_fd) + os.close(pipe_write_fd) + self._pipe_write_fd = None + raise + + os.close(pipe_read_fd) + logger.info('Keepalive started for lease %s.', self.id) + + def _close_write_side_in_child(self) -> None: + if self._pipe_write_fd is None: + return + os.close(self._pipe_write_fd) + + def _stop_keepalive(self) -> None: + """Terminate the keep-alive subprocess if it is running.""" + # Close the write side of the pipe to set EOF to the child's stdin + # and trigger the `read -r _` + if self._pipe_write_fd is not None: + try: + os.close(self._pipe_write_fd) + except OSError: + pass + finally: + self._pipe_write_fd = None + + if self.keepalive_proc is None: + return + + # Additional safeguard + try: + self.keepalive_proc.terminate() + except ProcessLookupError: + # Already dead + return + except Exception: + try: + self.keepalive_proc.wait(timeout=2) + except subprocess.TimeoutExpired: + logger.exception('Fail to stop keepalive for lease %s.') + self.keepalive_proc.kill() + return + finally: + self.keepalive_proc = None + + +class EtcdLock: + """Distributed lock implementation backed by etcd. + + The lock is represented by a key whose value identifies the current owner. + + Lock acquisition and release are performed using transactions to + ensure atomicity. + + The lock is attached to an etcd lease so that it is + automatically released if the owner stops refreshing the lease. + """ + + def __init__(self, lock_key: str, owner: str): + self.lock_key = lock_key + self.owner = owner + + def try_acquire(self, lease_id: str) -> bool: + """Attempt to acquire the lock. + + This method uses an etcd transaction that succeeds only if the + lock key does not yet exist. If successful, the lock key is created with the current + owner as its value and is attached to the provided lease. + + Args: + lease_id: ID of the etcd lease to associate with the lock. + + Returns: + True if the lock was successfully acquired, otherwise False. + """ + if not self.lock_key or not self.owner or not lease_id: + raise RollingOpsEtcdctlFatalError('Invalid input for lock acquire transaction.') + + txn = f"""\ + version("{self.lock_key}") = "0" + + put "{self.lock_key}" "{self.owner}" --lease={lease_id} + + + """ + return etcdctl.txn(txn) + + def release(self) -> None: + """Release the lock if it is currently held by this owner. + + The lock is removed only if the value of the lock key matches + the current owner. This prevents one process from accidentally + releasing a lock held by another owner. + """ + if not self.lock_key or not self.owner: + raise RollingOpsEtcdctlFatalError('Invalid input for lock release transaction.') + + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + + del "{self.lock_key}" + + + """ + etcdctl.txn(txn) + + def is_held(self) -> bool: + """Check whether the lock is currently held by the owner.""" + if not self.lock_key or not self.owner: + raise RollingOpsEtcdctlFatalError('Invalid input for check lock ownership operation.') + res = etcdctl.run('get', self.lock_key, '--print-value-only') + return res == self.owner + + +class EtcdOperationQueue: + """Queue abstraction for operations stored in etcd. + + This class represents a queue of operations stored under a common + key prefix in etcd. Each operation is stored as a key-value pair + where the key encodes the operation identifier and ordering, and + the value contains the serialized operation data. + """ + + def __init__(self, prefix: str, lock_key: str, owner: str): + self.prefix = prefix + self.lock_key = lock_key + self.owner = owner + + def peek(self) -> Operation | None: + """Return the first operation in the queue without removing it.""" + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is None: + return None + return Operation.model_validate(kv.value) + + def _peek_last(self) -> Operation | None: + """Return the last operation in the queue without removing it.""" + kv = etcdctl.get_last_key_value_pair(self.prefix) + if kv is None: + return None + return Operation.model_validate(kv.value) + + def move_head(self, to_queue_prefix: str) -> bool: + """Move the first operation in the queue to another queue. + + This operation is performed atomically using an etcd transaction. + The transaction succeeds only if: + - The lock is currently held by the configured owner. + - The head operation still exists. + + Args: + to_queue_prefix: Destination queue prefix. + + Returns: + True if the operation was moved successfully, otherwise False. + """ + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is None: + return False + + op_id = kv.key.split('/')[-1] + new_key = f'{to_queue_prefix}{op_id}' + op = Operation.model_validate(kv.value) + data = op.to_string() + + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + version("{kv.key}") != "0" + + put "{new_key}" {data} + del "{kv.key}" + + + """ + return etcdctl.txn(txn) + + def move_operation(self, to_queue_prefix: str, operation: Operation) -> bool: + """Move a specific operation from this queue to another queue. + + The operation is identified using its operation ID and moved + atomically via an etcd transaction. + + Args: + to_queue_prefix: Destination queue prefix. + operation: Operation to move. + + Returns: + True if the operation was successfully moved, otherwise False. + """ + old_key = f'{self.prefix}{operation.op_id}' + new_key = f'{to_queue_prefix}{operation.op_id}' + + data = operation.to_string() + + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + version("{old_key}") != "0" + + put "{new_key}" {data} + del "{old_key}" + + + """ + return etcdctl.txn(txn) + + def watch(self) -> Operation: + """Block until at least one operation exists and return it.""" + while True: + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is not None: + return Operation.model_validate(kv.value) + time.sleep(10) + + def dequeue(self) -> bool: + """Remove the first operation from the queue. + + The removal is performed using an etcd transaction that ensures + the lock owner still holds the lock and the operation exists. + + Returns: + True if the operation was removed successfully, otherwise False. + """ + kv = etcdctl.get_first_key_value_pair(self.prefix) + if kv is None: + return False + + txn = f"""\ + value("{self.lock_key}") = "{self.owner}" + version("{kv.key}") != "0" + + del "{kv.key}" + + + """ + return etcdctl.txn(txn) + + def enqueue(self, operation: Operation) -> None: + """Insert a new operation into the queue. + + The method avoids inserting duplicate operations by comparing + the new operation with the last operation currently in the queue. + + Args: + operation: Operation to insert. + """ + old_operation = self._peek_last() + + if old_operation is not None and operation == old_operation: + logger.info( + 'Operation %s not added to the etcd queue. ' + 'It already exists in the back of the queue.', + operation.callback_id, + ) + return + + op_str = operation.to_string() + key = f'{self.prefix}{operation.op_id}' + etcdctl.run('put', key, cmd_input=op_str) + logger.info('Operation %s added to the etcd queue.', operation.callback_id) + + def clear(self) -> None: + etcdctl.run('del', self.prefix, '--prefix') + + +class WorkerOperationStore: + """Background-worker view of etcd-backed rolling operations. + + This class is used by the background process that coordinates lock + ownership and operation execution. It manages the lifecycle of queued + operations across the etcd-backed queue prefixes: + + - pending: operations waiting to be claimed + - in-progress: operations currently being executed + - completed: operations that finished execution and await post-processing + + It provides worker-oriented methods to: + - detect pending work + - claim the next operation for execution + - wait for completed operations + - requeue or delete completed operations + """ + + def __init__(self, keys: RollingOpsKeys, owner: str): + self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner) + self._inprogress = EtcdOperationQueue(keys.inprogress, keys.lock_key, owner) + self._completed = EtcdOperationQueue(keys.completed, keys.lock_key, owner) + + def has_pending(self) -> bool: + """Check whether there are pending operations. + + Returns: + True if at least one operation exists in the pending queue, + otherwise False. + """ + return self._pending.peek() is not None + + def has_inprogress(self) -> bool: + """Check whether there are in-progress operations. + + Returns: + True if at least one operation exists in the inprogress queue, + otherwise False. + """ + return self._inprogress.peek() is not None + + def has_completed(self) -> bool: + """Check whether there are completed operations. + + Returns: + True if at least one operation exists in the completed queue, + otherwise False. + """ + return self._completed.peek() is not None + + def claim_next(self) -> str: + """Move the next pending operation to the in-progress queue. + + This operation is performed atomically and only succeeds if: + - the lock is still held by this owner + - the head of the pending queue has not changed + + Returns: + The operation ID of the operation + + Raises: + RollingOpsEtcdTransactionError: if the transaction failed. + """ + if not self._pending.move_head(self._inprogress.prefix): + raise RollingOpsEtcdTransactionError('Failed to move operation to in progress.') + + operation = self._inprogress.peek() + if operation is None: + raise RollingOpsEtcdTransactionError('Failed to get the ID of the next operation.') + return operation.op_id + + def wait_until_completed(self) -> Operation: + """Block until at least one operation appears in the completed queue.""" + return self._completed.watch() + + def requeue_completed(self) -> None: + """Requeue the head completed operation back to the pending queue. + + This is typically used when an operation needs to be retried + (e.g., RETRY_RELEASE or RETRY_HOLD semantics). + + Raises: + RollingOpsEtcdTransactionError: if the transaction failed. + """ + if not self._completed.move_head(self._pending.prefix): + raise RollingOpsEtcdTransactionError('Failed to move operation to pending.') + + def delete_completed(self) -> None: + """Remove the head operation from the completed queue. + + This is typically used when an operation has finished successfully + and does not need to be retried. + + Raises: + RollingOpsEtcdTransactionError: if the transaction failed. + """ + if not self._completed.dequeue(): + raise RollingOpsEtcdTransactionError('Failed finalize operation.') + + +class ManagerOperationStore: + """Charm-facing interface for requesting and finalizing etcd-backed operations. + + This class is used by the RollingOps manager running inside the charm. + It provides a narrow interface for interacting with the etcd-backed + operation queues without exposing the full queue topology. + + The manager can use it to: + - request a new operation + - inspect the current in-progress operation + - finalize an operation after execution + + Queue transitions and storage details remain encapsulated behind this API. + """ + + def __init__(self, keys: RollingOpsKeys, owner: str): + self._pending = EtcdOperationQueue(keys.pending, keys.lock_key, owner) + self._inprogress = EtcdOperationQueue(keys.inprogress, keys.lock_key, owner) + self._completed = EtcdOperationQueue(keys.completed, keys.lock_key, owner) + + def request(self, operation: Operation) -> None: + """Add a new operation to the pending queue. + + Duplicate operations (same callback_id and kwargs as the last queued + operation) are not inserted. + + Args: + operation: Operation to enqueue. + """ + self._pending.enqueue(operation) + + def finalize(self, operation: Operation, result: OperationResult) -> None: + """Move an in-progress operation to the completed queue. + + This should be called after the operation has been executed and its + result has been recorded. + + Args: + operation: The operation currently in the in-progress queue. + result: Result of the executions. + + Raises: + RollingOpsEtcdTransactionError: if the operation cannot be marked + as completed. + """ + match result: + case OperationResult.RETRY_HOLD: + operation.retry_hold() + case OperationResult.RETRY_RELEASE: + operation.retry_release() + case _: + operation.complete() + + if not self._inprogress.move_operation(self._completed.prefix, operation): + raise RollingOpsEtcdTransactionError('Failed to set the operation as completed.') + + def peek_current(self) -> Operation | None: + """Return the current in-progress operation without modifying state. + + Returns: + The current in-progress operation, or None if no operation is + being processed. + """ + return self._inprogress.peek() + + def has_pending_work(self) -> bool: + """Return whether there is an operation currently being processed. + + Returns: + True if there is a current operation, otherwise False. + """ + return self.peek_current() is not None + + def clean_up(self) -> None: + """Clear all operation queues for this unit. + + This removes all in-progress, pending, and completed operations, + resetting the local etcd-backed state. It is typically used when + recovering from inconsistencies or after switching backends to + ensure a clean starting point. + """ + self._inprogress.clear() + self._pending.clear() + self._completed.clear() diff --git a/rollingops/src/charmlibs/rollingops/_etcdctl.py b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py similarity index 51% rename from rollingops/src/charmlibs/rollingops/_etcdctl.py rename to rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py index 4befb143a..e5eb4f1c3 100644 --- a/rollingops/src/charmlibs/rollingops/_etcdctl.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_etcdctl.py @@ -27,14 +27,24 @@ from dataclasses import asdict from functools import lru_cache +from tenacity import ( + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_attempt, + wait_fixed, +) + from charmlibs import pathops -from charmlibs.rollingops._models import ( - CERT_MODE, - EtcdConfig, +from charmlibs.rollingops.common._exceptions import ( + RollingOpsEtcdctlFatalError, + RollingOpsEtcdctlParseError, + RollingOpsEtcdctlRetryableError, RollingOpsEtcdNotConfiguredError, RollingOpsFileSystemError, - with_pebble_retry, ) +from charmlibs.rollingops.common._utils import with_pebble_retry +from charmlibs.rollingops.etcd._models import CERT_MODE, EtcdConfig, EtcdKV logger = logging.getLogger(__name__) @@ -42,6 +52,9 @@ SERVER_CA_PATH = BASE_DIR / 'server-ca.pem' CONFIG_FILE_PATH = BASE_DIR / 'etcdctl.json' ETCDCTL_CMD = 'etcdctl' +ETCDCTL_TIMEOUT_SECONDS = 15 +ETCDCTL_RETRY_ATTEMPTS = 12 +ETCDCTL_RETRY_WAIT_SECONDS = 5 @lru_cache(maxsize=1) @@ -185,31 +198,210 @@ def cleanup() -> None: raise RollingOpsFileSystemError('Failed to remove etcd config file and CA.') from e -def run(*args: str) -> str | None: +def _is_retryable_stderr(stderr: str) -> bool: + """Return whether stderr looks like a transient etcd/client failure.""" + text = stderr.lower() + retryable_markers = ( + 'connection refused', + 'context deadline exceeded', + 'deadline exceeded', + 'temporarily unavailable', + 'transport is closing', + 'connection reset', + 'broken pipe', + 'unavailable', + 'leader changed', + 'etcdserver: request timed out', + ) + return any(marker in text for marker in retryable_markers) + + +@retry( + retry=retry_if_exception_type(RollingOpsEtcdctlRetryableError), + stop=stop_after_attempt(ETCDCTL_RETRY_ATTEMPTS), + wait=wait_fixed(ETCDCTL_RETRY_WAIT_SECONDS), + before_sleep=before_sleep_log(logger, logging.WARNING), + reraise=True, +) +def _run_checked(*args: str, cmd_input: str | None = None) -> subprocess.CompletedProcess[str]: + """Execute etcdctl and return the completed process. + + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlRetryableError: for transient command failures. + RollingOpsEtcdctlFatalError: for non-retryable command failures. + """ + ensure_initialized() + + cmd = [ETCDCTL_CMD, *args] + + try: + res = subprocess.run( + cmd, + env=load_env(), + input=cmd_input, + text=True, + capture_output=True, + check=False, + timeout=ETCDCTL_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + logger.warning( + 'Timed out running etcdctl: cmd=%r stdout=%r stderr=%r', cmd, e.stdout, e.stderr + ) + raise RollingOpsEtcdctlRetryableError(f'Timed out running etcdctl: {cmd!r}') from e + except FileNotFoundError as e: + logger.exception('etcdctl executable not found: %s', ETCDCTL_CMD) + raise RollingOpsEtcdctlFatalError(f'etcdctl executable not found: {ETCDCTL_CMD}') from e + except OSError as e: + logger.exception('Failed to execute etcdctl: cmd=%r', cmd) + raise RollingOpsEtcdctlFatalError(f'Failed to execute etcdctl: {cmd!r}') from e + + if res.returncode != 0: + logger.warning( + 'etcdctl command failed: cmd=%r returncode=%s stdout=%r stderr=%r', + cmd, + res.returncode, + res.stdout, + res.stderr, + ) + if _is_retryable_stderr(res.stderr): + raise RollingOpsEtcdctlRetryableError( + f'Retryable etcdctl failure (rc={res.returncode}): {res.stderr.strip()}' + ) + raise RollingOpsEtcdctlFatalError( + f'etcdctl failed (rc={res.returncode}): {res.stderr.strip()}' + ) + + logger.debug('etcdctl command succeeded: cmd=%r stdout=%r', cmd, res.stdout) + return res + + +def run(*args: str, cmd_input: str | None = None) -> str: """Execute an etcdctl command. Args: args: List of arguments to pass to etcdctl. + cmd_input: value to use as input when running the command. Returns: The stdout of the command, stripped, or None if execution failed. Raises: - RollingOpsEtcdNotConfiguredError: if the etcd config file does not exist. + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + RollingOpsFileSystemError: if configuration cannot be read. PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlError: etcdctl command error. """ - ensure_initialized() - cmd = [ETCDCTL_CMD, *args] + return _run_checked(*args, cmd_input=cmd_input).stdout.strip() - try: - result = subprocess.run( - cmd, env=load_env(), check=True, text=True, capture_output=True - ).stdout.strip() - except subprocess.CalledProcessError as e: - logger.error('etcdctl command failed: returncode: %s, error: %s', e.returncode, e.stderr) - return None - except subprocess.TimeoutExpired as e: - logger.error('Timed out running etcdctl: %s', e.stderr) + +def _get_key_value_pair(key_prefix: str, *extra_args: str) -> EtcdKV | None: + """Retrieve the first key and value under a given prefix. + + Args: + key_prefix: Key prefix to search for. + extra_args: Arguments to the get command + + Returns: + A EtcdKV containing: + - The key string + - The parsed JSON value as a dictionary + + Returns None if no key exists. + + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + + """ + res = run('get', key_prefix, '--prefix', *extra_args) + out = res.splitlines() + if len(out) < 2: return None - return result + try: + value = json.loads(out[1]) + except json.JSONDecodeError as e: + raise RollingOpsEtcdctlParseError( + f'Failed to parse JSON value for key {out[0]}: {out[1]}' + ) from e + + return EtcdKV(key=out[0], value=value) + + +def get_first_key_value_pair(key_prefix: str) -> EtcdKV | None: + """Retrieve the first key and value under a given prefix. + + Args: + key_prefix: Key prefix to search for. + + Returns: + A tuple containing: + - The key string + - The parsed JSON value as a dictionary + + Returns None if no key exists or the command fails. + + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + """ + return _get_key_value_pair(key_prefix, '--limit=1') + + +def get_last_key_value_pair(key_prefix: str) -> EtcdKV | None: + """Retrieve the last key and value under a given prefix. + + Args: + key_prefix: Key prefix to search for. + + Returns: + A tuple containing: + - The key string + - The parsed JSON value as a dictionary + + Returns None if no key exists or the command fails. + + Raises: + RollingOpsEtcdctlParseError: if the output is malformed + """ + return _get_key_value_pair( + key_prefix, + '--sort-by=KEY', + '--order=DESCEND', + '--limit=1', + ) + + +def txn(txn_input: str) -> bool: + """Execute an etcd transaction. + + The transaction string should follow the etcdctl transaction format + where comparison statements are followed by operations. + + Args: + txn_input: The transaction specification passed to `etcdctl txn`. + + Returns: + True if the transaction succeeded, otherwise False. + + Raises: + RollingOpsEtcdNotConfiguredError: if etcdctl is not configured. + PebbleConnectionError: if the remote container cannot be reached. + RollingOpsEtcdctlError: etcdctl command error. + RollingOpsEtcdctlParseError: if invalid response is found + """ + res = _run_checked('txn', cmd_input=txn_input) + + lines = res.stdout.splitlines() + if not lines: + raise RollingOpsEtcdctlParseError('Empty txn response') + + first_line = lines[0].strip() + + if first_line == 'SUCCESS': + return True + if first_line == 'FAILURE': + return False + + raise RollingOpsEtcdctlParseError(f'Unexpected txn response: {res.stdout}') diff --git a/rollingops/src/charmlibs/rollingops/_models.py b/rollingops/src/charmlibs/rollingops/etcd/_models.py similarity index 81% rename from rollingops/src/charmlibs/rollingops/_models.py rename to rollingops/src/charmlibs/rollingops/etcd/_models.py index 5e653bc2e..fa1daaa08 100644 --- a/rollingops/src/charmlibs/rollingops/_models.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_models.py @@ -14,74 +14,17 @@ """etcd rolling ops models.""" -from collections.abc import Callable from dataclasses import dataclass -from enum import StrEnum -from typing import ClassVar, TypeVar - -from ops import pebble -from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed +from typing import ClassVar from charmlibs.interfaces.tls_certificates import Certificate, PrivateKey -from charmlibs.pathops import LocalPath, PebbleConnectionError - -T = TypeVar('T') - - -class RollingOpsNoEtcdRelationError(Exception): - """Raised if we are trying to process a lock, but do not appear to have a relation yet.""" - - -class RollingOpsEtcdUnreachableError(Exception): - """Raised if etcd server is unreachable.""" - - -class RollingOpsEtcdNotConfiguredError(Exception): - """Raised if etcd client has not been configured yet (env file does not exist).""" - - -class RollingOpsFileSystemError(Exception): - """Raised if there is a problem when interacting with the filesystem.""" - - -class RollingOpsInvalidLockRequestError(Exception): - """Raised if the lock request is invalid.""" - - -class RollingOpsDecodingError(Exception): - """Raised if json content cannot be processed.""" - - -class RollingOpsInvalidSecretContentError(Exception): - """Raised if the content of a secret is invalid.""" - - -class RollingOpsCharmLibMissingError(Exception): - """Raised if the path to the libraries cannot be resolved.""" - +from charmlibs.pathops import LocalPath +from charmlibs.rollingops.common._utils import with_pebble_retry CERT_MODE = 0o644 KEY_MODE = 0o600 -@retry( - retry=retry_if_exception_type((PebbleConnectionError, pebble.APIError, pebble.ChangeError)), - stop=stop_after_attempt(3), - wait=wait_fixed(10), - reraise=True, -) -def with_pebble_retry[T](func: Callable[[], T]) -> T: - return func() - - -class OperationResult(StrEnum): - """Callback return values.""" - - RELEASE = 'release' - RETRY_RELEASE = 'retry-release' - RETRY_HOLD = 'retry-hold' - - @dataclass(frozen=True) class SharedCertificate: """Represent the certificates shared within units of an app to connect to etcd.""" @@ -207,6 +150,14 @@ class EtcdConfig: key_path: str +@dataclass +class EtcdKV: + """A single etcd key-value entry.""" + + key: str + value: dict[str, str] + + @dataclass(frozen=True) class RollingOpsKeys: """Collection of etcd key prefixes used for rolling operations. diff --git a/rollingops/src/charmlibs/rollingops/_relations.py b/rollingops/src/charmlibs/rollingops/etcd/_relations.py similarity index 96% rename from rollingops/src/charmlibs/rollingops/_relations.py rename to rollingops/src/charmlibs/rollingops/etcd/_relations.py index 7189f1ef1..8d37f92d8 100644 --- a/rollingops/src/charmlibs/rollingops/_relations.py +++ b/rollingops/src/charmlibs/rollingops/etcd/_relations.py @@ -32,9 +32,10 @@ from ops.framework import Object from charmlibs.interfaces.tls_certificates import Certificate, TLSCertificatesError -from charmlibs.rollingops import _certificates as certificates -from charmlibs.rollingops import _etcdctl as etcdctl -from charmlibs.rollingops._models import RollingOpsInvalidSecretContentError, SharedCertificate +from charmlibs.rollingops.common._exceptions import RollingOpsInvalidSecretContentError +from charmlibs.rollingops.etcd import _certificates as certificates +from charmlibs.rollingops.etcd import _etcdctl as etcdctl +from charmlibs.rollingops.etcd._models import SharedCertificate logger = logging.getLogger(__name__) CERT_SECRET_FIELD = 'rollingops-client-secret-id' # noqa: S105 @@ -111,8 +112,7 @@ def create_and_share_certificate(self) -> None: ) return - common_name = f'rollingops-{self.model.uuid}-{self.model.app.name}' - shared = certificates.generate(common_name) + shared = certificates.generate(self.model.uuid, self.model.app.name) secret = self.model.app.add_secret( content={ diff --git a/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py new file mode 100644 index 000000000..9d45b183a --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_rollingops.py @@ -0,0 +1,181 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import time + +from charmlibs.rollingops.common._models import OperationResult +from charmlibs.rollingops.common._utils import ( + ETCD_FAILED_HOOK_NAME, + dispatch_etcd_failed, + dispatch_lock_granted, + setup_logging, +) +from charmlibs.rollingops.etcd._etcd import ( + EtcdLease, + EtcdLock, + WorkerOperationStore, +) +from charmlibs.rollingops.etcd._models import RollingOpsKeys +from charmlibs.rollingops.etcd._worker import ETCD_LOG_FILENAME + +logger = logging.getLogger(__name__) + +INITIAL_SLEEP = 10 # Delay before the worker begins processing. +LOCK_ACQUIRE_SLEEP = 15 # Delay between etcd lock acquisition attempts. +NEXT_OP_SLEEP = 30 # Delay between queue polls when idle. + + +class RollingOpsEtcdInconsistencyError(Exception): + """Raised when unexpected or inconsistent etcd operation state is found.""" + + +def main(): + """Run the etcd rolling-ops worker loop. + + This worker is responsible for processing the current unit's + etcd-backed operation queue. It waits for pending work, acquires the + etcd lock, claims the next operation, dispatches the lock-granted + hook, and then waits for the operation result to be written back. + + Processing behavior depends on the final operation result: + + - `RETRY_HOLD`: requeue the operation immediately and keep the lock + - `RETRY_RELEASE`: requeue the operation and release the lock + - any other result: remove the completed operation and release the lock + + If the worker detects invalid etcd queue state or encounters an + unrecoverable error, it dispatches the ETCD_FAILED_HOOK_NAME + hook so the charm can fall back to peer-based processing. + + The worker always attempts to revoke its lease and release the lock + before exiting. + """ + parser = argparse.ArgumentParser(description='RollingOps etcd worker') + parser.add_argument( + '--unit-name', + type=str, + required=True, + help='Juju unit name (e.g. app/0)', + ) + parser.add_argument( + '--charm-dir', + type=str, + required=True, + help='Path to the charm directory', + ) + + parser.add_argument( + '--owner', + type=str, + required=True, + help='Unique owner identifier for the unit', + ) + parser.add_argument( + '--cluster-id', + type=str, + required=True, + help='Cluster identifier', + ) + args = parser.parse_args() + + setup_logging( + ETCD_LOG_FILENAME, unit_name=args.unit_name, owner=args.owner, cluster_id=args.cluster_id + ) + logger.info('Starting worker.') + + time.sleep(INITIAL_SLEEP) + + keys = RollingOpsKeys.for_owner(args.cluster_id, args.owner) + lock = EtcdLock(keys.lock_key, args.owner) + lease = EtcdLease() + operations = WorkerOperationStore(keys, args.owner) + + try: + while True: + if operations.has_inprogress() or operations.has_completed(): + raise RollingOpsEtcdInconsistencyError('Invalid operations found in etcd queues.') + + if not operations.has_pending(): + time.sleep(NEXT_OP_SLEEP) + continue + + logger.info('Operation found in the pending queue.') + + if not lock.is_held(): + if lease.id is None: + lease.grant() + + if lease.id is None: + raise RollingOpsEtcdInconsistencyError('Invalid lease ID found.') + + logger.info('Try to get lock using lease %s.', lease.id) + while not lock.try_acquire(lease.id): + time.sleep(LOCK_ACQUIRE_SLEEP) + continue + logger.info('Lock granted using lease %s.', lease.id) + + op_id = operations.claim_next() + + dispatch_lock_granted(args.unit_name, args.charm_dir) + + logger.info('Waiting for operation %s to be finished.', op_id) + operation = operations.wait_until_completed() + + logger.info('Operation %s completed with %s', operation.op_id, operation.result) + match operation.result: + case OperationResult.RETRY_HOLD: + operations.requeue_completed() + continue + + case OperationResult.RETRY_RELEASE: + operations.requeue_completed() + + case _: + operations.delete_completed() + + lease_id = lease.id + lease.revoke() + lock.release() + logger.info('Lease %s revoked and lock released.', lease_id) + time.sleep(NEXT_OP_SLEEP) + + except Exception as e: + logger.exception('Fatal etcd worker error: %s', e) + + try: + dispatch_etcd_failed(args.unit_name, args.charm_dir) + except Exception: + logger.exception('Failed to dispatch %s hook.', ETCD_FAILED_HOOK_NAME) + + finally: + lease_id = lease.id + try: + lease.revoke() + logger.info('Lease %s revoked.', lease_id) + except Exception: + logger.exception('Failed to revoke lease %s during worker shutdown.', lease_id) + + try: + lock.release() + logger.info('Lock released.') + except Exception: + logger.exception('Failed to release lock during worker shutdown.') + + logger.info('Exit.') + + +if __name__ == '__main__': + main() diff --git a/rollingops/src/charmlibs/rollingops/etcd/_worker.py b/rollingops/src/charmlibs/rollingops/etcd/_worker.py new file mode 100644 index 000000000..8240b4378 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/etcd/_worker.py @@ -0,0 +1,124 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" + +import logging + +from ops.charm import CharmBase + +from charmlibs import pathops +from charmlibs.rollingops.common._base_worker import BaseRollingOpsAsyncWorker + +logger = logging.getLogger(__name__) + +ETCD_LOG_FILENAME = '/var/log/etcd_rollingops_worker.log' + + +class EtcdRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): + """Manage the etcd-backed rolling-ops worker process. + + Unlike the peer backend, each unit runs its own worker process when + using the etcd backend. Worker PID is stored in the unit databag, + ensuring isolation between units and allowing each unit to independently + manage its own worker lifecycle. + """ + + _pid_field = 'etcd-rollingops-worker-pid' + _log_filename = ETCD_LOG_FILENAME + + def __init__(self, charm: CharmBase, peer_relation_name: str, owner: str, cluster_id: str): + super().__init__(charm, 'etcd-rollingops-async-worker', peer_relation_name) + self._owner = owner + self._cluster_id = cluster_id + + def _worker_script_path(self) -> pathops.LocalPath: + """Return the path to the etcd rolling-ops worker script. + + This script is executed in a background process to handle operation + processing for the etcd backend. + """ + return pathops.LocalPath( + self._venv_site_packages() / 'charmlibs' / 'rollingops' / 'etcd' / '_rollingops.py' + ) + + def _worker_args(self) -> list[str]: + """Return the arguments passed to the etcd worker process. + + Returns: + A list of command-line arguments for the worker process. + """ + return [ + '--owner', + self._owner, + '--cluster-id', + self._cluster_id, + ] + + @property + def _pid(self) -> int | None: + """Return the stored worker process PID for this unit. + + The PID is stored in the unit databag because each unit runs its own + independent worker process when using the etcd backend. This ensures + that worker lifecycle management is isolated per unit. + + Returns: + The worker process PID, or None if not set. + """ + if self._relation is None: + return None + pid = self._relation.data[self.model.unit].get(self._pid_field, '') + + try: + pid = int(pid) + except (ValueError, TypeError): + logger.info('Missing PID or invalid PID found in etcd worker state.') + pid = None + + return pid + + @_pid.setter + def _pid(self, value: int | None) -> None: + """Persist the worker process PID in the unit databag. + + The PID is stored per unit to reflect that each unit owns and manages + its own worker process when using the etcd backend. + + Args: + value: The process identifier to store. + """ + if self._relation is None: + return + self._relation.data[self.model.unit].update({ + self._pid_field: '' if value is None else str(value) + }) + + def _on_existing_worker(self, pid: int) -> bool: + """Executed on detection of an already running worker for this unit. + + Since each unit manages its own worker process, an existing worker is + considered valid and is left running. No restart is performed. + + Args: + pid: The PID of the currently running worker. + + Returns: + False to indicate that no new worker should be started. + """ + logger.info( + 'RollingOps worker already running with PID %s; not starting a new one.', + pid, + ) + return False diff --git a/rollingops/src/charmlibs/rollingops/peer/__init__.py b/rollingops/src/charmlibs/rollingops/peer/__init__.py new file mode 100644 index 000000000..c75a6c654 --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rollingops for charms using peer relations.""" diff --git a/rollingops/src/charmlibs/rollingops/_peer_manager.py b/rollingops/src/charmlibs/rollingops/peer/_backend.py similarity index 57% rename from rollingops/src/charmlibs/rollingops/_peer_manager.py rename to rollingops/src/charmlibs/rollingops/peer/_backend.py index e75bdcb78..310285c95 100644 --- a/rollingops/src/charmlibs/rollingops/_peer_manager.py +++ b/rollingops/src/charmlibs/rollingops/peer/_backend.py @@ -86,19 +86,19 @@ interface: rolling_op ``` -Import this library into src/charm.py, and initialize a PeerRollingOpsManager in the Charm's +Import this library into src/charm.py, and initialize a PeerRollingOpsBackend in the Charm's `__init__`. The Charm should also define a callback routine, which will be executed when a unit holds the distributed lock: src/charm.py ```python -from charms.rolling_ops.v1.rollingops import PeerRollingOpsManager, OperationResult +from charms.rolling_ops.v1.rollingops import PeerRollingOpsBackend, OperationResult class SomeCharm(CharmBase): def __init__(self, *args): super().__init__(*args) - self.rolling_ops = PeerRollingOpsManager( + self.rolling_ops = PeerRollingOpsBackend( charm=self, relation_name="restart", callback_targets={ @@ -152,47 +152,66 @@ def _on_restart_action(self, event) -> None: from collections.abc import Callable from typing import Any -from ops import Relation +from ops import Object, Relation, Unit from ops.charm import ( CharmBase, RelationChangedEvent, RelationDepartedEvent, ) -from ops.framework import EventBase, Object +from ops.framework import EventBase -from charmlibs.rollingops._peer_models import ( - Lock, - LockIterator, - OperationResult, +from charmlibs.rollingops.common._exceptions import ( RollingOpsDecodingError, RollingOpsInvalidLockRequestError, RollingOpsNoRelationError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationResult, + RollingOpsStatus, + RunWithLockOutcome, + RunWithLockStatus, +) +from charmlibs.rollingops.peer._models import ( + PeerAppLock, + PeerUnitOperations, + iter_peer_units, pick_oldest_completed, pick_oldest_request, ) -from charmlibs.rollingops._peer_worker import PeerRollingOpsAsyncWorker +from charmlibs.rollingops.peer._worker import PeerRollingOpsAsyncWorker logger = logging.getLogger(__name__) -class PeerRollingOpsManager(Object): - """Emitters and handlers for rolling ops.""" +class PeerRollingOpsBackend(Object): + """Manage rolling operations using the peer-relation backend. + + This backend stores operation queues in the peer relation and relies + on the leader unit to schedule lock grants across units. Once a unit + is granted the lock, it executes its queued operation locally. + + The peer backend acts as both the primary backend when etcd is not + available and as the durable fallback state used to continue + processing when etcd-backed execution fails. + """ def __init__( self, charm: CharmBase, relation_name: str, callback_targets: dict[str, Callable[..., Any]] ): - """Register our custom events. + """Initialize the peer-backed rolling-ops backend. - params: - charm: the charm we are attaching this to. - relation_name: the peer relation name from metadata.yaml. - callback_targets: mapping from callback_id -> callable. + Args: + charm: The charm instance owning this backend. + relation_name: Name of the peer relation used to store lock and + operation state. + callback_targets: Mapping from callback identifiers to callables + executed when this unit is granted the lock. """ super().__init__(charm, 'peer-rolling-ops-manager') self._charm = charm self.relation_name = relation_name self.callback_targets = callback_targets - self.charm_dir = charm.charm_dir self.worker = PeerRollingOpsAsyncWorker(charm, relation_name=relation_name) self.framework.observe( @@ -202,13 +221,64 @@ def __init__( charm.on[self.relation_name].relation_departed, self._on_relation_departed ) self.framework.observe(charm.on.leader_elected, self._process_locks) - self.framework.observe(charm.on.update_status, self._on_rollingops_lock_granted) @property def _relation(self) -> Relation | None: - """Returns the peer relation used to manage locks.""" + """Return the peer relation used for lock and operation state.""" return self.model.get_relation(self.relation_name) + def _lock(self) -> PeerAppLock: + """Return the shared application-level peer lock. + + This lock is stored in the peer relation application databag and is + used by the leader to grant execution rights to one unit at a time. + """ + return PeerAppLock(self.model, self.relation_name) + + def _operations(self, unit: Unit) -> PeerUnitOperations: + """Return the peer-backed operation queue for a unit. + + Args: + unit: The unit whose operation queue should be accessed. + + Returns: + A helper for reading and updating that unit's queued operations. + """ + return PeerUnitOperations(self.model, self.relation_name, unit) + + def enqueue_operation(self, operation: Operation) -> None: + """Persist an operation in the current unit's peer-backed queue. + + Args: + operation: The operation to enqueue. + + Raises: + RollingOpsInvalidLockRequestError: If the operation could not be + persisted due to invalid or undecodable queue state. + RollingOpsNoRelationError: If the peer relation is not available. + """ + try: + self._operations(self.model.unit).request(operation) + except (RollingOpsDecodingError, ValueError) as e: + logger.error('Failed to create operation: %s', e) + raise RollingOpsInvalidLockRequestError('Failed to create the lock request') from e + except RollingOpsNoRelationError as e: + logger.debug('No %s peer relation yet.', self.relation_name) + raise e + + def ensure_processing(self) -> None: + """Trigger peer-based scheduling if the current unit is leader. + + In the peer backend, scheduling decisions are made only by the + leader unit. Non-leader units do not actively process locks. + """ + if self.model.unit.is_leader(): + self._process_locks() + + def has_pending_work(self) -> bool: + """Return whether the current unit has pending peer-managed work.""" + return self._operations(self.model.unit).has_pending_work() + def _on_rollingops_lock_granted(self, event: EventBase) -> None: """Handler of the custom hook rollingops_lock_granted. @@ -216,11 +286,11 @@ def _on_rollingops_lock_granted(self, event: EventBase) -> None: """ if not self._relation: return - logger.info('Received a rolling-ops lock granted event.') - lock = Lock(self.model, self.relation_name, self.model.unit) - if lock.should_run(): + lock = self._lock() + operations = self._operations(self.model.unit) + if operations.should_run(lock): self._on_run_with_lock() - self._process_locks() + self._process_locks() def _on_relation_departed(self, event: RelationDepartedEvent) -> None: """Leader cleanup: if a departing unit was granted a lock, clear the grant. @@ -230,19 +300,25 @@ def _on_relation_departed(self, event: RelationDepartedEvent) -> None: if not self.model.unit.is_leader(): return if unit := event.departing_unit: - lock = Lock(self.model, self.relation_name, unit) - if lock.is_granted(): + lock = self._lock() + if lock.is_granted(unit.name): lock.release() - self._process_locks() + self._process_locks() def _on_relation_changed(self, _: RelationChangedEvent) -> None: - """Process relation changed.""" + """React to peer relation changes. + + The leader re-runs scheduling whenever peer relation state changes. + Non-leader units only check whether they should execute an operation + that has already been granted to them. + """ if self.model.unit.is_leader(): self._process_locks() return - lock = Lock(self.model, self.relation_name, self.model.unit) - if lock.should_run(): + lock = self._lock() + operations = self._operations(self.model.unit) + if operations.should_run(lock): self._on_run_with_lock() def _valid_peer_unit_names(self) -> set[str]: @@ -258,7 +334,9 @@ def _release_stale_grant(self) -> None: if not self._relation: return - if not (granted_unit := self._relation.data[self.model.app].get('granted_unit', '')): + lock = self._lock() + granted_unit = lock.granted_unit + if not granted_unit: return valid_units = self._valid_peer_unit_names() @@ -267,7 +345,7 @@ def _release_stale_grant(self) -> None: 'granted_unit=%s is not in current peer units; releasing stale grant.', granted_unit, ) - self._relation.data[self.model.app].update({'granted_unit': '', 'granted_at': ''}) + lock.release() def _process_locks(self, _: EventBase | None = None) -> None: """Process locks. @@ -278,21 +356,28 @@ def _process_locks(self, _: EventBase | None = None) -> None: if not self.model.unit.is_leader(): return - for lock in LockIterator(self.model, self.relation_name): - if lock.should_release(): + lock = self._lock() + + for unit in iter_peer_units(self.model, self.relation_name): + operations = self._operations(unit) + if not operations.is_peer_managed(): + continue + if operations.should_release(lock): lock.release() break self._release_stale_grant() - granted_unit = self._relation.data[self.model.app].get('granted_unit', '') # type: ignore[reportOptionalMemberAccess] - if granted_unit: - logger.info('Current granted_unit=%s. No new unit will be scheduled.', granted_unit) + if lock.granted_unit: + logger.info( + 'Current granted_unit=%s. No new unit will be scheduled.', + lock.granted_unit, + ) return - self._schedule() + self._schedule(lock) - def _schedule(self) -> None: + def _schedule(self, lock: PeerAppLock) -> None: """Select and grant the next lock based on priority and queue state. This method iterates over all locks associated with the relation and @@ -309,17 +394,23 @@ def _schedule(self) -> None: """ logger.info('Starting scheduling.') - pending_requests: list[Lock] = [] - pending_retries: list[Lock] = [] + pending_requests: list[PeerUnitOperations] = [] + pending_retries: list[PeerUnitOperations] = [] - for lock in LockIterator(self.model, self.relation_name): - if lock.is_retry_hold(): - self._grant_lock(lock) + for unit in iter_peer_units(self.model, self.relation_name): + operations = self._operations(unit) + + if not operations.is_peer_managed(): + continue + + if operations.is_retry_hold(): + self._grant_lock(lock, operations.unit.name) return - if lock.is_waiting(): - pending_requests.append(lock) - elif lock.is_waiting_retry(): - pending_retries.append(lock) + + if operations.is_waiting(): + pending_requests.append(operations) + elif operations.is_waiting_retry(): + pending_retries.append(operations) selected = None if pending_requests: @@ -327,30 +418,28 @@ def _schedule(self) -> None: elif pending_retries: selected = pick_oldest_completed(pending_retries) - if not selected: + if selected is None: logger.info('No pending lock requests. Lock was not granted to any unit.') return - self._grant_lock(selected) + self._grant_lock(lock, selected) - def _grant_lock(self, selected: Lock) -> None: + def _grant_lock(self, lock: PeerAppLock, unit_name: str) -> None: """Grant the lock to the selected unit. - If the lock is granted to the leader unit: - - If it is a retry, starts the worker to break the loop before next execution. - - Otherwise, the callback is run immediately + Once the lock is granted, the selected unit becomes eligible to + execute its next queued operation. If the selected unit is the local + unit (leader), its worker process is started to trigger execution. Args: - selected: The lock instance to grant. + lock: The peer lock instance to grant. + unit_name: Name of the unit receiving the lock grant. """ - selected.grant() - logger.info('Lock granted to unit=%s.', selected.unit.name) - if selected.unit == self.model.unit: - if selected.is_retry(): - self.worker.start() - return - self._on_run_with_lock() - self._process_locks() + lock.grant(unit_name) + logger.info('Lock granted to unit=%s.', unit_name) + + if unit_name == self.model.unit.name: + self.worker.start() def request_async_lock( self, @@ -386,11 +475,12 @@ def request_async_lock( try: if kwargs is None: kwargs = {} - lock = Lock(self.model, self.relation_name, self.model.unit) - lock.request(callback_id, kwargs, max_retry) + operation = Operation.create(callback_id, kwargs, max_retry) + operations = self._operations(self.model.unit) + operations.request(operation) except (RollingOpsDecodingError, ValueError) as e: - logger.error('Failed operation: %s', e) + logger.error('Failed to create operation: %s', e) raise RollingOpsInvalidLockRequestError('Failed to create the lock request') from e except RollingOpsNoRelationError as e: logger.debug('No %s peer relation yet.', self.relation_name) @@ -407,22 +497,24 @@ def _on_run_with_lock(self) -> None: - Otherwise, the operation's callback is looked up by `callback_id` and invoked with the operation kwargs. """ - lock = Lock(self.model, self.relation_name, self.model.unit) + lock = self._lock() + operations = self._operations(self.model.unit) - if not lock.is_granted(): + if not lock.is_granted(self.model.unit.name): logger.debug('Lock is not granted. Operation will not run.') return - if not (operation := lock.get_current_operation()): + if not (operation := operations.get_current()): logger.debug('There is no operation to run.') - lock.complete() + operations.finish(OperationResult.RELEASE) return if not (callback := self.callback_targets.get(operation.callback_id)): - logger.warning( - 'Operation %s target was not found. It cannot be executed.', + logger.error( + 'Operation %s target was not found. Releasing operation without retry.', operation.callback_id, ) + operations.finish(OperationResult.RELEASE) return logger.info( 'Executing callback_id=%s, attempt=%s', operation.callback_id, operation.attempt @@ -433,17 +525,70 @@ def _on_run_with_lock(self) -> None: logger.exception('Operation failed: %s: %s', operation.callback_id, e) result = OperationResult.RETRY_RELEASE - match result: - case OperationResult.RETRY_HOLD: - logger.info( - 'Finished %s. Operation will be retried immediately.', operation.callback_id - ) - lock.retry_hold() + logger.info('Operation %s executed with result %s.', operation.callback_id, result) + operations.finish(result) + + def mirror_outcome(self, outcome: RunWithLockOutcome) -> None: + """Apply the execution result to the mirrored peer queue. - case OperationResult.RETRY_RELEASE: - logger.info('Finished %s. Operation will be retried later.', operation.callback_id) - lock.retry_release() + This keeps the peer standby queue aligned with the backend that + actually executed the operation. + + Args: + outcome: The etcd execution outcome to mirror. + Raises: + RollingOpsDecodingError: If theres is an inconsistency found. + """ + match outcome.status: + case RunWithLockStatus.NOT_GRANTED: + logger.info('Skipping mirror: etcd lock was not granted.') + return + + case RunWithLockStatus.NO_OPERATION: + if not self._operations(self.model.unit).has_pending_work(): + logger.info('Skipping mirror: no operation.') + return + raise RollingOpsDecodingError( + 'Mismatch between the etcd and peer operation queue.' + ) + + case ( + RunWithLockStatus.MISSING_CALLBACK + | RunWithLockStatus.EXECUTED + | RunWithLockStatus.EXECUTED_NOT_COMMITTED + ): + self._operations(self.model.unit).mirror_result(outcome.op_id, outcome.result) # type: ignore[reportArgumentType] case _: - logger.info('Finished %s. Lock will be released.', operation.callback_id) - lock.complete() + raise RollingOpsDecodingError( + f'Unsupported run-with-lock outcome: {outcome.status}' + ) + + def get_status(self) -> RollingOpsStatus: + """Return the current rolling-ops status for this unit in peer mode. + + Status is derived from the local unit's peer-backed operation queue + and from the shared peer lock state. + + Returned values: + - UNAVAILABLE: the peer relation does not exist + - GRANTED: the current unit holds the peer lock + - WAITING: the current unit has queued work but does not hold the lock + - IDLE: the current unit has no pending work + + Returns: + The current rolling-ops status for this unit. + """ + if self._relation is None: + return RollingOpsStatus.UNAVAILABLE + + lock = self._lock() + operations = self._operations(self.model.unit) + + if lock.is_granted(self.model.unit.name): + return RollingOpsStatus.GRANTED + + if operations.has_pending_work(): + return RollingOpsStatus.WAITING + + return RollingOpsStatus.IDLE diff --git a/rollingops/src/charmlibs/rollingops/peer/_models.py b/rollingops/src/charmlibs/rollingops/peer/_models.py new file mode 100644 index 000000000..d11a3334d --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/_models.py @@ -0,0 +1,386 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Models for peer-relation rollingops.""" + +import logging +from collections.abc import Iterator +from dataclasses import dataclass +from datetime import datetime +from enum import StrEnum + +from ops import Model, Unit + +from charmlibs.rollingops.common._exceptions import ( + RollingOpsDecodingError, + RollingOpsNoRelationError, +) +from charmlibs.rollingops.common._models import ( + Operation, + OperationQueue, + OperationResult, + UnitBackendState, +) +from charmlibs.rollingops.common._utils import datetime_to_str, now_timestamp, parse_timestamp + +logger = logging.getLogger(__name__) + + +class LockIntent(StrEnum): + """Unit-level lock intents stored in unit databags.""" + + REQUEST = 'request' + RETRY_RELEASE = 'retry-release' + RETRY_HOLD = 'retry-hold' + IDLE = 'idle' + + +@dataclass +class PeerAppData: + """Application-scoped peer relation data.""" + + granted_unit: str = '' + granted_at: str = '' + + @property + def granted_at_dt(self) -> datetime | None: + """Return the grant timestamp as a datetime, if present.""" + return parse_timestamp(self.granted_at) + + @granted_at_dt.setter + def granted_at_dt(self, value: datetime | None) -> None: + """Store the grant timestamp from a datetime.""" + self.granted_at = datetime_to_str(value) if value is not None else '' + + +@dataclass +class PeerUnitData: + """Unit-scoped peer relation data.""" + + state: str = '' + operations: str = '' + executed_at: str = '' + + @property + def intent(self) -> LockIntent: + """Return the unit state as a LockIntent.""" + return LockIntent(self.state) if self.state else LockIntent.IDLE + + @intent.setter + def intent(self, value: LockIntent) -> None: + """Store the unit state from a LockIntent.""" + self.state = value + + @property + def queue(self) -> OperationQueue: + """Return the stored operation queue.""" + return OperationQueue.from_string(self.operations) + + @queue.setter + def queue(self, value: OperationQueue) -> None: + """Store the operation queue.""" + self.operations = value.to_string() + + @property + def executed_at_dt(self) -> datetime | None: + """Return the execution timestamp as a datetime, if present.""" + return parse_timestamp(self.executed_at) + + @executed_at_dt.setter + def executed_at_dt(self, value: datetime | None) -> None: + """Store the execution timestamp from a datetime.""" + self.executed_at = datetime_to_str(value) if value is not None else '' + + +class PeerAppLock: + """Application-scoped distributed lock state.""" + + def __init__(self, model: Model, relation_name: str): + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + self._relation = relation + self._app = model.app + self._app_data = self._relation.load(PeerAppData, self._app, decoder=lambda s: s) + + def _save(self, data: PeerAppData) -> None: + self._relation.save(data, self._app, encoder=str) + + @property + def granted_unit(self) -> str: + """Return the unit name currently holding the grant, if any.""" + return self._app_data.granted_unit + + @property + def granted_at(self) -> datetime | None: + """Return the timestamp when the grant was issued, if any.""" + return self._app_data.granted_at_dt + + def grant(self, unit_name: str) -> None: + """Grant the lock to the provided unit.""" + self._app_data.granted_unit = unit_name + self._app_data.granted_at_dt = now_timestamp() + self._save(self._app_data) + + def release(self) -> None: + """Clear the current grant.""" + self._app_data.granted_unit = '' + self._app_data.granted_at_dt = None + self._save(self._app_data) + + def is_granted(self, unit_name: str) -> bool: + """Return whether the provided unit currently holds the grant.""" + return self.granted_unit == unit_name + + +class PeerUnitOperations: + """Unit-scoped queued operations and execution state.""" + + def __init__(self, model: Model, relation_name: str, unit: Unit): + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + self._relation = relation + self.unit = unit + self._backend_state = UnitBackendState(model, relation_name, unit) + self._unit_data = self._relation.load(PeerUnitData, self.unit, decoder=lambda s: s) + + def _save(self, data: PeerUnitData) -> None: + self._relation.save(data, self.unit, encoder=str) + + def is_peer_managed(self) -> bool: + """Return whether the peer backend should process this unit's queue.""" + return self._backend_state.is_peer_managed() + + @property + def intent(self) -> LockIntent: + """Return the current unit intent.""" + return self._unit_data.intent + + @property + def executed_at(self) -> datetime | None: + """Return the last execution timestamp for this unit.""" + return self._unit_data.executed_at_dt + + @property + def queue(self) -> OperationQueue: + return self._unit_data.queue + + def get_current(self) -> Operation | None: + """Return the head operation, if any.""" + return self._unit_data.queue.peek() + + def has_pending_work(self) -> bool: + """Return whether this unit still has queued work.""" + return self.get_current() is not None + + def request(self, operation: Operation) -> None: + """Enqueue an operation and mark this unit as requesting the lock.""" + data = self._unit_data + queue = data.queue + + previous_length = len(queue) + queue.enqueue(operation) + added = len(queue) != previous_length + if not added: + logger.info( + 'Operation %s not added to the peer queue. ' + 'It already exists in the back of the queue.', + operation.callback_id, + ) + return + + data.queue = queue + if len(queue) == 1: + data.intent = LockIntent.REQUEST + self._unit_data = data + self._save(data) + logger.info('Operation %s added to the peer queue.', operation.callback_id) + + def finish(self, result: OperationResult) -> None: + """Persist the result of executing the current operation.""" + self._apply_result_to_data(self._unit_data, result) + self._save(self._unit_data) + + def _apply_result_to_data( + self, + data: PeerUnitData, + result: OperationResult, + ) -> None: + queue = data.queue + operation = queue.peek() + + if operation is None: + data.intent = LockIntent.IDLE + data.executed_at_dt = now_timestamp() + return + + match result: + case OperationResult.RETRY_HOLD: + queue.increase_attempt() + operation = queue.peek() + if operation is None or operation.is_max_retry_reached(): + logger.warning('Operation max retry reached. Dropping.') + queue.dequeue() + data.intent = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE + else: + data.intent = LockIntent.RETRY_HOLD + + case OperationResult.RETRY_RELEASE: + queue.increase_attempt() + operation = queue.peek() + if operation is None or operation.is_max_retry_reached(): + logger.warning('Operation max retry reached. Dropping.') + queue.dequeue() + data.intent = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE + else: + data.intent = LockIntent.RETRY_RELEASE + case _: + queue.dequeue() + data.intent = LockIntent.REQUEST if queue.peek() else LockIntent.IDLE + + data.queue = queue + data.executed_at_dt = now_timestamp() + + def should_run(self, lock: PeerAppLock) -> bool: + """Return whether this unit should execute now.""" + return ( + self.is_peer_managed() + and lock.is_granted(self.unit.name) + and not self._executed_after_grant(lock) + ) + + def should_release(self, lock: PeerAppLock) -> bool: + """Return whether this unit should release the lock.""" + return (self.is_peer_managed() and self.is_completed(lock)) or self._executed_after_grant( + lock + ) + + def is_waiting(self) -> bool: + """Return whether this unit is waiting for a fresh grant.""" + return self.is_peer_managed() and self.intent == LockIntent.REQUEST + + def is_waiting_retry(self) -> bool: + """Return whether this unit is waiting for a retry after releasing.""" + return self.is_peer_managed() and self.intent == LockIntent.RETRY_RELEASE + + def is_retry_hold(self) -> bool: + """Return whether this unit wants to retry while keeping priority.""" + return self.is_peer_managed() and self.intent == LockIntent.RETRY_HOLD + + def is_retry(self, lock: PeerAppLock) -> bool: + """Return whether this unit is in a retry state and currently granted.""" + return ( + self.is_peer_managed() + and self.intent + in { + LockIntent.RETRY_RELEASE, + LockIntent.RETRY_HOLD, + } + and lock.is_granted(self.unit.name) + ) + + def is_completed(self, lock: PeerAppLock) -> bool: + """Return whether this unit completed and still holds the grant.""" + return ( + self.is_peer_managed() + and self.intent == LockIntent.IDLE + and lock.is_granted(self.unit.name) + ) + + def requested_at(self) -> datetime | None: + """Return the timestamp of the current operation request, if any.""" + operation = self.get_current() + return operation.requested_at if operation is not None else None + + def _executed_after_grant(self, lock: PeerAppLock) -> bool: + """Return whether execution happened after the current grant.""" + granted_at = lock.granted_at + executed_at = self.executed_at + if granted_at is None or executed_at is None: + return False + return executed_at > granted_at + + def mirror_result(self, op_id: str, result: OperationResult) -> None: + """Apply an execution result to the mirrored peer queue. + + This keeps the peer copy aligned with the backend that actually executed + the operation. + + Raises: + RollingOpsDecodingError: if there is an inconsistency found. + """ + data = self._unit_data + current = data.queue.peek() + + if current is None: + logger.warning('Cannot mirror finalized operation: peer queue is empty.') + raise RollingOpsDecodingError('Inconsistent operation found.') + + if current.op_id != op_id: + logger.warning( + 'Cannot mirror finalized operation: peer head op_id=%s ' + 'does not match finalized op_id=%s.', + current.op_id, + op_id, + ) + raise RollingOpsDecodingError('Inconsistent operation found.') + + self._apply_result_to_data(data, result) + self._save(data) + + +def iter_peer_units(model: Model, relation_name: str) -> Iterator[Unit]: + """Yield all units currently participating in the peer relation, including self.""" + relation = model.get_relation(relation_name) + if relation is None: + raise RollingOpsNoRelationError() + + units = set(relation.units) + units.add(model.unit) + + yield from units + + +def pick_oldest_completed(operations_list: list[PeerUnitOperations]) -> str | None: + """Return the name of the unit with the oldest executed_at timestamp.""" + selected = None + oldest = None + + for operations in operations_list: + timestamp = operations.executed_at + if timestamp is None: + continue + if oldest is None or timestamp < oldest: + oldest = timestamp + selected = operations + + return selected.unit.name if selected is not None else None + + +def pick_oldest_request(operations_list: list[PeerUnitOperations]) -> str | None: + """Return the name of the unit with the oldest head operation.""" + selected = None + oldest = None + + for operations in operations_list: + timestamp = operations.requested_at() + if timestamp is None: + continue + if oldest is None or timestamp < oldest: + oldest = timestamp + selected = operations + + return selected.unit.name if selected is not None else None diff --git a/rollingops/src/charmlibs/rollingops/_peer_rollingops.py b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py similarity index 58% rename from rollingops/src/charmlibs/rollingops/_peer_rollingops.py rename to rollingops/src/charmlibs/rollingops/peer/_rollingops.py index 0933ce867..8dae3c0f1 100644 --- a/rollingops/src/charmlibs/rollingops/_peer_rollingops.py +++ b/rollingops/src/charmlibs/rollingops/peer/_rollingops.py @@ -15,25 +15,34 @@ """Background process.""" import argparse -import subprocess import time +from charmlibs.rollingops.common._utils import dispatch_lock_granted, setup_logging +from charmlibs.rollingops.peer._worker import PEER_LOG_FILENAME + def main(): """Juju hook event dispatcher.""" - parser = argparse.ArgumentParser() - parser.add_argument('--run-cmd', required=True) - parser.add_argument('--unit-name', required=True) - parser.add_argument('--charm-dir', required=True) + parser = argparse.ArgumentParser(description='RollingOps peer worker') + parser.add_argument( + '--unit-name', + type=str, + required=True, + help='Juju unit name (e.g. app/0)', + ) + parser.add_argument( + '--charm-dir', + type=str, + required=True, + help='Path to the charm directory', + ) args = parser.parse_args() + setup_logging(PEER_LOG_FILENAME, unit_name=args.unit_name) # Sleep so that the leader unit can properly leave the hook and start a new one time.sleep(10) - dispatch_sub_cmd = ( - f'JUJU_DISPATCH_PATH=hooks/rollingops_lock_granted {args.charm_dir}/dispatch' - ) - res = subprocess.run([args.run_cmd, '-u', args.unit_name, dispatch_sub_cmd]) - res.check_returncode() + + dispatch_lock_granted(args.unit_name, args.charm_dir) if __name__ == '__main__': diff --git a/rollingops/src/charmlibs/rollingops/peer/_worker.py b/rollingops/src/charmlibs/rollingops/peer/_worker.py new file mode 100644 index 000000000..ca4da358c --- /dev/null +++ b/rollingops/src/charmlibs/rollingops/peer/_worker.py @@ -0,0 +1,111 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""etcd rolling ops. Spawns and manages the external rolling-ops worker process.""" + +import logging + +from ops import RelationDataContent +from ops.charm import ( + CharmBase, +) + +from charmlibs import pathops +from charmlibs.rollingops.common._base_worker import BaseRollingOpsAsyncWorker + +logger = logging.getLogger(__name__) + +PEER_LOG_FILENAME = '/var/log/peer_rollingops_worker.log' + + +class PeerRollingOpsAsyncWorker(BaseRollingOpsAsyncWorker): + """Manage the peer-backed rolling-ops worker process. + + The worker state is coordinated through the peer relation application + databag to ensure that it remains accessible across leadership + changes. This guarantees that a newly elected leader can detect, + stop, or restart an existing worker process as needed. + """ + + _pid_field = 'peer-rollingops-worker-pid' + _log_filename = PEER_LOG_FILENAME + + def __init__(self, charm: CharmBase, relation_name: str): + super().__init__(charm, 'peer-rollingops-async-worker', relation_name) + + @property + def _app_data(self) -> RelationDataContent: + """Return the application databag in the peer relation.""" + return self._relation.data[self.model.app] # type: ignore[reportOptionalMemberAccess] + + def _worker_script_path(self) -> pathops.LocalPath: + """Return the path to the peer rolling-ops worker script. + + This script is executed in a background process to handle operation + processing for the peer backend. + """ + return pathops.LocalPath( + self._venv_site_packages() / 'charmlibs' / 'rollingops' / 'peer' / '_rollingops.py' + ) + + @property + def _pid(self) -> int | None: + """Return the stored worker process PID. + + The PID is persisted in the application databag of the peer relation. + + Returns: + The worker process PID, or None if not set. + """ + if self._relation is None: + return None + pid = self._app_data.get(self._pid_field, '') + + try: + pid = int(pid) + except (ValueError, TypeError): + pid = None + + return pid + + @_pid.setter + def _pid(self, value: int | None) -> None: + """Persist the worker process PID in the peer relation databag. + + The PID is stored in the application databag because it is used + to trigger rolling operations on the leader and the leader may change. + + Args: + value: The process identifier to store. + """ + if self._relation is None: + return + self._app_data.update({self._pid_field: '' if value is None else str(value)}) + + def _on_existing_worker(self, pid: int) -> bool: + """Handle the presence of an already running worker process. + + When an existing worker is detected, it is stopped before starting a + new one to ensure a single active worker per application. + + Args: + pid: The PID of the currently running worker. + + Returns: + True to indicate that the existing worker was handled and a new + worker can be started. + """ + logger.info('Stopping existing RollingOps worker PID %s before restart.', pid) + self.stop() + return True diff --git a/rollingops/tests/integration/charms/actions.yaml b/rollingops/tests/integration/charms/actions.yaml index d514e1d0d..196a181ce 100644 --- a/rollingops/tests/integration/charms/actions.yaml +++ b/rollingops/tests/integration/charms/actions.yaml @@ -30,3 +30,15 @@ deferred-restart: max-retry: description: "Number of times the operation should be retried." type: integer + +sync-restart: + description: Example restart with a custom callback function. Used in testing + params: + delay: + description: "Introduce an artificial delay (for testing)." + type: integer + default: 0 + timeout: + description: "Time (seconds) to wait before giving up." + type: integer + default: 60 diff --git a/rollingops/tests/integration/charms/common.py b/rollingops/tests/integration/charms/common.py index aba0cdb40..a098677c7 100644 --- a/rollingops/tests/integration/charms/common.py +++ b/rollingops/tests/integration/charms/common.py @@ -30,6 +30,7 @@ from charmlibs.rollingops import ( OperationResult, RollingOpsManager, + SyncLockBackend, ) logger = logging.getLogger(__name__) @@ -38,8 +39,16 @@ def _now_timestamp_str() -> str: - """UTC timestamp as a string using ISO 8601 format.""" - return datetime.now(UTC).isoformat() + """UTC timestamp as a epoch.""" + return str(datetime.now(UTC).timestamp()) + + +class MySyncBackend(SyncLockBackend): + def acquire(self, timeout: int | None) -> None: + logger.info('acquiring sync lock') + + def release(self) -> None: + logger.info('releasing sync lock') class Charm(CharmBase): @@ -59,11 +68,15 @@ def __init__(self, framework: Framework): etcd_relation_name='etcd', cluster_id='cluster-12345', callback_targets=callback_targets, + sync_lock_targets={ + 'stop': MySyncBackend, + }, ) self.framework.observe(self.on.restart_action, self._on_restart_action) self.framework.observe(self.on.failed_restart_action, self._on_failed_restart_action) self.framework.observe(self.on.deferred_restart_action, self._on_deferred_restart_action) + self.framework.observe(self.on.sync_restart_action, self._on_sync_restart_action) def _restart(self, delay: int = 0) -> None: self._record_transition('_restart:start', delay=delay) @@ -119,12 +132,37 @@ def _on_deferred_restart_action(self, event: ActionEvent) -> None: max_retry=max_retry, ) + def _on_sync_restart_action(self, event: ActionEvent): + self.model.unit.status = WaitingStatus('Awaiting _sync_restart operation') + timeout = event.params.get('timeout', 60) + delay = event.params.get('delay') + self._record_transition('action:sync-restart', delay=delay, timeout=timeout) + + try: + with self.restart_manager.acquire_sync_lock(backend_id='stop', timeout=timeout): + self._record_transition('_sync_restart:start', delay=delay, timeout=timeout) + logger.info('Executing _sync_restart.') + self.model.unit.status = MaintenanceStatus('Executing _sync_restart operation') + time.sleep(int(event.params.get('delay', 0))) + self.model.unit.status = ActiveStatus('') + logger.info('Finished _sync_restart.') + self._record_transition('_sync_restart:done', delay=delay, timeout=timeout) + return + except TimeoutError: + self._record_transition('_sync_restart:timeout', delay=delay, timeout=timeout) + event.fail('Timed out acquiring sync lock') + def _record_transition(self, name: str, **data: Any) -> None: TRACE_FILE.parent.mkdir(parents=True, exist_ok=True) + state = self.restart_manager.state payload = { 'ts': _now_timestamp_str(), 'unit': self.model.unit.name, 'event': name, + 'rollingops_status': state.status.value if state.status else None, + 'processing_backend': state.processing_backend.value + if state.processing_backend + else None, **data, } with TRACE_FILE.open('a', encoding='utf-8') as f: diff --git a/rollingops/tests/integration/test_etcd_rolling_ops.py b/rollingops/tests/integration/test_etcd_rolling_ops.py index d17153ca9..4172d416e 100644 --- a/rollingops/tests/integration/test_etcd_rolling_ops.py +++ b/rollingops/tests/integration/test_etcd_rolling_ops.py @@ -15,23 +15,37 @@ """Integration tests using real Juju and pre-packed charm(s).""" import logging +import time from pathlib import Path import jubilant import pytest from tenacity import retry, stop_after_delay, wait_fixed -from tests.integration.utils import get_unit_events, remove_transition_file +from tests.integration.utils import ( + get_unit_events, + is_empty_file, + parse_ts, + remove_transition_file, +) -TRACE_FILE = '/var/lib/charm-rolling-ops/transitions.log' logger = logging.getLogger(__name__) TIMEOUT = 15 * 60.0 +ETCD_PROCESS_LOGS = '/var/log/etcd_rollingops_worker.log' +PEER_PROCCES_LOGS = '/var/log/peer_rollingops_worker.log' +ETCD_CONFIG_FILE = '/var/lib/rollingops/etcd/etcdctl.json' -@retry(wait=wait_fixed(10), stop=stop_after_delay(60), reraise=True) -def wait_for_etcdctl_env(juju: jubilant.Juju, unit: str) -> None: - task = juju.exec('test -f /var/lib/rollingops/etcd/etcdctl.json', unit=unit) +def etcdctl_file_exits(juju: jubilant.Juju, unit: str) -> bool: + task = juju.exec(f'test -f {ETCD_CONFIG_FILE}', unit=unit) if task.status != 'completed' or task.return_code != 0: + return False + return True + + +@retry(wait=wait_fixed(10), stop=stop_after_delay(60), reraise=True) +def wait_for_etcdctl_config_file(juju: jubilant.Juju, unit: str) -> None: + if not etcdctl_file_exits(juju, unit): raise RuntimeError('etcdctl config file not ready') @@ -41,9 +55,7 @@ def test_deploy(juju: jubilant.Juju, app_name: str): @pytest.mark.machine_only -def test_restart_action_one_unit(juju: jubilant.Juju, app_name: str): - """Verify that restart action runs through the expected workflow.""" - +def test_charm_is_integrated_with_etcd(juju: jubilant.Juju, app_name: str): juju.deploy( 'self-signed-certificates', app='self-signed-certificates', @@ -65,68 +77,262 @@ def test_restart_action_one_unit(juju: jubilant.Juju, app_name: str): juju.integrate(f'{app_name}:etcd', 'etcd:etcd-client') juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) - wait_for_etcdctl_env(juju, f'{app_name}/0') + wait_for_etcdctl_config_file(juju, f'{app_name}/0') - juju.run(f'{app_name}/0', 'restart', {'delay': 1}, wait=300) - juju.wait( - jubilant.all_active, - error=jubilant.any_error, - timeout=TIMEOUT, - ) +@pytest.mark.machine_only +def test_restart_action_one_unit_single_app(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' - events = get_unit_events(juju, f'{app_name}/0') - restart_events = [e['event'] for e in events] + juju.run(unit, 'restart', {'delay': 1}, wait=TIMEOUT) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + events = get_unit_events(juju, unit) + restart_events = [ + (e['event'], e['processing_backend']) + for e in events + if not e['event'].startswith('action') + ] expected = [ - 'action:restart', - '_restart:start', - '_restart:done', + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ] + + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert not is_empty_file(juju, unit, ETCD_PROCESS_LOGS) + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_failed_restart_retries_one_unit_single_app(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' + remove_transition_file(juju, unit) + + juju.run(unit, 'failed-restart', {'delay': 1, 'max-retry': 1}) + juju.run(unit, 'restart', {'delay': 1}) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [ + (e['event'], e['processing_backend']) + for e in events + if not e['event'].startswith('action') ] - assert expected == restart_events + expected = [ + ('_failed_restart:start', 'etcd'), # attempt 0 + ('_failed_restart:retry_release', 'etcd'), + ('_failed_restart:start', 'etcd'), # retry 1 + ('_failed_restart:retry_release', 'etcd'), + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ] + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_assert_deferred_restart_retries_one_unit_single_app(juju: jubilant.Juju, app_name: str): + unit = f'{app_name}/0' + remove_transition_file(juju, unit) + + juju.run(unit, 'deferred-restart', {'delay': 1, 'max-retry': 1}, wait=TIMEOUT) + juju.run(unit, 'restart', {'delay': 1}) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + events = get_unit_events(juju, unit) + restart_events = [ + (e['event'], e['processing_backend']) + for e in events + if not e['event'].startswith('action') + ] + + expected = [ + ('_deferred_restart:start', 'etcd'), # attempt 0 + ('_deferred_restart:retry_hold', 'etcd'), + ('_deferred_restart:start', 'etcd'), # retry 1 + ('_deferred_restart:retry_hold', 'etcd'), + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ] + assert restart_events == expected, f'unexpected event order: {restart_events}' + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_assert_restart_rolls_one_unit_at_a_time_single_app(juju: jubilant.Juju, app_name: str): + juju.add_unit(app=app_name, num_units=4) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + for unit in units: + juju.run(unit, 'restart', {'delay': 15}) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + all_events: list[dict[str, str]] = [] + for unit in units: + events = get_unit_events(juju, unit) + assert len(events) == 3 + all_events.extend(events) + + restart_events = [e for e in all_events if not e['event'].startswith('action')] + restart_events.sort(key=parse_ts) + + logger.info(restart_events) + + assert len(restart_events) == len(units) * 2 + for i in range(0, len(restart_events), 2): + start_event = restart_events[i] + done_event = restart_events[i + 1] + + assert start_event['event'] == '_restart:start' + assert done_event['event'] == '_restart:done' + assert start_event['unit'] == done_event['unit'] + assert start_event['processing_backend'] == 'etcd' + assert done_event['processing_backend'] == 'etcd' + for unit in units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) @pytest.mark.machine_only -def test_all_units_can_connect_to_etcd(juju: jubilant.Juju, app_name: str): - juju.add_unit(app_name, num_units=2) +def test_retry_hold_operation_two_units_single_app(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + + for unit in units: + remove_transition_file(juju, unit) + + unit_a = units[1] + unit_b = units[3] + + juju.run(unit_a, 'deferred-restart', {'delay': 15, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit_b, 'restart', {'delay': 2}, wait=TIMEOUT) + juju.wait( - lambda status: jubilant.all_active(status, app_name), + lambda status: status.apps[app_name].units[unit_b].is_active, error=jubilant.any_error, timeout=TIMEOUT, ) + all_events: list[dict[str, str]] = [] + all_events.extend(get_unit_events(juju, unit_a)) + all_events.extend(get_unit_events(juju, unit_b)) + all_events.sort(key=parse_ts) + + logger.info(all_events) + + relevant_events = [e for e in all_events if not e['event'].startswith('action')] + sequence = [(e['unit'], e['event'], e['processing_backend']) for e in relevant_events] + + logger.info(sequence) + + assert sequence == [ + (unit_a, '_deferred_restart:start', 'etcd'), # attempt 0 + (unit_a, '_deferred_restart:retry_hold', 'etcd'), + (unit_a, '_deferred_restart:start', 'etcd'), # retry 1 + (unit_a, '_deferred_restart:retry_hold', 'etcd'), + (unit_a, '_deferred_restart:start', 'etcd'), # retry 2 + (unit_a, '_deferred_restart:retry_hold', 'etcd'), + (unit_b, '_restart:start', 'etcd'), + (unit_b, '_restart:done', 'etcd'), + ], f'unexpected event sequence: {sequence}' + + for unit in units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_retry_release_two_units_single_app(juju: jubilant.Juju, app_name: str): status = juju.status() units = sorted(status.apps[app_name].units) - for unit in units: remove_transition_file(juju, unit) + unit_a = units[2] + unit_b = units[4] + + juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit_b, 'failed-restart', {'delay': 15, 'max-retry': 2}, wait=TIMEOUT) + + time.sleep( + 60 * 3 + ) # wait for operation execution. TODO: in charm use lock state to clear status. + + all_events: list[dict[str, str]] = [] + all_events.extend(get_unit_events(juju, unit_a)) + all_events.extend(get_unit_events(juju, unit_b)) + all_events.sort(key=parse_ts) + + restart_events = [e for e in all_events if not e['event'].startswith('action')] + restart_events.sort(key=parse_ts) + + logger.info(restart_events) + + assert len(restart_events) == 2 * 2 * 3 # 2 units * 2 events * 3 executions + for i in range(0, len(restart_events), 2): + start_event = restart_events[i] + done_event = restart_events[i + 1] + + assert start_event['event'] == '_failed_restart:start' + assert done_event['event'] == '_failed_restart:retry_release' + assert start_event['unit'] == done_event['unit'] + assert start_event['processing_backend'] == 'etcd' + assert done_event['processing_backend'] == 'etcd' + + for unit in units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_subsequent_lock_request_ops_single_app(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) for unit in units: - juju.run(unit, 'restart', {'delay': 2}, wait=300) + remove_transition_file(juju, unit) + + unit_a = units[3] + + juju.run(unit_a, 'deferred-restart', {'delay': 1, 'max-retry': 1}) + for _ in range(3): + juju.run(unit_a, 'failed-restart', {'delay': 1, 'max-retry': 0}) + juju.run(unit_a, 'restart', {'delay': 1}) juju.wait( - lambda status: jubilant.all_active(status, app_name, 'etcd', 'self-signed-certificates'), + lambda status: status.apps[app_name].units[unit_a].is_active, error=jubilant.any_error, timeout=TIMEOUT, ) - expected = [ - 'action:restart', - '_restart:start', - '_restart:done', + unit_a_events = get_unit_events(juju, unit_a) + relevant_events = [ + (e['event'], e['processing_backend']) + for e in unit_a_events + if not e['event'].startswith('action') ] + logger.info('unit_a_events %s', unit_a_events) + + assert relevant_events == [ + ('_deferred_restart:start', 'etcd'), # attempt 0 + ('_deferred_restart:retry_hold', 'etcd'), + ('_deferred_restart:start', 'etcd'), # retry 1 + ('_deferred_restart:retry_hold', 'etcd'), + ('_failed_restart:start', 'etcd'), # attempt 0 + ('_failed_restart:retry_release', 'etcd'), + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ], f'unexpected event sequence: {relevant_events}' for unit in units: - events = get_unit_events(juju, unit) - restart_events = [e['event'] for e in events] - assert restart_events == expected + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) @pytest.mark.machine_only -def test_all_units_can_connect_to_etcd_multi_app(juju: jubilant.Juju, charm: Path, app_name: str): +def test_rolling_ops_multi_app(juju: jubilant.Juju, charm: Path, app_name: str): second_app = f'{app_name}-secondary' - juju.deploy(charm, app=second_app, num_units=3) juju.wait( lambda status: jubilant.all_active(status, second_app), @@ -136,9 +342,7 @@ def test_all_units_can_connect_to_etcd_multi_app(juju: jubilant.Juju, charm: Pat juju.integrate(f'{second_app}:etcd', 'etcd:etcd-client') juju.wait( - lambda status: jubilant.all_active( - status, app_name, second_app, 'etcd', 'self-signed-certificates' - ), + lambda status: jubilant.all_active(status, second_app, 'etcd'), error=jubilant.any_error, timeout=TIMEOUT, ) @@ -149,32 +353,169 @@ def test_all_units_can_connect_to_etcd_multi_app(juju: jubilant.Juju, charm: Pat for unit in all_units: remove_transition_file(juju, unit) + wait_for_etcdctl_config_file(juju, unit) for unit in all_units: - wait_for_etcdctl_env(juju, unit) + juju.run(unit, 'restart', {'delay': 10}, wait=TIMEOUT) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + all_events: list[dict[str, str]] = [] for unit in all_units: - juju.run(unit, 'restart', {'delay': 2}, wait=300) + events = get_unit_events(juju, unit) + assert len(events) == 3 + all_events.extend(events) - juju.wait( - lambda status: jubilant.all_active( - status, - app_name, - second_app, - 'etcd', - 'self-signed-certificates', - ), - error=jubilant.any_error, - timeout=TIMEOUT, - ) + restart_events = [e for e in all_events if not e['event'].startswith('action')] + restart_events.sort(key=parse_ts) - expected = [ - 'action:restart', - '_restart:start', - '_restart:done', - ] + logger.info(restart_events) + + assert len(restart_events) == len(all_units) * 2 + for i in range(0, len(restart_events), 2): + start_event = restart_events[i] + done_event = restart_events[i + 1] + + assert start_event['event'] == '_restart:start' + assert done_event['event'] == '_restart:done' + assert start_event['unit'] == done_event['unit'] + assert start_event['processing_backend'] == 'etcd' + assert done_event['processing_backend'] == 'etcd' for unit in all_units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_rolling_ops_sync_lock_multi_app(juju: jubilant.Juju, app_name: str): + second_app = f'{app_name}-secondary' + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + primary_units = sorted(juju.status().apps[app_name].units.keys()) + secondary_units = sorted(juju.status().apps[second_app].units.keys()) + all_units: list[str] = primary_units + secondary_units + + for unit in all_units: + remove_transition_file(juju, unit) + wait_for_etcdctl_config_file(juju, unit) + + unit_a = primary_units[1] + unit_b = secondary_units[1] + + juju.cli('run', unit_a, 'sync-restart', 'delay=15', '--background') + time.sleep(2) + juju.cli('run', unit_b, 'sync-restart', 'delay=15', '--background') + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + all_events: list[dict[str, str]] = [] + + for unit in {unit_a, unit_b}: events = get_unit_events(juju, unit) - restart_events = [e['event'] for e in events] - assert restart_events == expected + assert len(events) == 3 + all_events.extend(events) + + all_events.sort(key=parse_ts) + restart_events = [ + (e['unit'], e['event'], e['processing_backend']) + for e in all_events + if not e['event'].startswith('action') + ] + + logger.info(restart_events) + + assert restart_events == [ + (unit_a, '_sync_restart:start', 'etcd'), + (unit_a, '_sync_restart:done', 'etcd'), + (unit_b, '_sync_restart:start', 'etcd'), + (unit_b, '_sync_restart:done', 'etcd'), + ], f'unexpected event sequence: {restart_events}' + + for unit in all_units: + assert is_empty_file(juju, unit, PEER_PROCCES_LOGS) + + +@pytest.mark.machine_only +def test_lock_released_when_unit_removed(juju: jubilant.Juju, app_name: str) -> None: + units = sorted(juju.status().apps[app_name].units.keys()) + for unit in units: + remove_transition_file(juju, unit) + unit_a = units[1] + unit_b = units[2] + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + juju.run(unit_a, 'deferred-restart', {'delay': 15}) + time.sleep(5) + juju.run(unit_b, 'restart', {'delay': 2}) + + juju.remove_unit(unit_a) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + unit_b_events = get_unit_events(juju, unit_b) + relevant_events = [ + (e['event'], e['processing_backend']) + for e in unit_b_events + if not e['event'].startswith('action') + ] + + logger.info('unit_b_events %s', unit_b_events) + + assert relevant_events == [ + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ], f'unexpected event sequence: {relevant_events}' + + +@pytest.mark.machine_only +def test_actions_still_work_after_etcd_relation_removed( + juju: jubilant.Juju, app_name: str +) -> None: + second_app = f'{app_name}-secondary' + primary_units = sorted(juju.status().apps[app_name].units.keys()) + secondary_units = sorted(juju.status().apps[second_app].units.keys()) + all_units: list[str] = primary_units + secondary_units + + for unit in all_units: + remove_transition_file(juju, unit) + wait_for_etcdctl_config_file(juju, unit) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + unit_a = primary_units[3] + + juju.run(unit_a, 'failed-restart', {'delay': 10, 'max-retry': 1}) + juju.run(unit_a, 'restart', {'delay': 1}) + juju.run(unit_a, 'restart', {'delay': 2}) + + juju.remove_relation(f'{app_name}:etcd', 'etcd:etcd-client') + + unit_b = secondary_units[1] + juju.run(unit_b, 'restart', {'delay': 1}) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + unit_a_events = get_unit_events(juju, unit_a) + relevant_events = [e['event'] for e in unit_a_events if not e['event'].startswith('action')] + + logger.info('unit_a_events %s', unit_a_events) + + assert relevant_events.count('_failed_restart:start') == 2, relevant_events + assert relevant_events.count('_failed_restart:retry_release') == 2, relevant_events + assert relevant_events.count('_restart:start') == 2, relevant_events + assert relevant_events.count('_restart:done') == 2, relevant_events + + unit_b_events = get_unit_events(juju, unit_b) + assert len(unit_b_events) == 3 + restart_events = [ + (e['event'], e['processing_backend']) + for e in unit_b_events + if not e['event'].startswith('action') + ] + + assert restart_events == [ + ('_restart:start', 'etcd'), + ('_restart:done', 'etcd'), + ], f'unexpected event sequence: {restart_events}' diff --git a/rollingops/tests/integration/test_peer_rolling_ops.py b/rollingops/tests/integration/test_peer_rolling_ops.py index 8178cfe92..dcd527b61 100644 --- a/rollingops/tests/integration/test_peer_rolling_ops.py +++ b/rollingops/tests/integration/test_peer_rolling_ops.py @@ -36,8 +36,6 @@ def test_deploy(juju: jubilant.Juju, app_name: str): def test_restart_action_one_unit(juju: jubilant.Juju, app_name: str): - """Verify that restart action runs through the expected workflow.""" - juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) unit = f'{app_name}/0' @@ -55,79 +53,66 @@ def test_restart_action_one_unit(juju: jubilant.Juju, app_name: str): ] assert restart_events == expected, f'unexpected event order: {restart_events}' + assert all(e['processing_backend'] == 'peer' for e in events) def test_failed_restart_retries_one_unit(juju: jubilant.Juju, app_name: str): unit = f'{app_name}/0' remove_transition_file(juju, unit) - juju.run(unit, 'failed-restart', {'delay': 1, 'max-retry': 2}, wait=TIMEOUT) - - time.sleep(60) # wait for operation execution. TODO: in charm use lock state to clear status. + juju.run(unit, 'failed-restart', {'delay': 1, 'max-retry': 2}) + juju.run(unit, 'restart', {'delay': 1}) - juju.wait( - lambda status: status.apps[app_name].is_maintenance, - error=jubilant.any_error, - timeout=TIMEOUT, - ) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) events = get_unit_events(juju, unit) - restart_events = [e['event'] for e in events] + restart_events = [e['event'] for e in events if not e['event'].startswith('action')] expected = [ - 'action:failed-restart', '_failed_restart:start', # attempt 0 '_failed_restart:retry_release', '_failed_restart:start', # retry 1 '_failed_restart:retry_release', '_failed_restart:start', # retry 2 '_failed_restart:retry_release', + '_restart:start', + '_restart:done', ] assert restart_events == expected, f'unexpected event order: {restart_events}' + assert all(e['processing_backend'] == 'peer' for e in events) -def test_deferred_restart_retries_one_unit(juju: jubilant.Juju, app_name: str): +def test_assert_deferred_restart_retries_one_unit(juju: jubilant.Juju, app_name: str): unit = f'{app_name}/0' remove_transition_file(juju, unit) juju.run(unit, 'deferred-restart', {'delay': 1, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit, 'restart', {'delay': 1}) - time.sleep(60) # wait for operation execution. TODO: in charm use lock state to clear status. - - juju.wait( - lambda status: status.apps[app_name].is_maintenance, - error=jubilant.any_error, - timeout=TIMEOUT, - ) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) events = get_unit_events(juju, unit) - restart_events = [e['event'] for e in events] + restart_events = [e['event'] for e in events if not e['event'].startswith('action')] expected = [ - 'action:deferred-restart', '_deferred_restart:start', # attempt 0 '_deferred_restart:retry_hold', '_deferred_restart:start', # retry 1 '_deferred_restart:retry_hold', '_deferred_restart:start', # retry 2 '_deferred_restart:retry_hold', + '_restart:start', + '_restart:done', ] assert restart_events == expected, f'unexpected event order: {restart_events}' + assert all(e['processing_backend'] == 'peer' for e in events) -def test_restart_rolls_one_unit_at_a_time(juju: jubilant.Juju, app_name: str): +def test_assert_restart_rolls_one_unit_at_a_time(juju: jubilant.Juju, app_name: str): juju.add_unit(app=app_name, num_units=4) - juju.wait( # TODO: wait for 5 units to be active - lambda status: ( - app_name in status.apps - and len(status.apps[app_name].units) == 5 - and sum(1 for u in status.apps[app_name].units.values() if u.is_active) >= 4 - ), - error=jubilant.any_error, - timeout=TIMEOUT, - ) + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) status = juju.status() units = sorted(status.apps[app_name].units) @@ -160,6 +145,7 @@ def test_restart_rolls_one_unit_at_a_time(juju: jubilant.Juju, app_name: str): assert start_event['unit'] == done_event['unit'], ( f'start/done pair mismatch: {start_event} vs {done_event}' ) + assert all(e['processing_backend'] == 'peer' for e in all_events) def test_retry_hold_keeps_lock_on_same_unit(juju: jubilant.Juju, app_name: str): @@ -172,7 +158,7 @@ def test_retry_hold_keeps_lock_on_same_unit(juju: jubilant.Juju, app_name: str): unit_a = units[1] unit_b = units[3] - juju.run(unit_a, 'deferred-restart', {'delay': 10, 'max-retry': 2}, wait=TIMEOUT) + juju.run(unit_a, 'deferred-restart', {'delay': 15, 'max-retry': 2}, wait=TIMEOUT) juju.run(unit_b, 'restart', {'delay': 2}, wait=TIMEOUT) juju.wait( @@ -214,6 +200,7 @@ def test_retry_hold_keeps_lock_on_same_unit(juju: jubilant.Juju, app_name: str): (unit_b, '_restart:start'), (unit_b, '_restart:done'), ], f'unexpected event sequence: {sequence}' + assert all(e['processing_backend'] == 'peer' for e in all_events) def test_retry_release_alternates_execution(juju: jubilant.Juju, app_name: str): @@ -261,6 +248,7 @@ def test_retry_release_alternates_execution(juju: jubilant.Juju, app_name: str): (unit_b, '_failed_restart:start'), # retry 2 (unit_b, '_failed_restart:retry_release'), ], f'unexpected event sequence: {sequence}' + assert all(e['processing_backend'] == 'peer' for e in all_events) def test_subsequent_lock_request_of_different_ops(juju: jubilant.Juju, app_name: str): @@ -303,6 +291,7 @@ def test_subsequent_lock_request_of_different_ops(juju: jubilant.Juju, app_name: '_restart:start', '_restart:done', ], f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in unit_a_events) def test_subsequent_lock_request_of_same_op(juju: jubilant.Juju, app_name: str): @@ -317,7 +306,8 @@ def test_subsequent_lock_request_of_same_op(juju: jubilant.Juju, app_name: str): juju.run(unit_b, 'deferred-restart', {'delay': 10, 'max-retry': 1}) juju.run(unit_a, 'failed-restart', {'delay': 1, 'max-retry': 2}) for _ in range(3): - juju.run(unit_a, 'restart', {'delay': 1}) + juju.run(unit_a, 'deferred-restart', {'delay': 1, 'max-retry': 0}) + juju.run(unit_a, 'restart', {'delay': 1}) juju.wait( lambda status: status.apps[app_name].units[unit_a].is_active, @@ -326,24 +316,49 @@ def test_subsequent_lock_request_of_same_op(juju: jubilant.Juju, app_name: str): ) unit_a_events = get_unit_events(juju, unit_a) - relevant_events = [e['event'] for e in unit_a_events] + relevant_events = [e['event'] for e in unit_a_events if not e['event'].startswith('action')] logger.info('unit_a_events %s', unit_a_events) assert relevant_events == [ - 'action:failed-restart', - 'action:restart', - 'action:restart', - 'action:restart', '_failed_restart:start', # attempt 0 '_failed_restart:retry_release', '_failed_restart:start', # retry 1 '_failed_restart:retry_release', '_failed_restart:start', # retry 2 '_failed_restart:retry_release', + '_deferred_restart:start', # attemp 0 + '_deferred_restart:retry_hold', '_restart:start', '_restart:done', ], f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in unit_a_events) + + +def test_sync_lock_is_executed(juju: jubilant.Juju, app_name: str): + status = juju.status() + units = sorted(status.apps[app_name].units) + for unit in units: + remove_transition_file(juju, unit) + + for unit in units: + juju.run(unit, 'sync-restart', {'delay': 1}) + + juju.wait(jubilant.all_active, error=jubilant.any_error, timeout=TIMEOUT) + + expected_events = [ + 'action:sync-restart', + '_sync_restart:start', + '_sync_restart:done', + ] + + # mutually exclusive execution is not guarantee + for unit in units: + events = get_unit_events(juju, unit) + relevant_events = [e['event'] for e in events] + + assert expected_events == relevant_events, f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in events) def test_retry_on_leader_unit_leaves_the_hook(juju: jubilant.Juju, app_name: str): @@ -372,3 +387,4 @@ def test_retry_on_leader_unit_leaves_the_hook(juju: jubilant.Juju, app_name: str '_restart:start', '_restart:done', ], f'unexpected event sequence: {relevant_events}' + assert all(e['processing_backend'] == 'peer' for e in non_leader_events) diff --git a/rollingops/tests/integration/utils.py b/rollingops/tests/integration/utils.py index 795d9d514..bed37564a 100644 --- a/rollingops/tests/integration/utils.py +++ b/rollingops/tests/integration/utils.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Integration tests using real Juju and pre-packed charm(s).""" +"""Utils for integration tests.""" import json -from datetime import datetime +from datetime import UTC, datetime import jubilant +from charmlibs import pathops + TRACE_FILE = '/var/lib/charm-rolling-ops/transitions.log' @@ -32,7 +34,7 @@ def get_unit_events(juju: jubilant.Juju, unit: str) -> list[dict[str, str]]: def parse_ts(event: dict[str, str]) -> datetime: - return datetime.fromisoformat(event['ts']) + return datetime.fromtimestamp(float(event['ts']), tz=UTC) def get_leader_unit_name(juju: jubilant.Juju, app: str) -> str: @@ -50,3 +52,13 @@ def get_leader_unit_name(juju: jubilant.Juju, app: str) -> str: def remove_transition_file(juju: jubilant.Juju, unit: str): juju.exec(f'rm -f {TRACE_FILE}', unit=unit) + + +def is_empty_file(juju: jubilant.Juju, unit: str, path: str) -> bool: + pathops_path = pathops.LocalPath(path) + try: + task = juju.exec(f'test ! -s {pathops_path}', unit=unit) + except Exception: + return False + + return task.status == 'completed' and task.return_code == 0 diff --git a/rollingops/tests/unit/conftest.py b/rollingops/tests/unit/conftest.py index 74a5e4a60..bb3c46f61 100644 --- a/rollingops/tests/unit/conftest.py +++ b/rollingops/tests/unit/conftest.py @@ -25,17 +25,16 @@ from ops import ActionEvent from ops.testing import Context -import charmlibs.rollingops._certificates as certificates -import charmlibs.rollingops._etcdctl as etcdctl +import charmlibs.rollingops.etcd._certificates as certificates +import charmlibs.rollingops.etcd._etcdctl as etcdctl from charmlibs.interfaces.tls_certificates import ( Certificate, PrivateKey, ) from charmlibs.pathops import LocalPath -from charmlibs.rollingops._manager import EtcdRollingOpsManager -from charmlibs.rollingops._models import SharedCertificate -from charmlibs.rollingops._peer_manager import PeerRollingOpsManager -from charmlibs.rollingops._peer_models import OperationResult +from charmlibs.rollingops import RollingOpsManager +from charmlibs.rollingops.common._models import OperationResult +from charmlibs.rollingops.etcd._models import SharedCertificate VALID_CA_CERT_PEM = """-----BEGIN CERTIFICATE----- MIIC6DCCAdCgAwIBAgIUW42TU9LSjEZLMCclWrvSwAsgRtcwDQYJKoZIhvcNAQEL @@ -136,7 +135,7 @@ def temp_etcdctl(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> types.Modul @pytest.fixture def etcdctl_patch() -> Generator[MagicMock, None, None]: - with patch('charmlibs.rollingops._certificates') as mock_etcdctl: + with patch('charmlibs.rollingops.etcd._certificates') as mock_etcdctl: yield mock_etcdctl @@ -144,11 +143,11 @@ def etcdctl_patch() -> Generator[MagicMock, None, None]: def certificates_manager_patches() -> Generator[dict[str, MagicMock], None, None]: with ( patch( - 'charmlibs.rollingops._certificates._exists', + 'charmlibs.rollingops.etcd._certificates._exists', return_value=False, ), patch( - 'charmlibs.rollingops._certificates.generate', + 'charmlibs.rollingops.etcd._certificates.generate', return_value=SharedCertificate( certificate=Certificate.from_string(VALID_CLIENT_CERT_PEM), key=PrivateKey.from_string(VALID_CLIENT_KEY_PEM), @@ -156,7 +155,7 @@ def certificates_manager_patches() -> Generator[dict[str, MagicMock], None, None ), ) as mock_generate, patch( - 'charmlibs.rollingops._certificates.persist_client_cert_key_and_ca', + 'charmlibs.rollingops.etcd._certificates.persist_client_cert_key_and_ca', return_value=None, ) as mock_persit, ): @@ -167,26 +166,6 @@ def certificates_manager_patches() -> Generator[dict[str, MagicMock], None, None class RollingOpsCharm(ops.CharmBase): - def __init__(self, framework: ops.Framework): - super().__init__(framework) - - callback_targets = { - '_restart': self.restart, - } - - self.restart_manager = EtcdRollingOpsManager( - charm=self, - peer_relation_name='restart', - etcd_relation_name='etcd', - cluster_id='cluster-12345', - callback_targets=callback_targets, - ) - - def restart(self) -> None: - pass - - -class PeerRollingOpsCharm(ops.CharmBase): def __init__(self, framework: ops.Framework): super().__init__(framework) @@ -196,9 +175,11 @@ def __init__(self, framework: ops.Framework): '_deferred_restart': self._deferred_restart, } - self.restart_manager = PeerRollingOpsManager( + self.restart_manager = RollingOpsManager( charm=self, - relation_name='restart', + peer_relation_name='restart', + etcd_relation_name='etcd', + cluster_id='cluster-12345', callback_targets=callback_targets, ) self.framework.observe(self.on.restart_action, self._on_restart_action) @@ -242,11 +223,6 @@ def charm_test() -> type[RollingOpsCharm]: return RollingOpsCharm -@pytest.fixture -def peer_charm_test() -> type[PeerRollingOpsCharm]: - return PeerRollingOpsCharm - - meta: dict[str, Any] = { 'name': 'charm', 'peers': { @@ -305,9 +281,4 @@ def peer_charm_test() -> type[PeerRollingOpsCharm]: @pytest.fixture def ctx(charm_test: type[RollingOpsCharm]) -> Context[RollingOpsCharm]: - return Context(charm_test, meta=meta) - - -@pytest.fixture -def peer_ctx(peer_charm_test: type[PeerRollingOpsCharm]) -> Context[PeerRollingOpsCharm]: - return Context(peer_charm_test, meta=meta, actions=actions) + return Context(charm_test, meta=meta, actions=actions) diff --git a/rollingops/tests/unit/test_common_models.py b/rollingops/tests/unit/test_common_models.py new file mode 100644 index 000000000..fc4072ff6 --- /dev/null +++ b/rollingops/tests/unit/test_common_models.py @@ -0,0 +1,543 @@ +# Copyright 2026 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Learn more about testing at: https://juju.is/docs/sdk/testing + +import json +from datetime import UTC, datetime +from typing import Any + +import pytest + +from charmlibs.rollingops.common._exceptions import RollingOpsDecodingError +from charmlibs.rollingops.common._models import ( + Operation, + OperationQueue, + OperationResult, +) + + +def test_operation_create_sets_fields(): + op = Operation.create('restart', {'b': 2, 'a': 1}, max_retry=3) + + assert op.kwargs == {'b': 2, 'a': 1} + assert op.callback_id == 'restart' + assert op.max_retry == 3 + assert isinstance(op.requested_at, datetime) + + +def test_operation_to_string(): + ts = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) + op = Operation( + callback_id='cb', + kwargs={'b': 2, 'a': 1}, + requested_at=ts, + max_retry=None, + attempt=0, + result=None, + ) + + s = op.to_string() + expected = ( + '{"callback_id":"cb",' + '"requested_at":"1771848000.123456",' + '"max_retry":null,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"a":1,"b":2}}' + ) + + assert s == expected + + +def test_operation_to_string_zero_max_retry(): + ts = datetime(2026, 2, 23, 4, 0, 0, 123456, tzinfo=UTC) + op = Operation( + callback_id='cb', + kwargs={'b': 2, 'a': 1}, + requested_at=ts, + max_retry=0, + attempt=0, + result=None, + ) + + s = op.to_string() + expected = ( + '{"callback_id":"cb",' + '"requested_at":"1771819200.123456",' + '"max_retry":0,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"a":1,"b":2}}' + ) + assert s == expected + + +def test_operation_to_string_none_max_retry(): + ts = datetime(2026, 2, 23, 4, 0, 0, 123456, tzinfo=UTC) + op = Operation( + callback_id='cb', + kwargs={'b': 2, 'a': 1}, + requested_at=ts, + max_retry=None, + attempt=0, + result=None, + ) + + s = op.to_string() + expected = ( + '{"callback_id":"cb",' + '"requested_at":"1771819200.123456",' + '"max_retry":null,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"a":1,"b":2}}' + ) + + assert s == expected + + +def test_operation_is_max_retry_reached_on_zero_max_retry(): + op = Operation.create('restart', {'a': 1, 'b': 2}, max_retry=0) + assert not op.is_max_retry_reached() + op.increase_attempt() + assert op.is_max_retry_reached() + + +def test_operation_equality_and_hash_ignore_timestamp_and_max_retry(): + # Equality only depends on (callback_id, kwargs) + op1 = Operation.create('restart', {'a': 1, 'b': 2}, max_retry=0) + op2 = Operation.create('restart', {'b': 2, 'a': 1}, max_retry=999) + + assert op1 == op2 + assert hash(op1) == hash(op2) + + op3 = Operation.create('restart', {'a': 2}, max_retry=0) + assert op1 != op3 + + +def test_operation_equality_and_hash_empty_arguments(): + # Equality only depends on (callback_id, kwargs) + op1 = Operation.create('restart', {}, max_retry=0) + op2 = Operation.create('restart', {}, max_retry=999) + + assert op1 == op2 + assert hash(op1) == hash(op2) + + op3 = Operation.create('restart', {'a': 2}, max_retry=0) + assert op1 != op3 + + +def test_operation_to_string_and_from_string(): + ts = datetime(2026, 2, 23, 12, 0, 0, 0, tzinfo=UTC) + op1 = Operation( + callback_id='cb', + kwargs={'x': 1, 'y': 'z'}, + requested_at=ts, + max_retry=5, + attempt=0, + result=None, + ) + + s = op1.to_string() + op2 = Operation.from_string(s) + + assert op2.callback_id == op1.callback_id + assert op2.kwargs == op1.kwargs + assert op2.requested_at == op1.requested_at + assert op2.max_retry == op1.max_retry + assert op2.attempt == op1.attempt + + +def test_operation_from_string_valid_payload(): + requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) + payload = json.dumps({ + 'callback_id': 'cb-123', + 'kwargs': {'b': 2, 'a': 'x'}, + 'requested_at': '1773311445.123456', + 'max_retry': '5', + 'attempt': '2', + }) + + op = Operation.from_string(payload) + + assert op is not None + assert op.callback_id == 'cb-123' + assert op.kwargs == {'b': 2, 'a': 'x'} + assert op.requested_at == requested_at + assert op.max_retry == 5 + assert op.attempt == 2 + + +def test_from_string_valid_payload_with_empty_kwargs_and_no_max_retry(): + requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) + payload = json.dumps({ + 'callback_id': 'cb-123', + 'requested_at': '1773311445.123456', + 'attempt': '0', + }) + + op = Operation.from_string(payload) + + assert op is not None + assert op.callback_id == 'cb-123' + assert op.kwargs == {} + assert op.requested_at == requested_at + assert op.max_retry is None + assert op.attempt == 0 + + +def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): + requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) + payload = json.dumps({ + 'callback_id': 'cb-123', + 'kwargs': {}, + 'requested_at': '1773311445.123456', + 'max_retry': '0', + 'attempt': '0', + }) + + op = Operation.from_string(payload) + + assert op is not None + assert op.callback_id == 'cb-123' + assert op.kwargs == {} + assert op.requested_at == requested_at + assert op.max_retry == 0 + assert op.attempt == 0 + + +@pytest.mark.parametrize( + 'payload', + [ + '{not valid json', + json.dumps( # invalid requested_at + { + 'callback_id': 'cb-123', + 'kwargs': {'x': 1}, + 'requested_at': 'bad-ts', + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # invalid kwargs + { + 'callback_id': 'cb-123', + 'kwargs': '{bad kwargs json', + 'requested_at': '1773311445.123456', + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # missing callback_id + { + 'kwargs': {'x': 1}, + 'requested_at': '1773311445.123456', + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # invalid kwargs + { + 'callback_id': 'cb-123', + 'kwargs': '[]', + 'requested_at': '1773311445.123456', + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # missing requested_at + { + 'callback_id': 'cb-123', + 'kwargs': {}, + 'requested_at': '', + 'max_retry': '3', + 'attempt': '1', + } + ), + json.dumps( # result + { + 'callback_id': 'cb-123', + 'kwargs': {}, + 'requested_at': 'bad-ts', + 'max_retry': '3', + 'attempt': '1', + 'result': 'something', + } + ), + ], +) +def test_operation_from_string_invalid_inputs_return_none(payload: Any): + with pytest.raises(RollingOpsDecodingError, match='Failed to deserialize'): + Operation.from_string(payload) + + +def test_op_id_returns_timestamp_and_callback_id() -> None: + requested_at = datetime(2025, 1, 2, 3, 4, 5) + operation = Operation( + callback_id='restart', + kwargs={'delay': 2}, + requested_at=requested_at, + max_retry=3, + attempt=0, + result=None, + ) + + assert operation.op_id == f'{requested_at.timestamp()}-restart' + + +def test_complete_increments_attempt_and_sets_release() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=3, + attempt=0, + result=None, + ) + + operation.complete() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RELEASE + + +def test_retry_hold_sets_retry_hold_when_max_retry_not_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=3, + attempt=0, + result=None, + ) + + operation.retry_hold() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RETRY_HOLD + + +def test_retry_hold_sets_release_when_max_retry_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=0, + attempt=0, + result=None, + ) + + operation.retry_hold() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RELEASE + + +def test_retry_release_sets_retry_release_when_max_retry_not_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=3, + attempt=0, + result=None, + ) + + operation.retry_release() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RETRY_RELEASE + + +def test_retry_release_sets_release_when_max_retry_reached() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=0, + attempt=0, + result=None, + ) + + operation.retry_release() + + assert operation.attempt == 1 + assert operation.result == OperationResult.RELEASE + + +def test_retry_hold_with_no_max_retry_sets_retry_hold() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=None, + attempt=5, + result=None, + ) + + operation.retry_hold() + + assert operation.attempt == 6 + assert operation.result == OperationResult.RETRY_HOLD + + +def test_retry_release_with_no_max_retry_sets_retry_release() -> None: + operation = Operation( + callback_id='restart', + kwargs={}, + requested_at=datetime(2025, 1, 1, 0, 0, 0), + max_retry=None, + attempt=5, + result=None, + ) + + operation.retry_release() + + assert operation.attempt == 6 + assert operation.result == OperationResult.RETRY_RELEASE + + +def test_queue_empty_behaviour(): + q = OperationQueue() + + assert len(q) == 0 + assert q.empty is True + assert q.peek() is None + assert q.dequeue() is None + + assert q.to_string() == '[]' + + +def test_queue_enqueue_and_fifo_order(): + q = OperationQueue() + op1 = Operation.create('a', {'x': 2}) + op2 = Operation.create('b', {'i': 2}) + q.enqueue(op1) + q.enqueue(op2) + + assert len(q) == 2 + op = q.peek() + assert op is not None + assert op == op1 + + first = q.dequeue() + assert first is not None + assert first == op1 + assert len(q) == 1 + op = q.peek() + assert op is not None + assert op == op2 + + second = q.dequeue() + assert second is not None + assert second == op2 + assert q.empty is True + + +def test_queue_deduplicates_only_against_last_item(): + q = OperationQueue() + op1 = Operation.create('a', {'x': 2}) + op2 = Operation.create('a', {'x': 2}) + op3 = Operation.create('a', {'x': 4}) + + q.enqueue(op1) + assert len(q) == 1 + + q.enqueue(op2) + assert len(q) == 1 + + q.enqueue(op3) + assert len(q) == 2 + + q.enqueue(op2) + assert len(q) == 3 + + +def test_queue_to_string_and_from_string(): + q1 = OperationQueue() + ts1 = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) + op1 = Operation( + callback_id='a', + kwargs={'x': 1}, + requested_at=ts1, + max_retry=5, + attempt=0, + result=None, + ) + ts2 = datetime(2026, 2, 20, 12, 0, 0, 123456, tzinfo=UTC) + op2 = Operation( + callback_id='b', + kwargs={'y': 'z'}, + requested_at=ts2, + max_retry=None, + attempt=0, + result=None, + ) + q1.enqueue(op1) + q1.enqueue(op2) + + encoded = q1.to_string() + expected = ( + '[{"callback_id":"a",' + '"requested_at":"1771848000.123456",' + '"max_retry":5,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"x":1}},' + '{"callback_id":"b",' + '"requested_at":"1771588800.123456",' + '"max_retry":null,' + '"attempt":0,' + '"result":null,' + '"kwargs":{"y":"z"}}]' + ) + + assert encoded == expected + + q2 = OperationQueue.from_string(encoded) + + assert len(q2) == 2 + op = q2.peek() + assert op is not None + assert op == op1 + + op = q2.dequeue() + assert op is not None + assert op == op1 + + op = q2.dequeue() + assert op is not None + assert op == op2 + assert q2.empty + + +def test_queue_from_string_empty_string_is_empty_queue(): + q = OperationQueue.from_string('') + assert q.empty + assert q.peek() is None + + +def test_queue_from_string_rejects_non_list_json(): + with pytest.raises( + RollingOpsDecodingError, match='Failed to deserialize data to create an OperationQueue' + ): + OperationQueue.from_string('{"not": "a list"}') + + +def test_queue_from_string_rejects_invalid_json(): + with pytest.raises( + RollingOpsDecodingError, match='Failed to deserialize data to create an OperationQueue' + ): + OperationQueue.from_string('{invalid') diff --git a/rollingops/tests/unit/test_certificates.py b/rollingops/tests/unit/test_etcd_certificates.py similarity index 95% rename from rollingops/tests/unit/test_certificates.py rename to rollingops/tests/unit/test_etcd_certificates.py index 19dd62853..4984bb6db 100644 --- a/rollingops/tests/unit/test_certificates.py +++ b/rollingops/tests/unit/test_etcd_certificates.py @@ -22,7 +22,7 @@ Certificate, PrivateKey, ) -from charmlibs.rollingops._models import SharedCertificate +from charmlibs.rollingops.etcd._models import SharedCertificate def make_shared_certificate() -> SharedCertificate: @@ -33,7 +33,7 @@ def make_shared_certificate() -> SharedCertificate: ) -def test_certs(): +def test_make_shared_certificate_is_valid(): Certificate.from_string(VALID_CA_CERT_PEM) PrivateKey.from_string(VALID_CLIENT_KEY_PEM) Certificate.from_string(VALID_CLIENT_CERT_PEM) @@ -123,7 +123,7 @@ def test_certificates_manager_generate_does_nothing_when_files_already_exist( temp_certificates.CA_CERT_PATH.write_text(VALID_CA_CERT_PEM) old_certificates = make_shared_certificate() - new_certificates = temp_certificates.generate(common_name='unit-1') + new_certificates = temp_certificates.generate(model_uuid='model', app_name='unit-1') written = SharedCertificate.from_strings( certificate=temp_certificates.CLIENT_CERT_PATH.read_text(), @@ -138,7 +138,7 @@ def test_certificates_manager_generate_does_nothing_when_files_already_exist( def test_certificates_manager_generate_creates_all_files( temp_certificates: Any, ) -> None: - shared = temp_certificates.generate(common_name='unit-1') + shared = temp_certificates.generate(model_uuid='model', app_name='unit-1') assert temp_certificates._exists() is True assert temp_certificates.CA_CERT_PATH.read_text().startswith('-----BEGIN CERTIFICATE-----') diff --git a/rollingops/tests/unit/test_etcdctl.py b/rollingops/tests/unit/test_etcd_etcdctl.py similarity index 97% rename from rollingops/tests/unit/test_etcdctl.py rename to rollingops/tests/unit/test_etcd_etcdctl.py index 051103f2f..26497fa1e 100644 --- a/rollingops/tests/unit/test_etcdctl.py +++ b/rollingops/tests/unit/test_etcd_etcdctl.py @@ -21,7 +21,7 @@ import pytest from charmlibs.pathops import LocalPath -from charmlibs.rollingops import RollingOpsEtcdNotConfiguredError +from charmlibs.rollingops.common._exceptions import RollingOpsEtcdNotConfiguredError def test_etcdctl_write_env(temp_etcdctl: Any) -> None: diff --git a/rollingops/tests/unit/test_models.py b/rollingops/tests/unit/test_etcd_models.py similarity index 96% rename from rollingops/tests/unit/test_models.py rename to rollingops/tests/unit/test_etcd_models.py index 2820dfea0..e20ce39b5 100644 --- a/rollingops/tests/unit/test_models.py +++ b/rollingops/tests/unit/test_etcd_models.py @@ -15,7 +15,7 @@ # Learn more about testing at: https://juju.is/docs/sdk/testing -from charmlibs.rollingops._models import RollingOpsKeys +from charmlibs.rollingops.etcd._models import RollingOpsKeys def test_rollingopskeys_paths() -> None: diff --git a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py index de2d0dd56..e668412fd 100644 --- a/rollingops/tests/unit/test_etcd_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_etcd_rollingops_in_charm.py @@ -14,10 +14,11 @@ # # Learn more about testing at: https://juju.is/docs/sdk/testing -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest from ops.testing import Context, PeerRelation, Secret, State +from scenario import RawDataBagContents from scenario.errors import UncaughtCharmError from tests.unit.conftest import ( VALID_CA_CERT_PEM, @@ -30,10 +31,22 @@ Certificate, PrivateKey, ) -from charmlibs.rollingops._models import RollingOpsInvalidSecretContentError, SharedCertificate -from charmlibs.rollingops._relations import ( - CERT_SECRET_FIELD, +from charmlibs.rollingops.common._exceptions import ( + RollingOpsInvalidSecretContentError, ) +from charmlibs.rollingops.common._models import ( + Operation, + OperationQueue, + ProcessingBackend, + RollingOpsStatus, +) +from charmlibs.rollingops.etcd._models import SharedCertificate +from charmlibs.rollingops.etcd._relations import CERT_SECRET_FIELD +from charmlibs.rollingops.peer._models import LockIntent + + +def _unit_databag(state: State, peer: PeerRelation) -> RawDataBagContents: + return state.get_relation(peer.id).local_unit_data def test_leader_elected_creates_shared_secret_and_stores_id( @@ -145,3 +158,186 @@ def test_invalid_certificate_secret_content_raises( with pytest.raises(UncaughtCharmError) as exc_info: ctx.run(ctx.on.relation_changed(peer_relation, remote_unit=1), state_in) assert isinstance(exc_info.value.__cause__, RollingOpsInvalidSecretContentError) + + +def test_on_restart_action_lock_fallbacks_to_peer( + ctx: Context[RollingOpsCharm], +): + peer = PeerRelation(endpoint='restart') + state_in = State(leader=False, relations={peer}) + + state_out = ctx.run( + ctx.on.action('restart', params={'delay': 10}), + state_in, + ) + + databag = _unit_databag(state_out, peer) + assert databag['state'] == LockIntent.REQUEST + assert databag['operations'] + assert databag['processing_backend'] == ProcessingBackend.PEER + assert databag['etcd_cleanup_needed'] == 'true' + + q = OperationQueue.from_string(databag['operations']) + assert len(q) == 1 + operation = q.peek() + assert operation is not None + assert operation.callback_id == '_restart' + assert operation.kwargs == {'delay': 10} + assert operation.max_retry is None + assert operation.requested_at is not None + + +def test_state_not_initialized(ctx: Context[RollingOpsCharm]): + state = State(leader=True) + + with ctx(ctx.on.start(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.UNAVAILABLE + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 0 + + +def test_state_peer_idle(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_unit_data={ + 'state': '', + 'operations': '', + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.IDLE + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 0 + + +def test_state_peer_waiting(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.WAITING + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 + + +def test_state_peer_is_granted(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_app_data={ + 'granted_unit': f'{ctx.app_name}/0', + }, + local_unit_data={ + 'state': 'retry-release', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '2026-04-09T10:01:00+00:00', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.GRANTED + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 + + +def test_state_peer_waiting_retry(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + local_app_data={ + 'granted_unit': 'myapp/0', + }, + local_unit_data={ + 'state': 'retry-release', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '2026-04-09T10:01:00+00:00', + 'processing_backend': 'peer', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.WAITING + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 + + +def test_state_etcd_status(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': '', + 'operations': OperationQueue([ + Operation.create('restart', {'delay': 1}, max_retry=2) + ]).to_string(), + 'executed_at': '', + 'processing_backend': 'etcd', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with patch( + 'charmlibs.rollingops.etcd._backend.EtcdRollingOpsBackend.get_status', + return_value=RollingOpsStatus.GRANTED, + ): + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.GRANTED + assert rolling_state.processing_backend == ProcessingBackend.ETCD + assert len(rolling_state.operations) == 1 + + +def test_state_falls_back_to_peer_if_etcd_status_fails(ctx: Context[RollingOpsCharm]): + peer_rel = PeerRelation( + endpoint='restart', + interface='rollingops', + local_app_data={}, + local_unit_data={ + 'state': 'request', + 'operations': OperationQueue([Operation.create('restart', {'delay': 1})]).to_string(), + 'executed_at': '', + 'processing_backend': 'etcd', + 'etcd_cleanup_needed': 'false', + }, + ) + state = State(leader=False, relations={peer_rel}) + + with patch( + 'charmlibs.rollingops._rollingops_manager.EtcdRollingOpsBackend.get_status', + return_value=RollingOpsStatus.UNAVAILABLE, + ): + with ctx(ctx.on.update_status(), state) as mgr: + rolling_state = mgr.charm.restart_manager.state + assert rolling_state.status == RollingOpsStatus.WAITING + assert rolling_state.processing_backend == ProcessingBackend.PEER + assert len(rolling_state.operations) == 1 diff --git a/rollingops/tests/unit/test_peer_models.py b/rollingops/tests/unit/test_peer_models.py deleted file mode 100644 index 201f2a1b5..000000000 --- a/rollingops/tests/unit/test_peer_models.py +++ /dev/null @@ -1,342 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Learn more about testing at: https://juju.is/docs/sdk/testing - -import json -from datetime import UTC, datetime -from typing import Any - -import pytest - -from charmlibs.rollingops._peer_models import ( - Operation, - OperationQueue, - RollingOpsDecodingError, -) - - -def _decode_queue_string(queue_str: str) -> list[dict[str, str]]: - """Helper: decode OperationQueue.to_string() -> list of dicts.""" - items = json.loads(queue_str) - assert isinstance(items, list) - return [json.loads(s) for s in items] # type: ignore[reportUnknownArgumentType] - - -def test_operation_create_sets_fields(): - op = Operation.create('restart', {'b': 2, 'a': 1}, max_retry=3) - - assert op.kwargs == {'b': 2, 'a': 1} - assert op.callback_id == 'restart' - assert op.max_retry == 3 - assert isinstance(op.requested_at, datetime) - - -def test_operation_to_string_contains_string_values_only(): - ts = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) - op = Operation( - callback_id='cb', kwargs={'b': 2, 'a': 1}, requested_at=ts, max_retry=None, attempt=0 - ) - - s = op.to_string() - obj = json.loads(s) - - assert obj['callback_id'] == 'cb' - assert obj['kwargs'] == '{"a":1,"b":2}' - assert obj['requested_at'] == ts.isoformat() - assert obj.get('max_retry', '') == '' - - -def test_operation_to_string_contains_string_values_only_zero_max_retry(): - ts = datetime(2026, 2, 23, 12, 0, 0, 123456, tzinfo=UTC) - op = Operation( - callback_id='cb', kwargs={'b': 2, 'a': 1}, requested_at=ts, max_retry=0, attempt=0 - ) - - s = op.to_string() - obj = json.loads(s) - - assert obj['callback_id'] == 'cb' - assert obj['kwargs'] == '{"a":1,"b":2}' - assert obj['requested_at'] == ts.isoformat() - assert obj.get('max_retry', '') == '0' - - -def test_operation_is_max_retry_reached_on_zero_max_retry(): - op = Operation.create('restart', {'a': 1, 'b': 2}, max_retry=0) - assert not op.is_max_retry_reached() - op.increase_attempt() - assert op.is_max_retry_reached() - - -def test_operation_equality_and_hash_ignore_timestamp_and_max_retry(): - # Equality only depends on (callback_id, kwargs) - op1 = Operation.create('restart', {'a': 1, 'b': 2}, max_retry=0) - op2 = Operation.create('restart', {'b': 2, 'a': 1}, max_retry=999) - - assert op1 == op2 - assert hash(op1) == hash(op2) - - op3 = Operation.create('restart', {'a': 2}, max_retry=0) - assert op1 != op3 - - -def test_operation_equality_and_hash_empty_arguments(): - # Equality only depends on (callback_id, kwargs) - op1 = Operation.create('restart', {}, max_retry=0) - op2 = Operation.create('restart', {}, max_retry=999) - - assert op1 == op2 - assert hash(op1) == hash(op2) - - op3 = Operation.create('restart', {'a': 2}, max_retry=0) - assert op1 != op3 - - -def test_operation_to_string_and_from_string(): - ts = datetime(2026, 2, 23, 12, 0, 0, 0, tzinfo=UTC) - op1 = Operation( - callback_id='cb', kwargs={'x': 1, 'y': 'z'}, requested_at=ts, max_retry=5, attempt=0 - ) - - s = op1.to_string() - op2 = Operation.from_string(s) - - assert op2.callback_id == op1.callback_id - assert op2.kwargs == op1.kwargs - assert op2.requested_at == op1.requested_at - assert op2.max_retry == op1.max_retry - assert op2.attempt == op1.attempt - - -def test_operation_from_string_valid_payload(): - requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) - payload = json.dumps({ - 'callback_id': 'cb-123', - 'kwargs': json.dumps({'b': 2, 'a': 'x'}), - 'requested_at': requested_at.isoformat(), - 'max_retry': '5', - 'attempt': '2', - }) - - op = Operation.from_string(payload) - - assert op is not None - assert op.callback_id == 'cb-123' - assert op.kwargs == {'b': 2, 'a': 'x'} - assert op.requested_at == requested_at - assert op.max_retry == 5 - assert op.attempt == 2 - - -def test_from_string_valid_payload_with_empty_kwargs_and_no_max_retry(): - requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) - payload = json.dumps({ - 'callback_id': 'cb-123', - 'kwargs': '', - 'requested_at': requested_at.isoformat(), - 'max_retry': '', - 'attempt': '0', - }) - - op = Operation.from_string(payload) - - assert op is not None - assert op.callback_id == 'cb-123' - assert op.kwargs == {} - assert op.requested_at == requested_at - assert op.max_retry is None - assert op.attempt == 0 - - -def test_from_string_valid_payload_with_empty_kwargs_and_0_max_retry(): - requested_at = datetime(2026, 3, 12, 10, 30, 45, 123456, tzinfo=UTC) - payload = json.dumps({ - 'callback_id': 'cb-123', - 'kwargs': '{}', - 'requested_at': requested_at.isoformat(), - 'max_retry': '0', - 'attempt': '0', - }) - - op = Operation.from_string(payload) - - assert op is not None - assert op.callback_id == 'cb-123' - assert op.kwargs == {} - assert op.requested_at == requested_at - assert op.max_retry == 0 - assert op.attempt == 0 - - -@pytest.mark.parametrize( - 'payload', - [ - '{not valid json', - json.dumps( # invalid requested_at - { - 'callback_id': 'cb-123', - 'kwargs': json.dumps({'x': 1}), - 'requested_at': 'bad-ts', - 'max_retry': '3', - 'attempt': '1', - } - ), - json.dumps( # invalid kwargs - { - 'callback_id': 'cb-123', - 'kwargs': '{bad kwargs json', - 'requested_at': datetime.now(UTC).isoformat(), - 'max_retry': '3', - 'attempt': '1', - } - ), - json.dumps( # missing callback_id - { - 'kwargs': json.dumps({'x': 1}), - 'requested_at': datetime.now(UTC).isoformat(), - 'max_retry': '3', - 'attempt': '1', - } - ), - json.dumps( # invalid kwargs - { - 'callback_id': 'cb-123', - 'kwargs': '[]', - 'requested_at': datetime.now(UTC).isoformat(), - 'max_retry': '3', - 'attempt': '1', - } - ), - json.dumps( # missing requested_at - { - 'callback_id': 'cb-123', - 'kwargs': '{}', - 'requested_at': '', - 'max_retry': '3', - 'attempt': '1', - } - ), - ], -) -def test_operation_from_string_invalid_inputs_return_none(payload: Any): - with pytest.raises(RollingOpsDecodingError, match='Failed to deserialize'): - Operation.from_string(payload) - - -def test_queue_empty_behaviour(): - q = OperationQueue() - - assert len(q) == 0 - assert q.empty is True - assert q.peek() is None - assert q.dequeue() is None - - assert json.loads(q.to_string()) == [] - - -def test_queue_enqueue_and_fifo_order(): - q = OperationQueue() - q.enqueue_lock_request('a', {'i': 1}) - q.enqueue_lock_request('b', {'i': 2}) - - assert len(q) == 2 - op = q.peek() - assert op is not None - assert op.callback_id == 'a' - - first = q.dequeue() - assert first is not None - assert first.callback_id == 'a' - assert len(q) == 1 - op = q.peek() - assert op is not None - assert op.callback_id == 'b' - - second = q.dequeue() - assert second is not None - assert second.callback_id == 'b' - assert q.empty is True - - -def test_queue_deduplicates_only_against_last_item(): - q = OperationQueue() - - q.enqueue_lock_request('restart', {'x': 1}) - assert len(q) == 1 - - q.enqueue_lock_request('restart', {'x': 1}) - assert len(q) == 1 - - q.enqueue_lock_request('restart', {'x': 2}) - assert len(q) == 2 - - q.enqueue_lock_request('restart', {'x': 1}) - assert len(q) == 3 - - -def test_queue_to_string_and_from_string(): - q1 = OperationQueue() - q1.enqueue_lock_request('a', {'x': 1}, max_retry=5) - q1.enqueue_lock_request('b', {'y': 'z'}, max_retry=None) - - encoded = q1.to_string() - q2 = OperationQueue.from_string(encoded) - - assert len(q2) == 2 - op = q2.peek() - assert op is not None - assert op.callback_id == 'a' - - op = q2.dequeue() - assert op is not None - assert op.callback_id == 'a' - - op = q2.dequeue() - assert op is not None - assert op.callback_id == 'b' - assert q2.empty - - -def test_queue_from_string_empty_string_is_empty_queue(): - q = OperationQueue.from_string('') - assert q.empty - assert q.peek() is None - - -def test_queue_from_string_rejects_non_list_json(): - with pytest.raises(RollingOpsDecodingError, match='OperationQueue string'): - OperationQueue.from_string(json.dumps({'not': 'a list'})) - - -def test_queue_from_string_rejects_invalid_jason(): - with pytest.raises(RollingOpsDecodingError, match='Failed to deserialize data'): - OperationQueue.from_string('{invalid') - - -def test_queue_encoding_is_list_of_operation_strings(): - q = OperationQueue() - q.enqueue_lock_request('a', {'x': 1}) - s = q.to_string() - - decoded = json.loads(s) - assert isinstance(decoded, list) - assert len(decoded) == 1 # type: ignore[reportUnknownArgumentType] - assert isinstance(decoded[0], str) - - op_dicts = _decode_queue_string(s) - assert op_dicts[0]['callback_id'] == 'a' - assert op_dicts[0]['kwargs'] == '{"x":1}' - assert op_dicts[0].get('max_retry', '') == '' - assert 'requested_at' in op_dicts[0] diff --git a/rollingops/tests/unit/test_peer_rollingops_in_charm.py b/rollingops/tests/unit/test_peer_rollingops_in_charm.py index fab338e19..11389de47 100644 --- a/rollingops/tests/unit/test_peer_rollingops_in_charm.py +++ b/rollingops/tests/unit/test_peer_rollingops_in_charm.py @@ -15,22 +15,18 @@ # Learn more about testing at: https://juju.is/docs/sdk/testing -import logging from typing import Any +from unittest.mock import MagicMock import pytest from ops.testing import Context, PeerRelation, State from scenario import RawDataBagContents -from tests.unit.conftest import PeerRollingOpsCharm +from tests.unit.conftest import RollingOpsCharm -from charmlibs.rollingops._peer_models import ( - LockIntent, - OperationQueue, - RollingOpsInvalidLockRequestError, - _now_timestamp_str, -) - -logger = logging.getLogger(__name__) +from charmlibs.rollingops.common._exceptions import RollingOpsInvalidLockRequestError +from charmlibs.rollingops.common._models import Operation, OperationQueue +from charmlibs.rollingops.common._utils import now_timestamp +from charmlibs.rollingops.peer._models import LockIntent def _unit_databag(state: State, peer: PeerRelation) -> RawDataBagContents: @@ -45,18 +41,19 @@ def _make_operation_queue( callback_id: str, kwargs: dict[str, Any], max_retry: int | None ) -> OperationQueue: q = OperationQueue() - q.enqueue_lock_request(callback_id=callback_id, kwargs=kwargs, max_retry=max_retry) + op1 = Operation.create(callback_id=callback_id, kwargs=kwargs, max_retry=max_retry) + q.enqueue(op1) return q def test_lock_request_enqueues_and_sets_request( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.action('restart', params={'delay': 10}), + state_out = ctx.run( + ctx.on.action('restart', params={'delay': 10}), state_in, ) @@ -79,14 +76,13 @@ def test_lock_request_enqueues_and_sets_request( [ (-5), (-1), - ('3'), ], ) -def test_lock_request_invalid_inputs(peer_ctx: Context[PeerRollingOpsCharm], max_retry: Any): +def test_lock_request_invalid_inputs(ctx: Context[RollingOpsCharm], max_retry: Any): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with ctx(ctx.on.update_status(), state_in) as mgr: with pytest.raises(RollingOpsInvalidLockRequestError): mgr.charm.restart_manager.request_async_lock( callback_id='_restart', @@ -103,13 +99,11 @@ def test_lock_request_invalid_inputs(peer_ctx: Context[PeerRollingOpsCharm], max ('unknown',), ], ) -def test_lock_request_invalid_callback_id( - peer_ctx: Context[PeerRollingOpsCharm], callback_id: str -): +def test_lock_request_invalid_callback_id(ctx: Context[RollingOpsCharm], callback_id: str): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with ctx(ctx.on.update_status(), state_in) as mgr: with pytest.raises(RollingOpsInvalidLockRequestError, match='Unknown callback_id'): mgr.charm.restart_manager.request_async_lock( callback_id=callback_id, @@ -126,11 +120,11 @@ def test_lock_request_invalid_callback_id( ({'x': OperationQueue()}), ], ) -def test_lock_request_invalid_kwargs(peer_ctx: Context[PeerRollingOpsCharm], kwargs: Any): +def test_lock_request_invalid_kwargs(ctx: Context[RollingOpsCharm], kwargs: Any): peer = PeerRelation(endpoint='restart') state_in = State(leader=False, relations={peer}) - with peer_ctx(peer_ctx.on.update_status(), state_in) as mgr: + with ctx(ctx.on.update_status(), state_in) as mgr: with pytest.raises( RollingOpsInvalidLockRequestError, match='Failed to create the lock request' ): @@ -141,7 +135,7 @@ def test_lock_request_invalid_kwargs(peer_ctx: Context[PeerRollingOpsCharm], kwa ) -def test_existing_operation_then_new_request(peer_ctx: Context[PeerRollingOpsCharm]): +def test_existing_operation_then_new_request(ctx: Context[RollingOpsCharm]): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', @@ -150,7 +144,7 @@ def test_existing_operation_then_new_request(peer_ctx: Context[PeerRollingOpsCha state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.action('restart', params={'delay': 10}), state_in) + state_out = ctx.run(ctx.on.action('restart', params={'delay': 10}), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.REQUEST @@ -162,10 +156,10 @@ def test_existing_operation_then_new_request(peer_ctx: Context[PeerRollingOpsCha def test_new_request_does_not_overwrite_state_if_queue_not_empty( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) - executed_at = _now_timestamp_str() + executed_at = str(now_timestamp().timestamp()) peer = PeerRelation( endpoint='restart', local_unit_data={ @@ -176,7 +170,7 @@ def test_new_request_does_not_overwrite_state_if_queue_not_empty( ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.action('restart', params={'delay': 10}), state_in) + state_out = ctx.run(ctx.on.action('restart', params={'delay': 10}), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.RETRY_RELEASE @@ -188,21 +182,22 @@ def test_new_request_does_not_overwrite_state_if_queue_not_empty( def test_relation_changed_without_grant_does_not_run_operation( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): - remote_unit_name = f'{peer_ctx.app_name}/1' + remote_unit_name = f'{ctx.app_name}/1' queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, - local_app_data={'granted_unit': remote_unit_name, 'granted_at': _now_timestamp_str()}, + local_app_data={ + 'granted_unit': remote_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.REQUEST @@ -211,20 +206,21 @@ def test_relation_changed_without_grant_does_not_run_operation( assert databag.get('executed_at', '') == '' -def test_lock_complete_pops_head(peer_ctx: Context[PeerRollingOpsCharm]): - remote_unit_name = f'{peer_ctx.app_name}/1' - local_unit_name = f'{peer_ctx.app_name}/0' +def test_lock_complete_pops_head(ctx: Context[RollingOpsCharm]): + remote_unit_name = f'{ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' queue = _make_operation_queue(callback_id='_restart', kwargs={}, max_retry=0) peer = PeerRelation( endpoint='restart', local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, - local_app_data={'granted_unit': local_unit_name, 'granted_at': _now_timestamp_str()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.IDLE @@ -236,25 +232,29 @@ def test_lock_complete_pops_head(peer_ctx: Context[PeerRollingOpsCharm]): def test_successful_operation_leaves_state_request_when_more_ops_remain( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], ): - local_unit_name = f'{peer_ctx.app_name}/0' - remote_unit_name = f'{peer_ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' + remote_unit_name = f'{ctx.app_name}/1' queue = OperationQueue() - queue.enqueue_lock_request(callback_id='_restart', kwargs={}, max_retry=None) - queue.enqueue_lock_request(callback_id='_failed_restart', kwargs={}, max_retry=None) + op1 = Operation.create(callback_id='_restart', kwargs={}, max_retry=None) + op2 = Operation.create(callback_id='_failed_restart', kwargs={}, max_retry=None) + + queue.enqueue(op1) + queue.enqueue(op2) peer = PeerRelation( endpoint='restart', local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, - local_app_data={'granted_unit': local_unit_name, 'granted_at': _now_timestamp_str()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.REQUEST @@ -273,23 +273,24 @@ def test_successful_operation_leaves_state_request_when_more_ops_remain( ], ) def test_lock_retry_marks_retry( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], callback_id: str, lock_intent: LockIntent, ): - remote_unit_name = f'{peer_ctx.app_name}/1' - local_unit_name = f'{peer_ctx.app_name}/0' + remote_unit_name = f'{ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' queue = _make_operation_queue(callback_id=callback_id, kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, - local_app_data={'granted_unit': local_unit_name, 'granted_at': _now_timestamp_str()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == lock_intent @@ -316,14 +317,15 @@ def test_lock_retry_marks_retry( ], ) def test_lock_retry_drops_when_max_retry_reached( - peer_ctx: Context[PeerRollingOpsCharm], + ctx: Context[RollingOpsCharm], callback_id: str, ): - remote_unit_name = f'{peer_ctx.app_name}/1' - local_unit_name = f'{peer_ctx.app_name}/0' + remote_unit_name = f'{ctx.app_name}/1' + local_unit_name = f'{ctx.app_name}/0' queue = OperationQueue() - queue.enqueue_lock_request(callback_id=callback_id, kwargs={}, max_retry=3) + op1 = Operation.create(callback_id=callback_id, kwargs={}, max_retry=3) + queue.enqueue(op1) op = queue.peek() assert op is not None op.increase_attempt() @@ -333,13 +335,14 @@ def test_lock_retry_drops_when_max_retry_reached( peer = PeerRelation( endpoint='restart', local_unit_data={'state': LockIntent.REQUEST, 'operations': queue.to_string()}, - local_app_data={'granted_unit': local_unit_name, 'granted_at': _now_timestamp_str()}, + local_app_data={ + 'granted_unit': local_unit_name, + 'granted_at': str(now_timestamp().timestamp()), + }, ) state_in = State(leader=False, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _unit_databag(state_out, peer) assert databag['state'] == LockIntent.IDLE @@ -349,7 +352,11 @@ def test_lock_retry_drops_when_max_retry_reached( assert len(q) == 0 -def test_lock_grant_and_release(peer_ctx: Context[PeerRollingOpsCharm]): +def test_lock_grant_and_release( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( endpoint='restart', @@ -357,41 +364,47 @@ def test_lock_grant_and_release(peer_ctx: Context[PeerRollingOpsCharm]): ) state_in = State(leader=True, relations={peer}) - state = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state, peer) - unit_name = f'{peer_ctx.app_name}/1' + unit_name = f'{ctx.app_name}/1' assert unit_name in databag['granted_unit'] assert databag['granted_at'] is not None -def test_scheduling_does_nothing_if_lock_already_granted(peer_ctx: Context[PeerRollingOpsCharm]): +def test_scheduling_does_nothing_if_lock_already_granted( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) - remote_unit_name = f'{peer_ctx.app_name}/1' - now_timestamp = _now_timestamp_str() + remote_unit_name = f'{ctx.app_name}/1' + now_timestamp_str = str(now_timestamp().timestamp()) peer = PeerRelation( endpoint='restart', peers_data={ 1: {'state': LockIntent.REQUEST, 'operations': queue.to_string()}, 2: {'state': LockIntent.REQUEST, 'operations': queue.to_string()}, }, - local_app_data={'granted_unit': remote_unit_name, 'granted_at': now_timestamp}, + local_app_data={'granted_unit': remote_unit_name, 'granted_at': now_timestamp_str}, ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run( - peer_ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in - ) + state_out = ctx.run(ctx.on.relation_changed(peer, remote_unit=remote_unit_name), state_in) databag = _app_databag(state_out, peer) assert databag['granted_unit'] == remote_unit_name - assert databag['granted_at'] == now_timestamp + assert databag['granted_at'] == now_timestamp_str -def test_schedule_picks_retry_hold(peer_ctx: Context[PeerRollingOpsCharm]): - old_operation = _now_timestamp_str() +def test_schedule_picks_retry_hold( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): + old_operation = str(now_timestamp().timestamp()) queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) - new_operation = _now_timestamp_str() + new_operation = str(now_timestamp().timestamp()) peer = PeerRelation( endpoint='restart', @@ -415,19 +428,25 @@ def test_schedule_picks_retry_hold(peer_ctx: Context[PeerRollingOpsCharm]): ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/3' + remote_unit_name = f'{ctx.app_name}/3' assert databag['granted_unit'] == remote_unit_name -def test_schedule_picks_oldest_requested_at_among_requests(peer_ctx: Context[PeerRollingOpsCharm]): +def test_schedule_picks_oldest_requested_at_among_requests( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): old_queue = OperationQueue() - old_queue.enqueue_lock_request(callback_id='restart', kwargs={}, max_retry=2) + old_op = Operation.create(callback_id='restart', kwargs={}, max_retry=2) + old_queue.enqueue(old_op) new_queue = OperationQueue() - new_queue.enqueue_lock_request(callback_id='restart', kwargs={}, max_retry=2) + new_op = Operation.create(callback_id='restart', kwargs={}, max_retry=2) + new_queue.enqueue(new_op) peer = PeerRelation( endpoint='restart', @@ -438,18 +457,20 @@ def test_schedule_picks_oldest_requested_at_among_requests(peer_ctx: Context[Pee ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/2' + remote_unit_name = f'{ctx.app_name}/2' assert databag['granted_unit'] == remote_unit_name def test_schedule_picks_oldest_executed_at_among_retries_when_no_requests( - peer_ctx: Context[PeerRollingOpsCharm], + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], ): - old_operation = _now_timestamp_str() + old_operation = str(now_timestamp().timestamp()) queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) - new_operation = _now_timestamp_str() + new_operation = str(now_timestamp().timestamp()) peer = PeerRelation( endpoint='restart', @@ -468,14 +489,18 @@ def test_schedule_picks_oldest_executed_at_among_retries_when_no_requests( ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/2' + remote_unit_name = f'{ctx.app_name}/2' assert databag['granted_unit'] == remote_unit_name -def test_schedule_prioritizes_requests_over_retries(peer_ctx: Context[PeerRollingOpsCharm]): +def test_schedule_prioritizes_requests_over_retries( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): queue = _make_operation_queue(callback_id='_failed_restart', kwargs={}, max_retry=3) peer = PeerRelation( @@ -484,28 +509,32 @@ def test_schedule_prioritizes_requests_over_retries(peer_ctx: Context[PeerRollin 1: { 'state': LockIntent.RETRY_RELEASE, 'operations': queue.to_string(), - 'executed_at': _now_timestamp_str(), + 'executed_at': str(now_timestamp().timestamp()), }, 2: {'state': LockIntent.REQUEST, 'operations': queue.to_string()}, }, ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) - remote_unit_name = f'{peer_ctx.app_name}/2' + remote_unit_name = f'{ctx.app_name}/2' assert databag['granted_unit'] == remote_unit_name -def test_no_unit_is_granted_if_there_are_no_requests(peer_ctx: Context[PeerRollingOpsCharm]): +def test_no_unit_is_granted_if_there_are_no_requests( + certificates_manager_patches: dict[str, MagicMock], + etcdctl_patch: MagicMock, + ctx: Context[RollingOpsCharm], +): peer = PeerRelation( endpoint='restart', peers_data={1: {'state': LockIntent.IDLE}, 2: {'state': LockIntent.IDLE}}, ) state_in = State(leader=True, relations={peer}) - state_out = peer_ctx.run(peer_ctx.on.leader_elected(), state_in) + state_out = ctx.run(ctx.on.leader_elected(), state_in) databag = _app_databag(state_out, peer) assert databag.get('granted_unit', '') == '' diff --git a/rollingops/uv.lock b/rollingops/uv.lock index e6833c191..b5d0d3651 100644 --- a/rollingops/uv.lock +++ b/rollingops/uv.lock @@ -114,6 +114,8 @@ dependencies = [ { name = "charmlibs-pathops" }, { name = "dpcharmlibs-interfaces" }, { name = "ops" }, + { name = "pydantic" }, + { name = "shortuuid" }, { name = "tenacity" }, ] @@ -133,6 +135,8 @@ requires-dist = [ { name = "charmlibs-pathops", specifier = ">=1.2.1" }, { name = "dpcharmlibs-interfaces", specifier = "==1.0.0" }, { name = "ops" }, + { name = "pydantic", specifier = ">=2.12.5" }, + { name = "shortuuid", specifier = ">=1.0.13" }, { name = "tenacity" }, ] @@ -423,6 +427,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "shortuuid" +version = "1.0.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8c/e2/bcf761f3bff95856203f9559baf3741c416071dd200c0fc19fad7f078f86/shortuuid-1.0.13.tar.gz", hash = "sha256:3bb9cf07f606260584b1df46399c0b87dd84773e7b25912b7e391e30797c5e72", size = 9662, upload-time = "2024-03-11T20:11:06.879Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/44/21d6bf170bf40b41396480d8d49ad640bca3f2b02139cd52aa1e272830a5/shortuuid-1.0.13-py3-none-any.whl", hash = "sha256:a482a497300b49b4953e15108a7913244e1bb0d41f9d332f5e9925dba33a3c5a", size = 10529, upload-time = "2024-03-11T20:11:04.807Z" }, +] + [[package]] name = "tenacity" version = "9.1.4"