diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index fcc165b..385451d 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -35,7 +35,7 @@ edcloud/ - **Python + boto3 over Terraform:** small resource graph and tag-based ownership make stateful IaC overhead unnecessary here. - **Tailscale-only access:** zero inbound SG rules; access is identity-based over tailnet. - **Durable state volume + disposable root:** host runtime is replaceable; durable data lives under `/opt/edcloud/state`. -- **AWS-native backup lifecycle:** DLM retention + restore drills instead of replication/failover complexity. +- **AWS-native backup lifecycle:** DLM retention (daily/weekly/monthly/quarterly, 1 snapshot each tier) applied automatically on provision. DLM targets EBS volumes by tag and runs independently of instance state — snapshots accumulate whether the instance is up or down. - **SSM-backed runtime secrets:** secrets stay out of git and host bootstrap payloads. - **Cloud-init as baseline contract:** reproducible host/tooling baseline is codified in `cloud-init/user-data.yaml`. - **CLI-first operations model:** commands must remain safe/repeatable from lightweight ARM/Linux operator nodes. @@ -71,7 +71,10 @@ edcloud/ - AWS DLM policy management is implemented in `backup_policy.py`. - Root volume remains disposable; state volume is durable and role-tagged. +<<<<<<< HEAD - Cloud-init runs `loginctl enable-linger ubuntu` so user systemd services start at boot without a login session. Personal services (e.g. `rclone-dropbox.service`) are stored in `~/.config/systemd/user/` on the state volume and therefore survive reprovision automatically. Templates for optional user services live in `templates/operator/systemd-user/`. +- DLM backup policy is applied automatically on `edc provision`. It targets the state volume by tag and runs whether the instance is running or stopped. Retention: 1 daily (~1 day old), 1 weekly (~1 week old, Sundays), 1 monthly (~1 month old, 1st of month), 1 quarterly (~3 months old, 1st of Jan/Apr/Jul/Oct). +- `edc status` shows current backup policy state. `edc destroy` warns if no active policy is found. ## Non-goals diff --git a/edcloud/backup_policy.py b/edcloud/backup_policy.py index 713a4f1..4c4c066 100644 --- a/edcloud/backup_policy.py +++ b/edcloud/backup_policy.py @@ -26,14 +26,16 @@ def _target_tags() -> list[dict[str, str]]: ] -def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dict[str, Any]: +def _policy_details( + daily_keep: int, weekly_keep: int, monthly_keep: int, quarterly_keep: int +) -> dict[str, Any]: return { "PolicyType": "EBS_SNAPSHOT_MANAGEMENT", "ResourceTypes": ["VOLUME"], "TargetTags": _target_tags(), "Schedules": [ { - "Name": "daily-7", + "Name": "daily", "CopyTags": True, "CreateRule": { "Interval": 24, @@ -48,7 +50,7 @@ def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dic ], }, { - "Name": "weekly-4", + "Name": "weekly", "CopyTags": True, "CreateRule": { "CronExpression": "cron(0 4 ? * SUN *)", @@ -61,7 +63,7 @@ def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dic ], }, { - "Name": "monthly-2", + "Name": "monthly", "CopyTags": True, "CreateRule": { "CronExpression": "cron(0 5 1 * ? *)", @@ -73,6 +75,19 @@ def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dic {"Key": "edcloud:backup-tier", "Value": "monthly"}, ], }, + { + "Name": "quarterly", + "CopyTags": True, + "CreateRule": { + "CronExpression": "cron(0 6 1 1,4,7,10 ? *)", + }, + "RetainRule": {"Count": quarterly_keep}, + "TagsToAdd": [ + {"Key": MANAGER_TAG_KEY, "Value": MANAGER_TAG_VALUE}, + {"Key": "Name", "Value": f"{NAME_TAG}-dlm-quarterly"}, + {"Key": "edcloud:backup-tier", "Value": "quarterly"}, + ], + }, ], } @@ -105,14 +120,25 @@ def policy_status() -> dict[str, Any]: def ensure_policy( *, execution_role_arn: str, - daily_keep: int = 7, - weekly_keep: int = 4, - monthly_keep: int = 2, + daily_keep: int = 1, + weekly_keep: int = 1, + monthly_keep: int = 1, + quarterly_keep: int = 1, enabled: bool = True, ) -> dict[str, Any]: - """Create or update the managed DLM policy with tiered retention.""" - if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0: - raise ValueError("daily_keep, weekly_keep, and monthly_keep must be > 0") + """Create or update the managed DLM policy with tiered retention. + + Default retention keeps exactly one snapshot per tier: + - daily: 1 snapshot (~1 day old) + - weekly: 1 snapshot (~1 week old, every Sunday) + - monthly: 1 snapshot (~1 month old, 1st of month) + - quarterly: 1 snapshot (~3 months old, 1st of Jan/Apr/Jul/Oct) + + DLM targets EBS volumes by tag and runs independently of instance state, + so snapshots accumulate on schedule whether the instance is running or not. + """ + if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0 or quarterly_keep <= 0: + raise ValueError("daily_keep, weekly_keep, monthly_keep, and quarterly_keep must be > 0") dlm = _dlm_client() state = "ENABLED" if enabled else "DISABLED" @@ -120,6 +146,7 @@ def ensure_policy( daily_keep=daily_keep, weekly_keep=weekly_keep, monthly_keep=monthly_keep, + quarterly_keep=quarterly_keep, ) summary = _find_policy_summary() @@ -138,6 +165,7 @@ def ensure_policy( "daily_keep": daily_keep, "weekly_keep": weekly_keep, "monthly_keep": monthly_keep, + "quarterly_keep": quarterly_keep, } policy_id = summary["PolicyId"] @@ -155,6 +183,7 @@ def ensure_policy( "daily_keep": daily_keep, "weekly_keep": weekly_keep, "monthly_keep": monthly_keep, + "quarterly_keep": quarterly_keep, } diff --git a/edcloud/cli.py b/edcloud/cli.py index ee20a89..54f390f 100644 --- a/edcloud/cli.py +++ b/edcloud/cli.py @@ -356,6 +356,17 @@ def provision( require_existing_state_volume=require_existing_state_volume, ) _print_audit_summary("post-provision") + + click.echo() + click.echo("Applying DLM backup policy...") + try: + role_arn = iam.ensure_dlm_lifecycle_role({"edcloud:managed": "true", "Name": "edcloud"}) + backup_policy.ensure_policy(execution_role_arn=role_arn) + click.echo("✅ DLM backup policy active (daily/weekly/monthly/quarterly, 1 snapshot each)") + except Exception as exc: + click.echo(f"Warning: backup policy setup failed: {exc}", err=True) + click.echo(" Run 'edc backup-policy apply' manually to enable backups.", err=True) + click.echo() click.echo(json.dumps(result, indent=2)) @@ -740,6 +751,15 @@ def status() -> None: click.echo(f" Storage: ${cost.get('storage_monthly', 0):.2f}") click.echo(f" Total: ${cost.get('total_monthly', 0):.2f}") + # Backup policy + bp = backup_policy.policy_status() + click.echo() + if bp.get("exists"): + bp_state = bp.get("state", "UNKNOWN") + click.echo(f"Backups: DLM policy {bp.get('policy_id')} [{bp_state}]") + else: + click.echo("Backups: no DLM policy — run 'edc backup-policy apply' to enable") + # --------------------------------------------------------------------------- # destroy @@ -804,6 +824,17 @@ def destroy( click.echo(str(exc), err=True) raise SystemExit(1) from exc + # Warn if DLM backup policy is absent or disabled + bp = backup_policy.policy_status() + if not bp.get("exists") or bp.get("state") != "ENABLED": + bp_detail = ( + "no DLM policy found" if not bp.get("exists") else f"policy state: {bp.get('state')}" + ) + click.echo(f"Warning: backups are not active ({bp_detail}).", err=True) + click.echo(" The state volume has no recent automated snapshots.", err=True) + click.echo(" Run 'edc backup-policy apply' to enable backups.", err=True) + click.echo() + if info.get("exists") and require_fresh_snapshot: recent = snapshot.find_recent_prechange_snapshot(fresh_snapshot_max_age_minutes) if not recent: @@ -1045,15 +1076,17 @@ def backup_policy_status_cmd() -> None: @backup_policy_group.command("apply") -@click.option("--daily-keep", default=7, type=int, show_default=True) -@click.option("--weekly-keep", default=4, type=int, show_default=True) -@click.option("--monthly-keep", default=2, type=int, show_default=True) +@click.option("--daily-keep", default=1, type=int, show_default=True) +@click.option("--weekly-keep", default=1, type=int, show_default=True) +@click.option("--monthly-keep", default=1, type=int, show_default=True) +@click.option("--quarterly-keep", default=1, type=int, show_default=True) @click.option("--disabled", is_flag=True, help="Create/update policy in DISABLED state.") @require_aws_creds def backup_policy_apply_cmd( daily_keep: int, weekly_keep: int, monthly_keep: int, + quarterly_keep: int, disabled: bool, ) -> None: """Create or update the managed DLM backup policy.""" @@ -1068,6 +1101,7 @@ def backup_policy_apply_cmd( daily_keep=daily_keep, weekly_keep=weekly_keep, monthly_keep=monthly_keep, + quarterly_keep=quarterly_keep, enabled=not disabled, ) click.echo(json.dumps(result, indent=2)) diff --git a/tests/test_backup_policy.py b/tests/test_backup_policy.py index 2a028bd..8975d31 100644 --- a/tests/test_backup_policy.py +++ b/tests/test_backup_policy.py @@ -24,7 +24,16 @@ def test_returns_policy_details_when_present(self, mock_dlm_client): ] } mock_dlm.get_lifecycle_policy.return_value = { - "Policy": {"PolicyDetails": {"Schedules": [{"Name": "daily-7"}]}} + "Policy": { + "PolicyDetails": { + "Schedules": [ + {"Name": "daily"}, + {"Name": "weekly"}, + {"Name": "monthly"}, + {"Name": "quarterly"}, + ] + } + } } mock_dlm_client.return_value = mock_dlm @@ -43,13 +52,28 @@ def test_creates_when_missing(self, mock_dlm_client): result = ensure_policy( execution_role_arn="arn:aws:iam::123:role/edcloud-dlm", - daily_keep=7, - weekly_keep=4, - monthly_keep=2, + daily_keep=1, + weekly_keep=1, + monthly_keep=1, + quarterly_keep=1, ) assert result["action"] == "created" assert result["policy_id"] == "policy-new" - assert result["monthly_keep"] == 2 + assert result["quarterly_keep"] == 1 + + @patch("edcloud.backup_policy._dlm_client") + def test_creates_includes_quarterly_schedule(self, mock_dlm_client): + mock_dlm = MagicMock() + mock_dlm.get_lifecycle_policies.return_value = {"Policies": []} + mock_dlm.create_lifecycle_policy.return_value = {"PolicyId": "policy-new"} + mock_dlm_client.return_value = mock_dlm + + ensure_policy(execution_role_arn="arn:aws:iam::123:role/edcloud-dlm") + + call_kwargs = mock_dlm.create_lifecycle_policy.call_args[1] + schedule_names = [s["Name"] for s in call_kwargs["PolicyDetails"]["Schedules"]] + assert "quarterly" in schedule_names + assert len(schedule_names) == 4 @patch("edcloud.backup_policy._dlm_client") def test_updates_when_existing(self, mock_dlm_client): @@ -63,9 +87,10 @@ def test_updates_when_existing(self, mock_dlm_client): result = ensure_policy( execution_role_arn="arn:aws:iam::123:role/edcloud-dlm", - daily_keep=7, - weekly_keep=4, - monthly_keep=2, + daily_keep=1, + weekly_keep=1, + monthly_keep=1, + quarterly_keep=1, ) assert result["action"] == "updated" mock_dlm.update_lifecycle_policy.assert_called_once() diff --git a/tests/test_cli.py b/tests/test_cli.py index 48fb341..78df3d0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -419,9 +419,10 @@ def test_backup_policy_apply_uses_defaults( assert result.exit_code == 0 mock_ensure_policy.assert_called_once_with( execution_role_arn="arn:aws:iam::123:role/edcloud-dlm-lifecycle-role", - daily_keep=7, - weekly_keep=4, - monthly_keep=2, + daily_keep=1, + weekly_keep=1, + monthly_keep=1, + quarterly_keep=1, enabled=True, )