diff --git a/cloud-init/user-data.yaml b/cloud-init/user-data.yaml index a7be364..a4bd3cc 100644 --- a/cloud-init/user-data.yaml +++ b/cloud-init/user-data.yaml @@ -243,6 +243,15 @@ runcmd: systemctl enable docker usermod -aG docker ubuntu + # --- containerd data-root → state volume --- + - | + mkdir -p /opt/edcloud/state/containerd + # Write containerd config before any images are pulled so data lands on state volume + containerd config default > /etc/containerd/config.toml + sed -i 's|^root = .*|root = "/opt/edcloud/state/containerd"|' /etc/containerd/config.toml + systemctl restart containerd + systemctl restart docker + # --- Tailscale --- - | set -eu diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 385451d..239af72 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -35,7 +35,7 @@ edcloud/ - **Python + boto3 over Terraform:** small resource graph and tag-based ownership make stateful IaC overhead unnecessary here. - **Tailscale-only access:** zero inbound SG rules; access is identity-based over tailnet. - **Durable state volume + disposable root:** host runtime is replaceable; durable data lives under `/opt/edcloud/state`. -- **AWS-native backup lifecycle:** DLM retention (daily/weekly/monthly/quarterly, 1 snapshot each tier) applied automatically on provision. DLM targets EBS volumes by tag and runs independently of instance state — snapshots accumulate whether the instance is up or down. +- **CLI-managed snapshot queue:** a single flat pool capped at 3 snapshots, enforced by the CLI. Every snapshot trigger runs `prune(3) → snapshot → prune(3)` so drift self-heals within one cycle. Triggers: `edc up` (on-start, fire-and-forget), `edc provision`/`edc reprovision`/`edc destroy` (blocking, pre-destructive-op). DLM (`backup-policy`) remains available but is not wired automatically. - **SSM-backed runtime secrets:** secrets stay out of git and host bootstrap payloads. - **Cloud-init as baseline contract:** reproducible host/tooling baseline is codified in `cloud-init/user-data.yaml`. - **CLI-first operations model:** commands must remain safe/repeatable from lightweight ARM/Linux operator nodes. @@ -71,10 +71,9 @@ edcloud/ - AWS DLM policy management is implemented in `backup_policy.py`. - Root volume remains disposable; state volume is durable and role-tagged. -<<<<<<< HEAD - Cloud-init runs `loginctl enable-linger ubuntu` so user systemd services start at boot without a login session. Personal services (e.g. `rclone-dropbox.service`) are stored in `~/.config/systemd/user/` on the state volume and therefore survive reprovision automatically. Templates for optional user services live in `templates/operator/systemd-user/`. -- DLM backup policy is applied automatically on `edc provision`. It targets the state volume by tag and runs whether the instance is running or stopped. Retention: 1 daily (~1 day old), 1 weekly (~1 week old, Sundays), 1 monthly (~1 month old, 1st of month), 1 quarterly (~3 months old, 1st of Jan/Apr/Jul/Oct). -- `edc status` shows current backup policy state. `edc destroy` warns if no active policy is found. +- Snapshot cap is 3 (`DEFAULT_SNAPSHOT_KEEP_LAST`). Each CLI trigger runs pre-prune + create + post-prune. Worst-case drift is +1, self-healing on next trigger. +- `edc status` shows snapshot count. `edc snapshot --list` shows full inventory. `edc backup-policy apply` can optionally wire DLM on top. ## Non-goals diff --git a/edcloud/backup_policy.py b/edcloud/backup_policy.py index 4c4c066..62c5463 100644 --- a/edcloud/backup_policy.py +++ b/edcloud/backup_policy.py @@ -26,9 +26,7 @@ def _target_tags() -> list[dict[str, str]]: ] -def _policy_details( - daily_keep: int, weekly_keep: int, monthly_keep: int, quarterly_keep: int -) -> dict[str, Any]: +def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dict[str, Any]: return { "PolicyType": "EBS_SNAPSHOT_MANAGEMENT", "ResourceTypes": ["VOLUME"], @@ -75,19 +73,6 @@ def _policy_details( {"Key": "edcloud:backup-tier", "Value": "monthly"}, ], }, - { - "Name": "quarterly", - "CopyTags": True, - "CreateRule": { - "CronExpression": "cron(0 6 1 1,4,7,10 ? *)", - }, - "RetainRule": {"Count": quarterly_keep}, - "TagsToAdd": [ - {"Key": MANAGER_TAG_KEY, "Value": MANAGER_TAG_VALUE}, - {"Key": "Name", "Value": f"{NAME_TAG}-dlm-quarterly"}, - {"Key": "edcloud:backup-tier", "Value": "quarterly"}, - ], - }, ], } @@ -123,22 +108,20 @@ def ensure_policy( daily_keep: int = 1, weekly_keep: int = 1, monthly_keep: int = 1, - quarterly_keep: int = 1, enabled: bool = True, ) -> dict[str, Any]: """Create or update the managed DLM policy with tiered retention. Default retention keeps exactly one snapshot per tier: - - daily: 1 snapshot (~1 day old) - - weekly: 1 snapshot (~1 week old, every Sunday) - - monthly: 1 snapshot (~1 month old, 1st of month) - - quarterly: 1 snapshot (~3 months old, 1st of Jan/Apr/Jul/Oct) + - daily: 1 snapshot (~1 day old) + - weekly: 1 snapshot (~1 week old, every Sunday) + - monthly: 1 snapshot (~1 month old, 1st of month) DLM targets EBS volumes by tag and runs independently of instance state, so snapshots accumulate on schedule whether the instance is running or not. """ - if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0 or quarterly_keep <= 0: - raise ValueError("daily_keep, weekly_keep, monthly_keep, and quarterly_keep must be > 0") + if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0: + raise ValueError("daily_keep, weekly_keep, and monthly_keep must be > 0") dlm = _dlm_client() state = "ENABLED" if enabled else "DISABLED" @@ -146,7 +129,6 @@ def ensure_policy( daily_keep=daily_keep, weekly_keep=weekly_keep, monthly_keep=monthly_keep, - quarterly_keep=quarterly_keep, ) summary = _find_policy_summary() @@ -165,7 +147,6 @@ def ensure_policy( "daily_keep": daily_keep, "weekly_keep": weekly_keep, "monthly_keep": monthly_keep, - "quarterly_keep": quarterly_keep, } policy_id = summary["PolicyId"] @@ -183,7 +164,6 @@ def ensure_policy( "daily_keep": daily_keep, "weekly_keep": weekly_keep, "monthly_keep": monthly_keep, - "quarterly_keep": quarterly_keep, } diff --git a/edcloud/cli.py b/edcloud/cli.py index 54f390f..7451282 100644 --- a/edcloud/cli.py +++ b/edcloud/cli.py @@ -286,7 +286,7 @@ def provision( # Auto-snapshot if existing instance (unless --skip-snapshot) if not skip_snapshot: click.echo("Checking for existing instance to snapshot...") - snap_ids = snapshot.auto_snapshot_before_destroy() + snap_ids = snapshot.snapshot_and_prune("pre-provision", wait=True) if snap_ids: click.echo(f"✅ Created pre-provision snapshot(s): {', '.join(snap_ids)}") click.echo() @@ -358,15 +358,8 @@ def provision( _print_audit_summary("post-provision") click.echo() - click.echo("Applying DLM backup policy...") - try: - role_arn = iam.ensure_dlm_lifecycle_role({"edcloud:managed": "true", "Name": "edcloud"}) - backup_policy.ensure_policy(execution_role_arn=role_arn) - click.echo("✅ DLM backup policy active (daily/weekly/monthly/quarterly, 1 snapshot each)") - except Exception as exc: - click.echo(f"Warning: backup policy setup failed: {exc}", err=True) - click.echo(" Run 'edc backup-policy apply' manually to enable backups.", err=True) - + click.echo("Snapshots are managed by the CLI (max 3, pruned on each trigger).") + click.echo(" Use 'edc snapshot --list' to view, 'edc backup-policy apply' for DLM.") click.echo() click.echo(json.dumps(result, indent=2)) @@ -664,6 +657,15 @@ def up(allow_tailscale_name_conflicts: bool) -> None: """Start the edcloud instance.""" if not allow_tailscale_name_conflicts: _ensure_no_tailscale_name_conflicts() + + # On-start snapshot (fire-and-forget; prune enforces 3-snapshot cap) + try: + snap_ids = snapshot.snapshot_and_prune("on-start", wait=False) + if snap_ids: + click.echo(f"On-start snapshot queued: {', '.join(snap_ids)}") + except Exception as exc: + click.echo(f"Warning: on-start snapshot skipped ({exc})", err=True) + ec2.start() ts_ip = tailscale.get_tailscale_ip(DEFAULT_TAILSCALE_HOSTNAME) if ts_ip: @@ -751,14 +753,13 @@ def status() -> None: click.echo(f" Storage: ${cost.get('storage_monthly', 0):.2f}") click.echo(f" Total: ${cost.get('total_monthly', 0):.2f}") - # Backup policy - bp = backup_policy.policy_status() + # Snapshots + snaps = snapshot.list_snapshots() + completed = [s for s in snaps if s["state"] == "completed"] click.echo() - if bp.get("exists"): - bp_state = bp.get("state", "UNKNOWN") - click.echo(f"Backups: DLM policy {bp.get('policy_id')} [{bp_state}]") - else: - click.echo("Backups: no DLM policy — run 'edc backup-policy apply' to enable") + click.echo( + f"Snapshots: {len(snaps)} managed ({len(completed)} completed) — use 'edc snapshot --list'" + ) # --------------------------------------------------------------------------- @@ -824,17 +825,6 @@ def destroy( click.echo(str(exc), err=True) raise SystemExit(1) from exc - # Warn if DLM backup policy is absent or disabled - bp = backup_policy.policy_status() - if not bp.get("exists") or bp.get("state") != "ENABLED": - bp_detail = ( - "no DLM policy found" if not bp.get("exists") else f"policy state: {bp.get('state')}" - ) - click.echo(f"Warning: backups are not active ({bp_detail}).", err=True) - click.echo(" The state volume has no recent automated snapshots.", err=True) - click.echo(" Run 'edc backup-policy apply' to enable backups.", err=True) - click.echo() - if info.get("exists") and require_fresh_snapshot: recent = snapshot.find_recent_prechange_snapshot(fresh_snapshot_max_age_minutes) if not recent: @@ -855,7 +845,7 @@ def destroy( run_optional_auto_snapshot( skip_snapshot=skip_snapshot, - auto_snapshot=snapshot.auto_snapshot_before_destroy, + auto_snapshot=lambda: snapshot.snapshot_and_prune("pre-destroy", wait=True), echo=click.echo, echo_err=lambda msg: click.echo(msg, err=True), confirm_continue=lambda msg: click.confirm(msg), @@ -1079,14 +1069,12 @@ def backup_policy_status_cmd() -> None: @click.option("--daily-keep", default=1, type=int, show_default=True) @click.option("--weekly-keep", default=1, type=int, show_default=True) @click.option("--monthly-keep", default=1, type=int, show_default=True) -@click.option("--quarterly-keep", default=1, type=int, show_default=True) @click.option("--disabled", is_flag=True, help="Create/update policy in DISABLED state.") @require_aws_creds def backup_policy_apply_cmd( daily_keep: int, weekly_keep: int, monthly_keep: int, - quarterly_keep: int, disabled: bool, ) -> None: """Create or update the managed DLM backup policy.""" @@ -1101,7 +1089,6 @@ def backup_policy_apply_cmd( daily_keep=daily_keep, weekly_keep=weekly_keep, monthly_keep=monthly_keep, - quarterly_keep=quarterly_keep, enabled=not disabled, ) click.echo(json.dumps(result, indent=2)) @@ -1473,7 +1460,7 @@ def reprovision( snap_ids, result = run_reprovision_flow( info=info, skip_snapshot=skip_snapshot, - auto_snapshot=snapshot.auto_snapshot_before_destroy, + auto_snapshot=lambda: snapshot.snapshot_and_prune("pre-reprovision", wait=True), destroy_instance=lambda: ec2.destroy(force=True), cleanup_orphaned_volumes=lambda: cleanup_module.cleanup_orphaned_volumes( mode="delete", allow_delete_state=False diff --git a/edcloud/config.py b/edcloud/config.py index 243954e..bf97f00 100644 --- a/edcloud/config.py +++ b/edcloud/config.py @@ -24,9 +24,9 @@ # EC2 defaults # --------------------------------------------------------------------------- DEFAULT_INSTANCE_TYPE = "t3a.small" -DEFAULT_VOLUME_SIZE_GB = 16 # Root: OS + Docker + dev tools (~6GB used baseline) +DEFAULT_VOLUME_SIZE_GB = 30 # Root: OS + dev tools (containerd data lives on state volume) DEFAULT_VOLUME_TYPE = "gp3" -DEFAULT_STATE_VOLUME_SIZE_GB = 20 # State: home + Docker data (starts ~1GB, grows with use) +DEFAULT_STATE_VOLUME_SIZE_GB = 30 # State: home + Docker + containerd data DEFAULT_STATE_VOLUME_TYPE = "gp3" DEFAULT_STATE_VOLUME_DEVICE_NAME = "/dev/sdf" @@ -69,7 +69,7 @@ EBS_MONTHLY_RATE_PER_GB = 0.08 SNAPSHOT_MONTHLY_RATE_PER_GB = 0.05 EIP_UNATTACHED_MONTHLY_RATE = 3.60 -DEFAULT_HOURS_PER_DAY = 4 +DEFAULT_HOURS_PER_DAY = 8 DEFAULT_SNAPSHOT_KEEP_LAST = 3 diff --git a/edcloud/snapshot.py b/edcloud/snapshot.py index 126cc19..44dbeb4 100644 --- a/edcloud/snapshot.py +++ b/edcloud/snapshot.py @@ -8,6 +8,7 @@ from typing import Any from edcloud.config import ( + DEFAULT_SNAPSHOT_KEEP_LAST, MANAGER_TAG_KEY, MANAGER_TAG_VALUE, NAME_TAG, @@ -285,6 +286,47 @@ def auto_snapshot_before_destroy() -> list[str]: return snap_ids +def snapshot_and_prune( + description: str, + keep: int = DEFAULT_SNAPSHOT_KEEP_LAST, + wait: bool = True, +) -> list[str]: + """Prune → snapshot → prune, enforcing a hard cap of *keep* snapshots. + + The pre-prune heals drift from a previously failed post-prune. + The post-prune enforces the cap after the new snapshot is created. + Worst-case drift is +1 (snapshot succeeded, both prunes failed) — self-heals + on the next trigger. + + Args: + description: Snapshot description. + keep: Maximum snapshots to retain after the operation. + wait: If ``True``, wait for snapshots to reach completed state before returning. + + Returns: + List of created snapshot IDs, or empty list if no instance exists. + """ + ec2 = get_ec2_client() + inst = find_instance(ec2) + if not inst: + return [] + + # Pre-prune: heal any drift from a previously failed post-prune + prune_snapshots(keep_last=keep, dry_run=False) + + snap_ids = create_snapshot(description) + + # Post-prune: enforce the cap + prune_snapshots(keep_last=keep, dry_run=False) + + if wait and snap_ids: + log.info("Waiting for snapshot(s) to complete...") + wait_for_snapshot_completion(snap_ids) + log.info("Snapshot(s) completed.") + + return snap_ids + + def create_snapshot(description: str | None = None) -> list[str]: """Snapshot every EBS volume attached to the edcloud instance. diff --git a/tests/test_backup_policy.py b/tests/test_backup_policy.py index 8975d31..7759fef 100644 --- a/tests/test_backup_policy.py +++ b/tests/test_backup_policy.py @@ -30,7 +30,6 @@ def test_returns_policy_details_when_present(self, mock_dlm_client): {"Name": "daily"}, {"Name": "weekly"}, {"Name": "monthly"}, - {"Name": "quarterly"}, ] } } @@ -55,14 +54,12 @@ def test_creates_when_missing(self, mock_dlm_client): daily_keep=1, weekly_keep=1, monthly_keep=1, - quarterly_keep=1, ) assert result["action"] == "created" assert result["policy_id"] == "policy-new" - assert result["quarterly_keep"] == 1 @patch("edcloud.backup_policy._dlm_client") - def test_creates_includes_quarterly_schedule(self, mock_dlm_client): + def test_creates_includes_daily_weekly_monthly_schedules(self, mock_dlm_client): mock_dlm = MagicMock() mock_dlm.get_lifecycle_policies.return_value = {"Policies": []} mock_dlm.create_lifecycle_policy.return_value = {"PolicyId": "policy-new"} @@ -72,8 +69,10 @@ def test_creates_includes_quarterly_schedule(self, mock_dlm_client): call_kwargs = mock_dlm.create_lifecycle_policy.call_args[1] schedule_names = [s["Name"] for s in call_kwargs["PolicyDetails"]["Schedules"]] - assert "quarterly" in schedule_names - assert len(schedule_names) == 4 + assert "daily" in schedule_names + assert "weekly" in schedule_names + assert "monthly" in schedule_names + assert len(schedule_names) == 3 @patch("edcloud.backup_policy._dlm_client") def test_updates_when_existing(self, mock_dlm_client): @@ -90,7 +89,6 @@ def test_updates_when_existing(self, mock_dlm_client): daily_keep=1, weekly_keep=1, monthly_keep=1, - quarterly_keep=1, ) assert result["action"] == "updated" mock_dlm.update_lifecycle_policy.assert_called_once() diff --git a/tests/test_cli.py b/tests/test_cli.py index 78df3d0..60be547 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -422,7 +422,6 @@ def test_backup_policy_apply_uses_defaults( daily_keep=1, weekly_keep=1, monthly_keep=1, - quarterly_keep=1, enabled=True, ) @@ -973,7 +972,7 @@ def test_tailscale_check_logs_warning_when_cli_not_found( @patch("edcloud.cli.ec2.provision") @patch("edcloud.cli.ec2.destroy") @patch("edcloud.cli.ec2.status") -@patch("edcloud.cli.snapshot.auto_snapshot_before_destroy") +@patch("edcloud.cli.snapshot.snapshot_and_prune") @patch("edcloud.cli.tailscale.edcloud_name_conflicts", return_value=[]) @patch("edcloud.cli.tailscale.tailscale_available", return_value=True) @patch("edcloud.cli.get_region", return_value="us-east-1") @@ -1021,7 +1020,7 @@ def test_reprovision_snapshots_destroys_and_provisions( @patch("edcloud.cli.ec2.provision") @patch("edcloud.cli.ec2.destroy") @patch("edcloud.cli.ec2.status") -@patch("edcloud.cli.snapshot.auto_snapshot_before_destroy") +@patch("edcloud.cli.snapshot.snapshot_and_prune") @patch("edcloud.cli.tailscale.edcloud_name_conflicts", return_value=[]) @patch("edcloud.cli.tailscale.tailscale_available", return_value=True) @patch("edcloud.cli.get_region", return_value="us-east-1") @@ -1067,7 +1066,7 @@ def test_reprovision_skip_snapshot_skips_snapshot( @patch("edcloud.cli.ec2.provision") @patch("edcloud.cli.ec2.destroy") @patch("edcloud.cli.ec2.status") -@patch("edcloud.cli.snapshot.auto_snapshot_before_destroy") +@patch("edcloud.cli.snapshot.snapshot_and_prune") @patch("edcloud.cli.tailscale.edcloud_name_conflicts", return_value=[]) @patch("edcloud.cli.tailscale.tailscale_available", return_value=True) @patch("edcloud.cli.get_region", return_value="us-east-1") diff --git a/tests/test_config.py b/tests/test_config.py index dd093a9..6ec4afd 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -16,9 +16,9 @@ def test_default_config() -> None: cfg = InstanceConfig() assert cfg.instance_type == "t3a.small" - assert cfg.volume_size_gb == 16 + assert cfg.volume_size_gb == 30 assert cfg.volume_type == "gp3" - assert cfg.state_volume_size_gb == 20 + assert cfg.state_volume_size_gb == 30 assert cfg.state_volume_type == "gp3" assert cfg.state_volume_device_name == "/dev/sdf" assert cfg.tailscale_hostname == "edcloud" diff --git a/tests/test_ec2.py b/tests/test_ec2.py index 3b78733..b691caa 100644 --- a/tests/test_ec2.py +++ b/tests/test_ec2.py @@ -260,7 +260,7 @@ def test_includes_persistent_state_volume( assert block_mappings[0]["DeviceName"] == "/dev/sda1" assert block_mappings[0]["Ebs"]["DeleteOnTermination"] is True assert block_mappings[1]["DeviceName"] == "/dev/sdf" - assert block_mappings[1]["Ebs"]["VolumeSize"] == 20 + assert block_mappings[1]["Ebs"]["VolumeSize"] == 30 assert block_mappings[1]["Ebs"]["DeleteOnTermination"] is False # Verify IMDS settings metadata_opts = kwargs["MetadataOptions"]