From 0818ecf8b83552fa4227a3c7d3370e6260707eb2 Mon Sep 17 00:00:00 2001 From: brfid Date: Sun, 22 Feb 2026 17:08:59 -0500 Subject: [PATCH 1/4] chore(config): update default uptime estimate to 8hrs/day Co-Authored-By: Claude Sonnet 4.6 --- edcloud/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edcloud/config.py b/edcloud/config.py index 243954e..e710f15 100644 --- a/edcloud/config.py +++ b/edcloud/config.py @@ -69,7 +69,7 @@ EBS_MONTHLY_RATE_PER_GB = 0.08 SNAPSHOT_MONTHLY_RATE_PER_GB = 0.05 EIP_UNATTACHED_MONTHLY_RATE = 3.60 -DEFAULT_HOURS_PER_DAY = 4 +DEFAULT_HOURS_PER_DAY = 8 DEFAULT_SNAPSHOT_KEEP_LAST = 3 From 29209f53448c80f934f11556d4a36299b7c73a65 Mon Sep 17 00:00:00 2001 From: brfid Date: Sun, 22 Feb 2026 21:29:16 -0500 Subject: [PATCH 2/4] feat(bootstrap): move containerd data-root to state volume Redirect containerd's storage root to /opt/edcloud/state/containerd so Docker image layers and container data survive reprovision alongside the rest of the state volume. Grow both EBS volume defaults to 30 GB to accommodate the combined footprint (root: OS + dev tools only, state: home + Docker + containerd data). Co-Authored-By: Claude Sonnet 4.6 --- cloud-init/user-data.yaml | 9 +++++++++ edcloud/config.py | 4 ++-- tests/test_config.py | 4 ++-- tests/test_ec2.py | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/cloud-init/user-data.yaml b/cloud-init/user-data.yaml index a7be364..a4bd3cc 100644 --- a/cloud-init/user-data.yaml +++ b/cloud-init/user-data.yaml @@ -243,6 +243,15 @@ runcmd: systemctl enable docker usermod -aG docker ubuntu + # --- containerd data-root → state volume --- + - | + mkdir -p /opt/edcloud/state/containerd + # Write containerd config before any images are pulled so data lands on state volume + containerd config default > /etc/containerd/config.toml + sed -i 's|^root = .*|root = "/opt/edcloud/state/containerd"|' /etc/containerd/config.toml + systemctl restart containerd + systemctl restart docker + # --- Tailscale --- - | set -eu diff --git a/edcloud/config.py b/edcloud/config.py index e710f15..bf97f00 100644 --- a/edcloud/config.py +++ b/edcloud/config.py @@ -24,9 +24,9 @@ # EC2 defaults # --------------------------------------------------------------------------- DEFAULT_INSTANCE_TYPE = "t3a.small" -DEFAULT_VOLUME_SIZE_GB = 16 # Root: OS + Docker + dev tools (~6GB used baseline) +DEFAULT_VOLUME_SIZE_GB = 30 # Root: OS + dev tools (containerd data lives on state volume) DEFAULT_VOLUME_TYPE = "gp3" -DEFAULT_STATE_VOLUME_SIZE_GB = 20 # State: home + Docker data (starts ~1GB, grows with use) +DEFAULT_STATE_VOLUME_SIZE_GB = 30 # State: home + Docker + containerd data DEFAULT_STATE_VOLUME_TYPE = "gp3" DEFAULT_STATE_VOLUME_DEVICE_NAME = "/dev/sdf" diff --git a/tests/test_config.py b/tests/test_config.py index dd093a9..6ec4afd 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -16,9 +16,9 @@ def test_default_config() -> None: cfg = InstanceConfig() assert cfg.instance_type == "t3a.small" - assert cfg.volume_size_gb == 16 + assert cfg.volume_size_gb == 30 assert cfg.volume_type == "gp3" - assert cfg.state_volume_size_gb == 20 + assert cfg.state_volume_size_gb == 30 assert cfg.state_volume_type == "gp3" assert cfg.state_volume_device_name == "/dev/sdf" assert cfg.tailscale_hostname == "edcloud" diff --git a/tests/test_ec2.py b/tests/test_ec2.py index 3b78733..b691caa 100644 --- a/tests/test_ec2.py +++ b/tests/test_ec2.py @@ -260,7 +260,7 @@ def test_includes_persistent_state_volume( assert block_mappings[0]["DeviceName"] == "/dev/sda1" assert block_mappings[0]["Ebs"]["DeleteOnTermination"] is True assert block_mappings[1]["DeviceName"] == "/dev/sdf" - assert block_mappings[1]["Ebs"]["VolumeSize"] == 20 + assert block_mappings[1]["Ebs"]["VolumeSize"] == 30 assert block_mappings[1]["Ebs"]["DeleteOnTermination"] is False # Verify IMDS settings metadata_opts = kwargs["MetadataOptions"] From 557479219f24c6b7f4953e102803fe9c952f68a2 Mon Sep 17 00:00:00 2001 From: brfid Date: Sun, 22 Feb 2026 21:29:28 -0500 Subject: [PATCH 3/4] feat(backup): simplify DLM policy to daily/weekly/monthly tiers Remove the quarterly schedule from the DLM backup policy. With the CLI-managed snapshot queue as the primary safety net, the quarterly tier adds cost and complexity without meaningful benefit for a personal lab. Co-Authored-By: Claude Sonnet 4.6 --- edcloud/backup_policy.py | 32 ++++++-------------------------- tests/test_backup_policy.py | 12 +++++------- tests/test_cli.py | 7 +++---- 3 files changed, 14 insertions(+), 37 deletions(-) diff --git a/edcloud/backup_policy.py b/edcloud/backup_policy.py index 4c4c066..62c5463 100644 --- a/edcloud/backup_policy.py +++ b/edcloud/backup_policy.py @@ -26,9 +26,7 @@ def _target_tags() -> list[dict[str, str]]: ] -def _policy_details( - daily_keep: int, weekly_keep: int, monthly_keep: int, quarterly_keep: int -) -> dict[str, Any]: +def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dict[str, Any]: return { "PolicyType": "EBS_SNAPSHOT_MANAGEMENT", "ResourceTypes": ["VOLUME"], @@ -75,19 +73,6 @@ def _policy_details( {"Key": "edcloud:backup-tier", "Value": "monthly"}, ], }, - { - "Name": "quarterly", - "CopyTags": True, - "CreateRule": { - "CronExpression": "cron(0 6 1 1,4,7,10 ? *)", - }, - "RetainRule": {"Count": quarterly_keep}, - "TagsToAdd": [ - {"Key": MANAGER_TAG_KEY, "Value": MANAGER_TAG_VALUE}, - {"Key": "Name", "Value": f"{NAME_TAG}-dlm-quarterly"}, - {"Key": "edcloud:backup-tier", "Value": "quarterly"}, - ], - }, ], } @@ -123,22 +108,20 @@ def ensure_policy( daily_keep: int = 1, weekly_keep: int = 1, monthly_keep: int = 1, - quarterly_keep: int = 1, enabled: bool = True, ) -> dict[str, Any]: """Create or update the managed DLM policy with tiered retention. Default retention keeps exactly one snapshot per tier: - - daily: 1 snapshot (~1 day old) - - weekly: 1 snapshot (~1 week old, every Sunday) - - monthly: 1 snapshot (~1 month old, 1st of month) - - quarterly: 1 snapshot (~3 months old, 1st of Jan/Apr/Jul/Oct) + - daily: 1 snapshot (~1 day old) + - weekly: 1 snapshot (~1 week old, every Sunday) + - monthly: 1 snapshot (~1 month old, 1st of month) DLM targets EBS volumes by tag and runs independently of instance state, so snapshots accumulate on schedule whether the instance is running or not. """ - if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0 or quarterly_keep <= 0: - raise ValueError("daily_keep, weekly_keep, monthly_keep, and quarterly_keep must be > 0") + if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0: + raise ValueError("daily_keep, weekly_keep, and monthly_keep must be > 0") dlm = _dlm_client() state = "ENABLED" if enabled else "DISABLED" @@ -146,7 +129,6 @@ def ensure_policy( daily_keep=daily_keep, weekly_keep=weekly_keep, monthly_keep=monthly_keep, - quarterly_keep=quarterly_keep, ) summary = _find_policy_summary() @@ -165,7 +147,6 @@ def ensure_policy( "daily_keep": daily_keep, "weekly_keep": weekly_keep, "monthly_keep": monthly_keep, - "quarterly_keep": quarterly_keep, } policy_id = summary["PolicyId"] @@ -183,7 +164,6 @@ def ensure_policy( "daily_keep": daily_keep, "weekly_keep": weekly_keep, "monthly_keep": monthly_keep, - "quarterly_keep": quarterly_keep, } diff --git a/tests/test_backup_policy.py b/tests/test_backup_policy.py index 8975d31..7759fef 100644 --- a/tests/test_backup_policy.py +++ b/tests/test_backup_policy.py @@ -30,7 +30,6 @@ def test_returns_policy_details_when_present(self, mock_dlm_client): {"Name": "daily"}, {"Name": "weekly"}, {"Name": "monthly"}, - {"Name": "quarterly"}, ] } } @@ -55,14 +54,12 @@ def test_creates_when_missing(self, mock_dlm_client): daily_keep=1, weekly_keep=1, monthly_keep=1, - quarterly_keep=1, ) assert result["action"] == "created" assert result["policy_id"] == "policy-new" - assert result["quarterly_keep"] == 1 @patch("edcloud.backup_policy._dlm_client") - def test_creates_includes_quarterly_schedule(self, mock_dlm_client): + def test_creates_includes_daily_weekly_monthly_schedules(self, mock_dlm_client): mock_dlm = MagicMock() mock_dlm.get_lifecycle_policies.return_value = {"Policies": []} mock_dlm.create_lifecycle_policy.return_value = {"PolicyId": "policy-new"} @@ -72,8 +69,10 @@ def test_creates_includes_quarterly_schedule(self, mock_dlm_client): call_kwargs = mock_dlm.create_lifecycle_policy.call_args[1] schedule_names = [s["Name"] for s in call_kwargs["PolicyDetails"]["Schedules"]] - assert "quarterly" in schedule_names - assert len(schedule_names) == 4 + assert "daily" in schedule_names + assert "weekly" in schedule_names + assert "monthly" in schedule_names + assert len(schedule_names) == 3 @patch("edcloud.backup_policy._dlm_client") def test_updates_when_existing(self, mock_dlm_client): @@ -90,7 +89,6 @@ def test_updates_when_existing(self, mock_dlm_client): daily_keep=1, weekly_keep=1, monthly_keep=1, - quarterly_keep=1, ) assert result["action"] == "updated" mock_dlm.update_lifecycle_policy.assert_called_once() diff --git a/tests/test_cli.py b/tests/test_cli.py index 78df3d0..60be547 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -422,7 +422,6 @@ def test_backup_policy_apply_uses_defaults( daily_keep=1, weekly_keep=1, monthly_keep=1, - quarterly_keep=1, enabled=True, ) @@ -973,7 +972,7 @@ def test_tailscale_check_logs_warning_when_cli_not_found( @patch("edcloud.cli.ec2.provision") @patch("edcloud.cli.ec2.destroy") @patch("edcloud.cli.ec2.status") -@patch("edcloud.cli.snapshot.auto_snapshot_before_destroy") +@patch("edcloud.cli.snapshot.snapshot_and_prune") @patch("edcloud.cli.tailscale.edcloud_name_conflicts", return_value=[]) @patch("edcloud.cli.tailscale.tailscale_available", return_value=True) @patch("edcloud.cli.get_region", return_value="us-east-1") @@ -1021,7 +1020,7 @@ def test_reprovision_snapshots_destroys_and_provisions( @patch("edcloud.cli.ec2.provision") @patch("edcloud.cli.ec2.destroy") @patch("edcloud.cli.ec2.status") -@patch("edcloud.cli.snapshot.auto_snapshot_before_destroy") +@patch("edcloud.cli.snapshot.snapshot_and_prune") @patch("edcloud.cli.tailscale.edcloud_name_conflicts", return_value=[]) @patch("edcloud.cli.tailscale.tailscale_available", return_value=True) @patch("edcloud.cli.get_region", return_value="us-east-1") @@ -1067,7 +1066,7 @@ def test_reprovision_skip_snapshot_skips_snapshot( @patch("edcloud.cli.ec2.provision") @patch("edcloud.cli.ec2.destroy") @patch("edcloud.cli.ec2.status") -@patch("edcloud.cli.snapshot.auto_snapshot_before_destroy") +@patch("edcloud.cli.snapshot.snapshot_and_prune") @patch("edcloud.cli.tailscale.edcloud_name_conflicts", return_value=[]) @patch("edcloud.cli.tailscale.tailscale_available", return_value=True) @patch("edcloud.cli.get_region", return_value="us-east-1") From b30ef8544927e6dfd222e70ef55d7cccbba3aa3d Mon Sep 17 00:00:00 2001 From: brfid Date: Sun, 22 Feb 2026 21:29:46 -0500 Subject: [PATCH 4/4] feat(snapshot): unified CLI-managed snapshot queue (max 3, self-healing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace dual snapshot systems (DLM auto-wired + CLI manual) with a single flat pool managed entirely by the CLI. Each trigger runs prune(3) → snapshot → prune(3), so the cap enforces itself and drift from a failed prune heals on the next trigger (worst-case +1). Snapshot triggers: - edc up: on-start, fire-and-forget - edc provision: pre-provision, blocking (cleanup path) - edc reprovision: pre-reprovision, blocking - edc destroy: pre-destroy, blocking DLM auto-wiring removed from edc provision and edc destroy; backup-policy commands remain available for opt-in DLM on top. Co-Authored-By: Claude Sonnet 4.6 --- docs/ARCHITECTURE.md | 7 +++--- edcloud/cli.py | 53 +++++++++++++++++--------------------------- edcloud/snapshot.py | 42 +++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 385451d..239af72 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -35,7 +35,7 @@ edcloud/ - **Python + boto3 over Terraform:** small resource graph and tag-based ownership make stateful IaC overhead unnecessary here. - **Tailscale-only access:** zero inbound SG rules; access is identity-based over tailnet. - **Durable state volume + disposable root:** host runtime is replaceable; durable data lives under `/opt/edcloud/state`. -- **AWS-native backup lifecycle:** DLM retention (daily/weekly/monthly/quarterly, 1 snapshot each tier) applied automatically on provision. DLM targets EBS volumes by tag and runs independently of instance state — snapshots accumulate whether the instance is up or down. +- **CLI-managed snapshot queue:** a single flat pool capped at 3 snapshots, enforced by the CLI. Every snapshot trigger runs `prune(3) → snapshot → prune(3)` so drift self-heals within one cycle. Triggers: `edc up` (on-start, fire-and-forget), `edc provision`/`edc reprovision`/`edc destroy` (blocking, pre-destructive-op). DLM (`backup-policy`) remains available but is not wired automatically. - **SSM-backed runtime secrets:** secrets stay out of git and host bootstrap payloads. - **Cloud-init as baseline contract:** reproducible host/tooling baseline is codified in `cloud-init/user-data.yaml`. - **CLI-first operations model:** commands must remain safe/repeatable from lightweight ARM/Linux operator nodes. @@ -71,10 +71,9 @@ edcloud/ - AWS DLM policy management is implemented in `backup_policy.py`. - Root volume remains disposable; state volume is durable and role-tagged. -<<<<<<< HEAD - Cloud-init runs `loginctl enable-linger ubuntu` so user systemd services start at boot without a login session. Personal services (e.g. `rclone-dropbox.service`) are stored in `~/.config/systemd/user/` on the state volume and therefore survive reprovision automatically. Templates for optional user services live in `templates/operator/systemd-user/`. -- DLM backup policy is applied automatically on `edc provision`. It targets the state volume by tag and runs whether the instance is running or stopped. Retention: 1 daily (~1 day old), 1 weekly (~1 week old, Sundays), 1 monthly (~1 month old, 1st of month), 1 quarterly (~3 months old, 1st of Jan/Apr/Jul/Oct). -- `edc status` shows current backup policy state. `edc destroy` warns if no active policy is found. +- Snapshot cap is 3 (`DEFAULT_SNAPSHOT_KEEP_LAST`). Each CLI trigger runs pre-prune + create + post-prune. Worst-case drift is +1, self-healing on next trigger. +- `edc status` shows snapshot count. `edc snapshot --list` shows full inventory. `edc backup-policy apply` can optionally wire DLM on top. ## Non-goals diff --git a/edcloud/cli.py b/edcloud/cli.py index 54f390f..7451282 100644 --- a/edcloud/cli.py +++ b/edcloud/cli.py @@ -286,7 +286,7 @@ def provision( # Auto-snapshot if existing instance (unless --skip-snapshot) if not skip_snapshot: click.echo("Checking for existing instance to snapshot...") - snap_ids = snapshot.auto_snapshot_before_destroy() + snap_ids = snapshot.snapshot_and_prune("pre-provision", wait=True) if snap_ids: click.echo(f"✅ Created pre-provision snapshot(s): {', '.join(snap_ids)}") click.echo() @@ -358,15 +358,8 @@ def provision( _print_audit_summary("post-provision") click.echo() - click.echo("Applying DLM backup policy...") - try: - role_arn = iam.ensure_dlm_lifecycle_role({"edcloud:managed": "true", "Name": "edcloud"}) - backup_policy.ensure_policy(execution_role_arn=role_arn) - click.echo("✅ DLM backup policy active (daily/weekly/monthly/quarterly, 1 snapshot each)") - except Exception as exc: - click.echo(f"Warning: backup policy setup failed: {exc}", err=True) - click.echo(" Run 'edc backup-policy apply' manually to enable backups.", err=True) - + click.echo("Snapshots are managed by the CLI (max 3, pruned on each trigger).") + click.echo(" Use 'edc snapshot --list' to view, 'edc backup-policy apply' for DLM.") click.echo() click.echo(json.dumps(result, indent=2)) @@ -664,6 +657,15 @@ def up(allow_tailscale_name_conflicts: bool) -> None: """Start the edcloud instance.""" if not allow_tailscale_name_conflicts: _ensure_no_tailscale_name_conflicts() + + # On-start snapshot (fire-and-forget; prune enforces 3-snapshot cap) + try: + snap_ids = snapshot.snapshot_and_prune("on-start", wait=False) + if snap_ids: + click.echo(f"On-start snapshot queued: {', '.join(snap_ids)}") + except Exception as exc: + click.echo(f"Warning: on-start snapshot skipped ({exc})", err=True) + ec2.start() ts_ip = tailscale.get_tailscale_ip(DEFAULT_TAILSCALE_HOSTNAME) if ts_ip: @@ -751,14 +753,13 @@ def status() -> None: click.echo(f" Storage: ${cost.get('storage_monthly', 0):.2f}") click.echo(f" Total: ${cost.get('total_monthly', 0):.2f}") - # Backup policy - bp = backup_policy.policy_status() + # Snapshots + snaps = snapshot.list_snapshots() + completed = [s for s in snaps if s["state"] == "completed"] click.echo() - if bp.get("exists"): - bp_state = bp.get("state", "UNKNOWN") - click.echo(f"Backups: DLM policy {bp.get('policy_id')} [{bp_state}]") - else: - click.echo("Backups: no DLM policy — run 'edc backup-policy apply' to enable") + click.echo( + f"Snapshots: {len(snaps)} managed ({len(completed)} completed) — use 'edc snapshot --list'" + ) # --------------------------------------------------------------------------- @@ -824,17 +825,6 @@ def destroy( click.echo(str(exc), err=True) raise SystemExit(1) from exc - # Warn if DLM backup policy is absent or disabled - bp = backup_policy.policy_status() - if not bp.get("exists") or bp.get("state") != "ENABLED": - bp_detail = ( - "no DLM policy found" if not bp.get("exists") else f"policy state: {bp.get('state')}" - ) - click.echo(f"Warning: backups are not active ({bp_detail}).", err=True) - click.echo(" The state volume has no recent automated snapshots.", err=True) - click.echo(" Run 'edc backup-policy apply' to enable backups.", err=True) - click.echo() - if info.get("exists") and require_fresh_snapshot: recent = snapshot.find_recent_prechange_snapshot(fresh_snapshot_max_age_minutes) if not recent: @@ -855,7 +845,7 @@ def destroy( run_optional_auto_snapshot( skip_snapshot=skip_snapshot, - auto_snapshot=snapshot.auto_snapshot_before_destroy, + auto_snapshot=lambda: snapshot.snapshot_and_prune("pre-destroy", wait=True), echo=click.echo, echo_err=lambda msg: click.echo(msg, err=True), confirm_continue=lambda msg: click.confirm(msg), @@ -1079,14 +1069,12 @@ def backup_policy_status_cmd() -> None: @click.option("--daily-keep", default=1, type=int, show_default=True) @click.option("--weekly-keep", default=1, type=int, show_default=True) @click.option("--monthly-keep", default=1, type=int, show_default=True) -@click.option("--quarterly-keep", default=1, type=int, show_default=True) @click.option("--disabled", is_flag=True, help="Create/update policy in DISABLED state.") @require_aws_creds def backup_policy_apply_cmd( daily_keep: int, weekly_keep: int, monthly_keep: int, - quarterly_keep: int, disabled: bool, ) -> None: """Create or update the managed DLM backup policy.""" @@ -1101,7 +1089,6 @@ def backup_policy_apply_cmd( daily_keep=daily_keep, weekly_keep=weekly_keep, monthly_keep=monthly_keep, - quarterly_keep=quarterly_keep, enabled=not disabled, ) click.echo(json.dumps(result, indent=2)) @@ -1473,7 +1460,7 @@ def reprovision( snap_ids, result = run_reprovision_flow( info=info, skip_snapshot=skip_snapshot, - auto_snapshot=snapshot.auto_snapshot_before_destroy, + auto_snapshot=lambda: snapshot.snapshot_and_prune("pre-reprovision", wait=True), destroy_instance=lambda: ec2.destroy(force=True), cleanup_orphaned_volumes=lambda: cleanup_module.cleanup_orphaned_volumes( mode="delete", allow_delete_state=False diff --git a/edcloud/snapshot.py b/edcloud/snapshot.py index 126cc19..44dbeb4 100644 --- a/edcloud/snapshot.py +++ b/edcloud/snapshot.py @@ -8,6 +8,7 @@ from typing import Any from edcloud.config import ( + DEFAULT_SNAPSHOT_KEEP_LAST, MANAGER_TAG_KEY, MANAGER_TAG_VALUE, NAME_TAG, @@ -285,6 +286,47 @@ def auto_snapshot_before_destroy() -> list[str]: return snap_ids +def snapshot_and_prune( + description: str, + keep: int = DEFAULT_SNAPSHOT_KEEP_LAST, + wait: bool = True, +) -> list[str]: + """Prune → snapshot → prune, enforcing a hard cap of *keep* snapshots. + + The pre-prune heals drift from a previously failed post-prune. + The post-prune enforces the cap after the new snapshot is created. + Worst-case drift is +1 (snapshot succeeded, both prunes failed) — self-heals + on the next trigger. + + Args: + description: Snapshot description. + keep: Maximum snapshots to retain after the operation. + wait: If ``True``, wait for snapshots to reach completed state before returning. + + Returns: + List of created snapshot IDs, or empty list if no instance exists. + """ + ec2 = get_ec2_client() + inst = find_instance(ec2) + if not inst: + return [] + + # Pre-prune: heal any drift from a previously failed post-prune + prune_snapshots(keep_last=keep, dry_run=False) + + snap_ids = create_snapshot(description) + + # Post-prune: enforce the cap + prune_snapshots(keep_last=keep, dry_run=False) + + if wait and snap_ids: + log.info("Waiting for snapshot(s) to complete...") + wait_for_snapshot_completion(snap_ids) + log.info("Snapshot(s) completed.") + + return snap_ids + + def create_snapshot(description: str | None = None) -> list[str]: """Snapshot every EBS volume attached to the edcloud instance.