Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ edcloud/
- **Python + boto3 over Terraform:** small resource graph and tag-based ownership make stateful IaC overhead unnecessary here.
- **Tailscale-only access:** zero inbound SG rules; access is identity-based over tailnet.
- **Durable state volume + disposable root:** host runtime is replaceable; durable data lives under `/opt/edcloud/state`.
- **AWS-native backup lifecycle:** DLM retention + restore drills instead of replication/failover complexity.
- **AWS-native backup lifecycle:** DLM retention (daily/weekly/monthly/quarterly, 1 snapshot each tier) applied automatically on provision. DLM targets EBS volumes by tag and runs independently of instance state — snapshots accumulate whether the instance is up or down.
- **SSM-backed runtime secrets:** secrets stay out of git and host bootstrap payloads.
- **Cloud-init as baseline contract:** reproducible host/tooling baseline is codified in `cloud-init/user-data.yaml`.
- **CLI-first operations model:** commands must remain safe/repeatable from lightweight ARM/Linux operator nodes.
Expand Down Expand Up @@ -71,7 +71,10 @@ edcloud/

- AWS DLM policy management is implemented in `backup_policy.py`.
- Root volume remains disposable; state volume is durable and role-tagged.
<<<<<<< HEAD
- Cloud-init runs `loginctl enable-linger ubuntu` so user systemd services start at boot without a login session. Personal services (e.g. `rclone-dropbox.service`) are stored in `~/.config/systemd/user/` on the state volume and therefore survive reprovision automatically. Templates for optional user services live in `templates/operator/systemd-user/`.
- DLM backup policy is applied automatically on `edc provision`. It targets the state volume by tag and runs whether the instance is running or stopped. Retention: 1 daily (~1 day old), 1 weekly (~1 week old, Sundays), 1 monthly (~1 month old, 1st of month), 1 quarterly (~3 months old, 1st of Jan/Apr/Jul/Oct).
- `edc status` shows current backup policy state. `edc destroy` warns if no active policy is found.

## Non-goals

Expand Down
49 changes: 39 additions & 10 deletions edcloud/backup_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,16 @@ def _target_tags() -> list[dict[str, str]]:
]


def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dict[str, Any]:
def _policy_details(
daily_keep: int, weekly_keep: int, monthly_keep: int, quarterly_keep: int
) -> dict[str, Any]:
return {
"PolicyType": "EBS_SNAPSHOT_MANAGEMENT",
"ResourceTypes": ["VOLUME"],
"TargetTags": _target_tags(),
"Schedules": [
{
"Name": "daily-7",
"Name": "daily",
"CopyTags": True,
"CreateRule": {
"Interval": 24,
Expand All @@ -48,7 +50,7 @@ def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dic
],
},
{
"Name": "weekly-4",
"Name": "weekly",
"CopyTags": True,
"CreateRule": {
"CronExpression": "cron(0 4 ? * SUN *)",
Expand All @@ -61,7 +63,7 @@ def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dic
],
},
{
"Name": "monthly-2",
"Name": "monthly",
"CopyTags": True,
"CreateRule": {
"CronExpression": "cron(0 5 1 * ? *)",
Expand All @@ -73,6 +75,19 @@ def _policy_details(daily_keep: int, weekly_keep: int, monthly_keep: int) -> dic
{"Key": "edcloud:backup-tier", "Value": "monthly"},
],
},
{
"Name": "quarterly",
"CopyTags": True,
"CreateRule": {
"CronExpression": "cron(0 6 1 1,4,7,10 ? *)",
},
"RetainRule": {"Count": quarterly_keep},
"TagsToAdd": [
{"Key": MANAGER_TAG_KEY, "Value": MANAGER_TAG_VALUE},
{"Key": "Name", "Value": f"{NAME_TAG}-dlm-quarterly"},
{"Key": "edcloud:backup-tier", "Value": "quarterly"},
],
},
],
}

Expand Down Expand Up @@ -105,21 +120,33 @@ def policy_status() -> dict[str, Any]:
def ensure_policy(
*,
execution_role_arn: str,
daily_keep: int = 7,
weekly_keep: int = 4,
monthly_keep: int = 2,
daily_keep: int = 1,
weekly_keep: int = 1,
monthly_keep: int = 1,
quarterly_keep: int = 1,
enabled: bool = True,
) -> dict[str, Any]:
"""Create or update the managed DLM policy with tiered retention."""
if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0:
raise ValueError("daily_keep, weekly_keep, and monthly_keep must be > 0")
"""Create or update the managed DLM policy with tiered retention.

Default retention keeps exactly one snapshot per tier:
- daily: 1 snapshot (~1 day old)
- weekly: 1 snapshot (~1 week old, every Sunday)
- monthly: 1 snapshot (~1 month old, 1st of month)
- quarterly: 1 snapshot (~3 months old, 1st of Jan/Apr/Jul/Oct)

DLM targets EBS volumes by tag and runs independently of instance state,
so snapshots accumulate on schedule whether the instance is running or not.
"""
if daily_keep <= 0 or weekly_keep <= 0 or monthly_keep <= 0 or quarterly_keep <= 0:
raise ValueError("daily_keep, weekly_keep, monthly_keep, and quarterly_keep must be > 0")

dlm = _dlm_client()
state = "ENABLED" if enabled else "DISABLED"
details = _policy_details(
daily_keep=daily_keep,
weekly_keep=weekly_keep,
monthly_keep=monthly_keep,
quarterly_keep=quarterly_keep,
)
summary = _find_policy_summary()

Expand All @@ -138,6 +165,7 @@ def ensure_policy(
"daily_keep": daily_keep,
"weekly_keep": weekly_keep,
"monthly_keep": monthly_keep,
"quarterly_keep": quarterly_keep,
}

policy_id = summary["PolicyId"]
Expand All @@ -155,6 +183,7 @@ def ensure_policy(
"daily_keep": daily_keep,
"weekly_keep": weekly_keep,
"monthly_keep": monthly_keep,
"quarterly_keep": quarterly_keep,
}


Expand Down
40 changes: 37 additions & 3 deletions edcloud/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,17 @@ def provision(
require_existing_state_volume=require_existing_state_volume,
)
_print_audit_summary("post-provision")

click.echo()
click.echo("Applying DLM backup policy...")
try:
role_arn = iam.ensure_dlm_lifecycle_role({"edcloud:managed": "true", "Name": "edcloud"})
backup_policy.ensure_policy(execution_role_arn=role_arn)
click.echo("✅ DLM backup policy active (daily/weekly/monthly/quarterly, 1 snapshot each)")
except Exception as exc:
click.echo(f"Warning: backup policy setup failed: {exc}", err=True)
click.echo(" Run 'edc backup-policy apply' manually to enable backups.", err=True)

click.echo()
click.echo(json.dumps(result, indent=2))

Expand Down Expand Up @@ -740,6 +751,15 @@ def status() -> None:
click.echo(f" Storage: ${cost.get('storage_monthly', 0):.2f}")
click.echo(f" Total: ${cost.get('total_monthly', 0):.2f}")

# Backup policy
bp = backup_policy.policy_status()
click.echo()
if bp.get("exists"):
bp_state = bp.get("state", "UNKNOWN")
click.echo(f"Backups: DLM policy {bp.get('policy_id')} [{bp_state}]")
else:
click.echo("Backups: no DLM policy — run 'edc backup-policy apply' to enable")


# ---------------------------------------------------------------------------
# destroy
Expand Down Expand Up @@ -804,6 +824,17 @@ def destroy(
click.echo(str(exc), err=True)
raise SystemExit(1) from exc

# Warn if DLM backup policy is absent or disabled
bp = backup_policy.policy_status()
if not bp.get("exists") or bp.get("state") != "ENABLED":
bp_detail = (
"no DLM policy found" if not bp.get("exists") else f"policy state: {bp.get('state')}"
)
click.echo(f"Warning: backups are not active ({bp_detail}).", err=True)
click.echo(" The state volume has no recent automated snapshots.", err=True)
click.echo(" Run 'edc backup-policy apply' to enable backups.", err=True)
click.echo()

if info.get("exists") and require_fresh_snapshot:
recent = snapshot.find_recent_prechange_snapshot(fresh_snapshot_max_age_minutes)
if not recent:
Expand Down Expand Up @@ -1045,15 +1076,17 @@ def backup_policy_status_cmd() -> None:


@backup_policy_group.command("apply")
@click.option("--daily-keep", default=7, type=int, show_default=True)
@click.option("--weekly-keep", default=4, type=int, show_default=True)
@click.option("--monthly-keep", default=2, type=int, show_default=True)
@click.option("--daily-keep", default=1, type=int, show_default=True)
@click.option("--weekly-keep", default=1, type=int, show_default=True)
@click.option("--monthly-keep", default=1, type=int, show_default=True)
@click.option("--quarterly-keep", default=1, type=int, show_default=True)
@click.option("--disabled", is_flag=True, help="Create/update policy in DISABLED state.")
@require_aws_creds
def backup_policy_apply_cmd(
daily_keep: int,
weekly_keep: int,
monthly_keep: int,
quarterly_keep: int,
disabled: bool,
) -> None:
"""Create or update the managed DLM backup policy."""
Expand All @@ -1068,6 +1101,7 @@ def backup_policy_apply_cmd(
daily_keep=daily_keep,
weekly_keep=weekly_keep,
monthly_keep=monthly_keep,
quarterly_keep=quarterly_keep,
enabled=not disabled,
)
click.echo(json.dumps(result, indent=2))
Expand Down
41 changes: 33 additions & 8 deletions tests/test_backup_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,16 @@ def test_returns_policy_details_when_present(self, mock_dlm_client):
]
}
mock_dlm.get_lifecycle_policy.return_value = {
"Policy": {"PolicyDetails": {"Schedules": [{"Name": "daily-7"}]}}
"Policy": {
"PolicyDetails": {
"Schedules": [
{"Name": "daily"},
{"Name": "weekly"},
{"Name": "monthly"},
{"Name": "quarterly"},
]
}
}
}
mock_dlm_client.return_value = mock_dlm

Expand All @@ -43,13 +52,28 @@ def test_creates_when_missing(self, mock_dlm_client):

result = ensure_policy(
execution_role_arn="arn:aws:iam::123:role/edcloud-dlm",
daily_keep=7,
weekly_keep=4,
monthly_keep=2,
daily_keep=1,
weekly_keep=1,
monthly_keep=1,
quarterly_keep=1,
)
assert result["action"] == "created"
assert result["policy_id"] == "policy-new"
assert result["monthly_keep"] == 2
assert result["quarterly_keep"] == 1

@patch("edcloud.backup_policy._dlm_client")
def test_creates_includes_quarterly_schedule(self, mock_dlm_client):
mock_dlm = MagicMock()
mock_dlm.get_lifecycle_policies.return_value = {"Policies": []}
mock_dlm.create_lifecycle_policy.return_value = {"PolicyId": "policy-new"}
mock_dlm_client.return_value = mock_dlm

ensure_policy(execution_role_arn="arn:aws:iam::123:role/edcloud-dlm")

call_kwargs = mock_dlm.create_lifecycle_policy.call_args[1]
schedule_names = [s["Name"] for s in call_kwargs["PolicyDetails"]["Schedules"]]
assert "quarterly" in schedule_names
assert len(schedule_names) == 4

@patch("edcloud.backup_policy._dlm_client")
def test_updates_when_existing(self, mock_dlm_client):
Expand All @@ -63,9 +87,10 @@ def test_updates_when_existing(self, mock_dlm_client):

result = ensure_policy(
execution_role_arn="arn:aws:iam::123:role/edcloud-dlm",
daily_keep=7,
weekly_keep=4,
monthly_keep=2,
daily_keep=1,
weekly_keep=1,
monthly_keep=1,
quarterly_keep=1,
)
assert result["action"] == "updated"
mock_dlm.update_lifecycle_policy.assert_called_once()
Expand Down
7 changes: 4 additions & 3 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,10 @@ def test_backup_policy_apply_uses_defaults(
assert result.exit_code == 0
mock_ensure_policy.assert_called_once_with(
execution_role_arn="arn:aws:iam::123:role/edcloud-dlm-lifecycle-role",
daily_keep=7,
weekly_keep=4,
monthly_keep=2,
daily_keep=1,
weekly_keep=1,
monthly_keep=1,
quarterly_keep=1,
enabled=True,
)

Expand Down