From 8f98e6542b5b6342dc8f8260c5e8d13b4bcd1ece Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Fri, 20 Feb 2026 09:27:39 +0300 Subject: [PATCH 01/17] backup --- kubemarine/procedures/backup.py | 29 ++++++++ kubemarine/procedures/restore.py | 5 ++ kubemarine/resources/schemas/backup.json | 16 +++++ kubemarine/resources/schemas/restore.json | 4 ++ kubemarine/templates/etcd_backup.yaml.j2 | 82 +++++++++++++++++++++++ 5 files changed, 136 insertions(+) create mode 100644 kubemarine/templates/etcd_backup.yaml.j2 diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 48c588605..667b58f60 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -22,10 +22,12 @@ import threading import time import uuid +import io from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor from queue import Queue, Empty from typing import List, Tuple, Union, Dict, Optional, Iterator, Literal +from jinja2 import Template import yaml @@ -158,6 +160,33 @@ def export_nodes(cluster: KubernetesCluster) -> None: def export_etcd(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + path_to_yaml = '/tmp/etcd_backup.yaml' + if not cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('etcdctl_image', {}): + raise Exception('ETCDCTL image is not set') + if not cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('storage_class', {}): + raise Exception('PVC StorageClass is not set') + if not cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('busybox_image', {}): + raise Exception('BusyBox image is not set') + backup_yaml = utils.read_internal('templates/etcd_backup.yaml.j2') + first_control_plane = cluster.nodes['control-plane'].get_first_member() + config = Template(backup_yaml).render(hostname=first_control_plane.get_node_name(), + etcdctl_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('etcdctl_image', {}), + storage_class=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('storage_class', {}), + busybox_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('busybox_image', {})) + first_control_plane.put(io.StringIO(config), path_to_yaml, sudo=True, mkdir=True) + if cluster.procedure_inventory['backup_plan']['etcd']['cron_job'] == "enabled" : + cluster.log.verbose(f'Applying {path_to_yaml} file') + first_control_plane.sudo(f'kubectl apply -f {path_to_yaml}') + elif cluster.procedure_inventory['backup_plan']['etcd']['cron_job'] == "disabled" : + cluster.log.verbose(f'Deleting resource from {path_to_yaml} file') + first_control_plane.sudo(f'kubectl delete -f {path_to_yaml}') + else: + raise Exception('Unknown option. It must be `enabled` or `disabled`') + cluster.log.verbose(f'Removing {path_to_yaml} file') + first_control_plane.sudo(f'rm -f {path_to_yaml}') + return + backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) etcd_node, is_custom_etcd_node = select_etcd_node(cluster) cluster.context['backup_descriptor']['etcd']['image'] = retrieve_etcd_image(etcd_node) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 211f76f50..297702a73 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -163,6 +163,11 @@ def import_etcd(cluster: KubernetesCluster) -> None: cluster.log.verbose('ETCD will be restored from the following image: ' + etcd_image) cluster.log.debug('Uploading ETCD snapshot...') + # TODO: Custom path to ETCD snapshot + #snap_to_copy = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}) + # Check if snapshot exists + # Copy snapshot from first control-plane node to backup_location + # TODO: Other steps must be omitted snap_name = '/var/lib/etcd/etcd-snapshot%s.db' % int(round(time.time() * 1000)) cluster.nodes['control-plane'].put(os.path.join(cluster.context['backup_tmpdir'], 'etcd.db'), snap_name, sudo=True, compare_hashes=True) diff --git a/kubemarine/resources/schemas/backup.json b/kubemarine/resources/schemas/backup.json index 1d4bcf82f..ec1b8d868 100644 --- a/kubemarine/resources/schemas/backup.json +++ b/kubemarine/resources/schemas/backup.json @@ -18,6 +18,22 @@ "source_node": { "$ref": "definitions/common/node_ref.json#/definitions/Name", "description": "The name of the node to create a snapshot from. The node must be a control-plane and have an ETCD data located on it." + }, + "cron_job": { + "type": "string", + "description": "Create the CronJob for regular ETCD snapshots" + }, + "storage_class": { + "type": "string", + "description": "StorageClass that will be used to create PersistentVolumeClaim" + }, + "etcdctl_image": { + "type": "string", + "description": "Docker image with bash and etcdctl on board" + }, + "busybox_image": { + "type": "string", + "description": "Docker image with find util on board" } }, "additionalProperties": false diff --git a/kubemarine/resources/schemas/restore.json b/kubemarine/resources/schemas/restore.json index 5ce497fd8..8c77715e2 100644 --- a/kubemarine/resources/schemas/restore.json +++ b/kubemarine/resources/schemas/restore.json @@ -14,6 +14,10 @@ "type": "object", "description": "Additional parameters for ETCD restore", "properties": { + "snapshot": { + "type": "string", + "description": "Full path to the ETCD snapshot, that will be used instead the default one." + }, "image": { "type": "string", "description": "Full name of the ETCD image, including the registry address. On its basis, the restoration is performed." diff --git a/kubemarine/templates/etcd_backup.yaml.j2 b/kubemarine/templates/etcd_backup.yaml.j2 new file mode 100644 index 000000000..020184544 --- /dev/null +++ b/kubemarine/templates/etcd_backup.yaml.j2 @@ -0,0 +1,82 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: etcd-backup + namespace: kube-system +spec: + accessModes: + - ReadWriteOnce + storageClassName: {{ storage_class }} + resources: + requests: + storage: 50Gi +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: etcd-backup + namespace: kube-system +spec: + concurrencyPolicy: Allow + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + spec: + containers: + - args: + - -c + - etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt + --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key + snapshot save /backup/etcd-snapshot-$(date +%Y%m%d_%H%M%S).db + command: + - /bin/sh + env: + - name: ETCDCTL_API + value: "3" + image: {{ etcdctl_image }} + imagePullPolicy: IfNotPresent + name: backup + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /etc/kubernetes/pki/etcd + name: etcd-certs + readOnly: true + - mountPath: /backup + name: backup + - args: + - -c + - find /backup -type f -mmin +20 -name '*.db' -exec rm -- '{}' \; + command: + - /bin/sh + image: {{ busybox_image }} + imagePullPolicy: IfNotPresent + name: backup-purge + volumeMounts: + - mountPath: /backup + name: backup + dnsPolicy: ClusterFirst + hostNetwork: true + nodeSelector: + kubernetes.io/hostname: {{ hostname }} + restartPolicy: OnFailure + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - hostPath: + path: /etc/kubernetes/pki/etcd + type: DirectoryOrCreate + name: etcd-certs + - name: backup + persistentVolumeClaim: + claimName: etcd-backup + schedule: "*/5 * * * *" + successfulJobsHistoryLimit: 3 + suspend: false From f10f8ba8803996ec3818bfb7718fd1b138bddfdd Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Mon, 23 Feb 2026 13:15:11 +0300 Subject: [PATCH 02/17] backup/restore --- kubemarine/procedures/backup.py | 6 +++--- kubemarine/procedures/restore.py | 20 ++++++++++++++++---- kubemarine/templates/etcd_backup.yaml.j2 | 12 +++++++++--- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 667b58f60..9f7b56f89 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -176,14 +176,14 @@ def export_etcd(cluster: KubernetesCluster) -> None: busybox_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('busybox_image', {})) first_control_plane.put(io.StringIO(config), path_to_yaml, sudo=True, mkdir=True) if cluster.procedure_inventory['backup_plan']['etcd']['cron_job'] == "enabled" : - cluster.log.verbose(f'Applying {path_to_yaml} file') + cluster.log.debug(f'Applying {path_to_yaml} file') first_control_plane.sudo(f'kubectl apply -f {path_to_yaml}') elif cluster.procedure_inventory['backup_plan']['etcd']['cron_job'] == "disabled" : - cluster.log.verbose(f'Deleting resource from {path_to_yaml} file') + cluster.log.debug(f'Deleting resource from {path_to_yaml} file') first_control_plane.sudo(f'kubectl delete -f {path_to_yaml}') else: raise Exception('Unknown option. It must be `enabled` or `disabled`') - cluster.log.verbose(f'Removing {path_to_yaml} file') + cluster.log.debug(f'Removing {path_to_yaml} file') first_control_plane.sudo(f'rm -f {path_to_yaml}') return diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 297702a73..296a8271a 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -70,6 +70,12 @@ def unpack_data(resources: DynamicResources) -> None: raise Exception('Backup source not specified in procedure') backup_file_source = utils.get_external_resource_path(backup_file_source) + + if resources.procedure_inventory().get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + if not os.path.isdir(backup_file_source): + raise FileNotFoundError('Backup location "%s" not found' % backup_file_source) + return + if not os.path.isfile(backup_file_source): raise FileNotFoundError('Backup file "%s" not found' % backup_file_source) @@ -164,10 +170,16 @@ def import_etcd(cluster: KubernetesCluster) -> None: cluster.log.debug('Uploading ETCD snapshot...') # TODO: Custom path to ETCD snapshot - #snap_to_copy = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}) - # Check if snapshot exists - # Copy snapshot from first control-plane node to backup_location - # TODO: Other steps must be omitted + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + cluster.log.debug('The particular snapshot will be used') + path_to_snap = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}) + first_control_plane = cluster.nodes['control-plane'].get_first_member() + # Check if snapshot exists + #first_control_plane.sudo(f'ls {path_to_snap}') + # Copy snapshot from first control-plane node to backup_location + cluster.log.debug('Coping snapshot from first control-plane node to the backup folder') + first_control_plane.get(path_to_snap, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db')) + snap_name = '/var/lib/etcd/etcd-snapshot%s.db' % int(round(time.time() * 1000)) cluster.nodes['control-plane'].put(os.path.join(cluster.context['backup_tmpdir'], 'etcd.db'), snap_name, sudo=True, compare_hashes=True) diff --git a/kubemarine/templates/etcd_backup.yaml.j2 b/kubemarine/templates/etcd_backup.yaml.j2 index 020184544..837bd649b 100644 --- a/kubemarine/templates/etcd_backup.yaml.j2 +++ b/kubemarine/templates/etcd_backup.yaml.j2 @@ -26,12 +26,18 @@ spec: spec: containers: - args: + - | + etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y%m%d_%H%M%S).db + chmod 0644 /backup/*.db + command: + - /bin/sh - -c - - etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt - --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key - snapshot save /backup/etcd-snapshot-$(date +%Y%m%d_%H%M%S).db + - args: + - | + find /backup -type f -mmin +20 -name '*.db' -exec rm -- '{}' \; command: - /bin/sh + - -c env: - name: ETCDCTL_API value: "3" From f3b1d2e5e3c064dc23e9c764b40b78ff0523d84d Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Mon, 9 Mar 2026 08:47:34 +0300 Subject: [PATCH 03/17] feat: excluded tasks --- kubemarine/procedures/backup.py | 18 ++++++++++++++++++ kubemarine/procedures/restore.py | 10 ++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 9f7b56f89..c92a8e2bc 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -85,6 +85,8 @@ def prepare_backup_tmpdir(logger: log.EnhancedLogger, context: dict) -> str: def verify_backup_location(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return target = utils.get_external_resource_path(cluster.procedure_inventory.get('backup_location', 'backup.tar.gz')) if not os.path.isdir(target) and not os.path.isdir(os.path.abspath(os.path.join(target, os.pardir))): raise FileNotFoundError('Backup location directory not exists') @@ -98,6 +100,8 @@ def export_ansible_inventory(cluster: KubernetesCluster) -> None: def export_packages_list(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return cluster.context['backup_descriptor']['nodes']['packages'] = {} if cluster.get_os_family() in ['rhel8', 'rhel9']: cmd = r"rpm -qa" @@ -109,6 +113,8 @@ def export_packages_list(cluster: KubernetesCluster) -> None: def export_hostname(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return cluster.context['backup_descriptor']['nodes']['hostnames'] = {} results = cluster.nodes['all'].sudo('hostnamectl status | head -n 1 | sed -e \'s/[a-zA-Z ]*://g\'') cluster.log.verbose(results) @@ -117,6 +123,8 @@ def export_hostname(cluster: KubernetesCluster) -> None: def export_cluster_yaml(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) shutil.copyfile(utils.get_dump_filepath(cluster.context, 'cluster.yaml'), os.path.join(backup_directory, 'cluster.yaml')) @@ -126,6 +134,8 @@ def export_cluster_yaml(cluster: KubernetesCluster) -> None: def export_nodes(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) backup_nodes_data_dir = os.path.join(backup_directory, 'nodes_data') os.mkdir(backup_nodes_data_dir) @@ -268,6 +278,8 @@ def retrieve_etcd_image(etcd_node: NodeGroup) -> str: def export_kubernetes_version(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return control_plane = cluster.nodes['control-plane'].get_any_member() version = control_plane.sudo('kubectl get nodes --no-headers | head -n 1 | awk \'{print $5; exit}\'').get_simple_out() cluster.context['backup_descriptor']['kubernetes']['version'] = version.strip() @@ -613,6 +625,8 @@ def _download(self, task: DownloaderPayload, temp_local_filepath: str) -> None: def export_kubernetes(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) control_plane = cluster.nodes['control-plane'].get_any_member() backup_kubernetes = cluster.procedure_inventory.get('backup_plan', {}).get('kubernetes', {}) @@ -738,6 +752,8 @@ def _graceful_shutdown_downloaders() -> Optional[BaseException]: def make_descriptor(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) cluster.context['backup_descriptor']['kubernetes']['thirdparties'] = cluster.inventory['services']['thirdparties'] @@ -748,6 +764,8 @@ def make_descriptor(cluster: KubernetesCluster) -> None: def pack_data(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return cluster_name = cluster.inventory['cluster_name'] backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 296a8271a..0d3bb0cc9 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -124,6 +124,8 @@ def import_nodes_data(cluster: KubernetesCluster) -> None: def restore_dns_resolv_conf(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + return import_nodes_data(cluster) unpack_cmd = "sudo tar xzvf /tmp/kubemarine-backup.tar.gz -C / --overwrite /etc/resolv.conf" @@ -136,10 +138,14 @@ def restore_dns_resolv_conf(cluster: KubernetesCluster) -> None: def restore_thirdparties(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + return install.system_prepare_thirdparties(cluster) def import_nodes(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + return if not cluster.is_task_completed('restore.dns.resolv_conf'): import_nodes_data(cluster) @@ -169,12 +175,12 @@ def import_etcd(cluster: KubernetesCluster) -> None: cluster.log.verbose('ETCD will be restored from the following image: ' + etcd_image) cluster.log.debug('Uploading ETCD snapshot...') - # TODO: Custom path to ETCD snapshot + # Custom path to ETCD snapshot if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): cluster.log.debug('The particular snapshot will be used') path_to_snap = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}) first_control_plane = cluster.nodes['control-plane'].get_first_member() - # Check if snapshot exists + # TODO: Check if snapshot exists #first_control_plane.sudo(f'ls {path_to_snap}') # Copy snapshot from first control-plane node to backup_location cluster.log.debug('Coping snapshot from first control-plane node to the backup folder') From 3c2da8046408953f45e13dd1a0949a7fa439d8aa Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:35:17 +0300 Subject: [PATCH 04/17] feature: comment --- kubemarine/procedures/backup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index c92a8e2bc..9d2adfd40 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -180,6 +180,7 @@ def export_etcd(cluster: KubernetesCluster) -> None: raise Exception('BusyBox image is not set') backup_yaml = utils.read_internal('templates/etcd_backup.yaml.j2') first_control_plane = cluster.nodes['control-plane'].get_first_member() + # TODO: retention policy config = Template(backup_yaml).render(hostname=first_control_plane.get_node_name(), etcdctl_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('etcdctl_image', {}), storage_class=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('storage_class', {}), From 47744d8d83db187b435510204116f83113d852bc Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Tue, 10 Mar 2026 19:06:12 +0300 Subject: [PATCH 05/17] feature: rework --- kubemarine/procedures/backup.py | 25 ++++------- kubemarine/resources/schemas/backup.json | 53 ++++++++++++++++------- kubemarine/resources/schemas/restore.json | 2 +- kubemarine/templates/etcd_backup.yaml.j2 | 17 +++----- 4 files changed, 54 insertions(+), 43 deletions(-) diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 9d2adfd40..47781e5d9 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -172,30 +172,23 @@ def export_nodes(cluster: KubernetesCluster) -> None: def export_etcd(cluster: KubernetesCluster) -> None: if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): path_to_yaml = '/tmp/etcd_backup.yaml' - if not cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('etcdctl_image', {}): - raise Exception('ETCDCTL image is not set') - if not cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('storage_class', {}): - raise Exception('PVC StorageClass is not set') - if not cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('busybox_image', {}): - raise Exception('BusyBox image is not set') + retention=int(cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_depth', {}))*60 backup_yaml = utils.read_internal('templates/etcd_backup.yaml.j2') first_control_plane = cluster.nodes['control-plane'].get_first_member() - # TODO: retention policy config = Template(backup_yaml).render(hostname=first_control_plane.get_node_name(), - etcdctl_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('etcdctl_image', {}), - storage_class=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('storage_class', {}), - busybox_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('busybox_image', {})) + etcdctl_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('etcdctl_image', {}), + storage_class=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_class', {}), + storage_name=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_name', {}), + busybox_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('busybox_image', {}), + schedule=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('schedule', {}), + retention=retention) first_control_plane.put(io.StringIO(config), path_to_yaml, sudo=True, mkdir=True) - if cluster.procedure_inventory['backup_plan']['etcd']['cron_job'] == "enabled" : + if cluster.procedure_inventory['backup_plan']['etcd']['cron_job']['enabled'] : cluster.log.debug(f'Applying {path_to_yaml} file') first_control_plane.sudo(f'kubectl apply -f {path_to_yaml}') - elif cluster.procedure_inventory['backup_plan']['etcd']['cron_job'] == "disabled" : + else: cluster.log.debug(f'Deleting resource from {path_to_yaml} file') first_control_plane.sudo(f'kubectl delete -f {path_to_yaml}') - else: - raise Exception('Unknown option. It must be `enabled` or `disabled`') - cluster.log.debug(f'Removing {path_to_yaml} file') - first_control_plane.sudo(f'rm -f {path_to_yaml}') return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) diff --git a/kubemarine/resources/schemas/backup.json b/kubemarine/resources/schemas/backup.json index ec1b8d868..1090646b6 100644 --- a/kubemarine/resources/schemas/backup.json +++ b/kubemarine/resources/schemas/backup.json @@ -20,21 +20,44 @@ "description": "The name of the node to create a snapshot from. The node must be a control-plane and have an ETCD data located on it." }, "cron_job": { - "type": "string", - "description": "Create the CronJob for regular ETCD snapshots" - }, - "storage_class": { - "type": "string", - "description": "StorageClass that will be used to create PersistentVolumeClaim" - }, - "etcdctl_image": { - "type": "string", - "description": "Docker image with bash and etcdctl on board" - }, - "busybox_image": { - "type": "string", - "description": "Docker image with find util on board" - } + "type": "object", + "description": "Create the CronJob for regular ETCD snapshots", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enabling CronJob" + }, + "storage_class": { + "type": "string", + "default": "local-path", + "description": "StorageClass that will be used to create PersistentVolumeClaim" + }, + "storage_name": { + "type": "string", + "default": "etcd-backup", + "description": "Name of the PersistentVolumeClaim" + }, + "etcdctl_image": { + "type": "string", + "description": "Docker image with bash and etcdctl on board" + }, + "busybox_image": { + "type": "string", + "description": "Docker image with find util on board" + }, + "schedule": { + "type": "string", + "default": "*/5 * * * *", + "description": "Crontab schedule" + }, + "storage_depth": { + "type": "integer", + "minimal": 1, + "default": 5, + "description": "Backups retention time(hours)" + } + } + } }, "additionalProperties": false }, diff --git a/kubemarine/resources/schemas/restore.json b/kubemarine/resources/schemas/restore.json index 8c77715e2..87573773c 100644 --- a/kubemarine/resources/schemas/restore.json +++ b/kubemarine/resources/schemas/restore.json @@ -16,7 +16,7 @@ "properties": { "snapshot": { "type": "string", - "description": "Full path to the ETCD snapshot, that will be used instead the default one." + "description": "Full path to the ETCD snapshot, that will be used instead of the default one." }, "image": { "type": "string", diff --git a/kubemarine/templates/etcd_backup.yaml.j2 b/kubemarine/templates/etcd_backup.yaml.j2 index 837bd649b..5c25c3bd8 100644 --- a/kubemarine/templates/etcd_backup.yaml.j2 +++ b/kubemarine/templates/etcd_backup.yaml.j2 @@ -2,7 +2,7 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: etcd-backup + name: {{ storage_name }} namespace: kube-system spec: accessModes: @@ -32,12 +32,6 @@ spec: command: - /bin/sh - -c - - args: - - | - find /backup -type f -mmin +20 -name '*.db' -exec rm -- '{}' \; - command: - - /bin/sh - - -c env: - name: ETCDCTL_API value: "3" @@ -54,10 +48,11 @@ spec: - mountPath: /backup name: backup - args: - - -c - - find /backup -type f -mmin +20 -name '*.db' -exec rm -- '{}' \; + - | + find /backup -type f -mmin +{{ retention }} -name '*.db' -exec rm -- '{}' \; command: - /bin/sh + - -c image: {{ busybox_image }} imagePullPolicy: IfNotPresent name: backup-purge @@ -82,7 +77,7 @@ spec: name: etcd-certs - name: backup persistentVolumeClaim: - claimName: etcd-backup - schedule: "*/5 * * * *" + claimName: {{ storage_name }} + schedule: "{{ schedule }}" successfulJobsHistoryLimit: 3 suspend: false From c9afdf92484cdd0217fdb2ad1de5e00057ae6b46 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:03:24 +0300 Subject: [PATCH 06/17] feature: enabling/disabling --- kubemarine/procedures/backup.py | 6 ++++-- kubemarine/templates/etcd_backup.yaml.j2 | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 47781e5d9..b7a8534db 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -184,10 +184,12 @@ def export_etcd(cluster: KubernetesCluster) -> None: retention=retention) first_control_plane.put(io.StringIO(config), path_to_yaml, sudo=True, mkdir=True) if cluster.procedure_inventory['backup_plan']['etcd']['cron_job']['enabled'] : - cluster.log.debug(f'Applying {path_to_yaml} file') + cluster.log.debug(f'Enabling periodic ETCD backup') + cluster.log.verbose(f'Deleting resources from {path_to_yaml} file') first_control_plane.sudo(f'kubectl apply -f {path_to_yaml}') else: - cluster.log.debug(f'Deleting resource from {path_to_yaml} file') + cluster.log.debug(f'Disabling periodic ETCD backup') + cluster.log.verbose(f'Deleting resources from {path_to_yaml} file') first_control_plane.sudo(f'kubectl delete -f {path_to_yaml}') return diff --git a/kubemarine/templates/etcd_backup.yaml.j2 b/kubemarine/templates/etcd_backup.yaml.j2 index 5c25c3bd8..6c714b3aa 100644 --- a/kubemarine/templates/etcd_backup.yaml.j2 +++ b/kubemarine/templates/etcd_backup.yaml.j2 @@ -27,7 +27,7 @@ spec: containers: - args: - | - etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y%m%d_%H%M%S).db + etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y%m%d_%H%M%S)_${RANDOM}.db chmod 0644 /backup/*.db command: - /bin/sh From 246df5da5dd9fb9de5f47afd003c7bceb09fe8c4 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:58:44 +0300 Subject: [PATCH 07/17] feature: backup docs --- documentation/Maintenance.md | 43 ++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/documentation/Maintenance.md b/documentation/Maintenance.md index bfacddc53..327860ac2 100644 --- a/documentation/Maintenance.md +++ b/documentation/Maintenance.md @@ -7,7 +7,9 @@ This section describes the features and steps for performing maintenance procedu - [Software Upgrade Patches](#software-upgrade-patches) - [Upgrade Procedure](#upgrade-procedure) - [Backup Procedure](#backup-procedure) + - [Periodic ETCD Backups](#periodic-etcd-backups) - [Restore Procedure](#restore-procedure) + - [Restoration From Periodic ETCD Backups](#restoration-from-periodic-etcd-backups) - [Add Node Procedure](#add-node-procedure) - [Operating System Migration](#operating-system-migration) - [Remove Node Procedure](#remove-node-procedure) @@ -707,6 +709,47 @@ The `backup` procedure executes the following sequence of tasks: * make_descriptor * pack +### Periodic ETCD backups + +It's posible to set the periodic ETCD backups via CronJob. The procedure config for that case is the following + +```yaml +backup_location: '/tmp/tmp_folder' +backup_plan: + etcd: + cron_job: + enabled: true + storage_class: "local-path" + storage_name: "etcd-backup" + etcdctl_image: ghcr.io/netcracker/etcdctl:0.0.1 + busybox_image: busybox:1.37.0 + schedule: "*/5 * * * *" + storage_depth: 5 +``` + +`enabled` is a switcher to create or delete the CronJob +`storage_class` is StorageClasss that is used to create a PersistentVolume for backups +`storage_name` is PersistentVolumeClaim name +`etcdctl_image` is Docker image with etcdctl and additional utilities on board +`busybox_image` is Docker image with Linux shell +`schedule` is a crontab notation schedule +`storage_depth` is a storage time in days + +To disable the existing CronJob procedure config is the following: + +```yaml +backup_location: '/tmp/tmp_folder' +backup_plan: + etcd: + cron_job: + enabled: false +``` + +The procedure runs only the following tasks: + +* verify_backup_location +* export + * etcd ## Restore Procedure From b5544e92cb769c48a17bc030e454b823190bda00 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Wed, 11 Mar 2026 16:11:50 +0300 Subject: [PATCH 08/17] feature: docs --- documentation/Maintenance.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/documentation/Maintenance.md b/documentation/Maintenance.md index 327860ac2..790162aae 100644 --- a/documentation/Maintenance.md +++ b/documentation/Maintenance.md @@ -735,6 +735,14 @@ backup_plan: `schedule` is a crontab notation schedule `storage_depth` is a storage time in days +After the enabling, the CronJob must be created in `kube-system` Namespace: + +```shell +$ kubectl -n kube-system get cronjob +NAME SCHEDULE TIMEZONE SUSPEND ACTIVE LAST SCHEDULE AGE +etcd-backup */5 * * * * False 0 35s +``` + To disable the existing CronJob procedure config is the following: ```yaml @@ -832,6 +840,22 @@ The `restore` procedure executes the following sequence of tasks: * etcd * reboot +### Restore From Periodic Backup + +To restore the existing periodic backup the procedure config should be like the following: + +```yaml +backup_location: /tmp/backups + +restore_plan: + etcd: + image: registry.k8s.io/etcd:3.6.6-0 + snapshot: /opt/local-path-provisioner/pvc-e3b0d6c5-495d-4887-90d9-000d6b3d4d00_kube-system_etcd-backup/etcd-snapshot-20260220_103000.db +``` + +**Notices**: +* Images must be chosen according to the ETCD version that has been used originally to create a backup +* Path to the snapshot must be checked preliminary ## Add Node Procedure From 18b3497027310d3ca8f3408a162f32a217fe60e8 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:47:51 +0300 Subject: [PATCH 09/17] feat: docs, last snapshot --- documentation/Maintenance.md | 10 +++++++--- kubemarine/procedures/backup.py | 1 + kubemarine/procedures/restore.py | 18 +++++++++++++++--- kubemarine/resources/schemas/backup.json | 6 ++++-- kubemarine/templates/etcd_backup.yaml.j2 | 2 +- 5 files changed, 28 insertions(+), 9 deletions(-) diff --git a/documentation/Maintenance.md b/documentation/Maintenance.md index 790162aae..09de7abb9 100644 --- a/documentation/Maintenance.md +++ b/documentation/Maintenance.md @@ -7,9 +7,7 @@ This section describes the features and steps for performing maintenance procedu - [Software Upgrade Patches](#software-upgrade-patches) - [Upgrade Procedure](#upgrade-procedure) - [Backup Procedure](#backup-procedure) - - [Periodic ETCD Backups](#periodic-etcd-backups) - [Restore Procedure](#restore-procedure) - - [Restoration From Periodic ETCD Backups](#restoration-from-periodic-etcd-backups) - [Add Node Procedure](#add-node-procedure) - [Operating System Migration](#operating-system-migration) - [Remove Node Procedure](#remove-node-procedure) @@ -721,6 +719,7 @@ backup_plan: enabled: true storage_class: "local-path" storage_name: "etcd-backup" + storage_size: "50Gi" etcdctl_image: ghcr.io/netcracker/etcdctl:0.0.1 busybox_image: busybox:1.37.0 schedule: "*/5 * * * *" @@ -730,10 +729,13 @@ backup_plan: `enabled` is a switcher to create or delete the CronJob `storage_class` is StorageClasss that is used to create a PersistentVolume for backups `storage_name` is PersistentVolumeClaim name +`storage_size` is PersistentVolume size `etcdctl_image` is Docker image with etcdctl and additional utilities on board `busybox_image` is Docker image with Linux shell `schedule` is a crontab notation schedule -`storage_depth` is a storage time in days +`storage_depth` is a storage time in hours + +**Warning**: Do not use StorageClass with `reclaimPolicy: Delete` if you wat to keep snapshots after disabling periodic backups. After the enabling, the CronJob must be created in `kube-system` Namespace: @@ -743,6 +745,8 @@ NAME SCHEDULE TIMEZONE SUSPEND ACTIVE LAST SCHEDULE AGE etcd-backup */5 * * * * False 0 35s ``` +That CronJob runs two scripts periodically. The first one create ETCD snapshot with the name like `etcd-snapshot-20260311_114008_15743.db` on PersistentVolume. The second one delete the snapshots with the age more than `ststorage_depth` + To disable the existing CronJob procedure config is the following: ```yaml diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index b7a8534db..41d56f0bd 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -179,6 +179,7 @@ def export_etcd(cluster: KubernetesCluster) -> None: etcdctl_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('etcdctl_image', {}), storage_class=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_class', {}), storage_name=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_name', {}), + storage_size=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_size', {}), busybox_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('busybox_image', {}), schedule=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('schedule', {}), retention=retention) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 0d3bb0cc9..a83904d67 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -180,11 +180,23 @@ def import_etcd(cluster: KubernetesCluster) -> None: cluster.log.debug('The particular snapshot will be used') path_to_snap = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}) first_control_plane = cluster.nodes['control-plane'].get_first_member() - # TODO: Check if snapshot exists - #first_control_plane.sudo(f'ls {path_to_snap}') + result = first_control_plane.sudo(f'file -b {path_to_snap}').get_simple_out().split('\n')[0] + ############# + cluster.log.debug(f'RES: "{result}"') + if "directory" == result : + # Getting the latest snapshot + last_snapshot = first_control_plane.sudo(f'ls -1tr {path_to_snap} | tail -n 1').get_simple_out().split('\n')[0] + snapshot = f'{path_to_snap}/{last_snapshot}' + elif "data" == result : + # Will work with particular snapshot + snapshot = path_to_snap + else: + raise Exception("ETCD snapshot is incorrect or doesn't exist") + ############## + cluster.log.debug(f'SNAPSHOT: {snapshot}') # Copy snapshot from first control-plane node to backup_location cluster.log.debug('Coping snapshot from first control-plane node to the backup folder') - first_control_plane.get(path_to_snap, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db')) + first_control_plane.get(snapshot, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db')) snap_name = '/var/lib/etcd/etcd-snapshot%s.db' % int(round(time.time() * 1000)) cluster.nodes['control-plane'].put(os.path.join(cluster.context['backup_tmpdir'], 'etcd.db'), snap_name, diff --git a/kubemarine/resources/schemas/backup.json b/kubemarine/resources/schemas/backup.json index 1090646b6..85eeb81ea 100644 --- a/kubemarine/resources/schemas/backup.json +++ b/kubemarine/resources/schemas/backup.json @@ -37,6 +37,10 @@ "default": "etcd-backup", "description": "Name of the PersistentVolumeClaim" }, + "storage_size": { + "type": "string", + "description": "Size of the PersistentVolume" + }, "etcdctl_image": { "type": "string", "description": "Docker image with bash and etcdctl on board" @@ -47,13 +51,11 @@ }, "schedule": { "type": "string", - "default": "*/5 * * * *", "description": "Crontab schedule" }, "storage_depth": { "type": "integer", "minimal": 1, - "default": 5, "description": "Backups retention time(hours)" } } diff --git a/kubemarine/templates/etcd_backup.yaml.j2 b/kubemarine/templates/etcd_backup.yaml.j2 index 6c714b3aa..ca1d53733 100644 --- a/kubemarine/templates/etcd_backup.yaml.j2 +++ b/kubemarine/templates/etcd_backup.yaml.j2 @@ -10,7 +10,7 @@ spec: storageClassName: {{ storage_class }} resources: requests: - storage: 50Gi + storage: {{ storage_size }} --- apiVersion: batch/v1 kind: CronJob From 3c016e99e201c63443b9bfc645e595ddc0cb2336 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:49:06 +0300 Subject: [PATCH 10/17] fix --- kubemarine/procedures/restore.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index a83904d67..14e384363 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -181,8 +181,6 @@ def import_etcd(cluster: KubernetesCluster) -> None: path_to_snap = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}) first_control_plane = cluster.nodes['control-plane'].get_first_member() result = first_control_plane.sudo(f'file -b {path_to_snap}').get_simple_out().split('\n')[0] - ############# - cluster.log.debug(f'RES: "{result}"') if "directory" == result : # Getting the latest snapshot last_snapshot = first_control_plane.sudo(f'ls -1tr {path_to_snap} | tail -n 1').get_simple_out().split('\n')[0] @@ -192,8 +190,6 @@ def import_etcd(cluster: KubernetesCluster) -> None: snapshot = path_to_snap else: raise Exception("ETCD snapshot is incorrect or doesn't exist") - ############## - cluster.log.debug(f'SNAPSHOT: {snapshot}') # Copy snapshot from first control-plane node to backup_location cluster.log.debug('Coping snapshot from first control-plane node to the backup folder') first_control_plane.get(snapshot, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db')) From 9bc6caf386b8081f6076bc12361ac18830cf4504 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:27:50 +0300 Subject: [PATCH 11/17] fix --- kubemarine/procedures/backup.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 41d56f0bd..7854241a9 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -172,19 +172,20 @@ def export_nodes(cluster: KubernetesCluster) -> None: def export_etcd(cluster: KubernetesCluster) -> None: if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): path_to_yaml = '/tmp/etcd_backup.yaml' - retention=int(cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_depth', {}))*60 + retention = int(cluster.procedure_inventory.get('backup_plan', {}).get('etcd', + {}).get('cron_job', {}).get('storage_depth', {}))*60 backup_yaml = utils.read_internal('templates/etcd_backup.yaml.j2') first_control_plane = cluster.nodes['control-plane'].get_first_member() config = Template(backup_yaml).render(hostname=first_control_plane.get_node_name(), - etcdctl_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('etcdctl_image', {}), - storage_class=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_class', {}), - storage_name=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_name', {}), - storage_size=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_size', {}), - busybox_image=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('busybox_image', {}), - schedule=cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('schedule', {}), - retention=retention) + etcdctl_image=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('etcdctl_image', ''), + storage_class=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_class', ''), + storage_name=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_name', ''), + storage_size=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_size', ''), + busybox_image=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('busybox_image', ''), + schedule=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('schedule', ''), + retention=retention) first_control_plane.put(io.StringIO(config), path_to_yaml, sudo=True, mkdir=True) - if cluster.procedure_inventory['backup_plan']['etcd']['cron_job']['enabled'] : + if cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('enabled', False) : cluster.log.debug(f'Enabling periodic ETCD backup') cluster.log.verbose(f'Deleting resources from {path_to_yaml} file') first_control_plane.sudo(f'kubectl apply -f {path_to_yaml}') From 02000d5ce2901c8951d0ccf6c81faf0dc4ffe58c Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:40:59 +0300 Subject: [PATCH 12/17] fix: ident --- kubemarine/procedures/restore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 14e384363..dbbf86386 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -189,7 +189,7 @@ def import_etcd(cluster: KubernetesCluster) -> None: # Will work with particular snapshot snapshot = path_to_snap else: - raise Exception("ETCD snapshot is incorrect or doesn't exist") + raise Exception("ETCD snapshot is incorrect or doesn't exist") # Copy snapshot from first control-plane node to backup_location cluster.log.debug('Coping snapshot from first control-plane node to the backup folder') first_control_plane.get(snapshot, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db')) From 1c8552d93b8c6f0022fcd656b5878393b460d0a9 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:48:50 +0300 Subject: [PATCH 13/17] feat: docs --- documentation/Maintenance.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/documentation/Maintenance.md b/documentation/Maintenance.md index 09de7abb9..c36580297 100644 --- a/documentation/Maintenance.md +++ b/documentation/Maintenance.md @@ -726,14 +726,14 @@ backup_plan: storage_depth: 5 ``` -`enabled` is a switcher to create or delete the CronJob -`storage_class` is StorageClasss that is used to create a PersistentVolume for backups -`storage_name` is PersistentVolumeClaim name -`storage_size` is PersistentVolume size -`etcdctl_image` is Docker image with etcdctl and additional utilities on board -`busybox_image` is Docker image with Linux shell -`schedule` is a crontab notation schedule -`storage_depth` is a storage time in hours +* `enabled` is a switcher to create or delete the CronJob +* `storage_class` is StorageClasss that is used to create a PersistentVolume for backups +* `storage_name` is PersistentVolumeClaim name +* `storage_size` is PersistentVolume size +* `etcdctl_image` is Docker image with etcdctl and additional utilities on board +* `busybox_image` is Docker image with Linux shell +* `schedule` is a crontab notation schedule +* `storage_depth` is a storage time in hours **Warning**: Do not use StorageClass with `reclaimPolicy: Delete` if you wat to keep snapshots after disabling periodic backups. @@ -757,7 +757,7 @@ backup_plan: enabled: false ``` -The procedure runs only the following tasks: +The procedure runs only the following tasks(the others rtasks are skipped be default): * verify_backup_location * export @@ -858,8 +858,8 @@ restore_plan: ``` **Notices**: -* Images must be chosen according to the ETCD version that has been used originally to create a backup -* Path to the snapshot must be checked preliminary +* Images must be chosen according to the ETCD version that has been used originally to create a backup. +* Path to the snapshot could be as folder only. The latest snapshot will be used in that case. ## Add Node Procedure From a15fd579dc33663b57eee5c8faaf5af4ed0ebca8 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Fri, 13 Mar 2026 13:37:52 +0300 Subject: [PATCH 14/17] comments; fix --- kubemarine/procedures/backup.py | 1 + kubemarine/procedures/restore.py | 7 +------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 7854241a9..9a6ed1a8c 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -171,6 +171,7 @@ def export_nodes(cluster: KubernetesCluster) -> None: def export_etcd(cluster: KubernetesCluster) -> None: if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + # Applying CronJob and exit path_to_yaml = '/tmp/etcd_backup.yaml' retention = int(cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}).get('storage_depth', {}))*60 diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index dbbf86386..869c3cc89 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -71,11 +71,6 @@ def unpack_data(resources: DynamicResources) -> None: backup_file_source = utils.get_external_resource_path(backup_file_source) - if resources.procedure_inventory().get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): - if not os.path.isdir(backup_file_source): - raise FileNotFoundError('Backup location "%s" not found' % backup_file_source) - return - if not os.path.isfile(backup_file_source): raise FileNotFoundError('Backup file "%s" not found' % backup_file_source) @@ -186,7 +181,7 @@ def import_etcd(cluster: KubernetesCluster) -> None: last_snapshot = first_control_plane.sudo(f'ls -1tr {path_to_snap} | tail -n 1').get_simple_out().split('\n')[0] snapshot = f'{path_to_snap}/{last_snapshot}' elif "data" == result : - # Will work with particular snapshot + # Getting the particular snapshot snapshot = path_to_snap else: raise Exception("ETCD snapshot is incorrect or doesn't exist") From 7ab47d4e765966e3384771c936efef9c6b91e48f Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Fri, 13 Mar 2026 13:39:11 +0300 Subject: [PATCH 15/17] empty string --- kubemarine/procedures/restore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 869c3cc89..48d3f3064 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -70,7 +70,6 @@ def unpack_data(resources: DynamicResources) -> None: raise Exception('Backup source not specified in procedure') backup_file_source = utils.get_external_resource_path(backup_file_source) - if not os.path.isfile(backup_file_source): raise FileNotFoundError('Backup file "%s" not found' % backup_file_source) From 8981b8a5dd05c9a81d72e4e11454e33c29558060 Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Fri, 13 Mar 2026 13:42:31 +0300 Subject: [PATCH 16/17] comment --- kubemarine/procedures/restore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 48d3f3064..a570bbaae 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -184,7 +184,7 @@ def import_etcd(cluster: KubernetesCluster) -> None: snapshot = path_to_snap else: raise Exception("ETCD snapshot is incorrect or doesn't exist") - # Copy snapshot from first control-plane node to backup_location + # Copying snapshot from first control-plane node to backup_location cluster.log.debug('Coping snapshot from first control-plane node to the backup folder') first_control_plane.get(snapshot, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db')) From d869831d07dd68874c5d34bd5d51d1719e068fba Mon Sep 17 00:00:00 2001 From: Aleksandr Arefev <39635005+alexarefev@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:13:05 +0300 Subject: [PATCH 17/17] docs --- documentation/Maintenance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/Maintenance.md b/documentation/Maintenance.md index c36580297..c417d2470 100644 --- a/documentation/Maintenance.md +++ b/documentation/Maintenance.md @@ -757,7 +757,7 @@ backup_plan: enabled: false ``` -The procedure runs only the following tasks(the others rtasks are skipped be default): +The procedure runs only the following tasks(the others tasks are skipped be default): * verify_backup_location * export