diff --git a/documentation/Maintenance.md b/documentation/Maintenance.md index bfacddc53..c417d2470 100644 --- a/documentation/Maintenance.md +++ b/documentation/Maintenance.md @@ -707,6 +707,61 @@ The `backup` procedure executes the following sequence of tasks: * make_descriptor * pack +### Periodic ETCD backups + +It's posible to set the periodic ETCD backups via CronJob. The procedure config for that case is the following + +```yaml +backup_location: '/tmp/tmp_folder' +backup_plan: + etcd: + cron_job: + enabled: true + storage_class: "local-path" + storage_name: "etcd-backup" + storage_size: "50Gi" + etcdctl_image: ghcr.io/netcracker/etcdctl:0.0.1 + busybox_image: busybox:1.37.0 + schedule: "*/5 * * * *" + storage_depth: 5 +``` + +* `enabled` is a switcher to create or delete the CronJob +* `storage_class` is StorageClasss that is used to create a PersistentVolume for backups +* `storage_name` is PersistentVolumeClaim name +* `storage_size` is PersistentVolume size +* `etcdctl_image` is Docker image with etcdctl and additional utilities on board +* `busybox_image` is Docker image with Linux shell +* `schedule` is a crontab notation schedule +* `storage_depth` is a storage time in hours + +**Warning**: Do not use StorageClass with `reclaimPolicy: Delete` if you wat to keep snapshots after disabling periodic backups. + +After the enabling, the CronJob must be created in `kube-system` Namespace: + +```shell +$ kubectl -n kube-system get cronjob +NAME SCHEDULE TIMEZONE SUSPEND ACTIVE LAST SCHEDULE AGE +etcd-backup */5 * * * * False 0 35s +``` + +That CronJob runs two scripts periodically. The first one create ETCD snapshot with the name like `etcd-snapshot-20260311_114008_15743.db` on PersistentVolume. The second one delete the snapshots with the age more than `ststorage_depth` + +To disable the existing CronJob procedure config is the following: + +```yaml +backup_location: '/tmp/tmp_folder' +backup_plan: + etcd: + cron_job: + enabled: false +``` + +The procedure runs only the following tasks(the others tasks are skipped be default): + +* verify_backup_location +* export + * etcd ## Restore Procedure @@ -789,6 +844,22 @@ The `restore` procedure executes the following sequence of tasks: * etcd * reboot +### Restore From Periodic Backup + +To restore the existing periodic backup the procedure config should be like the following: + +```yaml +backup_location: /tmp/backups + +restore_plan: + etcd: + image: registry.k8s.io/etcd:3.6.6-0 + snapshot: /opt/local-path-provisioner/pvc-e3b0d6c5-495d-4887-90d9-000d6b3d4d00_kube-system_etcd-backup/etcd-snapshot-20260220_103000.db +``` + +**Notices**: +* Images must be chosen according to the ETCD version that has been used originally to create a backup. +* Path to the snapshot could be as folder only. The latest snapshot will be used in that case. ## Add Node Procedure diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py index 48c588605..9a6ed1a8c 100755 --- a/kubemarine/procedures/backup.py +++ b/kubemarine/procedures/backup.py @@ -22,10 +22,12 @@ import threading import time import uuid +import io from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor from queue import Queue, Empty from typing import List, Tuple, Union, Dict, Optional, Iterator, Literal +from jinja2 import Template import yaml @@ -83,6 +85,8 @@ def prepare_backup_tmpdir(logger: log.EnhancedLogger, context: dict) -> str: def verify_backup_location(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return target = utils.get_external_resource_path(cluster.procedure_inventory.get('backup_location', 'backup.tar.gz')) if not os.path.isdir(target) and not os.path.isdir(os.path.abspath(os.path.join(target, os.pardir))): raise FileNotFoundError('Backup location directory not exists') @@ -96,6 +100,8 @@ def export_ansible_inventory(cluster: KubernetesCluster) -> None: def export_packages_list(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return cluster.context['backup_descriptor']['nodes']['packages'] = {} if cluster.get_os_family() in ['rhel8', 'rhel9']: cmd = r"rpm -qa" @@ -107,6 +113,8 @@ def export_packages_list(cluster: KubernetesCluster) -> None: def export_hostname(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return cluster.context['backup_descriptor']['nodes']['hostnames'] = {} results = cluster.nodes['all'].sudo('hostnamectl status | head -n 1 | sed -e \'s/[a-zA-Z ]*://g\'') cluster.log.verbose(results) @@ -115,6 +123,8 @@ def export_hostname(cluster: KubernetesCluster) -> None: def export_cluster_yaml(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) shutil.copyfile(utils.get_dump_filepath(cluster.context, 'cluster.yaml'), os.path.join(backup_directory, 'cluster.yaml')) @@ -124,6 +134,8 @@ def export_cluster_yaml(cluster: KubernetesCluster) -> None: def export_nodes(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) backup_nodes_data_dir = os.path.join(backup_directory, 'nodes_data') os.mkdir(backup_nodes_data_dir) @@ -158,6 +170,32 @@ def export_nodes(cluster: KubernetesCluster) -> None: def export_etcd(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + # Applying CronJob and exit + path_to_yaml = '/tmp/etcd_backup.yaml' + retention = int(cluster.procedure_inventory.get('backup_plan', {}).get('etcd', + {}).get('cron_job', {}).get('storage_depth', {}))*60 + backup_yaml = utils.read_internal('templates/etcd_backup.yaml.j2') + first_control_plane = cluster.nodes['control-plane'].get_first_member() + config = Template(backup_yaml).render(hostname=first_control_plane.get_node_name(), + etcdctl_image=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('etcdctl_image', ''), + storage_class=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_class', ''), + storage_name=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_name', ''), + storage_size=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_size', ''), + busybox_image=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('busybox_image', ''), + schedule=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('schedule', ''), + retention=retention) + first_control_plane.put(io.StringIO(config), path_to_yaml, sudo=True, mkdir=True) + if cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('enabled', False) : + cluster.log.debug(f'Enabling periodic ETCD backup') + cluster.log.verbose(f'Deleting resources from {path_to_yaml} file') + first_control_plane.sudo(f'kubectl apply -f {path_to_yaml}') + else: + cluster.log.debug(f'Disabling periodic ETCD backup') + cluster.log.verbose(f'Deleting resources from {path_to_yaml} file') + first_control_plane.sudo(f'kubectl delete -f {path_to_yaml}') + return + backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) etcd_node, is_custom_etcd_node = select_etcd_node(cluster) cluster.context['backup_descriptor']['etcd']['image'] = retrieve_etcd_image(etcd_node) @@ -239,6 +277,8 @@ def retrieve_etcd_image(etcd_node: NodeGroup) -> str: def export_kubernetes_version(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return control_plane = cluster.nodes['control-plane'].get_any_member() version = control_plane.sudo('kubectl get nodes --no-headers | head -n 1 | awk \'{print $5; exit}\'').get_simple_out() cluster.context['backup_descriptor']['kubernetes']['version'] = version.strip() @@ -584,6 +624,8 @@ def _download(self, task: DownloaderPayload, temp_local_filepath: str) -> None: def export_kubernetes(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) control_plane = cluster.nodes['control-plane'].get_any_member() backup_kubernetes = cluster.procedure_inventory.get('backup_plan', {}).get('kubernetes', {}) @@ -709,6 +751,8 @@ def _graceful_shutdown_downloaders() -> Optional[BaseException]: def make_descriptor(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) cluster.context['backup_descriptor']['kubernetes']['thirdparties'] = cluster.inventory['services']['thirdparties'] @@ -719,6 +763,8 @@ def make_descriptor(cluster: KubernetesCluster) -> None: def pack_data(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}): + return cluster_name = cluster.inventory['cluster_name'] backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context) diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py index 211f76f50..a570bbaae 100755 --- a/kubemarine/procedures/restore.py +++ b/kubemarine/procedures/restore.py @@ -118,6 +118,8 @@ def import_nodes_data(cluster: KubernetesCluster) -> None: def restore_dns_resolv_conf(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + return import_nodes_data(cluster) unpack_cmd = "sudo tar xzvf /tmp/kubemarine-backup.tar.gz -C / --overwrite /etc/resolv.conf" @@ -130,10 +132,14 @@ def restore_dns_resolv_conf(cluster: KubernetesCluster) -> None: def restore_thirdparties(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + return install.system_prepare_thirdparties(cluster) def import_nodes(cluster: KubernetesCluster) -> None: + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + return if not cluster.is_task_completed('restore.dns.resolv_conf'): import_nodes_data(cluster) @@ -163,6 +169,25 @@ def import_etcd(cluster: KubernetesCluster) -> None: cluster.log.verbose('ETCD will be restored from the following image: ' + etcd_image) cluster.log.debug('Uploading ETCD snapshot...') + # Custom path to ETCD snapshot + if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}): + cluster.log.debug('The particular snapshot will be used') + path_to_snap = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}) + first_control_plane = cluster.nodes['control-plane'].get_first_member() + result = first_control_plane.sudo(f'file -b {path_to_snap}').get_simple_out().split('\n')[0] + if "directory" == result : + # Getting the latest snapshot + last_snapshot = first_control_plane.sudo(f'ls -1tr {path_to_snap} | tail -n 1').get_simple_out().split('\n')[0] + snapshot = f'{path_to_snap}/{last_snapshot}' + elif "data" == result : + # Getting the particular snapshot + snapshot = path_to_snap + else: + raise Exception("ETCD snapshot is incorrect or doesn't exist") + # Copying snapshot from first control-plane node to backup_location + cluster.log.debug('Coping snapshot from first control-plane node to the backup folder') + first_control_plane.get(snapshot, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db')) + snap_name = '/var/lib/etcd/etcd-snapshot%s.db' % int(round(time.time() * 1000)) cluster.nodes['control-plane'].put(os.path.join(cluster.context['backup_tmpdir'], 'etcd.db'), snap_name, sudo=True, compare_hashes=True) diff --git a/kubemarine/resources/schemas/backup.json b/kubemarine/resources/schemas/backup.json index 1d4bcf82f..85eeb81ea 100644 --- a/kubemarine/resources/schemas/backup.json +++ b/kubemarine/resources/schemas/backup.json @@ -18,7 +18,48 @@ "source_node": { "$ref": "definitions/common/node_ref.json#/definitions/Name", "description": "The name of the node to create a snapshot from. The node must be a control-plane and have an ETCD data located on it." - } + }, + "cron_job": { + "type": "object", + "description": "Create the CronJob for regular ETCD snapshots", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enabling CronJob" + }, + "storage_class": { + "type": "string", + "default": "local-path", + "description": "StorageClass that will be used to create PersistentVolumeClaim" + }, + "storage_name": { + "type": "string", + "default": "etcd-backup", + "description": "Name of the PersistentVolumeClaim" + }, + "storage_size": { + "type": "string", + "description": "Size of the PersistentVolume" + }, + "etcdctl_image": { + "type": "string", + "description": "Docker image with bash and etcdctl on board" + }, + "busybox_image": { + "type": "string", + "description": "Docker image with find util on board" + }, + "schedule": { + "type": "string", + "description": "Crontab schedule" + }, + "storage_depth": { + "type": "integer", + "minimal": 1, + "description": "Backups retention time(hours)" + } + } + } }, "additionalProperties": false }, diff --git a/kubemarine/resources/schemas/restore.json b/kubemarine/resources/schemas/restore.json index 5ce497fd8..87573773c 100644 --- a/kubemarine/resources/schemas/restore.json +++ b/kubemarine/resources/schemas/restore.json @@ -14,6 +14,10 @@ "type": "object", "description": "Additional parameters for ETCD restore", "properties": { + "snapshot": { + "type": "string", + "description": "Full path to the ETCD snapshot, that will be used instead of the default one." + }, "image": { "type": "string", "description": "Full name of the ETCD image, including the registry address. On its basis, the restoration is performed." diff --git a/kubemarine/templates/etcd_backup.yaml.j2 b/kubemarine/templates/etcd_backup.yaml.j2 new file mode 100644 index 000000000..ca1d53733 --- /dev/null +++ b/kubemarine/templates/etcd_backup.yaml.j2 @@ -0,0 +1,83 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ storage_name }} + namespace: kube-system +spec: + accessModes: + - ReadWriteOnce + storageClassName: {{ storage_class }} + resources: + requests: + storage: {{ storage_size }} +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: etcd-backup + namespace: kube-system +spec: + concurrencyPolicy: Allow + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + spec: + containers: + - args: + - | + etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt --key=/etc/kubernetes/pki/etcd/healthcheck-client.key snapshot save /backup/etcd-snapshot-$(date +%Y%m%d_%H%M%S)_${RANDOM}.db + chmod 0644 /backup/*.db + command: + - /bin/sh + - -c + env: + - name: ETCDCTL_API + value: "3" + image: {{ etcdctl_image }} + imagePullPolicy: IfNotPresent + name: backup + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /etc/kubernetes/pki/etcd + name: etcd-certs + readOnly: true + - mountPath: /backup + name: backup + - args: + - | + find /backup -type f -mmin +{{ retention }} -name '*.db' -exec rm -- '{}' \; + command: + - /bin/sh + - -c + image: {{ busybox_image }} + imagePullPolicy: IfNotPresent + name: backup-purge + volumeMounts: + - mountPath: /backup + name: backup + dnsPolicy: ClusterFirst + hostNetwork: true + nodeSelector: + kubernetes.io/hostname: {{ hostname }} + restartPolicy: OnFailure + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - hostPath: + path: /etc/kubernetes/pki/etcd + type: DirectoryOrCreate + name: etcd-certs + - name: backup + persistentVolumeClaim: + claimName: {{ storage_name }} + schedule: "{{ schedule }}" + successfulJobsHistoryLimit: 3 + suspend: false