Netcracker · alexarefev · Feb 20, 2026 · Feb 23, 2026 · Mar 9, 2026 · Mar 10, 2026
diff --git a/documentation/Maintenance.md b/documentation/Maintenance.md
@@ -707,6 +707,61 @@ The `backup` procedure executes the following sequence of tasks:
 * make_descriptor
 * pack
 
+### Periodic ETCD backups
+
+It's posible to set the periodic ETCD backups via CronJob. The procedure config for that case is the following
+
+```yaml
+backup_location: '/tmp/tmp_folder'
+backup_plan:
+  etcd:
+    cron_job:
+      enabled: true
+      storage_class: "local-path"
+      storage_name: "etcd-backup"
+      storage_size: "50Gi"
+      etcdctl_image: ghcr.io/netcracker/etcdctl:0.0.1
+      busybox_image: busybox:1.37.0
+      schedule: "*/5 * * * *"
+      storage_depth: 5
+```
+
+* `enabled` is a switcher to create or delete the CronJob
+* `storage_class` is StorageClasss that is used to create a PersistentVolume for backups
+* `storage_name` is PersistentVolumeClaim name
+* `storage_size` is PersistentVolume size
+* `etcdctl_image` is Docker image with etcdctl and additional utilities on board
+* `busybox_image` is Docker image with Linux shell
+* `schedule` is a crontab notation schedule
+* `storage_depth` is a storage time in hours
+
+**Warning**: Do not use StorageClass with `reclaimPolicy: Delete` if you wat to keep snapshots after disabling periodic backups.
+
+After the enabling, the CronJob must be created in `kube-system` Namespace:
+
+```shell
+$ kubectl -n kube-system get cronjob
+NAME          SCHEDULE      TIMEZONE   SUSPEND   ACTIVE   LAST SCHEDULE   AGE
+etcd-backup   */5 * * * *   <none>     False     0        <none>          35s
+```
+
+That CronJob runs two scripts periodically. The first one create ETCD snapshot with the name like `etcd-snapshot-20260311_114008_15743.db` on PersistentVolume. The second one delete the snapshots with the age more than `ststorage_depth`
+
+To disable the existing CronJob procedure config is the following:
+
+```yaml
+backup_location: '/tmp/tmp_folder'
+backup_plan:
+  etcd:
+    cron_job:
+      enabled: false
+```
+
+The procedure runs only the following tasks(the others tasks are skipped be default):
+
+* verify_backup_location
+* export
+  * etcd
 
 ## Restore Procedure
 
@@ -789,6 +844,22 @@ The `restore` procedure executes the following sequence of tasks:
   * etcd
 * reboot
 
+### Restore From Periodic Backup
+
+To restore the existing periodic backup the procedure config should be like the following:
+
+```yaml
+backup_location: /tmp/backups
+
+restore_plan:
+  etcd:
+    image: registry.k8s.io/etcd:3.6.6-0
+    snapshot: /opt/local-path-provisioner/pvc-e3b0d6c5-495d-4887-90d9-000d6b3d4d00_kube-system_etcd-backup/etcd-snapshot-20260220_103000.db
+```
+
+**Notices**:
+* Images must be chosen according to the ETCD version that has been used originally to create a backup.
+* Path to the snapshot could be as folder only. The latest snapshot will be used in that case.
 
 ## Add Node Procedure
 

diff --git a/kubemarine/procedures/backup.py b/kubemarine/procedures/backup.py
@@ -22,10 +22,12 @@
 import threading
 import time
 import uuid
+import io
 from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor
 from queue import Queue, Empty
 from typing import List, Tuple, Union, Dict, Optional, Iterator, Literal
+from jinja2 import Template
 
 import yaml
 
@@ -83,6 +85,8 @@ def prepare_backup_tmpdir(logger: log.EnhancedLogger, context: dict) -> str:
 
 
 def verify_backup_location(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     target = utils.get_external_resource_path(cluster.procedure_inventory.get('backup_location', 'backup.tar.gz'))
     if not os.path.isdir(target) and not os.path.isdir(os.path.abspath(os.path.join(target, os.pardir))):
         raise FileNotFoundError('Backup location directory not exists')
@@ -96,6 +100,8 @@ def export_ansible_inventory(cluster: KubernetesCluster) -> None:
 
 
 def export_packages_list(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     cluster.context['backup_descriptor']['nodes']['packages'] = {}
     if cluster.get_os_family() in ['rhel8', 'rhel9']:
         cmd = r"rpm -qa"
@@ -107,6 +113,8 @@ def export_packages_list(cluster: KubernetesCluster) -> None:
 
 
 def export_hostname(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     cluster.context['backup_descriptor']['nodes']['hostnames'] = {}
     results = cluster.nodes['all'].sudo('hostnamectl status | head -n 1 | sed -e \'s/[a-zA-Z ]*://g\'')
     cluster.log.verbose(results)
@@ -115,6 +123,8 @@ def export_hostname(cluster: KubernetesCluster) -> None:
 
 
 def export_cluster_yaml(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context)
     shutil.copyfile(utils.get_dump_filepath(cluster.context, 'cluster.yaml'),
                     os.path.join(backup_directory, 'cluster.yaml'))
@@ -124,6 +134,8 @@ def export_cluster_yaml(cluster: KubernetesCluster) -> None:
 
 
 def export_nodes(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context)
     backup_nodes_data_dir = os.path.join(backup_directory, 'nodes_data')
     os.mkdir(backup_nodes_data_dir)
@@ -158,6 +170,32 @@ def export_nodes(cluster: KubernetesCluster) -> None:
 
 
 def export_etcd(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        # Applying CronJob and exit
+        path_to_yaml = '/tmp/etcd_backup.yaml'
+        retention = int(cluster.procedure_inventory.get('backup_plan', {}).get('etcd',
+                        {}).get('cron_job', {}).get('storage_depth', {}))*60
+        backup_yaml = utils.read_internal('templates/etcd_backup.yaml.j2')
+        first_control_plane = cluster.nodes['control-plane'].get_first_member()
+        config = Template(backup_yaml).render(hostname=first_control_plane.get_node_name(),
+                    etcdctl_image=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('etcdctl_image', ''),
+                    storage_class=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_class', ''),
+                    storage_name=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_name', ''),
+                    storage_size=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('storage_size', ''),
+                    busybox_image=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('busybox_image', ''),
+                    schedule=cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('schedule', ''),
+                    retention=retention)
+        first_control_plane.put(io.StringIO(config), path_to_yaml, sudo=True, mkdir=True)
+        if cluster.procedure_inventory['backup_plan']['etcd']['cron_job'].get('enabled', False) :
+            cluster.log.debug(f'Enabling periodic ETCD backup')
+            cluster.log.verbose(f'Deleting resources from {path_to_yaml} file')
+            first_control_plane.sudo(f'kubectl apply -f {path_to_yaml}')
+        else:
+            cluster.log.debug(f'Disabling periodic ETCD backup')
+            cluster.log.verbose(f'Deleting resources from {path_to_yaml} file')
+            first_control_plane.sudo(f'kubectl delete -f {path_to_yaml}')
+        return
+
     backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context)
     etcd_node, is_custom_etcd_node = select_etcd_node(cluster)
     cluster.context['backup_descriptor']['etcd']['image'] = retrieve_etcd_image(etcd_node)
@@ -239,6 +277,8 @@ def retrieve_etcd_image(etcd_node: NodeGroup) -> str:
 
 
 def export_kubernetes_version(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     control_plane = cluster.nodes['control-plane'].get_any_member()
     version = control_plane.sudo('kubectl get nodes --no-headers | head -n 1 | awk \'{print $5; exit}\'').get_simple_out()
     cluster.context['backup_descriptor']['kubernetes']['version'] = version.strip()
@@ -584,6 +624,8 @@ def _download(self, task: DownloaderPayload, temp_local_filepath: str) -> None:
 
 
 def export_kubernetes(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context)
     control_plane = cluster.nodes['control-plane'].get_any_member()
     backup_kubernetes = cluster.procedure_inventory.get('backup_plan', {}).get('kubernetes', {})
@@ -709,6 +751,8 @@ def _graceful_shutdown_downloaders() -> Optional[BaseException]:
 
 
 def make_descriptor(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context)
 
     cluster.context['backup_descriptor']['kubernetes']['thirdparties'] = cluster.inventory['services']['thirdparties']
@@ -719,6 +763,8 @@ def make_descriptor(cluster: KubernetesCluster) -> None:
 
 
 def pack_data(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('backup_plan', {}).get('etcd', {}).get('cron_job', {}):
+        return
     cluster_name = cluster.inventory['cluster_name']
     backup_directory = prepare_backup_tmpdir(cluster.log, cluster.context)
 

diff --git a/kubemarine/procedures/restore.py b/kubemarine/procedures/restore.py
@@ -118,6 +118,8 @@ def import_nodes_data(cluster: KubernetesCluster) -> None:
 
 
 def restore_dns_resolv_conf(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}):
+        return
     import_nodes_data(cluster)
 
     unpack_cmd = "sudo tar xzvf /tmp/kubemarine-backup.tar.gz -C / --overwrite /etc/resolv.conf"
@@ -130,10 +132,14 @@ def restore_dns_resolv_conf(cluster: KubernetesCluster) -> None:
 
 
 def restore_thirdparties(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}):
+        return
     install.system_prepare_thirdparties(cluster)
 
 
 def import_nodes(cluster: KubernetesCluster) -> None:
+    if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}):
+        return
     if not cluster.is_task_completed('restore.dns.resolv_conf'):
         import_nodes_data(cluster)
 
@@ -163,6 +169,25 @@ def import_etcd(cluster: KubernetesCluster) -> None:
     cluster.log.verbose('ETCD will be restored from the following image: ' + etcd_image)
 
     cluster.log.debug('Uploading ETCD snapshot...')
+    # Custom path to ETCD snapshot
+    if cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {}):
+        cluster.log.debug('The particular snapshot will be used')
+        path_to_snap = cluster.procedure_inventory.get('restore_plan', {}).get('etcd', {}).get('snapshot', {})
+        first_control_plane = cluster.nodes['control-plane'].get_first_member()
+        result = first_control_plane.sudo(f'file -b {path_to_snap}').get_simple_out().split('\n')[0]
+        if "directory" == result :
+            # Getting the latest snapshot
+            last_snapshot = first_control_plane.sudo(f'ls -1tr {path_to_snap} | tail -n 1').get_simple_out().split('\n')[0]
+            snapshot = f'{path_to_snap}/{last_snapshot}'
+        elif "data" == result :
+            # Getting the particular snapshot
+            snapshot = path_to_snap
+        else:
+            raise Exception("ETCD snapshot is incorrect or doesn't exist")
+        # Copying snapshot from first control-plane node to backup_location
+        cluster.log.debug('Coping snapshot from first control-plane node to the backup folder')
+        first_control_plane.get(snapshot, os.path.join(cluster.context['backup_tmpdir'], 'etcd.db'))
+
     snap_name = '/var/lib/etcd/etcd-snapshot%s.db' % int(round(time.time() * 1000))
     cluster.nodes['control-plane'].put(os.path.join(cluster.context['backup_tmpdir'], 'etcd.db'), snap_name,
                                        sudo=True, compare_hashes=True)

diff --git a/kubemarine/resources/schemas/backup.json b/kubemarine/resources/schemas/backup.json
@@ -18,7 +18,48 @@
             "source_node": {
               "$ref": "definitions/common/node_ref.json#/definitions/Name",
               "description": "The name of the node to create a snapshot from. The node must be a control-plane and have an ETCD data located on it."
-            }
+            },
+            "cron_job": {
+              "type": "object",
+              "description": "Create the CronJob for regular ETCD snapshots",
+              "properties": {
+                "enabled": {
+                  "type": "boolean",
+                  "description": "Enabling CronJob"
+                },
+                "storage_class": {
+                  "type": "string",
+                  "default": "local-path",
+                  "description": "StorageClass that will be used to create PersistentVolumeClaim"
+                },
+                "storage_name": {
+                  "type": "string",
+                  "default": "etcd-backup",
+                  "description": "Name of the PersistentVolumeClaim"
+                },
+                "storage_size": {
+                  "type": "string",
+                  "description": "Size of the PersistentVolume"
+                },
+                "etcdctl_image": {
+                  "type": "string",
+                  "description": "Docker image with bash and etcdctl on board"
+                },
+                "busybox_image": {
+                  "type": "string",
+                  "description": "Docker image with find util on board"
+                },
+                "schedule": {
+                  "type": "string",
+                  "description": "Crontab schedule"
+                },
+                "storage_depth": {
+                  "type": "integer",
+                  "minimal": 1,
+                  "description": "Backups retention time(hours)"
+                }
+	      }
+	    }
           },
           "additionalProperties": false
         },

diff --git a/kubemarine/resources/schemas/restore.json b/kubemarine/resources/schemas/restore.json
@@ -14,6 +14,10 @@
           "type": "object",
           "description": "Additional parameters for ETCD restore",
           "properties": {
+            "snapshot": {
+              "type": "string",
+              "description": "Full path to the ETCD snapshot, that will be used instead of the default one."
+            },
             "image": {
               "type": "string",
               "description": "Full name of the ETCD image, including the registry address. On its basis, the restoration is performed."