From 364add73e0ae6a9e43d633c3631e9f3545b5bf24 Mon Sep 17 00:00:00 2001 From: Tom-Byrne Date: Fri, 5 Jun 2020 10:34:49 +0100 Subject: [PATCH 01/15] added cephFS backup script that uses the recursive directory stats * it uses the rfiles and rbytes to make sure individual rsyncs are not too large * it uses the rctime of the source CephFS to only backup directories modified after a specified time * it currently prints out rsync commands to stdout --- cephfs/recursive-backup/recursive-backup.py | 99 +++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 cephfs/recursive-backup/recursive-backup.py diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/recursive-backup.py new file mode 100644 index 0000000..4ece1d5 --- /dev/null +++ b/cephfs/recursive-backup/recursive-backup.py @@ -0,0 +1,99 @@ +#!/usr/bin/python3 +import os +import argparse +import subprocess + +max_files=100000 +max_bytes=1000000000000 + +attrcmd = ["getfattr", "--only-values", "-d", "-m"] + +rsync_cmd = "rsync -a -v -n" +one_lvl_filter = " -f '- /*/*'" + +parser = argparse.ArgumentParser() + +parser.add_argument("src", help="source CephFS directory to backup") +parser.add_argument("dst", help="destination directory to store the backup") +parser.add_argument("time", help="epoch time of last backup (set to 0 for full sync)", type=int) +parser.add_argument("-f", "--maxfiles", help="maximum number of files per rsync", type=int) +parser.add_argument("-b", "--maxbytes", help="maximum bytes per rsync", type=int) +parser.add_argument("-d", "--debug", help="print debug statements", action="store_true") + +args = parser.parse_args() + +if (args.maxfiles != None): + max_files = args.maxfiles + +if (args.maxbytes != None): + max_bytes = args.maxbytes + +def debug_print( message ): + if (args.debug): + print("# DEBUG> " + message) + +def rsync_full( directory ): + generate_rsync( directory, False) + +def rsync_one_level( directory ): + generate_rsync( directory, True) + +def generate_rsync(directory, oneLevel): + source = os.path.abspath(os.path.join(args.src,directory)) + destination = os.path.abspath(os.path.join(args.dst,directory)) + src_dest_str = " '{}/' '{}'".format(source, destination) + if (oneLevel): + cmd = rsync_cmd + one_lvl_filter + src_dest_str + else: + cmd = rsync_cmd + src_dest_str + print(cmd) + +def get_rctime( directory ): + rctime_sp = subprocess.run(attrcmd + ["ceph.dir.rctime", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + rctime = rctime_sp.stdout.split(b'.')[0] + return {"out": int(rctime), "stderr": rctime_sp.stderr, "rc": int(rctime_sp.returncode)} + +def get_rfiles( directory ): + rfiles_sp = subprocess.run(attrcmd + ["ceph.dir.rfiles", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return {"out": int(rfiles_sp.stdout), "stderr": rfiles_sp.stderr, "rc": int(rfiles_sp.returncode)} + +def get_rbytes( directory ): + rbytes_sp = subprocess.run(attrcmd + ["ceph.dir.rbytes", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return {"out": int(rbytes_sp.stdout), "stderr": rbytes_sp.stderr, "rc": int(rbytes_sp.returncode)} + + +def get_rsubdirs( directory ): + rsubdirs_sp = subprocess.run(attrcmd + ["ceph.dir.rsubdirs", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return {"out": int(rsubdirs_sp.stdout), "stderr": rsubdirs_sp.stderr, "rc": int(rsubdirs_sp.returncode)} + +def recurse_rsync( directory ): + debug_print("starting recurse rsync for directory {}".format(directory)) + rctime = get_rctime( directory ) + if (rctime["rc"] == 0 and rctime["out"] > args.time): + debug_print( "directory {} has a newer rctime of {}, backing up".format(directory, rctime["out"]) ) + rsubdirs = get_rsubdirs( directory ) + rfiles = get_rfiles( directory ) + rbytes = get_rbytes( directory ) + if ((rfiles["rc"] == 0 and rfiles["out"] < max_files) and (rbytes["rc"] == 0 and rbytes["out"] < max_bytes)): + debug_print("directory {} has {} files and {} bytes, fewer than the max of {} files / {} bytes".format(directory, rfiles["out"], rbytes["out"], max_files, max_bytes)) + debug_print("rsyncing full directory: {}".format(directory)) + rsync_full(directory) + elif (rsubdirs["rc"] == 0 and rsubdirs["out"] <= 1): # no subdirs = 1 subdir... + debug_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes), but has no subdirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) + debug_print("rsyncing full directory: {}".format(directory)) + rsync_full(directory) + else: + debug_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes) and has subdirs. rsyncing top level and recursing into dirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) + rsync_one_level(directory) + for path,subdirs,files in os.walk(directory): + for subdir in subdirs: + if not os.path.islink( os.path.join(directory,subdir)): + recurse_rsync( os.path.join(directory,subdir)) + break + else: + debug_print( "directory {} has an older rctime of {}, skipping".format(directory, rctime["out"]) ) + + +debug_print( "changing to directory {}".format(args.src) ) +os.chdir( args.src ) +recurse_rsync( "." ) From cba5934eb30622dab8d295795c6469636c616e8e Mon Sep 17 00:00:00 2001 From: Tom-Byrne Date: Thu, 10 Sep 2020 17:50:23 +0100 Subject: [PATCH 02/15] Improvements to the cephFS backup script * added the option for the script to run the rsyncs after generation * better error handling for getfattr subprocesses * added mount checks for source and destination * fixed inconsistent indentation * script help message improvements --- cephfs/recursive-backup/recursive-backup.py | 203 ++++++++++++++------ 1 file changed, 144 insertions(+), 59 deletions(-) diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/recursive-backup.py index 4ece1d5..fe66654 100644 --- a/cephfs/recursive-backup/recursive-backup.py +++ b/cephfs/recursive-backup/recursive-backup.py @@ -1,99 +1,184 @@ #!/usr/bin/python3 import os +import sys +import time +import shlex import argparse import subprocess +start_time=time.time() + max_files=100000 max_bytes=1000000000000 +safety_factor=3600 -attrcmd = ["getfattr", "--only-values", "-d", "-m"] +attrcmd = ["getfattr", "--only-values", "-n"] -rsync_cmd = "rsync -a -v -n" -one_lvl_filter = " -f '- /*/*'" +rsync_cmd = ["rsync", "-n", "-a", "--perms", "--acls", "--links", "--stats", "--delete", "--no-hard-links", "--numeric-ids"] +one_lvl_filter = ["-f", "-_/*/*"] -parser = argparse.ArgumentParser() +parser = argparse.ArgumentParser(description="Generate rsync commands to back up changes to a CephFS filesystem since a given time. Uses the CephFS rctime to determine what has changed, and uses rfiles and rbytes to determine the size of individual rsyncs. By default it will do nothing apart from outputting rsync commands to standard output.") parser.add_argument("src", help="source CephFS directory to backup") parser.add_argument("dst", help="destination directory to store the backup") parser.add_argument("time", help="epoch time of last backup (set to 0 for full sync)", type=int) -parser.add_argument("-f", "--maxfiles", help="maximum number of files per rsync", type=int) -parser.add_argument("-b", "--maxbytes", help="maximum bytes per rsync", type=int) -parser.add_argument("-d", "--debug", help="print debug statements", action="store_true") +parser.add_argument("-f", "--maxfiles", help="maximum number of files per rsync. defaults to 100000", type=int) +parser.add_argument("-b", "--maxbytes", help="maximum bytes per rsync. defaults to 100GB", type=int) +parser.add_argument("-s", "--safety", help="number of seconds before last backup time to still consider the directory changed. defaults to 3600 (1h)", type=int, default=3600) +parser.add_argument("--checksrc", help="check if the source dir is mountpoint before starting", action="store_true") +parser.add_argument("--checkdst", help="check if the dest dir is mountpoint before starting", action="store_true") +parser.add_argument("--run", help="run the rsyncs after generation", action="store_true") +parser.add_argument('-v', '--verbose', help="one '-v' for informational messages, two for debug", action='count', default=0) args = parser.parse_args() +time = args.time - args.safety + +debug = False +info = False +if (args.verbose >= 2): + debug = True +elif (args.verbose == 1): + info = True + + if (args.maxfiles != None): - max_files = args.maxfiles + max_files = args.maxfiles if (args.maxbytes != None): - max_bytes = args.maxbytes + max_bytes = args.maxbytes + + +rsync_cmd_list = [] def debug_print( message ): - if (args.debug): - print("# DEBUG> " + message) + if (debug): + print("# DEBUG> " + message) + +def info_print( message ): + if (info or debug): + print("# INFO> " + message) + def rsync_full( directory ): - generate_rsync( directory, False) + generate_rsync( directory, False) def rsync_one_level( directory ): - generate_rsync( directory, True) + generate_rsync( directory, True) def generate_rsync(directory, oneLevel): - source = os.path.abspath(os.path.join(args.src,directory)) - destination = os.path.abspath(os.path.join(args.dst,directory)) - src_dest_str = " '{}/' '{}'".format(source, destination) - if (oneLevel): - cmd = rsync_cmd + one_lvl_filter + src_dest_str - else: - cmd = rsync_cmd + src_dest_str - print(cmd) + source = os.path.abspath(os.path.join(args.src,directory)) + destination = os.path.abspath(os.path.join(args.dst,directory)) + src_dest_list = [source + "/", destination] + if (oneLevel): + cmd = rsync_cmd + one_lvl_filter + src_dest_list + else: + cmd = rsync_cmd + src_dest_list + rsync_cmd_list.append(cmd) + if not args.run: + print(" ".join(shlex.quote(s) for s in cmd)) + +def run_rsync( cmd ): + rsync_sp = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) + return {"stdout": rsync_sp.stdout, "stderr": rsync_sp.stderr, "rc": rsync_sp.returncode} def get_rctime( directory ): - rctime_sp = subprocess.run(attrcmd + ["ceph.dir.rctime", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - rctime = rctime_sp.stdout.split(b'.')[0] - return {"out": int(rctime), "stderr": rctime_sp.stderr, "rc": int(rctime_sp.returncode)} + rctime_sp = subprocess.run(attrcmd + ["ceph.dir.rctime", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if (rctime_sp.returncode == 0): + rctime = rctime_sp.stdout.split(b'.')[0] + return {"out": int(rctime), "stderr": rctime_sp.stderr, "rc": rctime_sp.returncode} + else: + return {"out": 0, "stderr": rctime_sp.stderr, "rc": rctime_sp.returncode} def get_rfiles( directory ): - rfiles_sp = subprocess.run(attrcmd + ["ceph.dir.rfiles", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return {"out": int(rfiles_sp.stdout), "stderr": rfiles_sp.stderr, "rc": int(rfiles_sp.returncode)} + rfiles_sp = subprocess.run(attrcmd + ["ceph.dir.rfiles", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if (rfiles_sp.returncode == 0): + return {"out": int(rfiles_sp.stdout), "stderr": rfiles_sp.stderr, "rc": rfiles_sp.returncode} + else: + return {"out": 0, "stderr": rfiles_sp.stderr, "rc": rfiles_sp.returncode} def get_rbytes( directory ): - rbytes_sp = subprocess.run(attrcmd + ["ceph.dir.rbytes", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return {"out": int(rbytes_sp.stdout), "stderr": rbytes_sp.stderr, "rc": int(rbytes_sp.returncode)} - + rbytes_sp = subprocess.run(attrcmd + ["ceph.dir.rbytes", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if (rbytes_sp.returncode == 0): + return {"out": int(rbytes_sp.stdout), "stderr": rbytes_sp.stderr, "rc": rbytes_sp.returncode} + else: + return {"out": 0, "stderr": rbytes_sp.stderr, "rc": rbytes_sp.returncode} def get_rsubdirs( directory ): - rsubdirs_sp = subprocess.run(attrcmd + ["ceph.dir.rsubdirs", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return {"out": int(rsubdirs_sp.stdout), "stderr": rsubdirs_sp.stderr, "rc": int(rsubdirs_sp.returncode)} + rsubdirs_sp = subprocess.run(attrcmd + ["ceph.dir.rsubdirs", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if (rsubdirs_sp.returncode == 0): + return {"out": int(rsubdirs_sp.stdout), "stderr": rsubdirs_sp.stderr, "rc": rsubdirs_sp.returncode} + else: + return {"out": 0, "stderr": rsubdirs_sp.stderr, "rc": rsubdirs_sp.returncode} def recurse_rsync( directory ): - debug_print("starting recurse rsync for directory {}".format(directory)) - rctime = get_rctime( directory ) - if (rctime["rc"] == 0 and rctime["out"] > args.time): - debug_print( "directory {} has a newer rctime of {}, backing up".format(directory, rctime["out"]) ) - rsubdirs = get_rsubdirs( directory ) - rfiles = get_rfiles( directory ) - rbytes = get_rbytes( directory ) - if ((rfiles["rc"] == 0 and rfiles["out"] < max_files) and (rbytes["rc"] == 0 and rbytes["out"] < max_bytes)): - debug_print("directory {} has {} files and {} bytes, fewer than the max of {} files / {} bytes".format(directory, rfiles["out"], rbytes["out"], max_files, max_bytes)) - debug_print("rsyncing full directory: {}".format(directory)) - rsync_full(directory) - elif (rsubdirs["rc"] == 0 and rsubdirs["out"] <= 1): # no subdirs = 1 subdir... - debug_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes), but has no subdirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) - debug_print("rsyncing full directory: {}".format(directory)) - rsync_full(directory) - else: - debug_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes) and has subdirs. rsyncing top level and recursing into dirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) - rsync_one_level(directory) - for path,subdirs,files in os.walk(directory): - for subdir in subdirs: - if not os.path.islink( os.path.join(directory,subdir)): - recurse_rsync( os.path.join(directory,subdir)) - break - else: - debug_print( "directory {} has an older rctime of {}, skipping".format(directory, rctime["out"]) ) - - -debug_print( "changing to directory {}".format(args.src) ) + debug_print("starting recurse rsync for directory {}".format(directory)) + rctime = get_rctime( directory ) + if (rctime["rc"] == 0 and rctime["out"] > time): + info_print( "directory {} has a newer rctime of {}, backing up".format(directory, rctime["out"]) ) + rsubdirs = get_rsubdirs( directory ) + rfiles = get_rfiles( directory ) + rbytes = get_rbytes( directory ) + if ((rfiles["rc"] == 0 and rfiles["out"] < max_files) and (rbytes["rc"] == 0 and rbytes["out"] < max_bytes)): + info_print("directory {} has {} files and {} bytes, fewer than the max of {} files / {} bytes".format(directory, rfiles["out"], rbytes["out"], max_files, max_bytes)) + info_print("rsyncing full directory: {}".format(directory)) + rsync_full(directory) + elif (rsubdirs["rc"] == 0 and rsubdirs["out"] <= 1): # no subdirs = 1 subdir... + info_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes), but has no subdirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) + info_print("rsyncing full directory: {}".format(directory)) + rsync_full(directory) + else: + info_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes) and has subdirs. rsyncing top level and recursing into dirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) + rsync_one_level(directory) + for path,subdirs,files in os.walk(directory): + for subdir in subdirs: + if not os.path.islink( os.path.join(directory,subdir)): + recurse_rsync( os.path.join(directory,subdir)) + break + else: + debug_print( "directory {} has an older rctime of {}, skipping".format(directory, rctime["out"]) ) + + +if (args.checksrc): + srcmnt = os.path.ismount(args.src) + if not srcmnt: + print("source directory {} is not a mountpoint and check specified".format(args.src)) + print("backup exited with errors") + sys.exit(1) + +if (args.checkdst): + dstmnt = os.path.ismount(args.dst) + if not dstmnt: + print("dest directory {} is not a mountpoint and check specified".format(args.dst)) + print("backup exited with errors") + sys.exit(1) + + +debug_print( "changing to source directory {}".format(args.src) ) os.chdir( args.src ) +test_rctime = get_rctime( "." ) +if (test_rctime["rc"] != 0): + print("source directory {} did not return a CephFS rctime, so probably is not a CephFS mount".format(args.src)) + print("backup exited with errors") + sys.exit(1) +else: + debug_print("src dir returned a cephFS rctime, starting ") + recurse_rsync( "." ) + +if (args.run): + cmd_list_len = len(rsync_cmd_list) + print("{} rsync commands to run".format(cmd_list_len)) + for (i, rsync_cmd) in enumerate(rsync_cmd_list): + print("{}/{} running '{}'".format(i+1, cmd_list_len, " ".join(rsync_cmd))) + rsync = run_rsync(rsync_cmd) + if (rsync["rc"] == 0): + print("Success") + print(rsync["stdout"].decode('ascii')) + else: + print("failure (return code: {})".format(rsync["rc"])) + print(rsync["stdout"].decode('ascii')) + print(rsync["stderr"].decode('ascii')) + print("backup exited with errors") + sys.exit(1) + print("SUCCESS: backup started at {} finished successfully".format(start_time)) From 023742d3b2c7fff4d2d79e297e4190832ee264ec Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Fri, 8 Jan 2021 11:15:50 +0000 Subject: [PATCH 03/15] improve recursive backup scripts logging and reporting * add json summary line when running the rsyncs * add timestamps to the info and debug lines --- cephfs/recursive-backup/recursive-backup.py | 119 ++++++++++++++------ 1 file changed, 87 insertions(+), 32 deletions(-) diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/recursive-backup.py index fe66654..0f8eb61 100644 --- a/cephfs/recursive-backup/recursive-backup.py +++ b/cephfs/recursive-backup/recursive-backup.py @@ -1,10 +1,58 @@ #!/usr/bin/python3 import os import sys +import json import time import shlex import argparse import subprocess +from datetime import datetime, timedelta + +warnings = False + +debug = False +info = False + +def timestamp_print( message ): + print ("# {} {}".format(time.strftime('[%Y-%m-%d %H:%M:%S]'), message)) + +def debug_print( message ): + if (debug): + timestamp_print("DEBUG> " + message) + +def info_print( message ): + if (info or debug): + timestamp_print("INFO> " + message) + +def warn_print( message ): + global warnings + warnings = True + timestamp_print("WARN> " + message) + +def exit_print ( success, message ): + if success: + timestamp_print("FINISHED SUCCESS> " + message) + else: + timestamp_print("FINISHED FAILED> " + message) + +def print_summary_json ( success, message ): + body = { + "type": "backup_summary", + "success": success, + "start_time": start_time, + "exit_time": time.time(), + "message": message + } + print(json.dumps(body)) + +def log_and_exit ( success, message ): + if (args.run): + print_summary_json( success, message) + exit_print( success, message ); + rc = 1 + if success and not warnings: + rc = 0 + sys.exit(rc) start_time=time.time() @@ -21,7 +69,9 @@ parser.add_argument("src", help="source CephFS directory to backup") parser.add_argument("dst", help="destination directory to store the backup") -parser.add_argument("time", help="epoch time of last backup (set to 0 for full sync)", type=int) +parser.add_argument("-t", "--time", help="epoch time of last backup", type=int) +parser.add_argument("-d", "--days", help="days since last backup. Use instead of --time", type=int) +parser.add_argument("--full", help="do a full backup", action="store_true") parser.add_argument("-f", "--maxfiles", help="maximum number of files per rsync. defaults to 100000", type=int) parser.add_argument("-b", "--maxbytes", help="maximum bytes per rsync. defaults to 100GB", type=int) parser.add_argument("-s", "--safety", help="number of seconds before last backup time to still consider the directory changed. defaults to 3600 (1h)", type=int, default=3600) @@ -32,15 +82,28 @@ args = parser.parse_args() -time = args.time - args.safety +if not (args.time or args.days or args.full): + log_and_exit( False, "backup interval not specified, use --time, --days or --full") + +if (args.time and args.days) or (args.time and args.full) or (args.days and args.full): + log_and_exit( False, "specify ONLY one of --time, --days or --full as a backup interval") + +if args.days: + backup_time_obj = datetime.now() - timedelta(days=args.days) + backup_time_wo_safety = backup_time_obj.timestamp() + backup_time = backup_time_wo_safety - args.safety +elif args.full: + backup_time = 0 +else: + backup_time = args.time - args.safety + -debug = False -info = False if (args.verbose >= 2): debug = True elif (args.verbose == 1): info = True +info_print("backup time is {}".format(backup_time)) if (args.maxfiles != None): max_files = args.maxfiles @@ -51,15 +114,6 @@ rsync_cmd_list = [] -def debug_print( message ): - if (debug): - print("# DEBUG> " + message) - -def info_print( message ): - if (info or debug): - print("# INFO> " + message) - - def rsync_full( directory ): generate_rsync( directory, False) @@ -88,6 +142,7 @@ def get_rctime( directory ): rctime = rctime_sp.stdout.split(b'.')[0] return {"out": int(rctime), "stderr": rctime_sp.stderr, "rc": rctime_sp.returncode} else: + warn_print("error while getting rctime of: {}".format(directory)) return {"out": 0, "stderr": rctime_sp.stderr, "rc": rctime_sp.returncode} def get_rfiles( directory ): @@ -95,6 +150,7 @@ def get_rfiles( directory ): if (rfiles_sp.returncode == 0): return {"out": int(rfiles_sp.stdout), "stderr": rfiles_sp.stderr, "rc": rfiles_sp.returncode} else: + warn_print("error while getting rfiles of: {}".format(directory)) return {"out": 0, "stderr": rfiles_sp.stderr, "rc": rfiles_sp.returncode} def get_rbytes( directory ): @@ -102,6 +158,7 @@ def get_rbytes( directory ): if (rbytes_sp.returncode == 0): return {"out": int(rbytes_sp.stdout), "stderr": rbytes_sp.stderr, "rc": rbytes_sp.returncode} else: + warn_print("error while getting rbytes of: {}".format(directory)) return {"out": 0, "stderr": rbytes_sp.stderr, "rc": rbytes_sp.returncode} def get_rsubdirs( directory ): @@ -109,12 +166,13 @@ def get_rsubdirs( directory ): if (rsubdirs_sp.returncode == 0): return {"out": int(rsubdirs_sp.stdout), "stderr": rsubdirs_sp.stderr, "rc": rsubdirs_sp.returncode} else: + warn_print("error while getting rsubdirs of: {}".format(directory)) return {"out": 0, "stderr": rsubdirs_sp.stderr, "rc": rsubdirs_sp.returncode} def recurse_rsync( directory ): debug_print("starting recurse rsync for directory {}".format(directory)) rctime = get_rctime( directory ) - if (rctime["rc"] == 0 and rctime["out"] > time): + if (rctime["rc"] == 0 and rctime["out"] > backup_time): info_print( "directory {} has a newer rctime of {}, backing up".format(directory, rctime["out"]) ) rsubdirs = get_rsubdirs( directory ) rfiles = get_rfiles( directory ) @@ -135,32 +193,27 @@ def recurse_rsync( directory ): if not os.path.islink( os.path.join(directory,subdir)): recurse_rsync( os.path.join(directory,subdir)) break - else: + elif (rctime["rc"] == 0): debug_print( "directory {} has an older rctime of {}, skipping".format(directory, rctime["out"]) ) + else: + warn_print( "nonzero exit code while getting rctime for {}, not backing up".format(directory) ) if (args.checksrc): srcmnt = os.path.ismount(args.src) if not srcmnt: - print("source directory {} is not a mountpoint and check specified".format(args.src)) - print("backup exited with errors") - sys.exit(1) + log_and_exit(success=False, message="Source directory {} is not a mountpoint (and check specified)".format(args.src)) if (args.checkdst): dstmnt = os.path.ismount(args.dst) if not dstmnt: - print("dest directory {} is not a mountpoint and check specified".format(args.dst)) - print("backup exited with errors") - sys.exit(1) - + log_and_exit(success=False, message="Destination directory {} is not a mountpoint (and check specified)".format(args.dst)) debug_print( "changing to source directory {}".format(args.src) ) os.chdir( args.src ) test_rctime = get_rctime( "." ) if (test_rctime["rc"] != 0): - print("source directory {} did not return a CephFS rctime, so probably is not a CephFS mount".format(args.src)) - print("backup exited with errors") - sys.exit(1) + log_and_exit(success=False, message="Source directory {} did not return a CephFS rctime, so probably is not a CephFS mount and cannot be backed up".format(args.src)) else: debug_print("src dir returned a cephFS rctime, starting ") @@ -168,17 +221,19 @@ def recurse_rsync( directory ): if (args.run): cmd_list_len = len(rsync_cmd_list) - print("{} rsync commands to run".format(cmd_list_len)) + timestamp_print("{} rsync commands to run".format(cmd_list_len)) for (i, rsync_cmd) in enumerate(rsync_cmd_list): - print("{}/{} running '{}'".format(i+1, cmd_list_len, " ".join(rsync_cmd))) + timestamp_print("{}/{} running '{}'".format(i+1, cmd_list_len, " ".join(rsync_cmd))) rsync = run_rsync(rsync_cmd) if (rsync["rc"] == 0): - print("Success") + timestamp_print("Success") print(rsync["stdout"].decode('ascii')) else: - print("failure (return code: {})".format(rsync["rc"])) + timestamp_print("failure (return code: {})".format(rsync["rc"])) print(rsync["stdout"].decode('ascii')) print(rsync["stderr"].decode('ascii')) - print("backup exited with errors") - sys.exit(1) - print("SUCCESS: backup started at {} finished successfully".format(start_time)) + warn_print("rsync failed, continuing") + if warnings is True: + log_and_exit(success=False, message="backup started at {} ({}) finished with warnings".format(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)), start_time)) + else: + log_and_exit(success=True, message="backup started at {} ({}) finished successfully".format( time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)), start_time)) From d07df108c76cdaa19cfc96c4faac1e8d7d088c03 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Fri, 8 Jan 2021 11:17:31 +0000 Subject: [PATCH 04/15] add nagios/icinga check script for the recursive-backup script --- .../icinga_cephfs_backup_check.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100755 cephfs/recursive-backup/icinga_cephfs_backup_check.py diff --git a/cephfs/recursive-backup/icinga_cephfs_backup_check.py b/cephfs/recursive-backup/icinga_cephfs_backup_check.py new file mode 100755 index 0000000..4ecbff8 --- /dev/null +++ b/cephfs/recursive-backup/icinga_cephfs_backup_check.py @@ -0,0 +1,115 @@ +#!/usr/bin/python3 +import os +import re +import sys +import json +import argparse +from datetime import datetime, timedelta + +def readable_timedelta(duration: timedelta): + data = {} + data['d'], remaining = divmod(duration.total_seconds(), 86_400) + data['h'], remaining = divmod(remaining, 3_600) + data['m'], data['s'] = divmod(remaining, 60) + + time_parts = [f'{round(value)}{name}' for name, value in data.items() if value > 0] + if time_parts: + return ' '.join(time_parts) + else: + return '<1s' + + +pattern = re.compile("^{.*}$") + +OK = 0 +WARN = 1 +CRIT = 2 +UNKNOWN = 3 + +check_time = datetime.now() + +parser = argparse.ArgumentParser(description="Check the status of a backup run by the CephFS recursive-backup script.") + +parser.add_argument("log", help="log file from backup process") +parser.add_argument("interval", help="time allowed since last successful backup (in seconds)", type=int) + +args = parser.parse_args() + +try: + interval_delta = timedelta(seconds=args.interval) + allowed_last_time = check_time - interval_delta +except: + print("Error parsing last allowed backup time") + sys.exit(CRIT) + +if not os.path.isfile(args.log): + print ("Backup log file does not exist") + sys.exit(CRIT) + +last_summary="NONE" +try: + for i, line in enumerate(open(args.log)): + for match in re.finditer(pattern, line): + last_summary = match.group() + #print("Found on line {}: {}".format(i+1, match.group())) +except: + print("Error reading backup log file") + sys.exit(CRIT) + +if last_summary is "NONE": + print("No backup summary found in log file") + sys.exit(CRIT) + +try: + summary = json.loads(last_summary) +except: + print("Error parsing JSON summary") + sys.exit(CRIT) + +if "type" not in summary: + print("Error reading JSON summary: message type not in object") + sys.exit(CRIT) + +if summary["type"] != "backup_summary": + print("Error reading JSON summary: message type is not backup_summary") + sys.exit(CRIT) + +if "message" not in summary: + print("Error reading JSON summary: summary message not in object") + sys.exit(CRIT) + +if "success" not in summary: + print("Error reading JSON summary: summary success status not in object") + sys.exit(CRIT) + +if summary["success"] != True: + print("Last backup was not successful: {}".format(summary["message"])) + sys.exit(CRIT) + +if "start_time" not in summary: + print("Error reading JSON summary: start time not in object") + sys.exit(CRIT) + +try: + start_time = datetime.fromtimestamp(summary["start_time"]) +except: + print("Error reading JSON summary: start time not parsable") + sys.exit(CRIT) + +if "exit_time" not in summary: + print("Error reading JSON summary: exit time not in object") + sys.exit(CRIT) + +try: + exit_time = datetime.fromtimestamp(summary["exit_time"]) +except: + print("Error reading JSON summary: exit time not parsable") + sys.exit(CRIT) + +#if start_time.timestamp() < allowed_last_time.timestamp(): +if start_time < allowed_last_time: + print( "Last successful backup started at {}, longer ago than the backup interval of {}".format(readable_timedelta(check_time - start_time), readable_timedelta(interval_delta)) ) + sys.exit(CRIT) + +print("A successful backup started {} ago and ran for {}. The backup interval is {}". format(readable_timedelta(check_time - start_time), readable_timedelta(exit_time - start_time), readable_timedelta(interval_delta))) +sys.exit(OK) From 87b708ae9a111d657b98a4a2e8470a394e959b18 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Fri, 8 Jan 2021 11:23:04 +0000 Subject: [PATCH 05/15] add a basic usage guide --- cephfs/recursive-backup/README.md | 56 +++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 cephfs/recursive-backup/README.md diff --git a/cephfs/recursive-backup/README.md b/cephfs/recursive-backup/README.md new file mode 100644 index 0000000..6de3c5c --- /dev/null +++ b/cephfs/recursive-backup/README.md @@ -0,0 +1,56 @@ +# CephFS Recursive Backup + +## Description + +This script generates rsync commands to back up changes to a CephFS filesystem since a given time. Uses the CephFS rctime to determine what has changed, and uses rfiles and rbytes to determine the size of individual rsyncs. By default it will do nothing apart from outputting rsync commands to standard output. + +Using the --run option causes the script to also run the generated rsyncs. In this mode it outputs a summary line when the script exits, which can be redirected to a log, and the nagios/icinga check can be used to report on the state of and time since the last backup. + +## Example usage + +TODO + +## Recursive backup script usage + +``` +usage: recursive-backup.py [-h] [-t TIME] [-d DAYS] [--full] [-f MAXFILES] + [-b MAXBYTES] [-s SAFETY] [--checksrc] [--checkdst] + [--run] [-v] + src dst + +positional arguments: + src source CephFS directory to backup + dst destination directory to store the backup + +optional arguments: + -h, --help show this help message and exit + -t TIME, --time TIME epoch time of last backup + -d DAYS, --days DAYS days since last backup. Use instead of --time + --full do a full backup + -f MAXFILES, --maxfiles MAXFILES + maximum number of files per rsync. defaults to 100000 + -b MAXBYTES, --maxbytes MAXBYTES + maximum bytes per rsync. defaults to 100GB + -s SAFETY, --safety SAFETY + number of seconds before last backup time to still + consider the directory changed. defaults to 3600 (1h) + --checksrc check if the source dir is mountpoint before starting + --checkdst check if the dest dir is mountpoint before starting + --run run the rsyncs after generation + -v, --verbose one '-v' for informational messages, two for debug +``` + +## Nagios/Icinga check usage + +``` +usage: icinga_cephfs_backup_check.py [-h] log interval + +Check the status of a backup run by the CephFS recursive-backup script. + +positional arguments: + log log file from backup process + interval time allowed since last successful backup (in seconds) + +optional arguments: + -h, --help show this help message and exit +``` From f1a1fe7b24789e825294a3df45116ee25048eba2 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Thu, 14 Jul 2022 11:28:34 +0100 Subject: [PATCH 06/15] identify and backup directories with 'broken' mtimes --- cephfs/recursive-backup/recursive-backup.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/recursive-backup.py index 0f8eb61..ac79edb 100644 --- a/cephfs/recursive-backup/recursive-backup.py +++ b/cephfs/recursive-backup/recursive-backup.py @@ -140,7 +140,16 @@ def get_rctime( directory ): rctime_sp = subprocess.run(attrcmd + ["ceph.dir.rctime", directory], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (rctime_sp.returncode == 0): rctime = rctime_sp.stdout.split(b'.')[0] - return {"out": int(rctime), "stderr": rctime_sp.stderr, "rc": rctime_sp.returncode} + # check for broken rctimes, identified by having a '0' nanosecond component + # due to the bug that prefixes the nanosecond component with '09', we should check for + # the string being '090' and '0' + # https://tracker.ceph.com/issues/39943 + broken_rctime = False + rctime_nano_string = rctime_sp.stdout.split(b'.')[1].decode() + if (rctime_nano_string == '090' or rctime_nano_string == '0'): + broken_rctime = True + + return {"out": int(rctime), "stderr": rctime_sp.stderr, "rc": rctime_sp.returncode, "broken" : broken_rctime } else: warn_print("error while getting rctime of: {}".format(directory)) return {"out": 0, "stderr": rctime_sp.stderr, "rc": rctime_sp.returncode} @@ -172,8 +181,12 @@ def get_rsubdirs( directory ): def recurse_rsync( directory ): debug_print("starting recurse rsync for directory {}".format(directory)) rctime = get_rctime( directory ) - if (rctime["rc"] == 0 and rctime["out"] > backup_time): - info_print( "directory {} has a newer rctime of {}, backing up".format(directory, rctime["out"]) ) + if (rctime["rc"] == 0 and (rctime["out"] > backup_time or rctime["broken"] )): + if (rctime["broken"]): + info_print( "directory {} has an odd time of {} with no nanosecond component, backing up as rctime is not reliable".format(directory, rctime["out"]) ) + else: + info_print( "directory {} has a newer rctime of {}, backing up".format(directory, rctime["out"]) ) + rsubdirs = get_rsubdirs( directory ) rfiles = get_rfiles( directory ) rbytes = get_rbytes( directory ) From 9f9fb99b819169ce29e8929eb2f8783c8c1d95f1 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Tue, 19 Jul 2022 10:54:36 +0100 Subject: [PATCH 07/15] icinga_cephfs_backup_check: change from positional to named arguments --- cephfs/recursive-backup/icinga_cephfs_backup_check.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cephfs/recursive-backup/icinga_cephfs_backup_check.py b/cephfs/recursive-backup/icinga_cephfs_backup_check.py index 4ecbff8..f324c7d 100755 --- a/cephfs/recursive-backup/icinga_cephfs_backup_check.py +++ b/cephfs/recursive-backup/icinga_cephfs_backup_check.py @@ -19,6 +19,7 @@ def readable_timedelta(duration: timedelta): return '<1s' +# regex to match the JSON summary line pattern = re.compile("^{.*}$") OK = 0 @@ -30,8 +31,8 @@ def readable_timedelta(duration: timedelta): parser = argparse.ArgumentParser(description="Check the status of a backup run by the CephFS recursive-backup script.") -parser.add_argument("log", help="log file from backup process") -parser.add_argument("interval", help="time allowed since last successful backup (in seconds)", type=int) +parser.add_argument("-f", "--logfile", help="log file from backup process", type=str, required=True) +parser.add_argument("-i", "--interval", help="time allowed since last successful backup (in seconds)", type=int, required=True) args = parser.parse_args() @@ -42,16 +43,15 @@ def readable_timedelta(duration: timedelta): print("Error parsing last allowed backup time") sys.exit(CRIT) -if not os.path.isfile(args.log): +if not os.path.isfile(args.logfile): print ("Backup log file does not exist") sys.exit(CRIT) last_summary="NONE" try: - for i, line in enumerate(open(args.log)): + for i, line in enumerate(open(args.logfile)): for match in re.finditer(pattern, line): last_summary = match.group() - #print("Found on line {}: {}".format(i+1, match.group())) except: print("Error reading backup log file") sys.exit(CRIT) @@ -106,7 +106,6 @@ def readable_timedelta(duration: timedelta): print("Error reading JSON summary: exit time not parsable") sys.exit(CRIT) -#if start_time.timestamp() < allowed_last_time.timestamp(): if start_time < allowed_last_time: print( "Last successful backup started at {}, longer ago than the backup interval of {}".format(readable_timedelta(check_time - start_time), readable_timedelta(interval_delta)) ) sys.exit(CRIT) From 6017136f2b837c8fcc6aca60f326e410fd2bff3a Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Tue, 19 Jul 2022 10:56:14 +0100 Subject: [PATCH 08/15] recursive-backup: various improvements * space checking on destination, to prevent filling up the destination filesystem. Our backup destination (a HSM) gets unhappy if it fills up to 100%, so this allows us to keep it happy. * support naming of backup jobs, this was needed to support the features added below. * pid file checking for existing running jobs. Uses the naming feature to allow tracking of multiple separate jobs. * option to redirect output to log files. The log files are named after the job names. * config file support, with ability to specify options for named jobs. --- cephfs/recursive-backup/README.md | 53 +++-- cephfs/recursive-backup/config.ini | 18 ++ cephfs/recursive-backup/recursive-backup.py | 235 +++++++++++++++----- 3 files changed, 242 insertions(+), 64 deletions(-) create mode 100644 cephfs/recursive-backup/config.ini mode change 100644 => 100755 cephfs/recursive-backup/recursive-backup.py diff --git a/cephfs/recursive-backup/README.md b/cephfs/recursive-backup/README.md index 6de3c5c..d22a927 100644 --- a/cephfs/recursive-backup/README.md +++ b/cephfs/recursive-backup/README.md @@ -4,7 +4,9 @@ This script generates rsync commands to back up changes to a CephFS filesystem since a given time. Uses the CephFS rctime to determine what has changed, and uses rfiles and rbytes to determine the size of individual rsyncs. By default it will do nothing apart from outputting rsync commands to standard output. -Using the --run option causes the script to also run the generated rsyncs. In this mode it outputs a summary line when the script exits, which can be redirected to a log, and the nagios/icinga check can be used to report on the state of and time since the last backup. +Using the --run option will allow the script to also run the generated rsyncs. In this mode it outputs a summary line when the script exits, which can be redirected to a log, and the nagios/icinga check can be used to report on the state of and time since the last backup. Any non zero exit code from any of the backup runs will cause the backup to be marked as failed. + +The config file `/etc/recursive-backup/config.ini` can be used to alter defaults, as well as set specific settings for named jobs (with the -n option). An example config file is present in this repo with all avaliable config file options specified, but it is not compulsory. Command line options will always override config file settings. ## Example usage @@ -13,29 +15,54 @@ TODO ## Recursive backup script usage ``` -usage: recursive-backup.py [-h] [-t TIME] [-d DAYS] [--full] [-f MAXFILES] - [-b MAXBYTES] [-s SAFETY] [--checksrc] [--checkdst] - [--run] [-v] +usage: recursive-backup.py [-h] [-n NAME] [-t TIME] [-d DAYS] [--full] + [-f MAXFILES] [-b MAXBYTES] [-s SAFETY] + [--checksrc] [--nochecksrc] [--checkdst] + [--nocheckdst] [--checkpid] [--nocheckpid] + [--checkspace] [--nocheckspace] + [--freebytes FREEBYTES] [--run] [-v] src dst +Generate rsync commands to back up changes to a CephFS filesystem since a +given time. Uses the CephFS rctime to determine what has changed, and uses +rfiles and rbytes to determine the size of individual rsyncs. By default it +will do nothing apart from outputting rsync commands to standard output. + positional arguments: src source CephFS directory to backup dst destination directory to store the backup optional arguments: -h, --help show this help message and exit + -n NAME, --name NAME name of the backup job (for log and pid file). -t TIME, --time TIME epoch time of last backup -d DAYS, --days DAYS days since last backup. Use instead of --time --full do a full backup -f MAXFILES, --maxfiles MAXFILES maximum number of files per rsync. defaults to 100000 -b MAXBYTES, --maxbytes MAXBYTES - maximum bytes per rsync. defaults to 100GB + maximum bytes per rsync. defaults to 1TB -s SAFETY, --safety SAFETY number of seconds before last backup time to still consider the directory changed. defaults to 3600 (1h) - --checksrc check if the source dir is mountpoint before starting - --checkdst check if the dest dir is mountpoint before starting + --checksrc (default) check if the source dir is mountpoint before + starting + --nochecksrc do not check if the source dir is mountpoint before + starting + --checkdst (default) check if the dest dir is mountpoint before + starting + --nocheckdst do not check if the dest dir is mountpoint before + starting + --checkpid (default) check for a named (-n/--name) pidfile before + starting + --nocheckpid do not check for a named pidfile before starting + --checkspace when running the rsyncs, check for free space on + destination FS before starting each rsync + --nocheckspace (default) do not check for free space on dest before + starting each rsync + --freebytes FREEBYTES + specify amount of free space overhead needed on the + destination to start an rsync. default is 100GB --run run the rsyncs after generation -v, --verbose one '-v' for informational messages, two for debug ``` @@ -43,14 +70,14 @@ optional arguments: ## Nagios/Icinga check usage ``` -usage: icinga_cephfs_backup_check.py [-h] log interval +usage: icinga_cephfs_backup_check.py [-h] -f LOGFILE -i INTERVAL Check the status of a backup run by the CephFS recursive-backup script. -positional arguments: - log log file from backup process - interval time allowed since last successful backup (in seconds) - optional arguments: - -h, --help show this help message and exit + -h, --help show this help message and exit + -f LOGFILE, --logfile LOGFILE + log file from backup process + -i INTERVAL, --interval INTERVAL + time allowed since last successful backup (in seconds) ``` diff --git a/cephfs/recursive-backup/config.ini b/cephfs/recursive-backup/config.ini new file mode 100644 index 0000000..28f6a43 --- /dev/null +++ b/cephfs/recursive-backup/config.ini @@ -0,0 +1,18 @@ +[DEFAULT] +backup_command = rsync -n -a --perms --acls --links --stats --no-hard-links --numeric-ids +one_level_backup_command = rsync -n -a --perms --acls --links --stats --no-hard-links --numeric-ids -f -_/*/* +max_files=100000 +max_bytes=1000000000000 +safety_factor=3600 +verbosity=info +check_src=true +check_dst=true +pid_dir=/var/run/recursive-backup +log_dir=/var/log/recursive-backup +log_to_file=false +check_pid=false + +[test_job] +free_bytes=50000000000 +log_to_file=true +check_pid=true diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/recursive-backup.py old mode 100644 new mode 100755 index ac79edb..5d00148 --- a/cephfs/recursive-backup/recursive-backup.py +++ b/cephfs/recursive-backup/recursive-backup.py @@ -6,22 +6,21 @@ import shlex import argparse import subprocess +import configparser +from contextlib import redirect_stdout from datetime import datetime, timedelta warnings = False -debug = False -info = False - def timestamp_print( message ): print ("# {} {}".format(time.strftime('[%Y-%m-%d %H:%M:%S]'), message)) def debug_print( message ): - if (debug): + if (verbosity == "debug"): timestamp_print("DEBUG> " + message) def info_print( message ): - if (info or debug): + if (verbosity == "info" or verbosity == "debug"): timestamp_print("INFO> " + message) def warn_print( message ): @@ -56,80 +55,174 @@ def log_and_exit ( success, message ): start_time=time.time() -max_files=100000 -max_bytes=1000000000000 -safety_factor=3600 - attrcmd = ["getfattr", "--only-values", "-n"] -rsync_cmd = ["rsync", "-n", "-a", "--perms", "--acls", "--links", "--stats", "--delete", "--no-hard-links", "--numeric-ids"] -one_lvl_filter = ["-f", "-_/*/*"] +verbosity="debug" +run_name="recursive-backup" +config = configparser.ConfigParser() + +# populate defaults +config['DEFAULT']['backup_command'] = 'rsync -n -a --perms --acls --links --stats --no-hard-links --numeric-ids' +config['DEFAULT']['one_level_backup_command'] = 'rsync -n -a --perms --acls --links --stats --no-hard-links --numeric-ids -f -_/*/*' +config['DEFAULT']['max_files']="100000" +config['DEFAULT']['max_bytes']="1000000000000" +config['DEFAULT']['safety_factor']="3600" +config['DEFAULT']['verbosity']="none" +config['DEFAULT']['check_src']="true" +config['DEFAULT']['check_dst']="true" +config['DEFAULT']['check_pid']="true" +config['DEFAULT']['pid_dir']="/var/run/ceph-fs-backup" +config['DEFAULT']['check_space']="false" +config['DEFAULT']['free_bytes']="10000000000" +config['DEFAULT']['log_to_file']="false" +config['DEFAULT']['log_dir']="/var/log/ceph-fs-backup" parser = argparse.ArgumentParser(description="Generate rsync commands to back up changes to a CephFS filesystem since a given time. Uses the CephFS rctime to determine what has changed, and uses rfiles and rbytes to determine the size of individual rsyncs. By default it will do nothing apart from outputting rsync commands to standard output.") parser.add_argument("src", help="source CephFS directory to backup") parser.add_argument("dst", help="destination directory to store the backup") +parser.add_argument("-n", "--name", help="name of the backup job (for log and pid file).", type=str) parser.add_argument("-t", "--time", help="epoch time of last backup", type=int) parser.add_argument("-d", "--days", help="days since last backup. Use instead of --time", type=int) parser.add_argument("--full", help="do a full backup", action="store_true") parser.add_argument("-f", "--maxfiles", help="maximum number of files per rsync. defaults to 100000", type=int) -parser.add_argument("-b", "--maxbytes", help="maximum bytes per rsync. defaults to 100GB", type=int) -parser.add_argument("-s", "--safety", help="number of seconds before last backup time to still consider the directory changed. defaults to 3600 (1h)", type=int, default=3600) -parser.add_argument("--checksrc", help="check if the source dir is mountpoint before starting", action="store_true") -parser.add_argument("--checkdst", help="check if the dest dir is mountpoint before starting", action="store_true") +parser.add_argument("-b", "--maxbytes", help="maximum bytes per rsync. defaults to 1TB", type=int) +parser.add_argument("-s", "--safety", help="number of seconds before last backup time to still consider the directory changed. defaults to 3600 (1h)", type=int) +parser.add_argument("--checksrc", help="(default) check if the source dir is mountpoint before starting", action="store_true") +parser.add_argument("--nochecksrc", help="do not check if the source dir is mountpoint before starting", action="store_true") +parser.add_argument("--checkdst", help="(default) check if the dest dir is mountpoint before starting", action="store_true") +parser.add_argument("--nocheckdst", help="do not check if the dest dir is mountpoint before starting", action="store_true") +parser.add_argument("--checkpid", help="(default) check for a named (-n/--name) pidfile before starting", action="store_true") +parser.add_argument("--nocheckpid", help="do not check for a named pidfile before starting", action="store_true") +parser.add_argument("--checkspace", help="when running the rsyncs, check for free space on destination FS before starting each rsync", action="store_true") +parser.add_argument("--nocheckspace", help="(default) do not check for free space on dest before starting each rsync", action="store_true") +parser.add_argument("--freebytes", help="specify amount of free space overhead needed on the destination to start an rsync. default is 100GB", type=int) parser.add_argument("--run", help="run the rsyncs after generation", action="store_true") parser.add_argument('-v', '--verbose', help="one '-v' for informational messages, two for debug", action='count', default=0) args = parser.parse_args() +config.read('/etc/recursive-backup/config.ini') + +if args.name: + run_name=args.name + try: + local_config = config[run_name] + except: + local_config = config['DEFAULT'] +else: + local_config = config['DEFAULT'] + +if args.run and not args.name: + log_and_exit( False, "name of backup job must be specified (--name/-n) when using the --run") + if not (args.time or args.days or args.full): log_and_exit( False, "backup interval not specified, use --time, --days or --full") if (args.time and args.days) or (args.time and args.full) or (args.days and args.full): log_and_exit( False, "specify ONLY one of --time, --days or --full as a backup interval") -if args.days: - backup_time_obj = datetime.now() - timedelta(days=args.days) - backup_time_wo_safety = backup_time_obj.timestamp() - backup_time = backup_time_wo_safety - args.safety -elif args.full: - backup_time = 0 -else: - backup_time = args.time - args.safety - +if args.safety: + local_config['safety_factor'] = args.safety if (args.verbose >= 2): - debug = True + local_config['verbosity'] = "debug" elif (args.verbose == 1): - info = True - -info_print("backup time is {}".format(backup_time)) + local_config['verbosity'] = "info" if (args.maxfiles != None): - max_files = args.maxfiles + local_config['max_files'] = args.maxfiles if (args.maxbytes != None): - max_bytes = args.maxbytes + local_config['max_bytes'] = args.maxbytes + +if args.checksrc and not args.nochecksrc: + local_config['check_src'] = "True" +elif args.nochecksrc and not args.checksrc: + local_config['check_src'] = "False" +elif args.nochecksrc and args.checksrc: + log_and_exit( False, "--checksrc and --nochecksrc cannot both be used") + +if args.checkdst and not args.nocheckdst: + local_config['check_dst'] = "True" +elif args.nocheckdst and not args.checkdst: + local_config['check_dst'] = "False" +elif args.nocheckdst and args.checkdst: + log_and_exit( False, "--checkdst and --nocheckdst cannot both be used") + +if args.checkpid and not args.nocheckpid: + local_config['check_pid'] = "True" +elif args.nocheckpid and not args.checkpid: + local_config['check_pid'] = "False" +elif args.nocheckpid and args.checkpid: + log_and_exit( False, "--checkpid and --nocheckpid cannot both be used") + +if args.checkspace and not args.nocheckspace: + local_config['check_space'] = "True" +elif args.nocheckspace and not args.checkspace: + local_config['check_space'] = "False" +elif args.nocheckspace and args.checkspace: + log_and_exit( False, "--checkspace and --nocheckspace cannot both be used") + + + +verbosity=local_config.get('verbosity') +backup_command = local_config.get('backup_command') +one_level_backup_command = local_config.get('one_level_backup_command') +max_files = local_config.getint('max_files') +max_bytes = local_config.getint('max_bytes') +safety_factor = local_config.getint('safety_factor') +check_src = local_config.getboolean('check_src') +check_dst = local_config.getboolean('check_dst') +check_pid = local_config.getboolean('check_pid') +check_space = local_config.getboolean('check_space') +free_bytes = local_config.getint('free_bytes') +log_dir = local_config.get('log_dir') +log_to_file = local_config.getboolean('log_to_file') +run = args.run +src = args.src +dst= args.dst + +if log_to_file: + log_file = os.path.join( log_dir, run_name + ".log" ) + info_print("stdout redirected to {}".format(log_file)) + sys.stdout = open(log_file, 'w') + +# do not check for pidfile if not running the rsyncs +if not run: + check_pid = False +if args.days: + backup_time_obj = datetime.now() - timedelta(days=args.days) + backup_time_wo_safety = backup_time_obj.timestamp() + backup_time = backup_time_wo_safety - safety_factor +elif args.full: + backup_time = 0 +else: + backup_time = args.time - safety_factor + +info_print("backup time is {}".format(backup_time)) rsync_cmd_list = [] +size_list = [] -def rsync_full( directory ): - generate_rsync( directory, False) +def rsync_full( directory, size ): + generate_rsync( directory, size, False) def rsync_one_level( directory ): - generate_rsync( directory, True) + generate_rsync( directory, 0, True) -def generate_rsync(directory, oneLevel): - source = os.path.abspath(os.path.join(args.src,directory)) - destination = os.path.abspath(os.path.join(args.dst,directory)) +def generate_rsync(directory, size, oneLevel): + source = os.path.abspath(os.path.join(src,directory)) + destination = os.path.abspath(os.path.join(dst,directory)) src_dest_list = [source + "/", destination] if (oneLevel): - cmd = rsync_cmd + one_lvl_filter + src_dest_list + cmd = one_level_backup_command.split() + src_dest_list else: - cmd = rsync_cmd + src_dest_list + cmd = backup_command.split() + src_dest_list rsync_cmd_list.append(cmd) - if not args.run: + size_list.append(size) + if not run: print(" ".join(shlex.quote(s) for s in cmd)) def run_rsync( cmd ): @@ -178,6 +271,10 @@ def get_rsubdirs( directory ): warn_print("error while getting rsubdirs of: {}".format(directory)) return {"out": 0, "stderr": rsubdirs_sp.stderr, "rc": rsubdirs_sp.returncode} +def get_fs_freespace( directory ): + statvfs = os.statvfs(directory) + return statvfs.f_frsize * statvfs.f_bavail + def recurse_rsync( directory ): debug_print("starting recurse rsync for directory {}".format(directory)) rctime = get_rctime( directory ) @@ -193,11 +290,11 @@ def recurse_rsync( directory ): if ((rfiles["rc"] == 0 and rfiles["out"] < max_files) and (rbytes["rc"] == 0 and rbytes["out"] < max_bytes)): info_print("directory {} has {} files and {} bytes, fewer than the max of {} files / {} bytes".format(directory, rfiles["out"], rbytes["out"], max_files, max_bytes)) info_print("rsyncing full directory: {}".format(directory)) - rsync_full(directory) + rsync_full(directory, rbytes["out"]) elif (rsubdirs["rc"] == 0 and rsubdirs["out"] <= 1): # no subdirs = 1 subdir... info_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes), but has no subdirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) info_print("rsyncing full directory: {}".format(directory)) - rsync_full(directory) + rsync_full(directory, rbytes["out"]) else: info_print("directory {} is bigger than allowed ({}/{} files, {}/{} bytes) and has subdirs. rsyncing top level and recursing into dirs".format(directory, rfiles["out"], max_files, rbytes["out"], max_bytes)) rsync_one_level(directory) @@ -212,31 +309,67 @@ def recurse_rsync( directory ): warn_print( "nonzero exit code while getting rctime for {}, not backing up".format(directory) ) -if (args.checksrc): - srcmnt = os.path.ismount(args.src) + +if (check_pid): + pid_file_path = os.path.join( pid_dir, run_name + ".lock") + debug_print("pid check start - looking for pidfile at {}".format(pid_file_path)) + if os.access(pid_file_path, os.F_OK): + debug_print("pid file from previous run found") + pid_file = open(pid_file_path, "r") + pid_file.seek(0) + old_pid = pid_file.read().splitlines()[0] + debug_print("pid from previous run is {}".format(old_pid)) + if os.path.exists("/proc/%s" % old_pid): + debug_print("pid ({}) from previous run is still running".format(old_pid)) + log_and_exit(success=False, message="Previous {} job was still running (pid {}). Backup cannot start".format(run_name, old_pid)) + else: + debug_print( "previous pid file found for job {}, but program not running (pid {}), removing pid file".format(run_name, old_pid) ) + os.remove(pid_file_path) + + pid_file = open(pid_file_path, "w") + pid = os.getpid() + pid_file.write("%s" % pid) + debug_print("writing pid {} to {}".format(pid, pid_file_path)) + pid_file.close() + +if (check_src): + srcmnt = os.path.ismount(src) if not srcmnt: - log_and_exit(success=False, message="Source directory {} is not a mountpoint (and check specified)".format(args.src)) + log_and_exit(success=False, message="Source directory {} is not a mountpoint (and check specified)".format(src)) -if (args.checkdst): - dstmnt = os.path.ismount(args.dst) +if (check_dst): + dstmnt = os.path.ismount(dst) if not dstmnt: - log_and_exit(success=False, message="Destination directory {} is not a mountpoint (and check specified)".format(args.dst)) + log_and_exit(success=False, message="Destination directory {} is not a mountpoint (and check specified)".format(dst)) -debug_print( "changing to source directory {}".format(args.src) ) -os.chdir( args.src ) +debug_print( "changing to source directory {}".format(src) ) +os.chdir( src ) test_rctime = get_rctime( "." ) if (test_rctime["rc"] != 0): - log_and_exit(success=False, message="Source directory {} did not return a CephFS rctime, so probably is not a CephFS mount and cannot be backed up".format(args.src)) + log_and_exit(success=False, message="Source directory {} did not return a CephFS rctime, so probably is not a CephFS mount and cannot be backed up".format(src)) else: debug_print("src dir returned a cephFS rctime, starting ") +#time.sleep(10) recurse_rsync( "." ) -if (args.run): +if (run): cmd_list_len = len(rsync_cmd_list) timestamp_print("{} rsync commands to run".format(cmd_list_len)) for (i, rsync_cmd) in enumerate(rsync_cmd_list): timestamp_print("{}/{} running '{}'".format(i+1, cmd_list_len, " ".join(rsync_cmd))) + if check_space: + debug_print("space check requested for destination") + cur_freespace = get_fs_freespace(dst) + next_rsync_size = size_list[i] + debug_print("next_rsync_size {}".format(next_rsync_size)) + free_space_after_rsync = cur_freespace - next_rsync_size + debug_print("space check: {} free space on destination, at most {} bytes in next rsync".format(cur_freespace, next_rsync_size)) + if free_space_after_rsync < free_bytes: + log_and_exit(success=False, message="backup aborted due to possibility of free space on destination dropping below free space threshold. {} out of {} required free bytes were avaliable ({} current free space, {} remote dir size)".format(free_space_after_rsync, free_bytes, cur_freespace, next_rsync_size)) + else: + info_print("space check ok - at least {} free bytes will be avaliable after this rsync ({} required, {} current free space, {} remote dir size)".format(free_space_after_rsync, free_bytes, cur_freespace, next_rsync_size)) + rsync = run_rsync(rsync_cmd) if (rsync["rc"] == 0): timestamp_print("Success") From 2e3617509fe065f94b0852446118318c980a8eb6 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Wed, 20 Jul 2022 10:27:02 +0100 Subject: [PATCH 09/15] chnge config file location to better match naming of log and lock dirs --- cephfs/recursive-backup/recursive-backup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/recursive-backup.py index 5d00148..2678fc9 100755 --- a/cephfs/recursive-backup/recursive-backup.py +++ b/cephfs/recursive-backup/recursive-backup.py @@ -102,7 +102,7 @@ def log_and_exit ( success, message ): args = parser.parse_args() -config.read('/etc/recursive-backup/config.ini') +config.read('/etc/ceph-fs-backup/config.ini') if args.name: run_name=args.name From 395e9bc4372c8a366eec95b7c69e3727d7868da0 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Wed, 20 Jul 2022 10:28:09 +0100 Subject: [PATCH 10/15] bugfix: add missing pid_dir definition --- cephfs/recursive-backup/recursive-backup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/recursive-backup.py index 2678fc9..0a04d06 100755 --- a/cephfs/recursive-backup/recursive-backup.py +++ b/cephfs/recursive-backup/recursive-backup.py @@ -178,6 +178,7 @@ def log_and_exit ( success, message ): check_space = local_config.getboolean('check_space') free_bytes = local_config.getint('free_bytes') log_dir = local_config.get('log_dir') +pid_dir = local_config.get('pid_dir') log_to_file = local_config.getboolean('log_to_file') run = args.run src = args.src From 50f4846cc316fc6054ed6fafb27718aa438fd1bc Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Wed, 20 Jul 2022 11:03:39 +0100 Subject: [PATCH 11/15] rename script and improve naming consistency --- cephfs/recursive-backup/README.md | 4 ++-- .../{recursive-backup.py => cephfs-recursive-backup.py} | 8 ++++---- cephfs/recursive-backup/config.ini | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) rename cephfs/recursive-backup/{recursive-backup.py => cephfs-recursive-backup.py} (98%) diff --git a/cephfs/recursive-backup/README.md b/cephfs/recursive-backup/README.md index d22a927..e27bf13 100644 --- a/cephfs/recursive-backup/README.md +++ b/cephfs/recursive-backup/README.md @@ -6,7 +6,7 @@ This script generates rsync commands to back up changes to a CephFS filesystem s Using the --run option will allow the script to also run the generated rsyncs. In this mode it outputs a summary line when the script exits, which can be redirected to a log, and the nagios/icinga check can be used to report on the state of and time since the last backup. Any non zero exit code from any of the backup runs will cause the backup to be marked as failed. -The config file `/etc/recursive-backup/config.ini` can be used to alter defaults, as well as set specific settings for named jobs (with the -n option). An example config file is present in this repo with all avaliable config file options specified, but it is not compulsory. Command line options will always override config file settings. +The config file `/etc/cephfs-recursive-backup/config.ini` can be used to alter defaults, as well as set specific settings for named jobs (with the -n option). An example config file is present in this repo with all avaliable config file options specified, but it is not compulsory. Command line options will always override config file settings. ## Example usage @@ -15,7 +15,7 @@ TODO ## Recursive backup script usage ``` -usage: recursive-backup.py [-h] [-n NAME] [-t TIME] [-d DAYS] [--full] +usage: cephfs-recursive-backup.py [-h] [-n NAME] [-t TIME] [-d DAYS] [--full] [-f MAXFILES] [-b MAXBYTES] [-s SAFETY] [--checksrc] [--nochecksrc] [--checkdst] [--nocheckdst] [--checkpid] [--nocheckpid] diff --git a/cephfs/recursive-backup/recursive-backup.py b/cephfs/recursive-backup/cephfs-recursive-backup.py similarity index 98% rename from cephfs/recursive-backup/recursive-backup.py rename to cephfs/recursive-backup/cephfs-recursive-backup.py index 0a04d06..4566ca4 100755 --- a/cephfs/recursive-backup/recursive-backup.py +++ b/cephfs/recursive-backup/cephfs-recursive-backup.py @@ -58,7 +58,7 @@ def log_and_exit ( success, message ): attrcmd = ["getfattr", "--only-values", "-n"] verbosity="debug" -run_name="recursive-backup" +run_name="cephfs-recursive-backup" config = configparser.ConfigParser() # populate defaults @@ -71,11 +71,11 @@ def log_and_exit ( success, message ): config['DEFAULT']['check_src']="true" config['DEFAULT']['check_dst']="true" config['DEFAULT']['check_pid']="true" -config['DEFAULT']['pid_dir']="/var/run/ceph-fs-backup" +config['DEFAULT']['pid_dir']="/var/run/cephfs-recursive-backup" config['DEFAULT']['check_space']="false" config['DEFAULT']['free_bytes']="10000000000" config['DEFAULT']['log_to_file']="false" -config['DEFAULT']['log_dir']="/var/log/ceph-fs-backup" +config['DEFAULT']['log_dir']="/var/log/cephfs-recursive-backup" parser = argparse.ArgumentParser(description="Generate rsync commands to back up changes to a CephFS filesystem since a given time. Uses the CephFS rctime to determine what has changed, and uses rfiles and rbytes to determine the size of individual rsyncs. By default it will do nothing apart from outputting rsync commands to standard output.") @@ -102,7 +102,7 @@ def log_and_exit ( success, message ): args = parser.parse_args() -config.read('/etc/ceph-fs-backup/config.ini') +config.read('/etc/cephfs-recursive-backup/config.ini') if args.name: run_name=args.name diff --git a/cephfs/recursive-backup/config.ini b/cephfs/recursive-backup/config.ini index 28f6a43..87146bd 100644 --- a/cephfs/recursive-backup/config.ini +++ b/cephfs/recursive-backup/config.ini @@ -7,8 +7,8 @@ safety_factor=3600 verbosity=info check_src=true check_dst=true -pid_dir=/var/run/recursive-backup -log_dir=/var/log/recursive-backup +pid_dir=/var/run/cephfs-recursive-backup +log_dir=/var/log/cephfs-recursive-backup log_to_file=false check_pid=false From c1cee12faf09e53bd9e4f3594558d5891d97b17a Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Wed, 6 Mar 2024 15:01:01 +0000 Subject: [PATCH 12/15] cephfs-recursive-backup: utf-8 decode rsync stderr/out --- cephfs/recursive-backup/cephfs-recursive-backup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cephfs/recursive-backup/cephfs-recursive-backup.py b/cephfs/recursive-backup/cephfs-recursive-backup.py index 4566ca4..0bb52b0 100755 --- a/cephfs/recursive-backup/cephfs-recursive-backup.py +++ b/cephfs/recursive-backup/cephfs-recursive-backup.py @@ -374,11 +374,11 @@ def recurse_rsync( directory ): rsync = run_rsync(rsync_cmd) if (rsync["rc"] == 0): timestamp_print("Success") - print(rsync["stdout"].decode('ascii')) + print(rsync["stdout"].decode()) else: timestamp_print("failure (return code: {})".format(rsync["rc"])) - print(rsync["stdout"].decode('ascii')) - print(rsync["stderr"].decode('ascii')) + print(rsync["stdout"].decode()) + print(rsync["stderr"].decode()) warn_print("rsync failed, continuing") if warnings is True: log_and_exit(success=False, message="backup started at {} ({}) finished with warnings".format(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(start_time)), start_time)) From defbb2656713bcd4e72e4b058cfa732a1acf39a0 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Wed, 6 Mar 2024 15:02:09 +0000 Subject: [PATCH 13/15] cephfs-recursive-backup: don't overwrite the log file --- cephfs/recursive-backup/cephfs-recursive-backup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cephfs/recursive-backup/cephfs-recursive-backup.py b/cephfs/recursive-backup/cephfs-recursive-backup.py index 0bb52b0..095a373 100755 --- a/cephfs/recursive-backup/cephfs-recursive-backup.py +++ b/cephfs/recursive-backup/cephfs-recursive-backup.py @@ -187,7 +187,7 @@ def log_and_exit ( success, message ): if log_to_file: log_file = os.path.join( log_dir, run_name + ".log" ) info_print("stdout redirected to {}".format(log_file)) - sys.stdout = open(log_file, 'w') + sys.stdout = open(log_file, 'w+') # do not check for pidfile if not running the rsyncs if not run: From 5f1ff18b21b4cb30447c19c0663453792801a93e Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Tue, 4 Feb 2025 14:55:38 +0000 Subject: [PATCH 14/15] Fix warning for syntax error --- cephfs/recursive-backup/icinga_cephfs_backup_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cephfs/recursive-backup/icinga_cephfs_backup_check.py b/cephfs/recursive-backup/icinga_cephfs_backup_check.py index f324c7d..1e5ea22 100755 --- a/cephfs/recursive-backup/icinga_cephfs_backup_check.py +++ b/cephfs/recursive-backup/icinga_cephfs_backup_check.py @@ -56,7 +56,7 @@ def readable_timedelta(duration: timedelta): print("Error reading backup log file") sys.exit(CRIT) -if last_summary is "NONE": +if last_summary == "NONE": print("No backup summary found in log file") sys.exit(CRIT) From e2a4c26a6794496d2387a394c9da0651d2d2b860 Mon Sep 17 00:00:00 2001 From: Tom Byrne Date: Wed, 12 Feb 2025 15:27:23 +0000 Subject: [PATCH 15/15] use correct append flag when opening log file --- cephfs/recursive-backup/cephfs-recursive-backup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cephfs/recursive-backup/cephfs-recursive-backup.py b/cephfs/recursive-backup/cephfs-recursive-backup.py index 095a373..1b3de63 100755 --- a/cephfs/recursive-backup/cephfs-recursive-backup.py +++ b/cephfs/recursive-backup/cephfs-recursive-backup.py @@ -187,7 +187,7 @@ def log_and_exit ( success, message ): if log_to_file: log_file = os.path.join( log_dir, run_name + ".log" ) info_print("stdout redirected to {}".format(log_file)) - sys.stdout = open(log_file, 'w+') + sys.stdout = open(log_file, 'a') # do not check for pidfile if not running the rsyncs if not run: