From 014b8852c646fb52e064baadc758f37dd2754747 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 22 Apr 2024 12:03:30 -0400 Subject: [PATCH 01/35] rebase packaging functions to shared module --- src/digarch_scripts/package/package_base.py | 159 ++++++++++++ tests/test_package_base.py | 261 ++++++++++++++++++++ 2 files changed, 420 insertions(+) create mode 100644 src/digarch_scripts/package/package_base.py create mode 100644 tests/test_package_base.py diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py new file mode 100644 index 0000000..0c63e60 --- /dev/null +++ b/src/digarch_scripts/package/package_base.py @@ -0,0 +1,159 @@ +import argparse +import logging +import os +import re +from datetime import date +from pathlib import Path + +import bagit + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) + + +def parse_args() -> argparse.Namespace: + def extant_path(p: str) -> Path: + path = Path(p) + if not path.exists(): + raise argparse.ArgumentTypeError(f"{path} does not exist") + return path + + def digital_carrier_label(id: str) -> Path: + pattern = r"ACQ_\d{4}_\d{6}" + if not re.match(r"ACQ_\d{4}_\d{6}", id): + raise argparse.ArgumentTypeError( + f"{id} does not match the expected {type} pattern, {pattern}" + ) + return id + + parser = argparse.ArgumentParser(description="test") + parser.add_argument("--payload", required=True, type=extant_path) + parser.add_argument("--log", required=True, type=extant_path) + parser.add_argument("--md5", required=True, type=extant_path) + parser.add_argument("--dest", required=True, type=extant_path) + parser.add_argument("--id", required=True, type=digital_carrier_label) + + return parser.parse_args() + + +def create_base_dir(dest: Path, id: str) -> Path: + acq_id = id.rsplit("_", 1)[0] + package_base = dest / acq_id / id + if package_base.exists(): + raise FileExistsError( + f"{package_base} already exists. Make sure you are using the correct ID" + ) + + try: + package_base.mkdir(parents=True) + except PermissionError: + raise PermissionError(f"{dest} is not writable") + return package_base + + +def move_metadata_file(md_path: Path, pkg_dir: Path) -> None: + md_dir = pkg_dir / "metadata" + if not md_dir.exists(): + md_dir.mkdir() + + new_md_path = md_dir / md_path.name + if new_md_path.exists(): + raise FileExistsError(f"{new_md_path} already exists. Not moving.") + + md_path.rename(new_md_path) + return None + + +def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: + for md_path in md_paths: + try: + move_metadata_file(md_path, pkg_dir) + except FileExistsError as e: + raise Warning( + f"{e} One or more metadata files may have already been moved to new location" + ) + return None + + +def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> None: + bag_dir = pkg_dir / "objects" + bag_dir.mkdir() + move_payload(payload_path, bag_dir) + convert_rclone_md5_to_bagit_manifest(md5_path, bag_dir) + # generate baginfo.txt and bagit.txt (copying code snippet from bagit) + create_bag_tag_files(bag_dir) + return None + + +def move_payload(payload_path: Path, bag_dir: Path) -> None: + # instantiate a var for objects dir + payload_dir = bag_dir / "data" + # if the object folder does not exist create it + if not payload_dir.exists(): + payload_dir.mkdir(parents=True) + else: + raise FileExistsError(f"{payload_dir} already exists. Not moving files.") + + for a_file in payload_path.iterdir(): + new_ob_path = payload_dir / a_file.name + # if a payload file is already in the object directory do not move, raise error + if new_ob_path.exists(): + raise FileExistsError(f"{new_ob_path} already exists. Not moving.") + + a_file.rename(new_ob_path) + return None + + +def convert_rclone_md5_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: + # check for manifest + new_md5_path = bag_dir / "manifest-md5.txt" + if new_md5_path.exists(): + raise FileExistsError("manifest-md5.txt already exists, review package") + + with open(md5_path, "r") as f: + manifest_data = f.readlines() + + updated_manifest = [line.replace(" ", " data/") for line in manifest_data] + # re-writes the manifest lines + with open(md5_path, "w") as f: + f.writelines(updated_manifest) + # move md5 file to manifest-md5.txt in bag + md5_path.rename(new_md5_path) + + return None + + +def create_bag_tag_files(bag_dir: Path): + txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" + with open(bag_dir / "bagit.txt", "w") as bagit_file: + bagit_file.write(txt) + + bag_info = {} + bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") + bag_info["Bag-Software-Agent"] = "package_cloud.py" + total_bytes, total_files = get_oxum(bag_dir / "data") + bag_info["Payload-Oxum"] = f"{total_bytes}.{total_files}" + bagit._make_tag_file(bag_dir / "bag-info.txt", bag_info) + + +def get_oxum(payload_dir: Path) -> tuple[int, int]: + total_bytes = 0 + total_files = 0 + + for payload_file in payload_dir.rglob("*"): + if payload_file.is_file(): + total_files += 1 + total_bytes += os.stat(payload_file).st_size + + return total_bytes, total_files + + +def validate_bag_in_payload(pkg_dir: Path) -> None: + bag_dir = pkg_dir / "objects" + bag = bagit.Bag(str(bag_dir)) + try: + bag.validate(completeness_only=True) + LOGGER.info(f"{bag.path} is valid.") + except bagit.BagValidationError: + LOGGER.warning(f"{bag.path} is not valid. Check the bag manifest and oxum.") + return None diff --git a/tests/test_package_base.py b/tests/test_package_base.py new file mode 100644 index 0000000..a075586 --- /dev/null +++ b/tests/test_package_base.py @@ -0,0 +1,261 @@ +import os +import shutil +from pathlib import Path + +import bagit +import pytest + +import digarch_scripts.package.package_base as pb + + +@pytest.fixture +def transfer_files(tmp_path: Path, request): + fixture_data = Path(request.module.__file__).parent / "fixtures" / "cloud" + shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) + return tmp_path + + +@pytest.fixture +def payload(transfer_files): + return transfer_files / "rclone_files" + + +@pytest.fixture +def md5_manifest(transfer_files): + return transfer_files / "rclone.md5" + + +@pytest.fixture +def log(transfer_files): + return transfer_files / "rclone.log" + + +@pytest.fixture +def id(): + return "ACQ_1234_123456" + + +def args(transfer_files): + args = [ + transfer_files / "rclone.md5", + transfer_files / "rclone.log", + transfer_files, + ] + return args + + +def test_create_package_basedir_exc_on_readonly(tmp_path: Path, id: str): + """Test that package folder maker reports permission error""" + + # make folder read-only + os.chmod(tmp_path, 0o500) + + with pytest.raises(PermissionError) as exc: + pb.create_base_dir(tmp_path, id) + + # change back to allow clean-up (might not be necessary) + os.chmod(tmp_path, 0o777) + assert f"{str(tmp_path)} is not writable" in str(exc.value) + + +def test_create_package_basedir(tmp_path: Path, id: str): + """Test that package folder maker makes ACQ and Carrier folders""" + + base_dir = pb.create_base_dir(tmp_path, id) + + assert base_dir.name == id + assert base_dir.parent.name == id[:-7] + + +def test_create_package_basedir_with_existing_acq_dir(tmp_path: Path, id: str): + """Test that package folder maker respect existing ACQ folder""" + + (tmp_path / id[:-7]).mkdir() + base_dir = pb.create_base_dir(tmp_path, id) + + assert base_dir.name == id + assert base_dir.parent.name == id[:-7] + + +def test_error_on_existing_package_dir(tmp_path: Path, id: str): + """Test that package folder maker errors if carrier folder exists""" + + base_dir = tmp_path / id[:-7] / id + base_dir.mkdir(parents=True) + + with pytest.raises(FileExistsError) as exc: + pb.create_base_dir(tmp_path, id) + + assert f"{base_dir} already exists. Make sure you are using the correct ID" in str( + exc.value + ) + + +@pytest.fixture +def package_base_dir(tmp_path: Path, id: str): + return pb.create_base_dir(tmp_path, id) + + +def test_move_metadata(package_base_dir: Path, log: Path): + """Test that metadata folder and log file are moved successfully""" + + pb.move_metadata_file(log, package_base_dir) + + assert not log.exists() + assert (package_base_dir / "metadata" / "rclone.log").exists() + + +def test_do_not_overwrite_metadata(package_base_dir: Path, log: Path): + """Test that log file is not moved if a same name file exists in dest""" + + rclone_log = package_base_dir / "metadata" / log.name + rclone_log.parent.mkdir() + rclone_log.touch() + + with pytest.raises(FileExistsError) as exc: + pb.move_metadata_file(log, package_base_dir) + + assert log.exists() + assert f"{rclone_log} already exists. Not moving." in str(exc.value) + + +def test_move_multiple_metadata(package_base_dir: Path, log: Path, md5_manifest: Path): + """Test that multiple files are moved successfully""" + + md_files = [log, md5_manifest] + pb.move_metadata_files(md_files, package_base_dir) + + for md_file in md_files: + assert not md_file.exists() + assert (package_base_dir / "metadata" / md_file.name).exists() + + +def test_partial_halt_multiple_metadata( + package_base_dir: Path, log: Path, md5_manifest: Path +): + """Test that warning is issued for multiple move if a single metadata move fails""" + + rclone_log = package_base_dir / "metadata" / log.name + rclone_log.parent.mkdir() + rclone_log.touch() + + md_files = [log, md5_manifest] + + with pytest.raises(Warning) as exc: + pb.move_metadata_files(md_files, package_base_dir) + + assert log.exists() + assert ( + f"already exists. Not moving. One or more metadata files may have already been moved to new location" + in str(exc.value) + ) + + +def test_move_payload(package_base_dir: Path, payload: Path): + """Test that entirety of payload is moved and hierarchy is preserved""" + + source_contents = [file.relative_to(payload) for file in payload.rglob("*")] + + data_path = package_base_dir / "objects" / "data" + pb.move_payload(payload, package_base_dir / "objects") + + # check that source is empty + assert not any(payload.iterdir()) + + assert data_path.exists() + + # compare contents of data and former source + data_contents = [file.relative_to(data_path) for file in data_path.rglob("*")] + assert source_contents == data_contents + + +def test_do_not_overwrite_payload(package_base_dir: Path, payload: Path): + """Test that no payload file is moved if /data exists""" + + source_contents = [file for file in payload.rglob("*")] + + bag_payload = package_base_dir / "objects" / "data" + bag_payload.mkdir(parents=True) + + with pytest.raises(FileExistsError) as exc: + pb.move_payload(payload, package_base_dir / "objects") + + # check source has not changed + assert source_contents == [file for file in payload.rglob("*")] + assert f"{bag_payload} already exists. Not moving files." in str(exc.value) + + +@pytest.fixture +def bag_payload(package_base_dir: Path, payload: Path): + pb.move_payload(payload, package_base_dir) + bag_payload = package_base_dir / "data" + + return bag_payload + + +def test_convert_rclone_md5(bag_payload: Path, md5_manifest: Path): + pb.convert_rclone_md5_to_bagit_manifest(md5_manifest, bag_payload.parent) + bag_md5 = bag_payload.parent / "manifest-md5.txt" + + # Get path to correct payload in data + # read md5 and extract filepaths + with open(bag_md5) as m: + md5_paths = [line.strip().split(" ")[-1] for line in m.readlines()] + + payload_files = [ + str(path.relative_to(bag_payload.parent)) for path in bag_payload.rglob("*") + ] + for a_file in md5_paths: + assert a_file in payload_files + + +def test_create_bag(package_base_dir: Path, payload: Path, md5_manifest: Path): + """Test that all tag files are created and rclone md5sums are correctly converted""" + + bag_path = package_base_dir / "objects" + + # might need further testing of the oxum and manifest converter functions + pb.create_bag_in_objects(payload, md5_manifest, package_base_dir) + + assert bagit.Bag(str(bag_path)).validate(completeness_only=True) + + +def test_generate_valid_oxum(transfer_files: Path): + """Test that script generates oxum correctly""" + # test with entire fixture to text folder recursion + + total_bytes, total_files = pb.get_oxum(transfer_files) + + assert total_bytes == 59286 + assert total_files == 12 + + +def test_validate_valid_bag(transfer_files: Path, caplog): + """Test the log message""" + + # create tiny bag for testing + object_dir = transfer_files / "objects" + object_dir.mkdir() + (transfer_files / "rclone.md5").rename(object_dir / "rlcone.md5") + test_bag = bagit.make_bag(object_dir) + + pb.validate_bag_in_payload(transfer_files) + + assert f"{test_bag.path} is valid." in caplog.text + + +def test_validate_invalid_bag(transfer_files, caplog): + """Test the log message if the bag isn't valid for some reason""" + + object_dir = transfer_files / "objects" + object_dir.mkdir() + (transfer_files / "rclone.md5").rename(object_dir / "rlcone.md5") + + test_bag = bagit.make_bag(object_dir) + print(list(Path(test_bag.path).iterdir())) + (Path(test_bag.path) / "bag-info.txt").unlink() + pb.validate_bag_in_payload(transfer_files) + + assert ( + f"{test_bag.path} is not valid. Check the bag manifest and oxum." in caplog.text + ) From fb7ae6677a0fd0a8eb56a9e876fe51a80d037e03 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 22 Apr 2024 12:03:52 -0400 Subject: [PATCH 02/35] remove now-shared functions --- src/digarch_scripts/package/package_cloud.py | 123 +---------- tests/test_package_cloud.py | 206 +------------------ 2 files changed, 15 insertions(+), 314 deletions(-) diff --git a/src/digarch_scripts/package/package_cloud.py b/src/digarch_scripts/package/package_cloud.py index 23e033a..2ece415 100644 --- a/src/digarch_scripts/package/package_cloud.py +++ b/src/digarch_scripts/package/package_cloud.py @@ -1,12 +1,14 @@ import argparse -from datetime import date import logging import os -from pathlib import Path import re +from datetime import date +from pathlib import Path import bagit +import digarch_scripts.package.package_base as pb + LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.INFO) @@ -35,122 +37,15 @@ def digital_carrier_label(id: str) -> Path: return parser.parse_args() -def create_base_dir(dest: Path, id: str) -> Path: - acq_id = id.rsplit("_", 1)[0] - package_base = dest / acq_id / id - if package_base.exists(): - raise FileExistsError( - f"{package_base} already exists. Make sure you are using the correct ID" - ) - - try: - package_base.mkdir(parents=True) - except PermissionError: - raise PermissionError(f"{dest} is not writable") - return package_base - -def move_metadata_file(md_path: Path, pkg_dir: Path) -> None: - md_dir = pkg_dir / "metadata" - if not md_dir.exists(): - md_dir.mkdir() - - new_md_path = md_dir / md_path.name - if new_md_path.exists(): - raise FileExistsError(f"{new_md_path} already exists. Not moving.") - - md_path.rename(new_md_path) - return None - -def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> None: - bag_dir = pkg_dir / "objects" - bag_dir.mkdir() - move_payload(payload_path, bag_dir) - convert_to_bagit_manifest(md5_path, bag_dir) - # generate baginfo.txt and bagit.txt (copying code snippet from bagit) - create_bag_tag_files(bag_dir) - return None - -def move_payload(payload_path: Path, bag_dir: Path) -> None: - #instantiate a var for objects dir - payload_dir = bag_dir / "data" - #if the object folder does not exist create it - if not payload_dir.exists(): - payload_dir.mkdir(parents=True) - else: - raise FileExistsError(f"{payload_dir} already exists. Not moving files.") - - for a_file in payload_path.iterdir(): - new_ob_path = payload_dir / a_file.name - #if a payload file is already in the object directory do not move, raise error - if new_ob_path.exists(): - raise FileExistsError(f"{new_ob_path} already exists. Not moving.") - - a_file.rename(new_ob_path) - return None - -def convert_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: - #check for manifest - new_md5_path = bag_dir / "manifest-md5.txt" - if new_md5_path.exists(): - raise FileExistsError("manifest-md5.txt already exists, review package") - - with open(md5_path, "r") as f: - manifest_data = f.readlines() - - updated_manifest = [ - line.replace(" ", " data/") for line in manifest_data - ] - #re-writes the manifest lines - with open(md5_path, "w") as f: - f.writelines(updated_manifest) - #move md5 file to manifest-md5.txt in bag - md5_path.rename(new_md5_path) - - return None - -def create_bag_tag_files(bag_dir: Path): - txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" - with open(bag_dir / "bagit.txt", "w") as bagit_file: - bagit_file.write(txt) - - bag_info = {} - bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") - bag_info["Bag-Software-Agent"] = "package_cloud.py" - total_bytes, total_files = get_oxum(bag_dir / "data") - bag_info["Payload-Oxum"] = f"{total_bytes}.{total_files}" - bagit._make_tag_file(bag_dir / "bag-info.txt", bag_info) - - -def get_oxum(payload_dir: Path) -> (int, int): - total_bytes = 0 - total_files = 0 - - for payload_file in payload_dir.rglob('*'): - if payload_file.is_file(): - total_files += 1 - total_bytes += os.stat(payload_file).st_size - - return total_bytes, total_files - - -def validate_bag_in_payload(pkg_dir: Path) -> None: - bag_dir = pkg_dir / "objects" - bag = bagit.Bag(str(bag_dir)) - try: - bag.validate(completeness_only=True) - LOGGER.info(f"{bag.path} is valid.") - except bagit.BagValidationError: - LOGGER.warn(f"{bag.path} is not valid. Check the bag manifest and oxum.") - return None - def main(): args = parse_args() - base_dir = create_base_dir(args.dest, args.id) - move_metadata_file(args.log, base_dir) - create_bag_in_objects(args.payload, args.md5, base_dir) - validate_bag_in_payload(base_dir) + base_dir = pb.create_base_dir(args.dest, args.id) + pb.move_metadata_file(args.log, base_dir) + pb.create_bag_in_objects(args.payload, args.md5, base_dir) + pb.validate_bag_in_payload(base_dir) + if __name__ == "__main__": main() diff --git a/tests/test_package_cloud.py b/tests/test_package_cloud.py index 55e5561..d7d5702 100644 --- a/tests/test_package_cloud.py +++ b/tests/test_package_cloud.py @@ -1,12 +1,12 @@ -import digarch_scripts.package.package_cloud as pc - import argparse import os -from pathlib import Path -import pytest import shutil +from pathlib import Path import bagit +import pytest + +import digarch_scripts.package.package_cloud as pc @pytest.fixture @@ -86,200 +86,6 @@ def test_id_arg_must_match_pattern( assert f"bad_id does not match" in stderr -def test_create_package_basedir_exc_on_readonly(tmp_path: Path, args: list): - """Test that package folder maker reports permission error""" - - id = args[-1] - # make folder read-only - os.chmod(tmp_path, 0o500) - - with pytest.raises(PermissionError) as exc: - pc.create_base_dir(tmp_path, id) - - # change back to allow clean-up (might not be necessary) - os.chmod(tmp_path, 0o777) - assert f"{str(tmp_path)} is not writable" in str(exc.value) - - -def test_create_package_basedir(tmp_path: Path, args: list): - """Test that package folder maker makes ACQ and Carrier folders""" - - id = args[-1] - base_dir = pc.create_base_dir(tmp_path, args[-1]) - - assert base_dir.name == id - assert base_dir.parent.name == id[:-7] - - -def test_create_package_basedir_with_existing_acq_dir(tmp_path: Path, args: list): - """Test that package folder maker respect existing ACQ folder""" - - id = args[-1] - (tmp_path / id[:-7]).mkdir() - base_dir = pc.create_base_dir(tmp_path, args[-1]) - - assert base_dir.name == id - assert base_dir.parent.name == id[:-7] - - -def test_error_on_existing_package_dir(tmp_path: Path, args: list): - """Test that package folder maker errors if carrier folder exists""" - - id = args[-1] - base_dir = tmp_path / id[:-7] / id - base_dir.mkdir(parents=True) - - with pytest.raises(FileExistsError) as exc: - pc.create_base_dir(tmp_path, id) - - assert f"{base_dir} already exists. Make sure you are using the correct ID" in str( - exc.value - ) - - -@pytest.fixture -def package_base_dir(tmp_path: Path, args: list): - return pc.create_base_dir(tmp_path, args[-1]) - - -def test_move_metadata(transfer_files: Path, package_base_dir: Path): - """Test that metadata folder and log file are moved successfully""" - - source_log = transfer_files / "rclone.log" - pc.move_metadata_file(source_log, package_base_dir) - - assert not source_log.exists() - assert (package_base_dir / "metadata" / "rclone.log").exists() - - -def test_do_not_overwrite_metadata(transfer_files: Path, package_base_dir: Path): - """Test that log file is not moved if a same name file exists in dest""" - - source_log = transfer_files / "rclone.log" - rclone_log = package_base_dir / "metadata" / "rclone.log" - rclone_log.parent.mkdir() - rclone_log.touch() - - with pytest.raises(FileExistsError) as exc: - pc.move_metadata_file(source_log, package_base_dir) - - assert source_log.exists() - assert f"{rclone_log} already exists. Not moving." in str(exc.value) - -def test_move_payload(transfer_files: Path, package_base_dir: Path): - """Test that entirety of payload is moved and hierarchy is preserved""" - - source_payload = transfer_files / "rclone_files" - source_contents = [ - file.relative_to(source_payload) for file in source_payload.rglob("*") - ] - - data_path = package_base_dir / "objects" / "data" - pc.move_payload(source_payload, package_base_dir / "objects") - - # check that source is empty - assert not any(source_payload.iterdir()) - - assert data_path.exists() - - # compare contents of data and former source - data_contents = [file.relative_to(data_path) for file in data_path.rglob("*")] - assert source_contents == data_contents - - -def test_do_not_overwrite_payload(transfer_files: Path, package_base_dir: Path): - """Test that no payload file is moved if /data exists""" - - source_payload = transfer_files / "rclone_files" - source_contents = [file for file in source_payload.rglob("*")] - - bag_payload = package_base_dir / "objects" / "data" - bag_payload.mkdir(parents=True) - - with pytest.raises(FileExistsError) as exc: - pc.move_payload(source_payload, package_base_dir / "objects") - - # check source has not changed - assert source_contents == [file for file in source_payload.rglob("*")] - assert f"{bag_payload} already exists. Not moving files." in str(exc.value) - -@pytest.fixture -def bag_payload(transfer_files: Path, package_base_dir: Path): - pc.move_payload(transfer_files / "rclone_files", package_base_dir) - bag_payload = package_base_dir / "data" - - return bag_payload - -def test_convert_md5(bag_payload: Path, transfer_files: Path): - rclone_md5 = transfer_files / "rclone.md5" - pc.convert_to_bagit_manifest(rclone_md5, bag_payload.parent) - bag_md5 = bag_payload.parent / "manifest-md5.txt" - - # Get path to correct payload in data - # read md5 and extract filepaths - with open(bag_md5) as m: - md5_paths = [line.strip().split(' ')[-1] for line in m.readlines()] - - payload_files = [ - str(path.relative_to(bag_payload.parent)) for path in bag_payload.rglob('*') - ] - for a_file in md5_paths: - assert a_file in payload_files - - -def test_create_bag(transfer_files: Path, package_base_dir: Path): - """Test that all tag files are created and rclone md5sums are correctly converted""" - - md5_path = transfer_files / "rclone.md5" - bag_path = package_base_dir / "objects" - - # might need further testing of the oxum and manifest converter functions - pc.create_bag_in_objects( - transfer_files / "rclone_files", md5_path, package_base_dir - ) - - assert bagit.Bag(str(bag_path)).validate(completeness_only=True) - - -def test_generate_valid_oxum(transfer_files: Path): - """Test that script generates oxum correctly""" - - total_bytes, total_files = pc.get_oxum(transfer_files) - - assert total_bytes == 59286 - assert total_files == 12 - - -def test_validate_valid_bag(transfer_files: Path, caplog): - """Test the log message""" - - object_dir = transfer_files / "objects" - object_dir.mkdir() - (transfer_files / "rclone.md5").rename(object_dir / "rlcone.md5") - - test_bag = bagit.make_bag(object_dir) - - pc.validate_bag_in_payload(transfer_files) - - assert f"{test_bag.path} is valid." in caplog.text - - -def test_validate_invalid_bag(transfer_files, caplog): - """Test the log message if the bag isn't valid for some reason""" - - object_dir = transfer_files / "objects" - object_dir.mkdir() - (transfer_files / "rclone.md5").rename(object_dir / "rlcone.md5") - - test_bag = bagit.make_bag(object_dir) - print(list(Path(test_bag.path).iterdir())) - (Path(test_bag.path) / 'bag-info.txt').unlink() - pc.validate_bag_in_payload(transfer_files) - - - assert f"{test_bag.path} is not valid. Check the bag manifest and oxum." in caplog.text - - def test_full_run( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list ): @@ -290,6 +96,6 @@ def test_full_run( pkg_dir = Path(args[-3]) / args[-1][:-7] / args[-1] assert pkg_dir.exists() - assert bagit.Bag(str(pkg_dir / 'objects')).validate() + assert bagit.Bag(str(pkg_dir / "objects")).validate() - assert 'rclone.log' in [x.name for x in (pkg_dir / 'metadata').iterdir()] + assert "rclone.log" in [x.name for x in (pkg_dir / "metadata").iterdir()] From 0de76bd724cc4238d1cb3eb4d9d555a9c313dda1 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 22 Apr 2024 12:26:21 -0400 Subject: [PATCH 03/35] genericize file moving, TODO tests --- src/digarch_scripts/package/package_base.py | 55 +++++++++++++++++---- tests/test_package_base.py | 4 +- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 0c63e60..ad7b85c 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -51,23 +51,28 @@ def create_base_dir(dest: Path, id: str) -> Path: return package_base -def move_metadata_file(md_path: Path, pkg_dir: Path) -> None: - md_dir = pkg_dir / "metadata" - if not md_dir.exists(): - md_dir.mkdir() +def move_file(file_path: Path, pkg_dir: Path, dest: str) -> None: + dest_dir = pkg_dir / dest + if not dest_dir.exists(): + dest_dir.mkdir() - new_md_path = md_dir / md_path.name - if new_md_path.exists(): - raise FileExistsError(f"{new_md_path} already exists. Not moving.") + new_file_path = dest_dir / file_path.name + if new_file_path.exists(): + raise FileExistsError(f"{new_file_path} already exists in {dest} folder. Not moving.") - md_path.rename(new_md_path) + file_path.rename(new_file_path) + return None + + +def move_metadata_file(md_path: Path, pkg_dir: Path) -> None: + move_file(md_path, pkg_dir, 'metadata') return None def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: for md_path in md_paths: try: - move_metadata_file(md_path, pkg_dir) + move_file(md_path, pkg_dir, 'metadata') except FileExistsError as e: raise Warning( f"{e} One or more metadata files may have already been moved to new location" @@ -75,6 +80,38 @@ def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: return None +def move_diskimage_file(image_path: Path, pkg_dir: Path) -> None: + move_file(image_path, pkg_dir, 'image') + return None + + +def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: + for image_path in image_paths: + try: + move_file(image_path, pkg_dir, 'images') + except FileExistsError as e: + raise Warning( + f"{e} One or more disk images may have already been moved to new location" + ) + return None + + +def move_stream_file(md_path: Path, pkg_dir: Path) -> None: + move_file(md_path, pkg_dir, 'streams') + return None + + +def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: + for image_path in image_paths: + try: + move_file(image_path, pkg_dir, 'streams') + except FileExistsError as e: + raise Warning( + f"{e} One or more disk image streams may have already been moved to new location" + ) + return None + + def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> None: bag_dir = pkg_dir / "objects" bag_dir.mkdir() diff --git a/tests/test_package_base.py b/tests/test_package_base.py index a075586..bcb49bf 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -116,7 +116,7 @@ def test_do_not_overwrite_metadata(package_base_dir: Path, log: Path): pb.move_metadata_file(log, package_base_dir) assert log.exists() - assert f"{rclone_log} already exists. Not moving." in str(exc.value) + assert f"{rclone_log} already exists in metadata folder. Not moving." in str(exc.value) def test_move_multiple_metadata(package_base_dir: Path, log: Path, md5_manifest: Path): @@ -146,7 +146,7 @@ def test_partial_halt_multiple_metadata( assert log.exists() assert ( - f"already exists. Not moving. One or more metadata files may have already been moved to new location" + f"already exists in metadata folder. Not moving. One or more metadata files may have already been moved to new location" in str(exc.value) ) From e294b3e4c5e095dd70912ec629dae75a0d1df67c Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 22 Apr 2024 12:26:58 -0400 Subject: [PATCH 04/35] minimal version of script --- src/digarch_scripts/package/package_images.py | 45 +++++++ tests/test_package_images.py | 115 ++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 src/digarch_scripts/package/package_images.py create mode 100644 tests/test_package_images.py diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py new file mode 100644 index 0000000..35d0f49 --- /dev/null +++ b/src/digarch_scripts/package/package_images.py @@ -0,0 +1,45 @@ +import argparse +import re +from pathlib import Path + +import digarch_scripts.package.package_base as pb + + +def parse_args() -> argparse.Namespace: + def extant_path(p: str) -> Path: + path = Path(p) + if not path.exists(): + raise argparse.ArgumentTypeError(f"{path} does not exist") + return path + + def digital_carrier_label(id: str) -> Path: + pattern = r"ACQ_\d{4}_\d{6}" + old_pattern = r"M\d{4-6}_\d{4}" + if not re.match(pattern, id): + if not re.match(old_pattern, id): + raise argparse.ArgumentTypeError( + f"{id} does not match the expected {type} pattern, {pattern}" + ) + return id + + parser = argparse.ArgumentParser(description="test") + parser.add_argument("--image", required=True, type=extant_path) + parser.add_argument("--dest", required=True, type=extant_path) + parser.add_argument("--id", required=True, type=digital_carrier_label) + parser.add_argument("--log", required=False, nargs="+", type=extant_path) + parser.add_argument("--streams", required=False, type=extant_path) + parser.add_argument("--extracted", required=False, type=extant_path) + + return parser.parse_args() + + +def main(): + args = parse_args() + + base_dir = pb.create_base_dir(args.dest, args.id) + pb.move_metadata_files(args.log, base_dir) + pb.move_diskimage_file(args.image, base_dir) + + +if __name__ == "__main__": + main() diff --git a/tests/test_package_images.py b/tests/test_package_images.py new file mode 100644 index 0000000..5f9f4f9 --- /dev/null +++ b/tests/test_package_images.py @@ -0,0 +1,115 @@ +import digarch_scripts.package.package_images as pi + +from pathlib import Path +import pytest +import shutil + +import bagit + + +@pytest.fixture +def transfer_files(tmp_path: Path, request): + fixture_data = Path(request.module.__file__).parent / "fixtures" / "image" + shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) + return tmp_path + +# Test command-line arguments +@pytest.fixture +def args(transfer_files): + args = [ + "script_name", + "--image", + str(transfer_files / "image.img"), + "--dest", + str(transfer_files), + "--id", + "ACQ_1234_123456", + "--streams", + str(transfer_files / "streams"), + "--log", + str(transfer_files / "process.log"), + ] + return args + + +def test_requires_args( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script requires image, dest, and id (first 3 args)""" + + for i in range(0, 3): + # remove a pair of list items (arg and value) for each test + part_args = args[0 : 2 * i + 1] + args[2 * i + 3 :] + + monkeypatch.setattr("sys.argv", part_args) + + with pytest.raises(SystemExit): + args = pi.parse_args() + + stderr = capsys.readouterr().err + + assert f"required: {args[2*i+1]}" in stderr + + +def test_optional_args( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script requires all five args""" + + for i in [3, 4]: + # remove a pair of list items (arg and value) for each test + part_args = args[0 : 2 * i + 1] + args[2 * i + 3 :] + missing_arg = args[2*i] + + monkeypatch.setattr("sys.argv", part_args) + + parsed_args = pi.parse_args() + + assert missing_arg not in parsed_args + + +def test_arg_paths_must_exist( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script errors if a path argument doesn't exist""" + + for i in [1, 2, 4, 5]: + bad_args = args + bad_path = "nonexistant" + bad_args[2 * i] = bad_path + + monkeypatch.setattr("sys.argv", bad_args) + with pytest.raises(SystemExit): + args = pi.parse_args() + + stderr = capsys.readouterr().err + + assert f"{bad_path} does not exist" in stderr + + +def test_id_arg_must_match_pattern( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script errors if id argument doesn't match ACQ_####_######""" + args[6] = "bad_id" + monkeypatch.setattr("sys.argv", args) + with pytest.raises(SystemExit): + args = pi.parse_args() + + stderr = capsys.readouterr().err + + assert f"bad_id does not match" in stderr + + +def test_full_run( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test end to end successful run""" + + monkeypatch.setattr("sys.argv", args) + pi.main() + + pkg_dir = Path(args[4]) / args[6][:-7] / args[6] + assert pkg_dir.exists() + + assert "process.log" in [x.name for x in (pkg_dir / "metadata").iterdir()] From f4e148fefa997e283a5929baef4404dfe336b449 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 22 Apr 2024 13:24:47 -0400 Subject: [PATCH 05/35] refactor move functions to be cleaner --- src/digarch_scripts/package/package_base.py | 48 ++++++++------------- tests/test_package_base.py | 36 +++++++++------- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index ad7b85c..37a9ab7 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -63,53 +63,39 @@ def move_file(file_path: Path, pkg_dir: Path, dest: str) -> None: file_path.rename(new_file_path) return None - -def move_metadata_file(md_path: Path, pkg_dir: Path) -> None: - move_file(md_path, pkg_dir, 'metadata') - return None - - -def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: - for md_path in md_paths: +def move_files(file_paths: list[Path], pkg_dir: Path, dest: str) -> None: + for file_path in file_paths: try: - move_file(md_path, pkg_dir, 'metadata') + move_file(file_path, pkg_dir, dest) except FileExistsError as e: raise Warning( - f"{e} One or more metadata files may have already been moved to new location" + f"{e} One or more files may have already been moved to the {dest} folder" ) return None +def move_metadata_file(md_path: Path, pkg_dir: Path) -> None: + return move_file(md_path, pkg_dir, 'metadata') + + +def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: + return move_files(md_paths, pkg_dir, 'metadata') + + def move_diskimage_file(image_path: Path, pkg_dir: Path) -> None: - move_file(image_path, pkg_dir, 'image') - return None + return move_file(image_path, pkg_dir, 'images') def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: - for image_path in image_paths: - try: - move_file(image_path, pkg_dir, 'images') - except FileExistsError as e: - raise Warning( - f"{e} One or more disk images may have already been moved to new location" - ) - return None + return move_files(image_paths, pkg_dir, 'images') def move_stream_file(md_path: Path, pkg_dir: Path) -> None: - move_file(md_path, pkg_dir, 'streams') - return None + return move_file(md_path, pkg_dir, 'streams') -def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: - for image_path in image_paths: - try: - move_file(image_path, pkg_dir, 'streams') - except FileExistsError as e: - raise Warning( - f"{e} One or more disk image streams may have already been moved to new location" - ) - return None +def move_stream_files(md_path: Path, pkg_dir: Path) -> None: + return move_files(md_path, pkg_dir, 'streams') def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> None: diff --git a/tests/test_package_base.py b/tests/test_package_base.py index bcb49bf..f4236e5 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -96,57 +96,63 @@ def package_base_dir(tmp_path: Path, id: str): return pb.create_base_dir(tmp_path, id) -def test_move_metadata(package_base_dir: Path, log: Path): +MOVE_FILE = [(pb.move_metadata_file, 'metadata'), (pb.move_diskimage_file, 'images'), (pb.move_stream_file, 'streams')] +@pytest.mark.parametrize("test_function,dest", MOVE_FILE) +def test_move_file(package_base_dir: Path, log: Path, test_function, dest: str): """Test that metadata folder and log file are moved successfully""" - pb.move_metadata_file(log, package_base_dir) + test_function(log, package_base_dir) assert not log.exists() - assert (package_base_dir / "metadata" / "rclone.log").exists() + assert (package_base_dir / dest / "rclone.log").exists() -def test_do_not_overwrite_metadata(package_base_dir: Path, log: Path): +@pytest.mark.parametrize("test_function,dest", MOVE_FILE) +def test_do_not_overwrite_file(package_base_dir: Path, log: Path, test_function, dest: str): """Test that log file is not moved if a same name file exists in dest""" - rclone_log = package_base_dir / "metadata" / log.name + rclone_log = package_base_dir / dest / log.name rclone_log.parent.mkdir() rclone_log.touch() with pytest.raises(FileExistsError) as exc: - pb.move_metadata_file(log, package_base_dir) + test_function(log, package_base_dir) assert log.exists() - assert f"{rclone_log} already exists in metadata folder. Not moving." in str(exc.value) + assert f"{rclone_log} already exists in {dest} folder. Not moving." in str(exc.value) -def test_move_multiple_metadata(package_base_dir: Path, log: Path, md5_manifest: Path): +MOVE_FILES = [(pb.move_metadata_files, 'metadata'), (pb.move_diskimage_files, 'images'), (pb.move_stream_files, 'streams')] +@pytest.mark.parametrize("test_function,dest", MOVE_FILES) +def test_move_multiple_file(package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str): """Test that multiple files are moved successfully""" md_files = [log, md5_manifest] - pb.move_metadata_files(md_files, package_base_dir) + test_function(md_files, package_base_dir) for md_file in md_files: assert not md_file.exists() - assert (package_base_dir / "metadata" / md_file.name).exists() + assert (package_base_dir / dest / md_file.name).exists() -def test_partial_halt_multiple_metadata( - package_base_dir: Path, log: Path, md5_manifest: Path +@pytest.mark.parametrize("test_function,dest", MOVE_FILES) +def test_partial_halt_multiple_files( + package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str ): """Test that warning is issued for multiple move if a single metadata move fails""" - rclone_log = package_base_dir / "metadata" / log.name + rclone_log = package_base_dir / dest / log.name rclone_log.parent.mkdir() rclone_log.touch() md_files = [log, md5_manifest] with pytest.raises(Warning) as exc: - pb.move_metadata_files(md_files, package_base_dir) + test_function(md_files, package_base_dir) assert log.exists() assert ( - f"already exists in metadata folder. Not moving. One or more metadata files may have already been moved to new location" + f"already exists in {dest} folder. Not moving. One or more files may have already been moved to the {dest} folder" in str(exc.value) ) From 51c30e4cc0156a2490ff479403ac73497e654402 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 22 Apr 2024 14:34:03 -0400 Subject: [PATCH 06/35] add acq_dir creation --- src/digarch_scripts/package/package_base.py | 15 ++++++++++- src/digarch_scripts/package/package_cloud.py | 2 +- tests/test_package_base.py | 26 ++++++++++++++------ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 37a9ab7..ff0f0e6 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -36,7 +36,20 @@ def digital_carrier_label(id: str) -> Path: return parser.parse_args() -def create_base_dir(dest: Path, id: str) -> Path: +def create_acq_dir(dest: Path, acq_id: str) -> Path: + acq_dir = dest / acq_id + if acq_dir.exists(): + LOGGER.info(f"Acquisition directory already exits: {acq_dir}") + return acq_dir + + try: + acq_dir.mkdir(parents=True) + except PermissionError: + raise PermissionError(f"{dest} is not writable") + return acq_dir + + +def create_package_dir(dest: Path, id: str) -> Path: acq_id = id.rsplit("_", 1)[0] package_base = dest / acq_id / id if package_base.exists(): diff --git a/src/digarch_scripts/package/package_cloud.py b/src/digarch_scripts/package/package_cloud.py index 2ece415..606ecdd 100644 --- a/src/digarch_scripts/package/package_cloud.py +++ b/src/digarch_scripts/package/package_cloud.py @@ -41,7 +41,7 @@ def digital_carrier_label(id: str) -> Path: def main(): args = parse_args() - base_dir = pb.create_base_dir(args.dest, args.id) + base_dir = pb.create_package_dir(args.dest, args.id) pb.move_metadata_file(args.log, base_dir) pb.create_bag_in_objects(args.payload, args.md5, base_dir) pb.validate_bag_in_payload(base_dir) diff --git a/tests/test_package_base.py b/tests/test_package_base.py index f4236e5..338b5fb 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -44,24 +44,36 @@ def args(transfer_files): return args -def test_create_package_basedir_exc_on_readonly(tmp_path: Path, id: str): +CREATE_DIR = [(pb.create_acq_dir, 'ACQ_1234'), (pb.create_package_dir, 'ACQ_1234_123456')] +@pytest.mark.parametrize("tested_function,id", CREATE_DIR) +def test_create_dir_exc_on_readonly(tmp_path: Path, id: str, tested_function): """Test that package folder maker reports permission error""" # make folder read-only os.chmod(tmp_path, 0o500) with pytest.raises(PermissionError) as exc: - pb.create_base_dir(tmp_path, id) + tested_function(tmp_path, id) # change back to allow clean-up (might not be necessary) os.chmod(tmp_path, 0o777) assert f"{str(tmp_path)} is not writable" in str(exc.value) -def test_create_package_basedir(tmp_path: Path, id: str): +def test_create_acq_dir(tmp_path: Path): """Test that package folder maker makes ACQ and Carrier folders""" - base_dir = pb.create_base_dir(tmp_path, id) + id = 'ACQ_1234' + base_dir = pb.create_acq_dir(tmp_path, id) + + assert base_dir.name == id + assert base_dir.parent.name == tmp_path.name + + +def test_create_pkg_dir(tmp_path: Path, id: str): + """Test that package folder maker makes ACQ and Carrier folders""" + + base_dir = pb.create_package_dir(tmp_path, id) assert base_dir.name == id assert base_dir.parent.name == id[:-7] @@ -71,7 +83,7 @@ def test_create_package_basedir_with_existing_acq_dir(tmp_path: Path, id: str): """Test that package folder maker respect existing ACQ folder""" (tmp_path / id[:-7]).mkdir() - base_dir = pb.create_base_dir(tmp_path, id) + base_dir = pb.create_package_dir(tmp_path, id) assert base_dir.name == id assert base_dir.parent.name == id[:-7] @@ -84,7 +96,7 @@ def test_error_on_existing_package_dir(tmp_path: Path, id: str): base_dir.mkdir(parents=True) with pytest.raises(FileExistsError) as exc: - pb.create_base_dir(tmp_path, id) + pb.create_package_dir(tmp_path, id) assert f"{base_dir} already exists. Make sure you are using the correct ID" in str( exc.value @@ -93,7 +105,7 @@ def test_error_on_existing_package_dir(tmp_path: Path, id: str): @pytest.fixture def package_base_dir(tmp_path: Path, id: str): - return pb.create_base_dir(tmp_path, id) + return pb.create_package_dir(tmp_path, id) MOVE_FILE = [(pb.move_metadata_file, 'metadata'), (pb.move_diskimage_file, 'images'), (pb.move_stream_file, 'streams')] From df99b734f335be473164987db0df63958d283535 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 22 Apr 2024 14:39:23 -0400 Subject: [PATCH 07/35] minimal working version to run on acquisition --- src/digarch_scripts/package/package_images.py | 56 +++++++++++++++---- tests/test_package_images.py | 20 +++---- 2 files changed, 54 insertions(+), 22 deletions(-) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 35d0f49..64d1d34 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -12,9 +12,9 @@ def extant_path(p: str) -> Path: raise argparse.ArgumentTypeError(f"{path} does not exist") return path - def digital_carrier_label(id: str) -> Path: - pattern = r"ACQ_\d{4}_\d{6}" - old_pattern = r"M\d{4-6}_\d{4}" + def acq_id(id: str) -> Path: + pattern = r"ACQ_\d{4}" + old_pattern = r"M\d{4-6}" if not re.match(pattern, id): if not re.match(old_pattern, id): raise argparse.ArgumentTypeError( @@ -23,22 +23,54 @@ def digital_carrier_label(id: str) -> Path: return id parser = argparse.ArgumentParser(description="test") - parser.add_argument("--image", required=True, type=extant_path) - parser.add_argument("--dest", required=True, type=extant_path) - parser.add_argument("--id", required=True, type=digital_carrier_label) - parser.add_argument("--log", required=False, nargs="+", type=extant_path) - parser.add_argument("--streams", required=False, type=extant_path) - parser.add_argument("--extracted", required=False, type=extant_path) + parser.add_argument("--images_folder", required=True, type=extant_path, help='Path to working images folder') + parser.add_argument("--dest", required=True, type=extant_path, help='Path to packaged images folder') + parser.add_argument("--acqid", required=True, type=acq_id, help='ACQ_####') + parser.add_argument("--logs_folder", required=False, type=extant_path, help='Path to working logs folder') + parser.add_argument("--streams_folder", required=False, type=extant_path, help='Path to working streams folder') return parser.parse_args() +def find_category_files(file_groups: dict, source_dir: Path, acq_id: str, category: str) -> dict: + for file in source_dir.iterdir(): + carrier_id_match = re.search(rf'{acq_id}_\d\d\d\d\d\d+', file.name) + if not carrier_id_match: + continue + carrier_id = carrier_id_match.group(0) + + if not carrier_id in file_groups: + file_groups[carrier_id] = {category: []} + elif not category in file_groups[carrier_id]: + file_groups[carrier_id][category] = [] + + file_groups[carrier_id][category].append(file) + + return file_groups + + +def find_carrier_files(carrier_files: dict, log_dir: Path, images_dir: Path, stream_dir: Path, acq_id: str) -> dict: + carrier_files = find_category_files(carrier_files, log_dir, acq_id, 'logs') + carrier_files = find_category_files(carrier_files, images_dir, acq_id, 'images') + carrier_files = find_category_files(carrier_files, stream_dir, acq_id, 'streams') + + return carrier_files + + +def package_carriers(carrier_files: dict, acq_dir: Path) -> None: + for carrier, files in carrier_files.items(): + base_dir = pb.create_package_dir(acq_dir, carrier) + pb.move_metadata_files(files['logs'], base_dir) + pb.move_diskimage_files(files['images'], base_dir) + pb.move_stream_files(files['streams'], base_dir) + + def main(): args = parse_args() - base_dir = pb.create_base_dir(args.dest, args.id) - pb.move_metadata_files(args.log, base_dir) - pb.move_diskimage_file(args.image, base_dir) + carrier_files = find_carrier_files({}, args.logs_folder, args.images_folder, args.streams_folder, args.acqid) + package_carriers(carrier_files, args.dest) + if __name__ == "__main__": diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 5f9f4f9..847bfcc 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -18,16 +18,16 @@ def transfer_files(tmp_path: Path, request): def args(transfer_files): args = [ "script_name", - "--image", - str(transfer_files / "image.img"), + "--images_folder", + str(transfer_files / "images"), "--dest", str(transfer_files), - "--id", - "ACQ_1234_123456", - "--streams", + "--acqid", + "ACQ_1234", + "--streams_folder", str(transfer_files / "streams"), - "--log", - str(transfer_files / "process.log"), + "--logs_folder", + str(transfer_files / "logs"), ] return args @@ -109,7 +109,7 @@ def test_full_run( monkeypatch.setattr("sys.argv", args) pi.main() - pkg_dir = Path(args[4]) / args[6][:-7] / args[6] - assert pkg_dir.exists() + acq_dir = Path(args[4]) / args[6] + assert acq_dir.exists() - assert "process.log" in [x.name for x in (pkg_dir / "metadata").iterdir()] + assert "ACQ_1234_123456" in [x.name for x in acq_dir.iterdir()] From 65d5344000a626ccd13ec4448fc198d87828c2bd Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 13:13:37 -0400 Subject: [PATCH 08/35] formatting --- src/digarch_scripts/package/package_base.py | 17 +- .../report/report_ftk_extents.py | 203 ++++++++---------- .../report/report_hdd_extents.py | 127 ++++++----- tests/test_lint_ft.py | 25 ++- tests/test_package_base.py | 37 +++- tests/test_report_ftk_extents.py | 145 +++++++++---- tests/test_report_hdd_extents.py | 113 ++++++---- 7 files changed, 386 insertions(+), 281 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index ff0f0e6..e213b4f 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -71,11 +71,14 @@ def move_file(file_path: Path, pkg_dir: Path, dest: str) -> None: new_file_path = dest_dir / file_path.name if new_file_path.exists(): - raise FileExistsError(f"{new_file_path} already exists in {dest} folder. Not moving.") + raise FileExistsError( + f"{new_file_path} already exists in {dest} folder. Not moving." + ) file_path.rename(new_file_path) return None + def move_files(file_paths: list[Path], pkg_dir: Path, dest: str) -> None: for file_path in file_paths: try: @@ -88,27 +91,27 @@ def move_files(file_paths: list[Path], pkg_dir: Path, dest: str) -> None: def move_metadata_file(md_path: Path, pkg_dir: Path) -> None: - return move_file(md_path, pkg_dir, 'metadata') + return move_file(md_path, pkg_dir, "metadata") def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: - return move_files(md_paths, pkg_dir, 'metadata') + return move_files(md_paths, pkg_dir, "metadata") def move_diskimage_file(image_path: Path, pkg_dir: Path) -> None: - return move_file(image_path, pkg_dir, 'images') + return move_file(image_path, pkg_dir, "images") def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: - return move_files(image_paths, pkg_dir, 'images') + return move_files(image_paths, pkg_dir, "images") def move_stream_file(md_path: Path, pkg_dir: Path) -> None: - return move_file(md_path, pkg_dir, 'streams') + return move_file(md_path, pkg_dir, "streams") def move_stream_files(md_path: Path, pkg_dir: Path) -> None: - return move_files(md_path, pkg_dir, 'streams') + return move_files(md_path, pkg_dir, "streams") def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> None: diff --git a/src/digarch_scripts/report/report_ftk_extents.py b/src/digarch_scripts/report/report_ftk_extents.py index f001f36..e6ab6e3 100644 --- a/src/digarch_scripts/report/report_ftk_extents.py +++ b/src/digarch_scripts/report/report_ftk_extents.py @@ -1,84 +1,76 @@ -from lxml import etree -import json -import re import argparse +import json +import logging import os import pathlib -import logging +import re + +from lxml import etree LOGGER = logging.getLogger(__name__) # Namespace for the FTK output XML -FO_NAMESPACE = {'fo': 'http://www.w3.org/1999/XSL/Format'} +FO_NAMESPACE = {"fo": "http://www.w3.org/1999/XSL/Format"} def _make_parser(): - def validate_file_input(f) -> pathlib.Path: - ''' + """ Ensure the input file exists - ''' + """ path = pathlib.Path(f) if not path.exists(): - raise argparse.ArgumentTypeError( - f'Directory or file does not exist: {f}' - ) + raise argparse.ArgumentTypeError(f"Directory or file does not exist: {f}") - if not path.suffix.lower() in ['.xml', '.fo']: + if not path.suffix.lower() in [".xml", ".fo"]: raise argparse.ArgumentTypeError( - 'Not a valid file type. Expect .xml or .fo' + "Not a valid file type. Expect .xml or .fo" ) return path def validate_output_dir(f) -> pathlib.Path: - path = pathlib.Path(f) if not path.exists(): - raise argparse.ArgumentTypeError( - f'Output directory does not exist: {f}' - ) + raise argparse.ArgumentTypeError(f"Output directory does not exist: {f}") return path - parser = argparse.ArgumentParser( - description='Create a JSON report from XML' - ) + parser = argparse.ArgumentParser(description="Create a JSON report from XML") parser.add_argument( - '-f', '--file', + "-f", + "--file", help="path to FTK XML report", type=validate_file_input, - required=True + required=True, ) parser.add_argument( - '-o', '--output', + "-o", + "--output", help="destination directory", type=validate_output_dir, - required=True + required=True, ) return parser.parse_args() -def create_er_list( - tree: etree.ElementTree -) -> list[list[list[str], str, str]]: - - ''' +def create_er_list(tree: etree.ElementTree) -> list[list[list[str], str, str]]: + """ This transforms the table of contents into a list of lists where each list item has the hierarchy of titles and a reference-id. This list is the intermediate data structure used to build the nested dict. The function returns the entire list. - ''' + """ tree = tree.xpath( '/fo:root/fo:page-sequence[@master-reference="TOC"]/fo:flow', - namespaces=FO_NAMESPACE + namespaces=FO_NAMESPACE, )[0] ers = [] @@ -89,27 +81,23 @@ def create_er_list( continue indent = int(child.get("start-indent").split(sep="pt")[0]) - level = (indent//12) - 2 + level = (indent // 12) - 2 if level >= 0: # build a list of parents based on level if level <= len(hierarchy) - 1: hierarchy = hierarchy[:level] elif level > len(hierarchy) + 1: - raise ValueError( - f'Unexpected jump in hierarchy at {child.text}' - ) + raise ValueError(f"Unexpected jump in hierarchy at {child.text}") hierarchy.append(child.text) # only record if entry is an ER possible_ref = child.xpath( - 'fo:basic-link/fo:page-number-citation', namespaces=FO_NAMESPACE + "fo:basic-link/fo:page-number-citation", namespaces=FO_NAMESPACE ) - if possible_ref and hierarchy[-1].startswith('ER'): - refid = possible_ref[0].get('ref-id') - ers.append( - [hierarchy.copy(), refid, hierarchy[-1]] - ) + if possible_ref and hierarchy[-1].startswith("ER"): + refid = possible_ref[0].get("ref-id") + ers.append([hierarchy.copy(), refid, hierarchy[-1]]) audit_ers(ers) @@ -119,11 +107,11 @@ def create_er_list( def audit_ers(ers: list[list[list[str], str, str]]) -> None: er_numbers_used = {} for er in ers: - number = re.match(r'ER (\d+):', er[2]) + number = re.match(r"ER (\d+):", er[2]) if not number: LOGGER.warning( - f'ER is missing a number: {er[2]}. Review the ERs with the processing archivist' + f"ER is missing a number: {er[2]}. Review the ERs with the processing archivist" ) er_number = 0 else: @@ -140,7 +128,7 @@ def audit_ers(ers: list[list[list[str], str, str]]) -> None: for i in range(er_min, er_max): if i not in er_numbers_used.keys(): LOGGER.warning( - f'Collection uses ER {er_min} to ER {er_max}. ER {i} is skipped. Review the ERs with the processing archivist' + f"Collection uses ER {er_min} to ER {er_max}. ER {i} is skipped. Review the ERs with the processing archivist" ) # test for duplicate ers @@ -153,51 +141,46 @@ def audit_ers(ers: list[list[list[str], str, str]]) -> None: return None -def transform_bookmark_tables( - tree: etree.ElementTree -) -> list[dict]: - - ''' +def transform_bookmark_tables(tree: etree.ElementTree) -> list[dict]: + """ transforms each row in the 'bookmarksPage' table into a string. this string contains all the extent information that will be summarized later. the return is a list of lists where the first item is the id with the prefix bk and the second item is a string serialized from the XML. - ''' + """ extent_tree = tree.xpath( '/fo:root/fo:page-sequence[@master-reference="bookmarksPage"]/fo:flow/fo:table[@id]', - namespaces=FO_NAMESPACE + namespaces=FO_NAMESPACE, ) bookmark_contents = [] for row in extent_tree: # row is an /fo:row in /fo:table[@id] file_table = row.xpath( - './fo:table-body/fo:table-row/fo:table-cell/fo:block', - namespaces=FO_NAMESPACE + "./fo:table-body/fo:table-row/fo:table-cell/fo:block", + namespaces=FO_NAMESPACE, ) file_dict = { file_table[i].text: file_table[i + 1].text for i in range(0, len(file_table), 2) } - file_dict['file_id'] = row.get('id') - file_dict['bookmark_id'] = row.get('id').split('_')[0] + file_dict["file_id"] = row.get("id") + file_dict["bookmark_id"] = row.get("id").split("_")[0] bookmark_contents.append(file_dict) return bookmark_contents def add_extents_to_ers( - er_list: list[list[list[str], str, str]], - bookmark_tables: list[dict] + er_list: list[list[list[str], str, str]], bookmark_tables: list[dict] ) -> list[list[str, int, int]]: - - ''' + """ summarizes the extent for each ER by correlating the table of contents with the bookmark tables. Returns list of lists with hierarchal ER string, file size, and file count. - ''' + """ ers_with_extents = [] @@ -208,11 +191,13 @@ def add_extents_to_ers( if count == 0: LOGGER.warning( - f'{er_name} does not contain any files. It will be omitted from the report.') + f"{er_name} does not contain any files. It will be omitted from the report." + ) continue if size == 0: LOGGER.warning( - f'{er_name} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist.') + f"{er_name} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist." + ) continue ers_with_extents.append([er[0], size, count]) @@ -221,34 +206,31 @@ def add_extents_to_ers( def get_er_report( - er_files: list[dict], - bookmark_id: str, - er_name: str + er_files: list[dict], bookmark_id: str, er_name: str ) -> tuple[int, int]: - - ''' + """ extract the total file size and file count for a given bookmark ID Returns a tuple with the file size and file count. - ''' + """ size = 0 count = 0 - prefix = bookmark_id.replace('k', 'f') + prefix = bookmark_id.replace("k", "f") for entry in er_files: - if entry['bookmark_id'] == prefix: - - byte_string = entry['Logical Size'] - bytes = re.findall(r'(\d+)\sB', byte_string) + if entry["bookmark_id"] == prefix: + byte_string = entry["Logical Size"] + bytes = re.findall(r"(\d+)\sB", byte_string) if bytes: count += 1 file_size = int(bytes[0]) if file_size == 0: - file_name = entry['Name'] - #extract file name, might have to parse file table better + file_name = entry["Name"] + # extract file name, might have to parse file table better LOGGER.warning( - f'{er_name} contains the following 0-byte file: {file_name}. Review this file with the processing archivist.') + f"{er_name} contains the following 0-byte file: {file_name}. Review this file with the processing archivist." + ) size += file_size else: @@ -257,93 +239,86 @@ def get_er_report( return size, count -def create_report( - input: list[list[str], int, int], - report: dict -) -> dict: - - ''' +def create_report(input: list[list[str], int, int], report: dict) -> dict: + """ recursive function to insert a given bookmark into a nested dictionary based on the hierarchy of component titles. Returns a nested dictionary - ''' + """ if len(input[0]) == 1: - number, name = input[0][0].split(':', maxsplit=1) - report['children'].append({ - 'title': input[0][0], - 'er_number': number, - 'er_name': name.strip(), - 'file_size': input[1], - 'file_count': input[2] - }) + number, name = input[0][0].split(":", maxsplit=1) + report["children"].append( + { + "title": input[0][0], + "er_number": number, + "er_name": name.strip(), + "file_size": input[1], + "file_count": input[2], + } + ) else: parent, child = input[0][0], input[0][1:] input[0] = child - for item in report['children']: - if item['title'] == parent: + for item in report["children"]: + if item["title"] == parent: item = create_report(input, item) return report - report['children'].append( - create_report(input, {'title': parent, 'children': []}) + report["children"].append( + create_report(input, {"title": parent, "children": []}) ) return report -def extract_collection_title( - tree: etree.ElementTree - ) -> str: +def extract_collection_title(tree: etree.ElementTree) -> str: case_info = tree.xpath( - '/fo:root/fo:page-sequence[@master-reference="caseInfoPage"]/fo:flow/fo:table'\ - '/fo:table-body/fo:table-row/fo:table-cell/fo:block/text()', - namespaces=FO_NAMESPACE + '/fo:root/fo:page-sequence[@master-reference="caseInfoPage"]/fo:flow/fo:table' + "/fo:table-body/fo:table-row/fo:table-cell/fo:block/text()", + namespaces=FO_NAMESPACE, ) for i, txt in enumerate(case_info): if txt == "Case Name": - collname = case_info[i+1] + collname = case_info[i + 1] return collname -def make_json( - destination: pathlib.Path, - report: dict, - collname -) -> None: - ''' +def make_json(destination: pathlib.Path, report: dict, collname) -> None: + """ creates a json file with the name of the collection as the file name destination is the file path from args parse and report is the collection style dict - ''' + """ name = collname name = name.replace(" ", "_") - with open(os.path.join(destination, f'{name}.json'), 'w') as file: + with open(os.path.join(destination, f"{name}.json"), "w") as file: json.dump(report, file) def main() -> None: args = _make_parser() - print('Parsing XML ...') + print("Parsing XML ...") tree = etree.parse(args.file) - print('Creating report ...') + print("Creating report ...") ers = create_er_list(tree) bookmark_tables = transform_bookmark_tables(tree) ers_with_extents = add_extents_to_ers(ers, bookmark_tables) colltitle = extract_collection_title(tree) - dct = {'title': colltitle, 'children': []} + dct = {"title": colltitle, "children": []} for er in ers_with_extents: dct = create_report(er, dct) print("Writing report ...") make_json(args.output, dct, colltitle) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/digarch_scripts/report/report_hdd_extents.py b/src/digarch_scripts/report/report_hdd_extents.py index 4223ebd..5f1d66f 100644 --- a/src/digarch_scripts/report/report_hdd_extents.py +++ b/src/digarch_scripts/report/report_hdd_extents.py @@ -1,63 +1,56 @@ import argparse -import os import json -import pathlib import logging +import os +import pathlib import re + LOGGER = logging.getLogger(__name__) + def parse_args(): parser = argparse.ArgumentParser() - def validate_dir( - d: str - ) -> pathlib.Path: + def validate_dir(d: str) -> pathlib.Path: path = pathlib.Path(d) if not path.exists(): - raise argparse.ArgumentTypeError( - f'Specified directory does not exist: {d}' - ) + raise argparse.ArgumentTypeError(f"Specified directory does not exist: {d}") if not path.is_dir(): - raise argparse.ArgumentTypeError( - f'Specified path is not a directory: {d}' - ) + raise argparse.ArgumentTypeError(f"Specified path is not a directory: {d}") return path def validate_output_dir(f) -> pathlib.Path: - path = pathlib.Path(f) if not path.exists(): - raise argparse.ArgumentTypeError( - f'Output directory does not exist: {f}' - ) + raise argparse.ArgumentTypeError(f"Output directory does not exist: {f}") return path parser.add_argument( - "-d", "--dir", + "-d", + "--dir", type=validate_dir, help="Path to the parent directory, e.g. M###_FAComponents", - required = True + required=True, ) parser.add_argument( - '-o', '--output', + "-o", + "--output", help="report destination directory", type=validate_output_dir, - required=True + required=True, ) return parser.parse_args() -def get_ers( - facomponent_dir: pathlib.Path -) -> list[str, int, int, str]: +def get_ers(facomponent_dir: pathlib.Path) -> list[str, int, int, str]: ers = [] - for possible_er in facomponent_dir.glob('**/ER *'): - objects_dir = possible_er.joinpath('objects') + for possible_er in facomponent_dir.glob("**/ER *"): + objects_dir = possible_er.joinpath("objects") if possible_er.is_dir(): if objects_dir.is_dir(): er = possible_er.relative_to(facomponent_dir) @@ -69,41 +62,47 @@ def get_ers( fp = os.path.join(path, f) if os.path.getsize(fp) == 0: LOGGER.warning( - f'{possible_er.name} contains the following 0-byte file: {f}. Review this file with the processing archivist.') + f"{possible_er.name} contains the following 0-byte file: {f}. Review this file with the processing archivist." + ) size += os.path.getsize(fp) else: LOGGER.warning( - f'{possible_er.name} does not contain an object folder. It will be omitted from the report.') + f"{possible_er.name} does not contain an object folder. It will be omitted from the report." + ) continue if count == 0: LOGGER.warning( - f'{possible_er.name} does not contain any files. It will be omitted from the report.') + f"{possible_er.name} does not contain any files. It will be omitted from the report." + ) continue if size == 0: LOGGER.warning( - f'{possible_er.name} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist.') + f"{possible_er.name} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist." + ) continue ers.append([str(er), size, count, possible_er.name]) return ers + def extract_collection_title(hdd_dir: pathlib.Path) -> str: for item in hdd_dir.iterdir(): - if re.match(r'M\d+\_FAcomponents', item.name): + if re.match(r"M\d+\_FAcomponents", item.name): return item.name else: LOGGER.warning( - 'Cannot find CollectionID_FAcomponents directory. Please use CollectionID_FAcomponents naming convention for the directory containing all ERs.' + "Cannot find CollectionID_FAcomponents directory. Please use CollectionID_FAcomponents naming convention for the directory containing all ERs." ) + def audit_ers(ers: list[list[str, str, str]]) -> None: er_numbers_used = {} for er in ers: - number = re.match(r'ER (\d+)', er[3]) + number = re.match(r"ER (\d+)", er[3]) if not number: LOGGER.warning( - f'ER is missing a number: {er[3]}. Review the ERs with the processing archivist' + f"ER is missing a number: {er[3]}. Review the ERs with the processing archivist" ) er_number = 0 else: @@ -120,7 +119,7 @@ def audit_ers(ers: list[list[str, str, str]]) -> None: for i in range(er_min, er_max): if i not in er_numbers_used.keys(): LOGGER.warning( - f'Collection uses ER {er_min} to ER {er_max}. ER {i} is skipped. Review the ERs with the processing archivist' + f"Collection uses ER {er_min} to ER {er_max}. ER {i} is skipped. Review the ERs with the processing archivist" ) # test for duplicate ers @@ -133,66 +132,60 @@ def audit_ers(ers: list[list[str, str, str]]) -> None: return None -def create_report( - input: list[list[str, int, int]], - report: dict -) -> dict: +def create_report(input: list[list[str, int, int]], report: dict) -> dict: for er in input: report = process_item(er, report) return report -def process_item( - input: list[str, int, int], - report: dict -) -> dict: - if not '/' in input[0]: - parts = re.match(r'(ER \d+)\s(.*)', input[0]) - report['children'].append({ - 'title': input[0], - 'er_number': parts.group(1), - 'er_name': parts.group(2), - 'file_size': input[1], - 'file_count': input[2] - }) +def process_item(input: list[str, int, int], report: dict) -> dict: + if not "/" in input[0]: + parts = re.match(r"(ER \d+)\s(.*)", input[0]) + report["children"].append( + { + "title": input[0], + "er_number": parts.group(1), + "er_name": parts.group(2), + "file_size": input[1], + "file_count": input[2], + } + ) else: - parent, child = input[0].split('/', maxsplit=1) + parent, child = input[0].split("/", maxsplit=1) input[0] = child - for item in report['children']: - if item['title'] == parent: + for item in report["children"]: + if item["title"] == parent: item = process_item(input, item) return report - report['children'].append( - process_item(input, {'title': parent, 'children': []}) + report["children"].append( + process_item(input, {"title": parent, "children": []}) ) return report -def write_report( - report: dict, - dest: pathlib.Path -) -> None: - with open(dest, 'w') as f: + +def write_report(report: dict, dest: pathlib.Path) -> None: + with open(dest, "w") as f: json.dump(report, f) + def main(): args = parse_args() - LOGGER.info('retrieving ER folder paths') + LOGGER.info("retrieving ER folder paths") ers = get_ers(args.dir) - LOGGER.info('creating report') + LOGGER.info("creating report") colltitle = extract_collection_title(args.dir) - stub_report = {'title': colltitle, 'children': []} + stub_report = {"title": colltitle, "children": []} full_report = create_report(ers, stub_report) - - LOGGER.info('writing report') - report_file = args.output.joinpath(f'{colltitle}.json') + LOGGER.info("writing report") + report_file = args.output.joinpath(f"{colltitle}.json") write_report(full_report, report_file) -if __name__=="__main__": +if __name__ == "__main__": main() diff --git a/tests/test_lint_ft.py b/tests/test_lint_ft.py index 8d9eb9b..0b536e2 100644 --- a/tests/test_lint_ft.py +++ b/tests/test_lint_ft.py @@ -4,6 +4,7 @@ import digarch_scripts.lint.lint_ft as lint_ft + # Unit tests # Argument tests def test_package_argument(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): @@ -30,6 +31,7 @@ def test_directory_argument(tmp_path: Path, monkeypatch: pytest.MonkeyPatch): assert child_dir in args.packages + # linting tests @pytest.fixture def good_package(tmp_path: Path): @@ -37,8 +39,7 @@ def good_package(tmp_path: Path): f_object_data = pkg / "objects" / "data" f_object_data.mkdir(parents=True) - bag_files = ["bag-info.txt", "bagit.txt", - "manifest-md5.txt", "tagmanifest-md5.txt"] + bag_files = ["bag-info.txt", "bagit.txt", "manifest-md5.txt", "tagmanifest-md5.txt"] for f in bag_files: filepath = pkg / "objects" / f filepath.touch() @@ -64,6 +65,7 @@ def good_package(tmp_path: Path): return pkg + def test_top_folder_valid_name(good_package): """Top level folder name has to conform to ACQ_####_######""" result = lint_ft.package_has_valid_name(good_package) @@ -81,12 +83,14 @@ def test_top_folder_invalid_name(good_package): assert not result + def test_package_has_two_subfolders(good_package): """Second level folders must be two""" result = lint_ft.package_has_two_subfolders(good_package) assert result + def test_package_does_not_have_two_subfolders(good_package): """Test that package fails function when second level folders are not the correct number, i.e. 2""" @@ -98,6 +102,7 @@ def test_package_does_not_have_two_subfolders(good_package): assert not result + def test_sec_level_folder_valid_names(good_package): """Second level folders must only have objects and metadata folder""" result = lint_ft.package_has_valid_subfolder_names(good_package) @@ -116,12 +121,14 @@ def test_sec_level_folder_invalid_names(good_package): assert not result + def test_package_has_no_hidden_file(good_package): """The package should not have any hidden file""" result = lint_ft.package_has_no_hidden_file(good_package) assert result + def test_package_has_hidden_file(good_package): """Test that package fails function when there is any hidden file""" bad_package = good_package @@ -134,12 +141,14 @@ def test_package_has_hidden_file(good_package): assert not result + def test_package_has_no_zero_bytes_file(good_package): """The package should not have any zero bytes file""" result = lint_ft.package_has_no_zero_bytes_file(good_package) assert result + def test_package_has_zero_bytes_file(good_package): """Test that package fails function when there is any zero bytes file""" bad_package = good_package @@ -150,6 +159,7 @@ def test_package_has_zero_bytes_file(good_package): assert not result + def test_metadata_folder_is_flat(good_package): """The metadata folder should not have folder structure""" result = lint_ft.metadata_folder_is_flat(good_package) @@ -168,12 +178,14 @@ def test_metadata_folder_has_random_folder(good_package): assert not result + def test_metadata_folder_has_files(good_package): """The metadata folder should have one or more file""" result = lint_ft.metadata_folder_has_files(good_package) assert result + def test_metadata_folder_empty(good_package): """Test that package fails function when the metadata does not have any files""" @@ -185,12 +197,14 @@ def test_metadata_folder_empty(good_package): assert not result + def test_metadata_has_correct_naming_convention(good_package): """The metadata file name should be in the accepted list""" result = lint_ft.metadata_has_correct_naming_convention(good_package) assert result + def test_metadata_has_incorrect_naming_convention(good_package): """Test that package fails function when metadata file(s) has incorrect naming conventions""" @@ -202,6 +216,7 @@ def test_metadata_has_incorrect_naming_convention(good_package): assert not result + def test_objects_folder_correct_structure(good_package): """objects folder should have a data folder, which includes four files: bag-info.txt, bagit.txt, manifest-md5.txt and tagmanifest-md5.txt""" @@ -209,6 +224,7 @@ def test_objects_folder_correct_structure(good_package): assert result + def test_objects_folder_incorrect_structure(good_package): """Test that package fails function if it does not have the data folder, or missing any of the four files: bag-info.txt, bagit.txt, manifest-md5.txt @@ -221,12 +237,14 @@ def test_objects_folder_incorrect_structure(good_package): assert not result + def test_objects_folder_has_no_empty_folder(good_package): """The objects folder should not have any empty folders""" result = lint_ft.objects_folder_has_no_empty_folder(good_package) assert result + def test_objects_folder_has_empty_folder(good_package): """Test that package fails function if its objects folder has empty folder(s)""" bad_package = good_package @@ -238,12 +256,14 @@ def test_objects_folder_has_empty_folder(good_package): assert not result + def test_valid_package(good_package): """Test that package returns 'valid' when all tests are passed""" result = lint_ft.lint_package(good_package) assert result == "valid" + def test_invalid_package(good_package): """Test that package returns 'invalid' when failing some tests""" bad_package = good_package @@ -255,6 +275,7 @@ def test_invalid_package(good_package): assert result == "invalid" + def test_unclear_package(good_package): """Test that package returns 'needs review' when failing some tests""" bad_package = good_package diff --git a/tests/test_package_base.py b/tests/test_package_base.py index 338b5fb..3a9752c 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -44,7 +44,12 @@ def args(transfer_files): return args -CREATE_DIR = [(pb.create_acq_dir, 'ACQ_1234'), (pb.create_package_dir, 'ACQ_1234_123456')] +CREATE_DIR = [ + (pb.create_acq_dir, "ACQ_1234"), + (pb.create_package_dir, "ACQ_1234_123456"), +] + + @pytest.mark.parametrize("tested_function,id", CREATE_DIR) def test_create_dir_exc_on_readonly(tmp_path: Path, id: str, tested_function): """Test that package folder maker reports permission error""" @@ -63,7 +68,7 @@ def test_create_dir_exc_on_readonly(tmp_path: Path, id: str, tested_function): def test_create_acq_dir(tmp_path: Path): """Test that package folder maker makes ACQ and Carrier folders""" - id = 'ACQ_1234' + id = "ACQ_1234" base_dir = pb.create_acq_dir(tmp_path, id) assert base_dir.name == id @@ -108,7 +113,13 @@ def package_base_dir(tmp_path: Path, id: str): return pb.create_package_dir(tmp_path, id) -MOVE_FILE = [(pb.move_metadata_file, 'metadata'), (pb.move_diskimage_file, 'images'), (pb.move_stream_file, 'streams')] +MOVE_FILE = [ + (pb.move_metadata_file, "metadata"), + (pb.move_diskimage_file, "images"), + (pb.move_stream_file, "streams"), +] + + @pytest.mark.parametrize("test_function,dest", MOVE_FILE) def test_move_file(package_base_dir: Path, log: Path, test_function, dest: str): """Test that metadata folder and log file are moved successfully""" @@ -120,7 +131,9 @@ def test_move_file(package_base_dir: Path, log: Path, test_function, dest: str): @pytest.mark.parametrize("test_function,dest", MOVE_FILE) -def test_do_not_overwrite_file(package_base_dir: Path, log: Path, test_function, dest: str): +def test_do_not_overwrite_file( + package_base_dir: Path, log: Path, test_function, dest: str +): """Test that log file is not moved if a same name file exists in dest""" rclone_log = package_base_dir / dest / log.name @@ -131,12 +144,22 @@ def test_do_not_overwrite_file(package_base_dir: Path, log: Path, test_function, test_function(log, package_base_dir) assert log.exists() - assert f"{rclone_log} already exists in {dest} folder. Not moving." in str(exc.value) + assert f"{rclone_log} already exists in {dest} folder. Not moving." in str( + exc.value + ) + + +MOVE_FILES = [ + (pb.move_metadata_files, "metadata"), + (pb.move_diskimage_files, "images"), + (pb.move_stream_files, "streams"), +] -MOVE_FILES = [(pb.move_metadata_files, 'metadata'), (pb.move_diskimage_files, 'images'), (pb.move_stream_files, 'streams')] @pytest.mark.parametrize("test_function,dest", MOVE_FILES) -def test_move_multiple_file(package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str): +def test_move_multiple_file( + package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str +): """Test that multiple files are moved successfully""" md_files = [log, md5_manifest] diff --git a/tests/test_report_ftk_extents.py b/tests/test_report_ftk_extents.py index 3831089..55dd4a5 100644 --- a/tests/test_report_ftk_extents.py +++ b/tests/test_report_ftk_extents.py @@ -1,6 +1,9 @@ -import src.digarch_scripts.report.report_ftk_extents as rfe -import pytest import json + +import pytest + +import src.digarch_scripts.report.report_ftk_extents as rfe + try: from lxml import etree except ImportError: @@ -9,17 +12,19 @@ @pytest.fixture def parsed_report(): - return etree.parse('tests/fixtures/report/Report.xml') + return etree.parse("tests/fixtures/report/Report.xml") + def test_identify_all_ers(parsed_report): """Function should list every bookmark starting with ER""" ers = rfe.create_er_list(parsed_report) - just_ers = [er[0][-1].split(':')[0] for er in ers] + just_ers = [er[0][-1].split(":")[0] for er in ers] for i in range(1, 12): - assert f'ER {i}' in just_ers - assert 'ER 23' in just_ers + assert f"ER {i}" in just_ers + assert "ER 23" in just_ers + def test_hierarchy_nests_down_correctly(parsed_report): """Function should include organization hierarchy. @@ -27,45 +32,91 @@ def test_hierarchy_nests_down_correctly(parsed_report): ers = rfe.create_er_list(parsed_report) just_titles = [er[0] for er in ers] - assert ['Extents Test papers', 'Series 1', 'Subseries(1)', 'ER 1: Text, 2023'] in just_titles - assert ['Extents Test papers', 'Series 1', 'Subseries(1)', 'Subsubseries(2)', 'ER 2: File 15, 2023'] in just_titles + assert [ + "Extents Test papers", + "Series 1", + "Subseries(1)", + "ER 1: Text, 2023", + ] in just_titles + assert [ + "Extents Test papers", + "Series 1", + "Subseries(1)", + "Subsubseries(2)", + "ER 2: File 15, 2023", + ] in just_titles + def test_hierarchy_nests_empty_subseries(parsed_report): """Function should include organization hierarchy including empty levels""" ers = rfe.create_er_list(parsed_report) just_titles = [er[0] for er in ers] - assert ['Extents Test papers', 'Series 1', 'Subseries(1)', 'Subsubseries(2)', 'Subsubsubseries(3)', 'Subsubsubsubseries(4)', 'ER 10: Folder 2, 2023'] in just_titles + assert [ + "Extents Test papers", + "Series 1", + "Subseries(1)", + "Subsubseries(2)", + "Subsubsubseries(3)", + "Subsubsubsubseries(4)", + "ER 10: Folder 2, 2023", + ] in just_titles + def test_hierarchy_nests_up_correctly(parsed_report): """Function should be able to step down in hierarchy""" ers = rfe.create_er_list(parsed_report) just_titles = [er[0] for er in ers] - assert ['Extents Test papers', 'Series 1', 'Subseries(1)', 'Subsubseries(2) the second', 'ER 23: File 17, 2023'] in just_titles - assert ['Extents Test papers', 'Series 1', 'Subseries(1) the second', 'ER 4: File 18, 2023'] in just_titles + assert [ + "Extents Test papers", + "Series 1", + "Subseries(1)", + "Subsubseries(2) the second", + "ER 23: File 17, 2023", + ] in just_titles + assert [ + "Extents Test papers", + "Series 1", + "Subseries(1) the second", + "ER 4: File 18, 2023", + ] in just_titles + def test_hierarchy_nests_reverse_order_bookmarks(parsed_report): """Function should parse bottom-up hierarchy""" ers = rfe.create_er_list(parsed_report) just_titles = [er[0] for er in ers] - assert ['Extents Test papers', 'Series 2', 'ER 9: File 20,2023'] in just_titles - assert ['Extents Test papers', 'Series 2', 'Subseries(1) of Series 2', 'ER 8: File 2, 2023'] in just_titles - assert ['Extents Test papers', 'Series 2', 'Subseries(1) of Series 2', 'Subsubseries(2) of Series 2', 'ER 7: File 19, 2023'] in just_titles + assert ["Extents Test papers", "Series 2", "ER 9: File 20,2023"] in just_titles + assert [ + "Extents Test papers", + "Series 2", + "Subseries(1) of Series 2", + "ER 8: File 2, 2023", + ] in just_titles + assert [ + "Extents Test papers", + "Series 2", + "Subseries(1) of Series 2", + "Subsubseries(2) of Series 2", + "ER 7: File 19, 2023", + ] in just_titles + def test_er_outside_of_series(parsed_report): """Function should include capture ERs even if they're not in a series""" ers = rfe.create_er_list(parsed_report) just_titles = [er[0] for er in ers] - assert ['Extents Test papers', 'ER 10: File 21,2023'] in just_titles + assert ["Extents Test papers", "ER 10: File 21,2023"] in just_titles + def test_correct_report_many_files(parsed_report): """Test if file count and byte count is completed correctly""" bookmark_tables = rfe.transform_bookmark_tables(parsed_report) - er_with_many_files = [['ER 1', 'bk6001']] + er_with_many_files = [["ER 1", "bk6001"]] extents = rfe.add_extents_to_ers(er_with_many_files, bookmark_tables) # bytes @@ -73,12 +124,13 @@ def test_correct_report_many_files(parsed_report): # files assert extents[0][2] == 7 + def test_correct_report_on_er_with_folder_bookmarked(parsed_report): """Test if file count and byte count is completed correctly when bookmark includes a folder that is bookmarked""" bookmark_tables = rfe.transform_bookmark_tables(parsed_report) - er_with_folder = [['ER 10', 'bk12001']] + er_with_folder = [["ER 10", "bk12001"]] extents = rfe.add_extents_to_ers(er_with_folder, bookmark_tables) # bytes @@ -86,12 +138,13 @@ def test_correct_report_on_er_with_folder_bookmarked(parsed_report): # files assert extents[0][2] == 5 + def test_correct_report_on_er_with_folder_not_bookmarked(parsed_report): """Test if file count and byte count is completed correctly when bookmark includes a folder that isn't bookmarked""" bookmark_tables = rfe.transform_bookmark_tables(parsed_report) - er_with_folder = [['ER 3', 'bk11001']] + er_with_folder = [["ER 3", "bk11001"]] extents = rfe.add_extents_to_ers(er_with_folder, bookmark_tables) # bytes @@ -99,11 +152,12 @@ def test_correct_report_on_er_with_folder_not_bookmarked(parsed_report): # files assert extents[0][2] == 5 + def test_correct_report_1_file(parsed_report): """Test if file count and byte count is completed correctly for one file""" bookmark_tables = rfe.transform_bookmark_tables(parsed_report) - er_with_one_file = [['ER 2', 'bk9001']] + er_with_one_file = [["ER 2", "bk9001"]] extents = rfe.add_extents_to_ers(er_with_one_file, bookmark_tables) # bytes @@ -111,38 +165,45 @@ def test_correct_report_1_file(parsed_report): # files assert extents[0][2] == 1 + def test_warn_on_no_files_in_er(parsed_report, caplog): """Test if warning is logged for empty bookmarks and ER is omitted from report""" bookmark_tables = rfe.transform_bookmark_tables(parsed_report) - er_with_no_files = [[['hier', 'archy', 'list'], 'bk27001', 'ER 5: No Files, 2023']] + er_with_no_files = [[["hier", "archy", "list"], "bk27001", "ER 5: No Files, 2023"]] extents = rfe.add_extents_to_ers(er_with_no_files, bookmark_tables) assert extents == [] - log_msg = f'{er_with_no_files[0][-1]} does not contain any files. It will be omitted from the report.' + log_msg = f"{er_with_no_files[0][-1]} does not contain any files. It will be omitted from the report." assert log_msg in caplog.text + def test_warn_on_a_no_byte_file_in_er(parsed_report, caplog): """Test if warning is logged for empty files in an ER""" bookmark_tables = rfe.transform_bookmark_tables(parsed_report) - er_with_no_bytes = [[['hier', 'archy', 'list'], 'bk28001', 'ER 6: Zero Length, 2023']] + er_with_no_bytes = [ + [["hier", "archy", "list"], "bk28001", "ER 6: Zero Length, 2023"] + ] rfe.add_extents_to_ers(er_with_no_bytes, bookmark_tables) - log_msg = f'{er_with_no_bytes[0][-1]} contains the following 0-byte file: file00.txt. Review this file with the processing archivist.' + log_msg = f"{er_with_no_bytes[0][-1]} contains the following 0-byte file: file00.txt. Review this file with the processing archivist." assert log_msg in caplog.text + def test_warn_on_no_bytes_in_er(parsed_report, caplog): """Test if warning is logged for bookmarks with 0 bytes total and ER is omitted from report""" bookmark_tables = rfe.transform_bookmark_tables(parsed_report) - er_with_no_bytes = [[['hier', 'archy', 'list'], 'bk28001', 'ER 6: Zero Length, 2023']] + er_with_no_bytes = [ + [["hier", "archy", "list"], "bk28001", "ER 6: Zero Length, 2023"] + ] extents = rfe.add_extents_to_ers(er_with_no_bytes, bookmark_tables) assert extents == [] - log_msg = f'{er_with_no_bytes[0][-1]} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist.' + log_msg = f"{er_with_no_bytes[0][-1]} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist." assert log_msg in caplog.text @@ -150,7 +211,8 @@ def test_extract_collection_name_from_report(parsed_report): """Test if collection name is taken from XML""" coll_name = rfe.extract_collection_title(parsed_report) - assert coll_name == 'M12345 Extents Test' + assert coll_name == "M12345 Extents Test" + @pytest.fixture def ers_with_extents_list(parsed_report): @@ -160,40 +222,45 @@ def ers_with_extents_list(parsed_report): return ers_with_extents + def test_json_objects_contains_expected_fields(ers_with_extents_list): """Test if final report aligns with expectations for ASpace import""" - full_dict = {'title': 'slug', 'children': []} + full_dict = {"title": "slug", "children": []} for er in ers_with_extents_list: rfe.create_report(er, full_dict) def recursive_validator(er_dict): for key, value in er_dict.items(): - if key == 'title': + if key == "title": assert type(value) is str - elif key == 'children': + elif key == "children": assert type(value) is list for child in value: recursive_validator(child) - elif key == 'er_number': + elif key == "er_number": assert type(value) is str - elif key == 'er_name': + elif key == "er_name": assert type(value) is str - elif key == 'file_size': + elif key == "file_size": assert type(value) is int - elif key == 'file_count': + elif key == "file_count": assert type(value) is int else: assert False recursive_validator(full_dict) + def test_skipped_ER_number_behavior(parsed_report, caplog): """Test if script flags when ER numbering is not sequential""" ers = rfe.create_er_list(parsed_report) for i in range(13, 23): - assert f'Collection uses ER 1 to ER 23. ER {i} is skipped. Review the ERs with the processing archivist' in caplog.text + assert ( + f"Collection uses ER 1 to ER 23. ER {i} is skipped. Review the ERs with the processing archivist" + in caplog.text + ) def test_ER_missing_number_behavior(parsed_report, caplog): @@ -203,7 +270,7 @@ def test_ER_missing_number_behavior(parsed_report, caplog): rfe.audit_ers(ers) - log_msg = f'ER is missing a number: ER ?: File 21,2023. Review the ERs with the processing archivist' + log_msg = f"ER is missing a number: ER ?: File 21,2023. Review the ERs with the processing archivist" assert log_msg in caplog.text @@ -213,18 +280,20 @@ def test_repeated_ER_number_behavior(parsed_report, caplog): rfe.audit_ers(ers) - log_msg = f'ER 10 is used multiple times: ER 10: File 21,2023, ER 10: Folder 2, 2023. Review the ERs with the processing archivist' + log_msg = f"ER 10 is used multiple times: ER 10: File 21,2023, ER 10: Folder 2, 2023. Review the ERs with the processing archivist" assert log_msg in caplog.text + @pytest.fixture def expected_json(): - with open('tests/fixtures/report/report.json') as f: + with open("tests/fixtures/report/report.json") as f: report = json.load(f) return report + def test_create_correct_json(ers_with_extents_list, expected_json): """Test that final report matches total expectations""" - dct = {'title': 'coll', 'children': []} + dct = {"title": "coll", "children": []} for er in ers_with_extents_list: dct = rfe.create_report(er, dct) diff --git a/tests/test_report_hdd_extents.py b/tests/test_report_hdd_extents.py index ff7cef8..d68b915 100644 --- a/tests/test_report_hdd_extents.py +++ b/tests/test_report_hdd_extents.py @@ -1,27 +1,32 @@ -import src.digarch_scripts.report.report_hdd_extents as rhe -import pytest -import shutil -import re -import pathlib import json +import pathlib +import re +import shutil + +import pytest + +import src.digarch_scripts.report.report_hdd_extents as rhe + @pytest.fixture() def arranged_collection(tmp_path: pathlib.Path): - path = tmp_path.joinpath('hdd') - shutil.copytree('tests/fixtures/report', path) + path = tmp_path.joinpath("hdd") + shutil.copytree("tests/fixtures/report", path) return path + def test_identify_all_ers(arranged_collection): """Function should list every folder starting with ER""" ers = rhe.get_ers(arranged_collection) print(ers) - just_ers = [re.search(r'ER\s\d+', er[0]).group() for er in ers] + just_ers = [re.search(r"ER\s\d+", er[0]).group() for er in ers] for i in range(1, 4): - assert f'ER {i}' in just_ers + assert f"ER {i}" in just_ers for i in range(7, 12): - assert f'ER {i}' in just_ers - assert 'ER 23' in just_ers + assert f"ER {i}" in just_ers + assert "ER 23" in just_ers + def test_hierarchy_nests_down_correctly(arranged_collection): """Function should include organization hierarchy. @@ -30,28 +35,37 @@ def test_hierarchy_nests_down_correctly(arranged_collection): just_titles = [er[0] for er in ers] print(just_titles) - assert 'M12345_FAcomponents/Series 1/Subseries(1)/ER 1 Text, 2023' in just_titles - assert 'M12345_FAcomponents/Series 1/Subseries(1)/Subsubseries(2)/ER 2 File 15, 2023' in just_titles + assert "M12345_FAcomponents/Series 1/Subseries(1)/ER 1 Text, 2023" in just_titles + assert ( + "M12345_FAcomponents/Series 1/Subseries(1)/Subsubseries(2)/ER 2 File 15, 2023" + in just_titles + ) + def test_hierarchy_nests_empty_subseries(arranged_collection): """Function should include organization hierarchy including empty levels""" ers = rhe.get_ers(arranged_collection) just_titles = [er[0] for er in ers] - assert 'M12345_FAcomponents/Series 1/Subseries(1)/Subsubseries(2)/Subsubsubseries(3)/Subsubsubsubseries(4)/ER 10 Folder 2, 2023' in just_titles + assert ( + "M12345_FAcomponents/Series 1/Subseries(1)/Subsubseries(2)/Subsubsubseries(3)/Subsubsubsubseries(4)/ER 10 Folder 2, 2023" + in just_titles + ) + def test_er_outside_of_series(arranged_collection): """Function should include capture ERs even if they're not in a series""" ers = rhe.get_ers(arranged_collection) just_titles = [er[0] for er in ers] - assert 'M12345_FAcomponents/ER 10 File 21,2023' in just_titles + assert "M12345_FAcomponents/ER 10 File 21,2023" in just_titles + def test_correct_report_many_files(arranged_collection): """Test if file count and byte count is completed correctly""" ers = rhe.get_ers(arranged_collection) - er_with_many_files = 'ER 1 Text, 2023' + er_with_many_files = "ER 1 Text, 2023" for er in ers: if er[3] == er_with_many_files: bytes, files = er[1:3] @@ -62,12 +76,13 @@ def test_correct_report_many_files(arranged_collection): # files assert files == 7 + def test_correct_report_on_er_with_folder_included(arranged_collection): """Test if file count and byte count is completed correctly when bookmark includes a folder that is bookmarked""" ers = rhe.get_ers(arranged_collection) - er_with_folder = 'ER 10 Folder 2, 2023' + er_with_folder = "ER 10 Folder 2, 2023" for er in ers: if er[3] == er_with_folder: bytes, files = er[1:3] @@ -82,7 +97,7 @@ def test_correct_report_1_file(arranged_collection): """Test if file count and byte count is completed correctly for one file""" ers = rhe.get_ers(arranged_collection) - er_with_one_file = 'ER 2 File 15, 2023' + er_with_one_file = "ER 2 File 15, 2023" for er in ers: if er[3] == er_with_one_file: bytes, files = er[1:3] @@ -97,9 +112,9 @@ def test_warn_on_no_files_in_er(arranged_collection, caplog): """Test if warning is logged for empty bookmarks and ER is omitted from report""" ers = rhe.get_ers(arranged_collection) - er_with_no_files = 'ER 5 No Files, 2023' + er_with_no_files = "ER 5 No Files, 2023" - log_msg = f'{er_with_no_files} does not contain any files. It will be omitted from the report.' + log_msg = f"{er_with_no_files} does not contain any files. It will be omitted from the report." assert log_msg in caplog.text @@ -107,11 +122,11 @@ def test_warn_on_a_no_byte_file_in_er(arranged_collection, caplog): """Test if warning is logged for empty files in an ER""" ers = rhe.get_ers(arranged_collection) - er_with_no_bytes = 'ER 6 Zero Length, 2023' + er_with_no_bytes = "ER 6 Zero Length, 2023" # rfe.add_extents_to_ers(er_with_no_bytes, bookmark_tables) # log warning, script should continue running # 'ER xxx: Title contain zero byte files.' - log_msg = f'{er_with_no_bytes} contains the following 0-byte file: file00.txt. Review this file with the processing archivist.' + log_msg = f"{er_with_no_bytes} contains the following 0-byte file: file00.txt. Review this file with the processing archivist." assert log_msg in caplog.text @@ -119,11 +134,11 @@ def test_warn_on_no_bytes_in_er(arranged_collection, caplog): """Test if warning is logged for bookmarks with 0 bytes total and ER is omitted from report""" ers = rhe.get_ers(arranged_collection) - er_with_no_bytes = 'ER 6 Zero Length, 2023' + er_with_no_bytes = "ER 6 Zero Length, 2023" # rfe.add_extents_to_ers(er_with_no_bytes, bookmark_tables) # log warning, script should continue running # 'ER xxx: Title does not contain any bytes. It will be omitted from the report' - log_msg = f'{er_with_no_bytes} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist.' + log_msg = f"{er_with_no_bytes} contains no files with bytes. This ER is omitted from report. Review this ER with the processing archivist." assert log_msg in caplog.text @@ -131,9 +146,9 @@ def test_warn_on_no_objects_in_er(arranged_collection, caplog): """Test if warning is logged for empty bookmarks and ER is omitted from report""" ers = rhe.get_ers(arranged_collection) - er_with_no_files = 'ER 13 No objects, 2023' + er_with_no_files = "ER 13 No objects, 2023" - log_msg = f'{er_with_no_files} does not contain an object folder. It will be omitted from the report.' + log_msg = f"{er_with_no_files} does not contain an object folder. It will be omitted from the report." assert log_msg in caplog.text @@ -141,16 +156,18 @@ def test_extract_collection_name(arranged_collection): """Test if collection name is taken from XML""" coll_name = rhe.extract_collection_title(arranged_collection) - assert coll_name == 'M12345_FAcomponents' + assert coll_name == "M12345_FAcomponents" + def test_warn_on_bad_collection_name(arranged_collection, caplog): """Test if collection name is taken from XML""" - coll_name_folder = arranged_collection / 'M12345_FAcomponents' - coll_name_folder.rename(arranged_collection / 'Test_Coll') + coll_name_folder = arranged_collection / "M12345_FAcomponents" + coll_name_folder.rename(arranged_collection / "Test_Coll") coll_name = rhe.extract_collection_title(arranged_collection) - log_msg = 'Cannot find CollectionID_FAcomponents directory. Please use CollectionID_FAcomponents naming convention for the directory containing all ERs.' + log_msg = "Cannot find CollectionID_FAcomponents directory. Please use CollectionID_FAcomponents naming convention for the directory containing all ERs." assert log_msg in caplog.text + def test_ER_missing_number_behavior(arranged_collection, caplog): """Test if script flags when ER number is reused""" ers = rhe.get_ers(arranged_collection) @@ -158,51 +175,55 @@ def test_ER_missing_number_behavior(arranged_collection, caplog): rhe.audit_ers(ers) - log_msg = f'ER is missing a number: ER ? File 21,2023. Review the ERs with the processing archivist' + log_msg = f"ER is missing a number: ER ? File 21,2023. Review the ERs with the processing archivist" assert log_msg in caplog.text + def test_skipped_ER_number_behavior(arranged_collection, caplog): ers = rhe.get_ers(arranged_collection) rhe.audit_ers(ers) # log warning, but continue operation - for number in range(13,22): - log_msg = f'Collection uses ER 1 to ER 23. ER {number} is skipped. Review the ERs with the processing archivist' + for number in range(13, 22): + log_msg = f"Collection uses ER 1 to ER 23. ER {number} is skipped. Review the ERs with the processing archivist" assert log_msg in caplog.text + def test_repeated_ER_number_behavior(arranged_collection, caplog): ers = rhe.get_ers(arranged_collection) rhe.audit_ers(ers) - log_msg = 'ER 10 is used multiple times' + log_msg = "ER 10 is used multiple times" assert log_msg in caplog.text + @pytest.fixture def extracted_ers(arranged_collection): return rhe.get_ers(arranged_collection) + def test_json_objects_contains_expected_fields(extracted_ers): """Test if final report aligns with expectations for ASpace import""" - full_dict = rhe.create_report(extracted_ers, {'title': 'test', 'children': []}) + full_dict = rhe.create_report(extracted_ers, {"title": "test", "children": []}) def recursive_validator(er_dict): for key, value in er_dict.items(): - if key == 'title': + if key == "title": assert type(value) is str - elif key == 'children': + elif key == "children": assert type(value) is list for child in value: recursive_validator(child) - elif key == 'er_number': + elif key == "er_number": assert type(value) is str - elif key == 'er_name': + elif key == "er_name": assert type(value) is str - elif key == 'file_size': + elif key == "file_size": assert type(value) is int - elif key == 'file_count': + elif key == "file_count": assert type(value) is int else: assert False @@ -212,19 +233,19 @@ def recursive_validator(er_dict): @pytest.fixture def expected_json(): - with open('tests/fixtures/report/report.json') as f: + with open("tests/fixtures/report/report.json") as f: raw = f.read() - #adjust fixture for hdd conventions + # adjust fixture for hdd conventions colons_removed = re.sub(r"(ER \d+):", r"\1", raw) report = json.loads(colons_removed) - report['children'][0]['title'] = 'M12345_FAcomponents' - + report["children"][0]["title"] = "M12345_FAcomponents" return report + def test_create_correct_json(extracted_ers, expected_json): """Test that final report matches total expectations""" - dct = rhe.create_report(extracted_ers, {'title': 'coll', 'children': []}) + dct = rhe.create_report(extracted_ers, {"title": "coll", "children": []}) assert dct == expected_json From 18177b56dc428d5e69919dcb8f54b4b9784eb139 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 13:52:29 -0400 Subject: [PATCH 09/35] tests and code to validate filesets --- src/digarch_scripts/package/package_images.py | 108 ++++++++++--- tests/test_package_images.py | 150 +++++++++++++++++- 2 files changed, 230 insertions(+), 28 deletions(-) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 64d1d34..8961e2b 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -1,9 +1,17 @@ import argparse +import logging import re from pathlib import Path import digarch_scripts.package.package_base as pb +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) + +IMG_EXTS = [".001", ".img", ".dsk"] +LOG_EXTS = [".log"] +STREAM_EXTS = [""] + def parse_args() -> argparse.Namespace: def extant_path(p: str) -> Path: @@ -23,54 +31,108 @@ def acq_id(id: str) -> Path: return id parser = argparse.ArgumentParser(description="test") - parser.add_argument("--images_folder", required=True, type=extant_path, help='Path to working images folder') - parser.add_argument("--dest", required=True, type=extant_path, help='Path to packaged images folder') - parser.add_argument("--acqid", required=True, type=acq_id, help='ACQ_####') - parser.add_argument("--logs_folder", required=False, type=extant_path, help='Path to working logs folder') - parser.add_argument("--streams_folder", required=False, type=extant_path, help='Path to working streams folder') + parser.add_argument( + "--images_folder", + required=True, + type=extant_path, + help="Path to working images folder", + ) + parser.add_argument( + "--dest", required=True, type=extant_path, help="Path to packaged images folder" + ) + parser.add_argument("--acqid", required=True, type=acq_id, help="ACQ_####") + parser.add_argument( + "--logs_folder", + required=False, + type=extant_path, + help="Path to working logs folder", + ) + parser.add_argument( + "--streams_folder", + required=False, + type=extant_path, + help="Path to working streams folder", + ) return parser.parse_args() -def find_category_files(file_groups: dict, source_dir: Path, acq_id: str, category: str) -> dict: +def find_category_of_carrier_files( + carrier_files: dict, acq_id: str, source_dir: Path, exts: list, category: str +) -> dict: for file in source_dir.iterdir(): - carrier_id_match = re.search(rf'{acq_id}_\d\d\d\d\d\d+', file.name) + if not file.suffix in exts: + continue + carrier_id_match = re.search(rf"{acq_id}_\d\d\d\d\d\d+", file.name) if not carrier_id_match: continue carrier_id = carrier_id_match.group(0) - if not carrier_id in file_groups: - file_groups[carrier_id] = {category: []} - elif not category in file_groups[carrier_id]: - file_groups[carrier_id][category] = [] + if not carrier_id in carrier_files: + carrier_files[carrier_id] = {category: []} + elif not category in carrier_files[carrier_id]: + carrier_files[carrier_id][category] = [] - file_groups[carrier_id][category].append(file) + carrier_files[carrier_id][category].append(file) + + return carrier_files - return file_groups +def find_carrier_files( + acq_id: str, images_dir: Path, log_dir: Path, stream_dir: Path +) -> dict: + carrier_files = find_category_of_carrier_files( + {}, acq_id, images_dir, IMG_EXTS, "images" + ) + carrier_files = find_category_of_carrier_files( + carrier_files, acq_id, log_dir, LOG_EXTS, "logs" + ) + carrier_files = find_category_of_carrier_files( + carrier_files, acq_id, stream_dir, STREAM_EXTS, "streams" + ) -def find_carrier_files(carrier_files: dict, log_dir: Path, images_dir: Path, stream_dir: Path, acq_id: str) -> dict: - carrier_files = find_category_files(carrier_files, log_dir, acq_id, 'logs') - carrier_files = find_category_files(carrier_files, images_dir, acq_id, 'images') - carrier_files = find_category_files(carrier_files, stream_dir, acq_id, 'streams') + if not carrier_files: + raise Warning(f"No files found with the acquisition ID {acq_id} in filename") return carrier_files +def validate_carrier_files(carrier_files): + for carrier_name in carrier_files: + carrier = carrier_files[carrier_name] + missing = [] + for key in ['images', 'logs', 'streams']: + if not key in carrier.keys(): + missing.append(key) + + if missing: + LOGGER.warning(f'The following categories of files were not found for {carrier_name}: {", ".join(missing)} ') + + if 'images' in carrier: + for image_file in carrier['images']: + if image_file.stat().st_size == 0: + LOGGER.warning(f'The following image file is 0-bytes: {image_file}') + + return + def package_carriers(carrier_files: dict, acq_dir: Path) -> None: for carrier, files in carrier_files.items(): base_dir = pb.create_package_dir(acq_dir, carrier) - pb.move_metadata_files(files['logs'], base_dir) - pb.move_diskimage_files(files['images'], base_dir) - pb.move_stream_files(files['streams'], base_dir) + pb.move_metadata_files(files["logs"], base_dir) + pb.move_diskimage_files(files["images"], base_dir) + pb.move_stream_files(files["streams"], base_dir) def main(): args = parse_args() - carrier_files = find_carrier_files({}, args.logs_folder, args.images_folder, args.streams_folder, args.acqid) - package_carriers(carrier_files, args.dest) - + carrier_files = find_carrier_files( + args.acqid, args.images_folder, args.logs_folder, args.streams_folder + ) + if validate_carrier_files(carrier_files): + package_carriers(carrier_files, args.dest) + else: + LOGGER.error("1 or more errors with files for a carrier. Please address warnings and re-run") if __name__ == "__main__": diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 847bfcc..2f518ed 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -1,10 +1,10 @@ -import digarch_scripts.package.package_images as pi - -from pathlib import Path -import pytest import shutil +from pathlib import Path import bagit +import pytest + +import digarch_scripts.package.package_images as pi @pytest.fixture @@ -13,6 +13,7 @@ def transfer_files(tmp_path: Path, request): shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) return tmp_path + # Test command-line arguments @pytest.fixture def args(transfer_files): @@ -59,7 +60,7 @@ def test_optional_args( for i in [3, 4]: # remove a pair of list items (arg and value) for each test part_args = args[0 : 2 * i + 1] + args[2 * i + 3 :] - missing_arg = args[2*i] + missing_arg = args[2 * i] monkeypatch.setattr("sys.argv", part_args) @@ -101,6 +102,144 @@ def test_id_arg_must_match_pattern( assert f"bad_id does not match" in stderr +def test_carrier_files_found(transfer_files): + acq_id = "ACQ_1234" + + carrier_files = pi.find_carrier_files( + acq_id, + transfer_files / "images", + transfer_files / "logs", + transfer_files / "streams", + ) + + carrier1 = f"{acq_id}_123456" + assert carrier1 in carrier_files + for key in ["images", "logs", "streams"]: + assert key in carrier_files[carrier1] + for key in carrier_files[carrier1]: + for item in carrier_files[carrier1][key]: + assert isinstance(item, Path) + + +def test_acqid_not_found(transfer_files): + acq_id = "ACQ_1111" + + with pytest.raises(Warning) as exc: + pi.find_carrier_files( + acq_id, + transfer_files / "images", + transfer_files / "logs", + transfer_files / "streams", + ) + + assert f"No files found with the acquisition ID {acq_id} in filename" in str( + exc.value + ) + + +def test_file_found(transfer_files): + acq_id = "ACQ_1234" + + carrier_files = {} + carrier_files = pi.find_category_of_carrier_files( + carrier_files, acq_id, transfer_files / "images", [".img"], "images" + ) + + assert ( + transfer_files / "images" / "ACQ_1234_123456.img" in carrier_files[f"{acq_id}_123456"]["images"] + ) + + +def test_ignore_unknown_extension_for_category(transfer_files): + acq_id = "ACQ_1234" + + carrier_files = {} + carrier_files = pi.find_category_of_carrier_files( + carrier_files, acq_id, transfer_files / "images", [".001"], "images" + ) + + assert f"{acq_id}_123456" not in carrier_files + + +def test_multiple_files_found(transfer_files): + acq_id = "ACQ_1234" + + carrier_files = {} + carrier_files = pi.find_category_of_carrier_files( + carrier_files, acq_id, transfer_files / "logs", [".log"], "logs" + ) + + assert len(carrier_files[f"{acq_id}_123456"]["logs"]) == 2 + + +@pytest.fixture +def carrier_files(transfer_files): + acq_id = "ACQ_1234" + + carrier_files = pi.find_carrier_files( + acq_id, + transfer_files / "images", + transfer_files / "logs", + transfer_files / "streams", + ) + return carrier_files + +def test_good_validate_carrier(carrier_files, caplog): + pi.validate_carrier_files(carrier_files) + + assert not caplog.text + + +@pytest.mark.parametrize("key", ['images', 'logs', 'streams']) +def test_warn_carrier_with_one_missing_category(carrier_files, key, caplog): + carrier_files['ACQ_1234_123456'].pop(key) + + pi.validate_carrier_files(carrier_files) + + assert f'The following categories of files were not found for ACQ_1234_123456: {key}' in caplog.text + + +def test_warn_carrier_with_logs_no_images_or_streams(caplog): + carrier_files = { + 'ACQ_1234_123456': { + 'logs': [Path('ACQ_1234_123456.log')] + } + } + pi.validate_carrier_files(carrier_files) + + assert f'The following categories of files were not found for ACQ_1234_123456: images, streams' in caplog.text + + +def test_warn_carrier_with_streams_no_images_or_logs(caplog): + carrier_files = { + 'ACQ_1234_123456': { + 'streams': [Path('ACQ_1234_123456_streams')] + } + } + pi.validate_carrier_files(carrier_files) + + assert f'The following categories of files were not found for ACQ_1234_123456: images, logs' in caplog.text + + + +def test_warn_and_skip_0_length_image(carrier_files, caplog): + carrier_files["ACQ_1234_123457"]["images"][0].unlink() + carrier_files["ACQ_1234_123457"]["images"][0].touch() + pi.validate_carrier_files(carrier_files) + + assert f'The following image file is 0-bytes: {str(carrier_files["ACQ_1234_123457"]["images"][0])}' in caplog.text + + +def test_warn_streams_missing_a_side(): + #TODO + assert True + + + + + + +''' def test_full_run( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list ): @@ -113,3 +252,4 @@ def test_full_run( assert acq_dir.exists() assert "ACQ_1234_123456" in [x.name for x in acq_dir.iterdir()] +''' From 6913a27542371ef0c513cf96f40d24ae983d0258 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 14:49:05 -0400 Subject: [PATCH 10/35] add more bagging functions, untested --- src/digarch_scripts/package/package_base.py | 52 +++++++++++++++++---- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index e213b4f..6ad9cf2 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -98,20 +98,48 @@ def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: return move_files(md_paths, pkg_dir, "metadata") -def move_diskimage_file(image_path: Path, pkg_dir: Path) -> None: - return move_file(image_path, pkg_dir, "images") +def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: + return move_files(image_paths, pkg_dir, "data") -def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: - return move_files(image_paths, pkg_dir, "images") +def move_stream_files(stream_paths: Path, pkg_dir: Path) -> None: + return move_files(stream_paths, pkg_dir, "data") -def move_stream_file(md_path: Path, pkg_dir: Path) -> None: - return move_file(md_path, pkg_dir, "streams") +def move_and_bag_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: + bag_dir = pkg_dir / "images" + if not bag_dir.exists(): + bag_dir.mkdir() + create_bagit_manifest(image_paths, bag_dir) + move_diskimage_files(image_paths, bag_dir) + create_bag_tag_files(bag_dir) + + return None -def move_stream_files(md_path: Path, pkg_dir: Path) -> None: - return move_files(md_path, pkg_dir, "streams") +def move_and_bag_stream_files(stream_path: list[Path], pkg_dir: Path) -> None: + bag_dir = pkg_dir / "streams" + if not bag_dir.exists(): + bag_dir.mkdir() + stream_paths = list(stream_path[0].iterdir()) + create_bagit_manifest(stream_paths, bag_dir) + move_stream_files(stream_paths, bag_dir) + create_bag_tag_files(bag_dir) + + return None + + +def create_bagit_manifest(paths: list[Path], bag_dir: Path) -> None: + manifest_lines = [] + for path in paths: + md5_hash = bagit.generate_manifest_lines(str(path), ["md5"])[0][1] + manifest_lines.append([md5_hash, Path("data") / path.name]) + + with open(bag_dir / "manifest-md5.txt", "w") as f: + for line in manifest_lines: + f.write(f"{line[0]} {line[1]}") + + return None def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> None: @@ -121,6 +149,7 @@ def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> convert_rclone_md5_to_bagit_manifest(md5_path, bag_dir) # generate baginfo.txt and bagit.txt (copying code snippet from bagit) create_bag_tag_files(bag_dir) + return None @@ -140,6 +169,7 @@ def move_payload(payload_path: Path, bag_dir: Path) -> None: raise FileExistsError(f"{new_ob_path} already exists. Not moving.") a_file.rename(new_ob_path) + return None @@ -162,18 +192,20 @@ def convert_rclone_md5_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: return None -def create_bag_tag_files(bag_dir: Path): +def create_bag_tag_files(bag_dir: Path) -> None: txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" with open(bag_dir / "bagit.txt", "w") as bagit_file: bagit_file.write(txt) bag_info = {} bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") - bag_info["Bag-Software-Agent"] = "package_cloud.py" + bag_info["Bag-Software-Agent"] = "digarch_scripts" total_bytes, total_files = get_oxum(bag_dir / "data") bag_info["Payload-Oxum"] = f"{total_bytes}.{total_files}" bagit._make_tag_file(bag_dir / "bag-info.txt", bag_info) + return None + def get_oxum(payload_dir: Path) -> tuple[int, int]: total_bytes = 0 From 8c117d3cb368c823ce8a2b8e6e8aa09219958c0f Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 14:49:22 -0400 Subject: [PATCH 11/35] initial functioning bag and move, untested --- src/digarch_scripts/package/package_images.py | 36 ++++++++--- tests/test_package_images.py | 61 ++++++++++++------- 2 files changed, 66 insertions(+), 31 deletions(-) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 8961e2b..89582d5 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -101,26 +101,40 @@ def validate_carrier_files(carrier_files): for carrier_name in carrier_files: carrier = carrier_files[carrier_name] missing = [] - for key in ['images', 'logs', 'streams']: + for key in ["images", "logs", "streams"]: if not key in carrier.keys(): missing.append(key) if missing: - LOGGER.warning(f'The following categories of files were not found for {carrier_name}: {", ".join(missing)} ') + LOGGER.warning( + f'The following categories of files were not found for {carrier_name}: {", ".join(missing)} ' + ) - if 'images' in carrier: - for image_file in carrier['images']: + if "images" in carrier: + for image_file in carrier["images"]: if image_file.stat().st_size == 0: - LOGGER.warning(f'The following image file is 0-bytes: {image_file}') + LOGGER.warning(f"The following image file is 0-bytes: {image_file}") + + if "streams" in carrier: + if not len(carrier["streams"]) == 1: + LOGGER.warning( + f'Multiple folder of stream folders found for {carrier_name}. Only 1 allowed: {carrier["streams"]}' + ) return + def package_carriers(carrier_files: dict, acq_dir: Path) -> None: for carrier, files in carrier_files.items(): - base_dir = pb.create_package_dir(acq_dir, carrier) - pb.move_metadata_files(files["logs"], base_dir) - pb.move_diskimage_files(files["images"], base_dir) - pb.move_stream_files(files["streams"], base_dir) + try: + base_dir = pb.create_package_dir(acq_dir, carrier) + pb.move_metadata_files(files["logs"], base_dir) + pb.move_and_bag_diskimage_files(files["images"], base_dir) + pb.move_and_bag_stream_files(files["streams"], base_dir) + except: + LOGGER.error( + f"Packaging incomplete for {carrier}. Address warnings manually." + ) def main(): @@ -132,7 +146,9 @@ def main(): if validate_carrier_files(carrier_files): package_carriers(carrier_files, args.dest) else: - LOGGER.error("1 or more errors with files for a carrier. Please address warnings and re-run") + LOGGER.error( + "1 or more errors with files for a carrier. Please address warnings and re-run" + ) if __name__ == "__main__": diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 2f518ed..85e55e5 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -146,7 +146,8 @@ def test_file_found(transfer_files): ) assert ( - transfer_files / "images" / "ACQ_1234_123456.img" in carrier_files[f"{acq_id}_123456"]["images"] + transfer_files / "images" / "ACQ_1234_123456.img" + in carrier_files[f"{acq_id}_123456"]["images"] ) @@ -184,42 +185,43 @@ def carrier_files(transfer_files): ) return carrier_files + def test_good_validate_carrier(carrier_files, caplog): pi.validate_carrier_files(carrier_files) assert not caplog.text -@pytest.mark.parametrize("key", ['images', 'logs', 'streams']) +@pytest.mark.parametrize("key", ["images", "logs", "streams"]) def test_warn_carrier_with_one_missing_category(carrier_files, key, caplog): - carrier_files['ACQ_1234_123456'].pop(key) + carrier_files["ACQ_1234_123456"].pop(key) pi.validate_carrier_files(carrier_files) - assert f'The following categories of files were not found for ACQ_1234_123456: {key}' in caplog.text + assert ( + f"The following categories of files were not found for ACQ_1234_123456: {key}" + in caplog.text + ) def test_warn_carrier_with_logs_no_images_or_streams(caplog): - carrier_files = { - 'ACQ_1234_123456': { - 'logs': [Path('ACQ_1234_123456.log')] - } - } + carrier_files = {"ACQ_1234_123456": {"logs": [Path("ACQ_1234_123456.log")]}} pi.validate_carrier_files(carrier_files) - assert f'The following categories of files were not found for ACQ_1234_123456: images, streams' in caplog.text + assert ( + f"The following categories of files were not found for ACQ_1234_123456: images, streams" + in caplog.text + ) def test_warn_carrier_with_streams_no_images_or_logs(caplog): - carrier_files = { - 'ACQ_1234_123456': { - 'streams': [Path('ACQ_1234_123456_streams')] - } - } + carrier_files = {"ACQ_1234_123456": {"streams": [Path("ACQ_1234_123456_streams")]}} pi.validate_carrier_files(carrier_files) - assert f'The following categories of files were not found for ACQ_1234_123456: images, logs' in caplog.text - + assert ( + f"The following categories of files were not found for ACQ_1234_123456: images, logs" + in caplog.text + ) def test_warn_and_skip_0_length_image(carrier_files, caplog): @@ -227,19 +229,37 @@ def test_warn_and_skip_0_length_image(carrier_files, caplog): carrier_files["ACQ_1234_123457"]["images"][0].touch() pi.validate_carrier_files(carrier_files) - assert f'The following image file is 0-bytes: {str(carrier_files["ACQ_1234_123457"]["images"][0])}' in caplog.text + assert ( + f'The following image file is 0-bytes: {str(carrier_files["ACQ_1234_123457"]["images"][0])}' + in caplog.text + ) def test_warn_streams_missing_a_side(): - #TODO + # TODO assert True +def test_warn_only_one_stream_folder_allowed(carrier_files, caplog): + carrier_files["ACQ_1234_123457"]["streams"].append("ACQ_1234_123457_2") + pi.validate_carrier_files(carrier_files) + + assert ( + f"Multiple folder of stream folders found for ACQ_1234_123457. Only 1 allowed" + in caplog.text + ) + + +def test_good_packaging(carrier_files, tmp_path: Path): + pi.package_carriers(carrier_files, tmp_path) + + for carrier in carrier_files: + assert carrier in [x.name for x in (tmp_path / "ACQ_1234").iterdir()] +# TODO add packaging fails -''' def test_full_run( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list ): @@ -252,4 +272,3 @@ def test_full_run( assert acq_dir.exists() assert "ACQ_1234_123456" in [x.name for x in acq_dir.iterdir()] -''' From 3afa237f7b061075a95908819d30b7c3507369fb Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 14:49:45 -0400 Subject: [PATCH 12/35] formatting --- src/digarch_scripts/lint/lint_ft.py | 80 +++++++++++++++++------------ 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/src/digarch_scripts/lint/lint_ft.py b/src/digarch_scripts/lint/lint_ft.py index 258adbc..334d6c1 100644 --- a/src/digarch_scripts/lint/lint_ft.py +++ b/src/digarch_scripts/lint/lint_ft.py @@ -7,6 +7,7 @@ LOGGER = logging.getLogger(__name__) + def _configure_logging(log_folder: Path): log_fn = datetime.now().strftime("lint_%Y_%m_%d_%H_%M.log") log_fpath = log_folder / log_fn @@ -21,15 +22,14 @@ def _configure_logging(log_folder: Path): encoding="utf-8", ) + def parse_args() -> argparse.Namespace: """Validate and return command-line args""" def extant_dir(p): path = Path(p) if not path.is_dir(): - raise argparse.ArgumentTypeError( - f'{path} does not exist' - ) + raise argparse.ArgumentTypeError(f"{path} does not exist") return path def list_of_paths(p): @@ -43,28 +43,21 @@ def list_of_paths(p): parser = argparse.ArgumentParser() parser.add_argument( - '--package', - type=extant_dir, - nargs='+', - dest='packages', - action='extend' + "--package", type=extant_dir, nargs="+", dest="packages", action="extend" ) parser.add_argument( - '--directory', - type=list_of_paths, - dest='packages', - action='extend' + "--directory", type=list_of_paths, dest="packages", action="extend" ) parser.add_argument( - '--log_folder', - help='''Optional. Designate where to save the log file, - or it will be saved in current directory''', - default='.' + "--log_folder", + help="""Optional. Designate where to save the log file, + or it will be saved in current directory""", + default=".", ) - return parser.parse_args() + def package_has_valid_name(package: Path) -> bool: """Top level folder name has to conform to ACQ_####_######""" folder_name = package.name @@ -76,15 +69,17 @@ def package_has_valid_name(package: Path) -> bool: LOGGER.error(f"{folder_name} does not conform to ACQ_####_######") return False + def package_has_two_subfolders(package: Path) -> bool: """There must be two subfolders in the package""" - pkg_folders = [ x for x in package.iterdir() if x.is_dir() ] + pkg_folders = [x for x in package.iterdir() if x.is_dir()] if len(pkg_folders) == 2: return True else: LOGGER.error(f"{package} does not have exactly two subfolders") return False + def package_has_valid_subfolder_names(package: Path) -> bool: """Second level folders must be objects and metadata folder""" expected = set(["objects", "metadata"]) @@ -98,6 +93,7 @@ def package_has_valid_subfolder_names(package: Path) -> bool: ) return False + def package_has_no_hidden_file(package: Path) -> bool: """The package should not have any hidden file""" hidden_ls = [ @@ -111,10 +107,11 @@ def package_has_no_hidden_file(package: Path) -> bool: else: return True + def package_has_no_zero_bytes_file(package: Path) -> bool: """The package should not have any zero bytes file""" - all_file = [ f for f in package.rglob("*") if f.is_file() ] - zero_bytes_ls = [ f for f in all_file if f.stat().st_size == 0 ] + all_file = [f for f in package.rglob("*") if f.is_file()] + zero_bytes_ls = [f for f in all_file if f.stat().st_size == 0] if zero_bytes_ls: LOGGER.warning(f"{package.name} has zero bytes file {zero_bytes_ls}") @@ -122,6 +119,7 @@ def package_has_no_zero_bytes_file(package: Path) -> bool: else: return True + def metadata_folder_is_flat(package: Path) -> bool: """The metadata folder should not have folder structure""" metadata_path = package / "metadata" @@ -132,40 +130,49 @@ def metadata_folder_is_flat(package: Path) -> bool: else: return True + def metadata_folder_has_files(package: Path) -> bool: """The metadata folder should have one or more file""" metadata_path = package / "metadata" - md_files_ls = [ x for x in metadata_path.rglob("*") if x.is_file() ] + md_files_ls = [x for x in metadata_path.rglob("*") if x.is_file()] if md_files_ls: return True else: LOGGER.warning(f"{package.name} metadata folder does not have any files") return False + def metadata_has_correct_naming_convention(package: Path) -> bool: """The metadata file name should be in the accepted list""" metadata_path = package / "metadata" accepted_fn = ["rclone.log"] - md_files_ls = [ x for x in metadata_path.rglob("*") if x.is_file() ] + md_files_ls = [x for x in metadata_path.rglob("*") if x.is_file()] nonconforming = [] for file in md_files_ls: if not file.name in accepted_fn: nonconforming.append(file) if nonconforming: - LOGGER.error(f"""{package.name} has nonconforming metadata file(s): - {nonconforming}""") + LOGGER.error( + f"""{package.name} has nonconforming metadata file(s): + {nonconforming}""" + ) return False else: return True + def objects_folder_correct_structure(package: Path) -> bool: """objects folder should have a data folder, which includes four files: bag-info.txt, bagit.txt, manifest-md5.txt and tagmanifest-md5.txt""" expected_paths = [] - expected_files = ["bag-info.txt", "bagit.txt", - "manifest-md5.txt", "tagmanifest-md5.txt"] + expected_files = [ + "bag-info.txt", + "bagit.txt", + "manifest-md5.txt", + "tagmanifest-md5.txt", + ] missing = [] data_folder = package / "objects" / "data" @@ -180,16 +187,19 @@ def objects_folder_correct_structure(package: Path) -> bool: missing.append(fp.name) if missing: - LOGGER.error(f"""{package.name} has incorrect structure. - missing {missing}""") + LOGGER.error( + f"""{package.name} has incorrect structure. + missing {missing}""" + ) return False else: return True + def objects_folder_has_no_empty_folder(package: Path) -> bool: """The objects folder should not have any empty folders""" objects_path = package / "objects" - folder_in_obj = [ x for x in objects_path.rglob("*") if x.is_dir() ] + folder_in_obj = [x for x in objects_path.rglob("*") if x.is_dir()] empty = [] for folder in folder_in_obj: @@ -202,6 +212,7 @@ def objects_folder_has_no_empty_folder(package: Path) -> bool: else: return True + def lint_package(package: Path) -> Literal["valide", "invalide", "needs review"]: """Run all linting tests against a package""" result = "valid" @@ -209,7 +220,7 @@ def lint_package(package: Path) -> Literal["valide", "invalide", "needs review"] less_strict_tests = [ package_has_no_hidden_file, package_has_no_zero_bytes_file, - metadata_folder_has_files + metadata_folder_has_files, ] for test in less_strict_tests: @@ -223,7 +234,7 @@ def lint_package(package: Path) -> Literal["valide", "invalide", "needs review"] metadata_folder_is_flat, metadata_has_correct_naming_convention, objects_folder_correct_structure, - objects_folder_has_no_empty_folder + objects_folder_has_no_empty_folder, ] for test in strict_tests: @@ -232,6 +243,7 @@ def lint_package(package: Path) -> Literal["valide", "invalide", "needs review"] return result + def main(): args = parse_args() _configure_logging(args.log_folder) @@ -266,7 +278,9 @@ def main(): print( f""" The following {len(needs_review)} packages need review. - They may be passed without change after review: {needs_review}""") + They may be passed without change after review: {needs_review}""" + ) + if __name__ == "__main__": - main() \ No newline at end of file + main() From de7076a6592ad894e1c40da19080bd8e531805ca Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 14:50:08 -0400 Subject: [PATCH 13/35] add test files for package images --- tests/fixtures/image/images/ACQ_1234_123456.img | 1 + tests/fixtures/image/images/ACQ_1234_123457.img | 1 + tests/fixtures/image/logs/ACQ_1234_123456_process1.log | 1 + tests/fixtures/image/logs/ACQ_1234_123456_process2.log | 1 + tests/fixtures/image/logs/ACQ_1234_123457_process21.log | 1 + tests/fixtures/image/streams/ACQ_1234_123456/ACQ_1234_123456.001 | 0 tests/fixtures/image/streams/ACQ_1234_123457/ACQ_1234_123457.001 | 0 7 files changed, 5 insertions(+) create mode 100644 tests/fixtures/image/images/ACQ_1234_123456.img create mode 100644 tests/fixtures/image/images/ACQ_1234_123457.img create mode 100644 tests/fixtures/image/logs/ACQ_1234_123456_process1.log create mode 100644 tests/fixtures/image/logs/ACQ_1234_123456_process2.log create mode 100644 tests/fixtures/image/logs/ACQ_1234_123457_process21.log create mode 100644 tests/fixtures/image/streams/ACQ_1234_123456/ACQ_1234_123456.001 create mode 100644 tests/fixtures/image/streams/ACQ_1234_123457/ACQ_1234_123457.001 diff --git a/tests/fixtures/image/images/ACQ_1234_123456.img b/tests/fixtures/image/images/ACQ_1234_123456.img new file mode 100644 index 0000000..2e65efe --- /dev/null +++ b/tests/fixtures/image/images/ACQ_1234_123456.img @@ -0,0 +1 @@ +a \ No newline at end of file diff --git a/tests/fixtures/image/images/ACQ_1234_123457.img b/tests/fixtures/image/images/ACQ_1234_123457.img new file mode 100644 index 0000000..63d8dbd --- /dev/null +++ b/tests/fixtures/image/images/ACQ_1234_123457.img @@ -0,0 +1 @@ +b \ No newline at end of file diff --git a/tests/fixtures/image/logs/ACQ_1234_123456_process1.log b/tests/fixtures/image/logs/ACQ_1234_123456_process1.log new file mode 100644 index 0000000..2e65efe --- /dev/null +++ b/tests/fixtures/image/logs/ACQ_1234_123456_process1.log @@ -0,0 +1 @@ +a \ No newline at end of file diff --git a/tests/fixtures/image/logs/ACQ_1234_123456_process2.log b/tests/fixtures/image/logs/ACQ_1234_123456_process2.log new file mode 100644 index 0000000..63d8dbd --- /dev/null +++ b/tests/fixtures/image/logs/ACQ_1234_123456_process2.log @@ -0,0 +1 @@ +b \ No newline at end of file diff --git a/tests/fixtures/image/logs/ACQ_1234_123457_process21.log b/tests/fixtures/image/logs/ACQ_1234_123457_process21.log new file mode 100644 index 0000000..63d8dbd --- /dev/null +++ b/tests/fixtures/image/logs/ACQ_1234_123457_process21.log @@ -0,0 +1 @@ +b \ No newline at end of file diff --git a/tests/fixtures/image/streams/ACQ_1234_123456/ACQ_1234_123456.001 b/tests/fixtures/image/streams/ACQ_1234_123456/ACQ_1234_123456.001 new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/image/streams/ACQ_1234_123457/ACQ_1234_123457.001 b/tests/fixtures/image/streams/ACQ_1234_123457/ACQ_1234_123457.001 new file mode 100644 index 0000000..e69de29 From 8bbb8f0b6961ba66b29cb5bf3fe40797b8379fcc Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 18:48:48 -0400 Subject: [PATCH 14/35] clean up moving methods --- src/digarch_scripts/package/package_base.py | 16 ++++++---------- tests/test_package_base.py | 6 ++---- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 6ad9cf2..b487a9f 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -67,14 +67,14 @@ def create_package_dir(dest: Path, id: str) -> Path: def move_file(file_path: Path, pkg_dir: Path, dest: str) -> None: dest_dir = pkg_dir / dest if not dest_dir.exists(): - dest_dir.mkdir() + dest_dir.mkdir(parents=True) new_file_path = dest_dir / file_path.name if new_file_path.exists(): raise FileExistsError( f"{new_file_path} already exists in {dest} folder. Not moving." ) - + print(new_file_path) file_path.rename(new_file_path) return None @@ -98,12 +98,8 @@ def move_metadata_files(md_paths: list[Path], pkg_dir: Path) -> None: return move_files(md_paths, pkg_dir, "metadata") -def move_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: - return move_files(image_paths, pkg_dir, "data") - - -def move_stream_files(stream_paths: Path, pkg_dir: Path) -> None: - return move_files(stream_paths, pkg_dir, "data") +def move_data_files(data_paths: list[Path], pkg_dir: Path) -> None: + return move_files(data_paths, pkg_dir, "data") def move_and_bag_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: @@ -111,7 +107,7 @@ def move_and_bag_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None if not bag_dir.exists(): bag_dir.mkdir() create_bagit_manifest(image_paths, bag_dir) - move_diskimage_files(image_paths, bag_dir) + move_data_files(image_paths, bag_dir) create_bag_tag_files(bag_dir) return None @@ -123,7 +119,7 @@ def move_and_bag_stream_files(stream_path: list[Path], pkg_dir: Path) -> None: bag_dir.mkdir() stream_paths = list(stream_path[0].iterdir()) create_bagit_manifest(stream_paths, bag_dir) - move_stream_files(stream_paths, bag_dir) + move_data_files(stream_paths, bag_dir) create_bag_tag_files(bag_dir) return None diff --git a/tests/test_package_base.py b/tests/test_package_base.py index 3a9752c..8d24703 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -115,8 +115,6 @@ def package_base_dir(tmp_path: Path, id: str): MOVE_FILE = [ (pb.move_metadata_file, "metadata"), - (pb.move_diskimage_file, "images"), - (pb.move_stream_file, "streams"), ] @@ -151,8 +149,7 @@ def test_do_not_overwrite_file( MOVE_FILES = [ (pb.move_metadata_files, "metadata"), - (pb.move_diskimage_files, "images"), - (pb.move_stream_files, "streams"), + (pb.move_data_files, "data"), ] @@ -161,6 +158,7 @@ def test_move_multiple_file( package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str ): """Test that multiple files are moved successfully""" + parts = dest.split('/') md_files = [log, md5_manifest] test_function(md_files, package_base_dir) From f523be2849aab78bf9a630fddffd87cf3dc40455 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Tue, 28 May 2024 20:11:17 -0400 Subject: [PATCH 15/35] make bag creation more generic --- src/digarch_scripts/package/package_base.py | 79 ++++++++----------- src/digarch_scripts/package/package_images.py | 4 +- tests/test_package_base.py | 39 +-------- 3 files changed, 38 insertions(+), 84 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index b487a9f..81ea7bf 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -74,7 +74,7 @@ def move_file(file_path: Path, pkg_dir: Path, dest: str) -> None: raise FileExistsError( f"{new_file_path} already exists in {dest} folder. Not moving." ) - print(new_file_path) + file_path.rename(new_file_path) return None @@ -102,30 +102,48 @@ def move_data_files(data_paths: list[Path], pkg_dir: Path) -> None: return move_files(data_paths, pkg_dir, "data") -def move_and_bag_diskimage_files(image_paths: list[Path], pkg_dir: Path) -> None: - bag_dir = pkg_dir / "images" - if not bag_dir.exists(): - bag_dir.mkdir() - create_bagit_manifest(image_paths, bag_dir) - move_data_files(image_paths, bag_dir) +def create_bag_in_dir( + paths: list[Path], + pkg_dir: Path, + type: str, + manifest_source: Path = None, + source: str = None, +) -> None: + bag_dir = pkg_dir / type + bag_dir.mkdir() + + if len(paths) == 1 and paths[0].is_dir(): + paths = list(paths[0].iterdir()) + + if source == "rclone": + convert_rclone_md5_to_bagit_manifest(manifest_source, bag_dir) + else: + create_bagit_manifest(paths, bag_dir) + + move_data_files(paths, bag_dir) create_bag_tag_files(bag_dir) + +def create_bag_in_images(image_paths: list[Path], pkg_dir: Path) -> None: + create_bag_in_dir(image_paths, pkg_dir, "images") + return None -def move_and_bag_stream_files(stream_path: list[Path], pkg_dir: Path) -> None: - bag_dir = pkg_dir / "streams" - if not bag_dir.exists(): - bag_dir.mkdir() - stream_paths = list(stream_path[0].iterdir()) - create_bagit_manifest(stream_paths, bag_dir) - move_data_files(stream_paths, bag_dir) - create_bag_tag_files(bag_dir) +def create_bag_in_streams(stream_path: list[Path], pkg_dir: Path) -> None: + create_bag_in_dir([stream_path], pkg_dir, "streams") + + return None + + +def create_bag_in_objects(objects_path: Path, md5_path: Path, pkg_dir: Path) -> None: + create_bag_in_dir([objects_path], pkg_dir, "objects", md5_path, "rclone") return None def create_bagit_manifest(paths: list[Path], bag_dir: Path) -> None: + # paths must be files manifest_lines = [] for path in paths: md5_hash = bagit.generate_manifest_lines(str(path), ["md5"])[0][1] @@ -138,37 +156,6 @@ def create_bagit_manifest(paths: list[Path], bag_dir: Path) -> None: return None -def create_bag_in_objects(payload_path: Path, md5_path: Path, pkg_dir: Path) -> None: - bag_dir = pkg_dir / "objects" - bag_dir.mkdir() - move_payload(payload_path, bag_dir) - convert_rclone_md5_to_bagit_manifest(md5_path, bag_dir) - # generate baginfo.txt and bagit.txt (copying code snippet from bagit) - create_bag_tag_files(bag_dir) - - return None - - -def move_payload(payload_path: Path, bag_dir: Path) -> None: - # instantiate a var for objects dir - payload_dir = bag_dir / "data" - # if the object folder does not exist create it - if not payload_dir.exists(): - payload_dir.mkdir(parents=True) - else: - raise FileExistsError(f"{payload_dir} already exists. Not moving files.") - - for a_file in payload_path.iterdir(): - new_ob_path = payload_dir / a_file.name - # if a payload file is already in the object directory do not move, raise error - if new_ob_path.exists(): - raise FileExistsError(f"{new_ob_path} already exists. Not moving.") - - a_file.rename(new_ob_path) - - return None - - def convert_rclone_md5_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: # check for manifest new_md5_path = bag_dir / "manifest-md5.txt" diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 89582d5..f49ca0b 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -129,8 +129,8 @@ def package_carriers(carrier_files: dict, acq_dir: Path) -> None: try: base_dir = pb.create_package_dir(acq_dir, carrier) pb.move_metadata_files(files["logs"], base_dir) - pb.move_and_bag_diskimage_files(files["images"], base_dir) - pb.move_and_bag_stream_files(files["streams"], base_dir) + pb.create_bag_in_images(files["images"], base_dir) + pb.create_bag_in_streams(files["streams"], base_dir) except: LOGGER.error( f"Packaging incomplete for {carrier}. Address warnings manually." diff --git a/tests/test_package_base.py b/tests/test_package_base.py index 8d24703..2c1476c 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -158,7 +158,7 @@ def test_move_multiple_file( package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str ): """Test that multiple files are moved successfully""" - parts = dest.split('/') + parts = dest.split("/") md_files = [log, md5_manifest] test_function(md_files, package_base_dir) @@ -190,43 +190,9 @@ def test_partial_halt_multiple_files( ) -def test_move_payload(package_base_dir: Path, payload: Path): - """Test that entirety of payload is moved and hierarchy is preserved""" - - source_contents = [file.relative_to(payload) for file in payload.rglob("*")] - - data_path = package_base_dir / "objects" / "data" - pb.move_payload(payload, package_base_dir / "objects") - - # check that source is empty - assert not any(payload.iterdir()) - - assert data_path.exists() - - # compare contents of data and former source - data_contents = [file.relative_to(data_path) for file in data_path.rglob("*")] - assert source_contents == data_contents - - -def test_do_not_overwrite_payload(package_base_dir: Path, payload: Path): - """Test that no payload file is moved if /data exists""" - - source_contents = [file for file in payload.rglob("*")] - - bag_payload = package_base_dir / "objects" / "data" - bag_payload.mkdir(parents=True) - - with pytest.raises(FileExistsError) as exc: - pb.move_payload(payload, package_base_dir / "objects") - - # check source has not changed - assert source_contents == [file for file in payload.rglob("*")] - assert f"{bag_payload} already exists. Not moving files." in str(exc.value) - - @pytest.fixture def bag_payload(package_base_dir: Path, payload: Path): - pb.move_payload(payload, package_base_dir) + pb.move_data_files(list(payload.iterdir()), package_base_dir) bag_payload = package_base_dir / "data" return bag_payload @@ -244,6 +210,7 @@ def test_convert_rclone_md5(bag_payload: Path, md5_manifest: Path): payload_files = [ str(path.relative_to(bag_payload.parent)) for path in bag_payload.rglob("*") ] + for a_file in md5_paths: assert a_file in payload_files From 1e8ae61bebc3a22f8af44defcf405f9c41e2ce0d Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 10:44:29 -0400 Subject: [PATCH 16/35] expand bag validation --- src/digarch_scripts/package/package_base.py | 22 ++++++++++++-- tests/test_package_base.py | 33 +++++++++++++-------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 81ea7bf..0afa253 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -202,8 +202,8 @@ def get_oxum(payload_dir: Path) -> tuple[int, int]: return total_bytes, total_files -def validate_bag_in_payload(pkg_dir: Path) -> None: - bag_dir = pkg_dir / "objects" +def validate_bag(pkg_dir: Path, subfolder: str) -> None: + bag_dir = pkg_dir / subfolder bag = bagit.Bag(str(bag_dir)) try: bag.validate(completeness_only=True) @@ -211,3 +211,21 @@ def validate_bag_in_payload(pkg_dir: Path) -> None: except bagit.BagValidationError: LOGGER.warning(f"{bag.path} is not valid. Check the bag manifest and oxum.") return None + + +def validate_objects_bag(pkg_dir: Path) -> None: + validate_bag(pkg_dir, "objects") + + return None + + +def validate_images_bag(pkg_dir: Path) -> None: + validate_bag(pkg_dir, "images") + + return None + + +def validate_streams_bag(pkg_dir: Path) -> None: + validate_bag(pkg_dir, "streams") + + return None diff --git a/tests/test_package_base.py b/tests/test_package_base.py index 2c1476c..2a514c4 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -236,31 +236,40 @@ def test_generate_valid_oxum(transfer_files: Path): assert total_files == 12 -def test_validate_valid_bag(transfer_files: Path, caplog): +VALIDATE_BAGS = [ + (pb.validate_objects_bag, "objects"), + (pb.validate_images_bag, "images"), + (pb.validate_streams_bag, "streams"), +] + + +@pytest.mark.parametrize("test_function,type", VALIDATE_BAGS) +def test_validate_valid_bag(transfer_files: Path, test_function, type: str, caplog): """Test the log message""" # create tiny bag for testing - object_dir = transfer_files / "objects" - object_dir.mkdir() - (transfer_files / "rclone.md5").rename(object_dir / "rlcone.md5") - test_bag = bagit.make_bag(object_dir) + sub_dir = transfer_files / type + sub_dir.mkdir() + (transfer_files / "rclone.md5").rename(sub_dir / "rlcone.md5") + test_bag = bagit.make_bag(sub_dir) - pb.validate_bag_in_payload(transfer_files) + test_function(transfer_files) assert f"{test_bag.path} is valid." in caplog.text -def test_validate_invalid_bag(transfer_files, caplog): +@pytest.mark.parametrize("test_function,type", VALIDATE_BAGS) +def test_validate_invalid_bag(transfer_files, test_function, type: str, caplog): """Test the log message if the bag isn't valid for some reason""" - object_dir = transfer_files / "objects" - object_dir.mkdir() - (transfer_files / "rclone.md5").rename(object_dir / "rlcone.md5") + sub_dir = transfer_files / type + sub_dir.mkdir() + (transfer_files / "rclone.md5").rename(sub_dir / "rlcone.md5") - test_bag = bagit.make_bag(object_dir) + test_bag = bagit.make_bag(sub_dir) print(list(Path(test_bag.path).iterdir())) (Path(test_bag.path) / "bag-info.txt").unlink() - pb.validate_bag_in_payload(transfer_files) + test_function(transfer_files) assert ( f"{test_bag.path} is not valid. Check the bag manifest and oxum." in caplog.text From 6192a15d041d047e69c06364d5a6aa44e78e3497 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 11:27:29 -0400 Subject: [PATCH 17/35] adjust argument requirement --- src/digarch_scripts/package/package_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 0afa253..798467d 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -130,7 +130,7 @@ def create_bag_in_images(image_paths: list[Path], pkg_dir: Path) -> None: return None -def create_bag_in_streams(stream_path: list[Path], pkg_dir: Path) -> None: +def create_bag_in_streams(stream_path: Path, pkg_dir: Path) -> None: create_bag_in_dir([stream_path], pkg_dir, "streams") return None From 37d30c7a33a744f05805f14703a9c8ad3155a655 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 11:31:16 -0400 Subject: [PATCH 18/35] fix validation to report result --- src/digarch_scripts/package/package_images.py | 17 ++++++++++++--- tests/test_package_images.py | 21 +++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index f49ca0b..e1bb294 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -97,9 +97,11 @@ def find_carrier_files( return carrier_files -def validate_carrier_files(carrier_files): +def validate_carrier_files(carrier_files: dict) -> bool: + result = True for carrier_name in carrier_files: carrier = carrier_files[carrier_name] + missing = [] for key in ["images", "logs", "streams"]: if not key in carrier.keys(): @@ -109,19 +111,22 @@ def validate_carrier_files(carrier_files): LOGGER.warning( f'The following categories of files were not found for {carrier_name}: {", ".join(missing)} ' ) + result = False if "images" in carrier: for image_file in carrier["images"]: if image_file.stat().st_size == 0: LOGGER.warning(f"The following image file is 0-bytes: {image_file}") + result = False if "streams" in carrier: if not len(carrier["streams"]) == 1: LOGGER.warning( f'Multiple folder of stream folders found for {carrier_name}. Only 1 allowed: {carrier["streams"]}' ) + result = False - return + return result def package_carriers(carrier_files: dict, acq_dir: Path) -> None: @@ -130,11 +135,16 @@ def package_carriers(carrier_files: dict, acq_dir: Path) -> None: base_dir = pb.create_package_dir(acq_dir, carrier) pb.move_metadata_files(files["logs"], base_dir) pb.create_bag_in_images(files["images"], base_dir) - pb.create_bag_in_streams(files["streams"], base_dir) + pb.create_bag_in_streams(files["streams"][0], base_dir) except: LOGGER.error( f"Packaging incomplete for {carrier}. Address warnings manually." ) + finally: + pb.validate_images_bag(base_dir) + pb.validate_streams_bag(base_dir) + + return None def main(): @@ -143,6 +153,7 @@ def main(): carrier_files = find_carrier_files( args.acqid, args.images_folder, args.logs_folder, args.streams_folder ) + if validate_carrier_files(carrier_files): package_carriers(carrier_files, args.dest) else: diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 85e55e5..4a48450 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -187,52 +187,57 @@ def carrier_files(transfer_files): def test_good_validate_carrier(carrier_files, caplog): - pi.validate_carrier_files(carrier_files) + result = pi.validate_carrier_files(carrier_files) assert not caplog.text + assert result @pytest.mark.parametrize("key", ["images", "logs", "streams"]) def test_warn_carrier_with_one_missing_category(carrier_files, key, caplog): carrier_files["ACQ_1234_123456"].pop(key) - pi.validate_carrier_files(carrier_files) + result = pi.validate_carrier_files(carrier_files) assert ( f"The following categories of files were not found for ACQ_1234_123456: {key}" in caplog.text ) + assert not result def test_warn_carrier_with_logs_no_images_or_streams(caplog): carrier_files = {"ACQ_1234_123456": {"logs": [Path("ACQ_1234_123456.log")]}} - pi.validate_carrier_files(carrier_files) + result = pi.validate_carrier_files(carrier_files) assert ( f"The following categories of files were not found for ACQ_1234_123456: images, streams" in caplog.text ) + assert not result def test_warn_carrier_with_streams_no_images_or_logs(caplog): carrier_files = {"ACQ_1234_123456": {"streams": [Path("ACQ_1234_123456_streams")]}} - pi.validate_carrier_files(carrier_files) + result = pi.validate_carrier_files(carrier_files) assert ( f"The following categories of files were not found for ACQ_1234_123456: images, logs" in caplog.text ) + assert not result def test_warn_and_skip_0_length_image(carrier_files, caplog): carrier_files["ACQ_1234_123457"]["images"][0].unlink() carrier_files["ACQ_1234_123457"]["images"][0].touch() - pi.validate_carrier_files(carrier_files) + result = pi.validate_carrier_files(carrier_files) assert ( f'The following image file is 0-bytes: {str(carrier_files["ACQ_1234_123457"]["images"][0])}' in caplog.text ) + assert not result def test_warn_streams_missing_a_side(): @@ -242,12 +247,13 @@ def test_warn_streams_missing_a_side(): def test_warn_only_one_stream_folder_allowed(carrier_files, caplog): carrier_files["ACQ_1234_123457"]["streams"].append("ACQ_1234_123457_2") - pi.validate_carrier_files(carrier_files) + result = pi.validate_carrier_files(carrier_files) assert ( f"Multiple folder of stream folders found for ACQ_1234_123457. Only 1 allowed" in caplog.text ) + assert not result def test_good_packaging(carrier_files, tmp_path: Path): @@ -257,9 +263,6 @@ def test_good_packaging(carrier_files, tmp_path: Path): assert carrier in [x.name for x in (tmp_path / "ACQ_1234").iterdir()] -# TODO add packaging fails - - def test_full_run( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list ): From 3378972d5e7ed803d3b55629466229ce728a6639 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 11:55:40 -0400 Subject: [PATCH 19/35] fix bug from refactor --- src/digarch_scripts/package/package_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/digarch_scripts/package/package_cloud.py b/src/digarch_scripts/package/package_cloud.py index 606ecdd..793414b 100644 --- a/src/digarch_scripts/package/package_cloud.py +++ b/src/digarch_scripts/package/package_cloud.py @@ -44,7 +44,7 @@ def main(): base_dir = pb.create_package_dir(args.dest, args.id) pb.move_metadata_file(args.log, base_dir) pb.create_bag_in_objects(args.payload, args.md5, base_dir) - pb.validate_bag_in_payload(base_dir) + pb.validate_objects_bag(base_dir) if __name__ == "__main__": From b4a9e8d796653bcfc02e98e506a29e7c3285f328 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 14:09:28 -0400 Subject: [PATCH 20/35] move more functions into base --- src/digarch_scripts/package/package_base.py | 108 +++++++++++++++--- src/digarch_scripts/package/package_cloud.py | 28 ++--- src/digarch_scripts/package/package_images.py | 84 +++----------- tests/test_package_base.py | 43 +++++++ tests/test_package_cloud.py | 2 +- tests/test_package_images.py | 56 ++------- 6 files changed, 169 insertions(+), 152 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 798467d..93c058e 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -11,29 +11,107 @@ LOGGER.setLevel(logging.INFO) -def parse_args() -> argparse.Namespace: - def extant_path(p: str) -> Path: +class TransferParser(argparse.ArgumentParser): + def extant_path(self, p: str) -> Path: path = Path(p) if not path.exists(): raise argparse.ArgumentTypeError(f"{path} does not exist") return path - def digital_carrier_label(id: str) -> Path: - pattern = r"ACQ_\d{4}_\d{6}" - if not re.match(r"ACQ_\d{4}_\d{6}", id): - raise argparse.ArgumentTypeError( - f"{id} does not match the expected {type} pattern, {pattern}" - ) + def acq_id(self, id: str) -> Path: + pattern = r"ACQ_\d{4}" + old_pattern = r"M\d{4-6}" + if not re.match(pattern, id): + if not re.match(old_pattern, id): + raise argparse.ArgumentTypeError( + f"{id} does not match the expected {type} pattern, {pattern}" + ) return id - parser = argparse.ArgumentParser(description="test") - parser.add_argument("--payload", required=True, type=extant_path) - parser.add_argument("--log", required=True, type=extant_path) - parser.add_argument("--md5", required=True, type=extant_path) - parser.add_argument("--dest", required=True, type=extant_path) - parser.add_argument("--id", required=True, type=digital_carrier_label) + def add_acqid(self) -> None: + self.add_argument( + "--acqid", "--id", required=True, type=self.acq_id, help="ACQ_####" + ) + + def add_payload(self) -> None: + self.add_argument( + "--payload", + required=True, + type=self.extant_path, + help="Path to files transferred from single carrier", + ) + + def add_objects_folder(self) -> None: + self.add_argument( + "--objects-folder", + required=True, + type=self.extant_path, + help="Path to working folder with file transfers from all transfers", + ) + + def add_md5(self) -> None: + self.add_argument( + "--md5", + required=True, + type=self.extant_path, + help="Path to a log with md5 checksums, e.g. rclone or rsync log", + ) + + def add_images_folder(self) -> None: + self.add_argument( + "--images_folder", + required=True, + type=self.extant_path, + help="Path to working images folder", + ) + + def add_log(self) -> None: + self.add_argument( + "--log", + required=True, + type=self.extant_path, + help="Path to a log file from the transfer process", + ) + + def add_logs_folder(self) -> None: + self.add_argument( + "--logs_folder", + required=False, + type=self.extant_path, + help="Path to working folder with logs from all transfers", + ) + + def add_streams_folder(self) -> None: + self.add_argument( + "--streams_folder", + required=False, + type=self.extant_path, + help="Path to working folder with streams from all transfers", + ) + + def add_dest(self) -> None: + self.add_argument("--dest", required=True, type=self.extant_path) + + +def find_category_of_carrier_files( + carrier_files: dict, acq_id: str, source_dir: Path, exts: list, category: str +) -> dict: + for path in source_dir.iterdir(): + if not path.suffix in exts: + continue + carrier_id_match = re.search(rf"{acq_id}_\d\d\d\d\d\d+", path.name) + if not carrier_id_match: + continue + carrier_id = carrier_id_match.group(0) + + if not carrier_id in carrier_files: + carrier_files[carrier_id] = {category: []} + elif not category in carrier_files[carrier_id]: + carrier_files[carrier_id][category] = [] + + carrier_files[carrier_id][category].append(path) - return parser.parse_args() + return carrier_files def create_acq_dir(dest: Path, acq_id: str) -> Path: diff --git a/src/digarch_scripts/package/package_cloud.py b/src/digarch_scripts/package/package_cloud.py index 793414b..ebd8195 100644 --- a/src/digarch_scripts/package/package_cloud.py +++ b/src/digarch_scripts/package/package_cloud.py @@ -14,26 +14,12 @@ def parse_args() -> argparse.Namespace: - def extant_path(p: str) -> Path: - path = Path(p) - if not path.exists(): - raise argparse.ArgumentTypeError(f"{path} does not exist") - return path - - def digital_carrier_label(id: str) -> Path: - pattern = r"ACQ_\d{4}_\d{6}" - if not re.match(r"ACQ_\d{4}_\d{6}", id): - raise argparse.ArgumentTypeError( - f"{id} does not match the expected {type} pattern, {pattern}" - ) - return id - - parser = argparse.ArgumentParser(description="test") - parser.add_argument("--payload", required=True, type=extant_path) - parser.add_argument("--log", required=True, type=extant_path) - parser.add_argument("--md5", required=True, type=extant_path) - parser.add_argument("--dest", required=True, type=extant_path) - parser.add_argument("--id", required=True, type=digital_carrier_label) + parser = pb.TransferParser(description="Create package for single cloud-based file-transfer.") + parser.add_acqid() + parser.add_payload() + parser.add_log() + parser.add_md5() + parser.add_dest() return parser.parse_args() @@ -41,7 +27,7 @@ def digital_carrier_label(id: str) -> Path: def main(): args = parse_args() - base_dir = pb.create_package_dir(args.dest, args.id) + base_dir = pb.create_package_dir(args.dest, args.acqid) pb.move_metadata_file(args.log, base_dir) pb.create_bag_in_objects(args.payload, args.md5, base_dir) pb.validate_objects_bag(base_dir) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index e1bb294..1a06598 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -14,80 +14,26 @@ def parse_args() -> argparse.Namespace: - def extant_path(p: str) -> Path: - path = Path(p) - if not path.exists(): - raise argparse.ArgumentTypeError(f"{path} does not exist") - return path - - def acq_id(id: str) -> Path: - pattern = r"ACQ_\d{4}" - old_pattern = r"M\d{4-6}" - if not re.match(pattern, id): - if not re.match(old_pattern, id): - raise argparse.ArgumentTypeError( - f"{id} does not match the expected {type} pattern, {pattern}" - ) - return id - - parser = argparse.ArgumentParser(description="test") - parser.add_argument( - "--images_folder", - required=True, - type=extant_path, - help="Path to working images folder", - ) - parser.add_argument( - "--dest", required=True, type=extant_path, help="Path to packaged images folder" - ) - parser.add_argument("--acqid", required=True, type=acq_id, help="ACQ_####") - parser.add_argument( - "--logs_folder", - required=False, - type=extant_path, - help="Path to working logs folder", - ) - parser.add_argument( - "--streams_folder", - required=False, - type=extant_path, - help="Path to working streams folder", - ) + parser = pb.TransferParser(description="Create packages for all disk imaging files for a single acquisition.") + parser.add_acqid() + parser.add_images_folder() + parser.add_logs_folder() + parser.add_streams_folder() + parser.add_dest() return parser.parse_args() -def find_category_of_carrier_files( - carrier_files: dict, acq_id: str, source_dir: Path, exts: list, category: str -) -> dict: - for file in source_dir.iterdir(): - if not file.suffix in exts: - continue - carrier_id_match = re.search(rf"{acq_id}_\d\d\d\d\d\d+", file.name) - if not carrier_id_match: - continue - carrier_id = carrier_id_match.group(0) - - if not carrier_id in carrier_files: - carrier_files[carrier_id] = {category: []} - elif not category in carrier_files[carrier_id]: - carrier_files[carrier_id][category] = [] - - carrier_files[carrier_id][category].append(file) - - return carrier_files - - -def find_carrier_files( +def find_carriers_image_files( acq_id: str, images_dir: Path, log_dir: Path, stream_dir: Path ) -> dict: - carrier_files = find_category_of_carrier_files( + carrier_files = pb.find_category_of_carrier_files( {}, acq_id, images_dir, IMG_EXTS, "images" ) - carrier_files = find_category_of_carrier_files( + carrier_files = pb.find_category_of_carrier_files( carrier_files, acq_id, log_dir, LOG_EXTS, "logs" ) - carrier_files = find_category_of_carrier_files( + carrier_files = pb.find_category_of_carrier_files( carrier_files, acq_id, stream_dir, STREAM_EXTS, "streams" ) @@ -97,7 +43,7 @@ def find_carrier_files( return carrier_files -def validate_carrier_files(carrier_files: dict) -> bool: +def validate_carriers_image_files(carrier_files: dict) -> bool: result = True for carrier_name in carrier_files: carrier = carrier_files[carrier_name] @@ -129,7 +75,7 @@ def validate_carrier_files(carrier_files: dict) -> bool: return result -def package_carriers(carrier_files: dict, acq_dir: Path) -> None: +def package_carriers_image_files(carrier_files: dict, acq_dir: Path) -> None: for carrier, files in carrier_files.items(): try: base_dir = pb.create_package_dir(acq_dir, carrier) @@ -150,12 +96,12 @@ def package_carriers(carrier_files: dict, acq_dir: Path) -> None: def main(): args = parse_args() - carrier_files = find_carrier_files( + carrier_files = find_carriers_image_files( args.acqid, args.images_folder, args.logs_folder, args.streams_folder ) - if validate_carrier_files(carrier_files): - package_carriers(carrier_files, args.dest) + if validate_carriers_image_files(carrier_files): + package_carriers_image_files(carrier_files, args.dest) else: LOGGER.error( "1 or more errors with files for a carrier. Please address warnings and re-run" diff --git a/tests/test_package_base.py b/tests/test_package_base.py index 2a514c4..3deb4fb 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -15,6 +15,13 @@ def transfer_files(tmp_path: Path, request): return tmp_path +@pytest.fixture +def image_files(tmp_path: Path, request): + fixture_data = Path(request.module.__file__).parent / "fixtures" / "image" + shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) + return tmp_path + + @pytest.fixture def payload(transfer_files): return transfer_files / "rclone_files" @@ -50,6 +57,42 @@ def args(transfer_files): ] +def test_file_found(image_files): + acq_id = "ACQ_1234" + + carrier_files = {} + carrier_files = pb.find_category_of_carrier_files( + carrier_files, acq_id, image_files / "images", [".img"], "images" + ) + + assert ( + image_files / "images" / "ACQ_1234_123456.img" + in carrier_files[f"{acq_id}_123456"]["images"] + ) + + +def test_ignore_unknown_extension_for_category(image_files): + acq_id = "ACQ_1234" + + carrier_files = {} + carrier_files = pb.find_category_of_carrier_files( + carrier_files, acq_id, image_files / "images", [".001"], "images" + ) + + assert f"{acq_id}_123456" not in carrier_files + + +def test_multiple_files_found(image_files): + acq_id = "ACQ_1234" + + carrier_files = {} + carrier_files = pb.find_category_of_carrier_files( + carrier_files, acq_id, image_files / "logs", [".log"], "logs" + ) + + assert len(carrier_files[f"{acq_id}_123456"]["logs"]) == 2 + + @pytest.mark.parametrize("tested_function,id", CREATE_DIR) def test_create_dir_exc_on_readonly(tmp_path: Path, id: str, tested_function): """Test that package folder maker reports permission error""" diff --git a/tests/test_package_cloud.py b/tests/test_package_cloud.py index d7d5702..22eccd8 100644 --- a/tests/test_package_cloud.py +++ b/tests/test_package_cloud.py @@ -28,7 +28,7 @@ def args(transfer_files): str(transfer_files / "rclone.log"), "--dest", str(transfer_files), - "--id", + "--acqid", "ACQ_1234_123456", ] return args diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 4a48450..179eb51 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -105,7 +105,7 @@ def test_id_arg_must_match_pattern( def test_carrier_files_found(transfer_files): acq_id = "ACQ_1234" - carrier_files = pi.find_carrier_files( + carrier_files = pi.find_carriers_image_files( acq_id, transfer_files / "images", transfer_files / "logs", @@ -125,7 +125,7 @@ def test_acqid_not_found(transfer_files): acq_id = "ACQ_1111" with pytest.raises(Warning) as exc: - pi.find_carrier_files( + pi.find_carriers_image_files( acq_id, transfer_files / "images", transfer_files / "logs", @@ -137,47 +137,11 @@ def test_acqid_not_found(transfer_files): ) -def test_file_found(transfer_files): - acq_id = "ACQ_1234" - - carrier_files = {} - carrier_files = pi.find_category_of_carrier_files( - carrier_files, acq_id, transfer_files / "images", [".img"], "images" - ) - - assert ( - transfer_files / "images" / "ACQ_1234_123456.img" - in carrier_files[f"{acq_id}_123456"]["images"] - ) - - -def test_ignore_unknown_extension_for_category(transfer_files): - acq_id = "ACQ_1234" - - carrier_files = {} - carrier_files = pi.find_category_of_carrier_files( - carrier_files, acq_id, transfer_files / "images", [".001"], "images" - ) - - assert f"{acq_id}_123456" not in carrier_files - - -def test_multiple_files_found(transfer_files): - acq_id = "ACQ_1234" - - carrier_files = {} - carrier_files = pi.find_category_of_carrier_files( - carrier_files, acq_id, transfer_files / "logs", [".log"], "logs" - ) - - assert len(carrier_files[f"{acq_id}_123456"]["logs"]) == 2 - - @pytest.fixture def carrier_files(transfer_files): acq_id = "ACQ_1234" - carrier_files = pi.find_carrier_files( + carrier_files = pi.find_carriers_image_files( acq_id, transfer_files / "images", transfer_files / "logs", @@ -187,7 +151,7 @@ def carrier_files(transfer_files): def test_good_validate_carrier(carrier_files, caplog): - result = pi.validate_carrier_files(carrier_files) + result = pi.validate_carriers_image_files(carrier_files) assert not caplog.text assert result @@ -197,7 +161,7 @@ def test_good_validate_carrier(carrier_files, caplog): def test_warn_carrier_with_one_missing_category(carrier_files, key, caplog): carrier_files["ACQ_1234_123456"].pop(key) - result = pi.validate_carrier_files(carrier_files) + result = pi.validate_carriers_image_files(carrier_files) assert ( f"The following categories of files were not found for ACQ_1234_123456: {key}" @@ -208,7 +172,7 @@ def test_warn_carrier_with_one_missing_category(carrier_files, key, caplog): def test_warn_carrier_with_logs_no_images_or_streams(caplog): carrier_files = {"ACQ_1234_123456": {"logs": [Path("ACQ_1234_123456.log")]}} - result = pi.validate_carrier_files(carrier_files) + result = pi.validate_carriers_image_files(carrier_files) assert ( f"The following categories of files were not found for ACQ_1234_123456: images, streams" @@ -219,7 +183,7 @@ def test_warn_carrier_with_logs_no_images_or_streams(caplog): def test_warn_carrier_with_streams_no_images_or_logs(caplog): carrier_files = {"ACQ_1234_123456": {"streams": [Path("ACQ_1234_123456_streams")]}} - result = pi.validate_carrier_files(carrier_files) + result = pi.validate_carriers_image_files(carrier_files) assert ( f"The following categories of files were not found for ACQ_1234_123456: images, logs" @@ -231,7 +195,7 @@ def test_warn_carrier_with_streams_no_images_or_logs(caplog): def test_warn_and_skip_0_length_image(carrier_files, caplog): carrier_files["ACQ_1234_123457"]["images"][0].unlink() carrier_files["ACQ_1234_123457"]["images"][0].touch() - result = pi.validate_carrier_files(carrier_files) + result = pi.validate_carriers_image_files(carrier_files) assert ( f'The following image file is 0-bytes: {str(carrier_files["ACQ_1234_123457"]["images"][0])}' @@ -247,7 +211,7 @@ def test_warn_streams_missing_a_side(): def test_warn_only_one_stream_folder_allowed(carrier_files, caplog): carrier_files["ACQ_1234_123457"]["streams"].append("ACQ_1234_123457_2") - result = pi.validate_carrier_files(carrier_files) + result = pi.validate_carriers_image_files(carrier_files) assert ( f"Multiple folder of stream folders found for ACQ_1234_123457. Only 1 allowed" @@ -257,7 +221,7 @@ def test_warn_only_one_stream_folder_allowed(carrier_files, caplog): def test_good_packaging(carrier_files, tmp_path: Path): - pi.package_carriers(carrier_files, tmp_path) + pi.package_carriers_image_files(carrier_files, tmp_path) for carrier in carrier_files: assert carrier in [x.name for x in (tmp_path / "ACQ_1234").iterdir()] From 4dc3ec589da15fa7670772945ed7616582c1cfe4 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 14:48:22 -0400 Subject: [PATCH 21/35] clean up --- src/digarch_scripts/package/package_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 93c058e..7983c66 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -214,8 +214,8 @@ def create_bag_in_streams(stream_path: Path, pkg_dir: Path) -> None: return None -def create_bag_in_objects(objects_path: Path, md5_path: Path, pkg_dir: Path) -> None: - create_bag_in_dir([objects_path], pkg_dir, "objects", md5_path, "rclone") +def create_bag_in_objects(objects_path: Path, pkg_dir: Path, manifest_source: Path = None, manifest_type: str = None) -> None: + create_bag_in_dir([objects_path], pkg_dir, "objects", manifest_source, manifest_type) return None From 76ead5edefd2fc10de06358f29956a6aa5f0ee1e Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 15:02:23 -0400 Subject: [PATCH 22/35] fix acqid carrierid confusion --- src/digarch_scripts/package/package_base.py | 26 +++++++++++++++++-- src/digarch_scripts/package/package_cloud.py | 10 ++++--- src/digarch_scripts/package/package_images.py | 4 ++- tests/test_package_cloud.py | 2 +- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 7983c66..ef882f8 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -28,11 +28,26 @@ def acq_id(self, id: str) -> Path: ) return id + def carrier_id(self, id: str) -> Path: + pattern = r"ACQ_\d{4}_\d{6,7}" + old_pattern = r"M\d{4-6}_\d{6,7}" + if not re.match(pattern, id): + if not re.match(old_pattern, id): + raise argparse.ArgumentTypeError( + f"{id} does not match the expected {type} pattern, {pattern}" + ) + return id + def add_acqid(self) -> None: self.add_argument( "--acqid", "--id", required=True, type=self.acq_id, help="ACQ_####" ) + def add_carrierid(self) -> None: + self.add_argument( + "--carrierid", required=True, type=self.carrier_id, help="ACQ_####_#######" + ) + def add_payload(self) -> None: self.add_argument( "--payload", @@ -214,8 +229,15 @@ def create_bag_in_streams(stream_path: Path, pkg_dir: Path) -> None: return None -def create_bag_in_objects(objects_path: Path, pkg_dir: Path, manifest_source: Path = None, manifest_type: str = None) -> None: - create_bag_in_dir([objects_path], pkg_dir, "objects", manifest_source, manifest_type) +def create_bag_in_objects( + objects_path: Path, + pkg_dir: Path, + manifest_source: Path = None, + manifest_type: str = None, +) -> None: + create_bag_in_dir( + [objects_path], pkg_dir, "objects", manifest_source, manifest_type + ) return None diff --git a/src/digarch_scripts/package/package_cloud.py b/src/digarch_scripts/package/package_cloud.py index ebd8195..e930370 100644 --- a/src/digarch_scripts/package/package_cloud.py +++ b/src/digarch_scripts/package/package_cloud.py @@ -14,8 +14,10 @@ def parse_args() -> argparse.Namespace: - parser = pb.TransferParser(description="Create package for single cloud-based file-transfer.") - parser.add_acqid() + parser = pb.TransferParser( + description="Create package for single cloud-based file-transfer." + ) + parser.add_carrierid() parser.add_payload() parser.add_log() parser.add_md5() @@ -27,9 +29,9 @@ def parse_args() -> argparse.Namespace: def main(): args = parse_args() - base_dir = pb.create_package_dir(args.dest, args.acqid) + base_dir = pb.create_package_dir(args.dest, args.carrierid) pb.move_metadata_file(args.log, base_dir) - pb.create_bag_in_objects(args.payload, args.md5, base_dir) + pb.create_bag_in_objects(args.payload, base_dir, args.md5, "rclone") pb.validate_objects_bag(base_dir) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 1a06598..8e579c1 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -14,7 +14,9 @@ def parse_args() -> argparse.Namespace: - parser = pb.TransferParser(description="Create packages for all disk imaging files for a single acquisition.") + parser = pb.TransferParser( + description="Create packages for all disk imaging files for a single acquisition." + ) parser.add_acqid() parser.add_images_folder() parser.add_logs_folder() diff --git a/tests/test_package_cloud.py b/tests/test_package_cloud.py index 22eccd8..0a12184 100644 --- a/tests/test_package_cloud.py +++ b/tests/test_package_cloud.py @@ -28,7 +28,7 @@ def args(transfer_files): str(transfer_files / "rclone.log"), "--dest", str(transfer_files), - "--acqid", + "--carrierid", "ACQ_1234_123456", ] return args From ea29daa7151c7a25d3688906f6d041dbbad805fd Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Wed, 29 May 2024 15:30:00 -0400 Subject: [PATCH 23/35] first running package ft --- rsync.log | 3 + rsync_files/file.01 | Bin 0 -> 3072 bytes src/digarch_scripts/package/package_base.py | 26 +++++ .../package/package_filetransfer.py | 34 ++++++ tests/fixtures/rsync/rsync.log | 4 + tests/fixtures/rsync/rsync_files/file.01 | Bin 0 -> 3072 bytes tests/fixtures/rsync/rsync_files/file.02 | Bin 0 -> 7168 bytes tests/test_package_filetransfer.py | 99 ++++++++++++++++++ 8 files changed, 166 insertions(+) create mode 100644 rsync.log create mode 100644 rsync_files/file.01 create mode 100644 src/digarch_scripts/package/package_filetransfer.py create mode 100644 tests/fixtures/rsync/rsync.log create mode 100644 tests/fixtures/rsync/rsync_files/file.01 create mode 100644 tests/fixtures/rsync/rsync_files/file.02 create mode 100644 tests/test_package_filetransfer.py diff --git a/rsync.log b/rsync.log new file mode 100644 index 0000000..1a4094f --- /dev/null +++ b/rsync.log @@ -0,0 +1,3 @@ +2024/05/29 15:07:45 [46235] building file list +2024/05/29 15:07:45 [46235] , tests/fixtures/rsync/rsync_files/file.01, 3072, f075a8d6d4df7509d39a3140bbae9fcd +2024/05/29 15:07:45 [46235] sent 3206 bytes received 41 bytes total size 3072 diff --git a/rsync_files/file.01 b/rsync_files/file.01 new file mode 100644 index 0000000000000000000000000000000000000000..b70f4554297d86f9aade2f45b7f769d88c70609e GIT binary patch literal 3072 zcmV+b4FB_;_q;LM+U)%{VT_Y}?hw~j|E2|j8?Rg$m(-68Z+pczw5*k<=ULAz6i4USeym- zEOHcX|2>$~X5?)Zxif6WW3djC@mJ6}E`WCdds2Z-KO_F}-Ih&AJ%U$|4dW&GLZ>`Z zW0-nARhFj5-EbF95c%{IJ2>uV^7>aE9fZO%DB~bUk{P-!537TID`x()UsmnBq5Bb1 zr{#&Qh*!YeqenNWj?gaV>W}&7Ha)A0wXe5$dB9^q$Ncu3+7;w0LwdO&zFCneaB>^p zXqDKGf0|cPfx$i+IlJjmTaG}bSdkqKCv*UxZCLIc$)gjVpWvyP#WN+&c}v->ZKY%N zb8I%Ajm=1y+wBX5ffTYVsJ|d_sxDQ-t#}mRK@r)J5rA}Xl>+7BzOHA)RX{kngnrI3 ztr^5G7lA^~BLVqtUS(a%*UjHssN?lVs!Az#WM%%5{h(f7Q^`4qA_js9S1J6~;OlDB zzMkWQgdRB{eLOlyjYdzkfl<8-D(}{zkLWzwy8-IN-AEzkQZ7sxFxDew7a*Ceu!l)O z<92MkxCN4D2_V}26@HSFgq^e*@~&ow;7Bb-vx#Pf!HTQK4(tLD2T^hi2f z#eJEG=q=&;o8=-IkzTtg;nCH2CvomCgpwr=Kz@@nXC1XutAEsRNrHKNlc-2t#i7>u z&F8#u{})zn1HoQq0G#PYB9u!o4m06YA|M}iczt#Pma`wSH=uOc^GnU9mCmDz!WX4} zCV{xk^awGOk5^#E-h2;%4$s+N^;puiw3oAik5pQQZW8>3&^1!#90h!?SM}}RIvzK5 z#cEq&qnF)luBW16XyOY{pq-g1PqoHoL;TT2#Ni<6(zq3!^0-{_(@SbvZ@?wl2`dB2cDD?N9^ayr4LDC;T8?!y?lQjEs(+;+rD0ULZnmSKro1`%;s`h z8fCA8V`i_)?lT`;Bc+fv$B6a>~oop zDY|tgNCH5`b&BsSgBsoRAuz_P?$Pjem?Y4f76Suqt_UNUx^|93TmnzCE|I6|Z7-)4 zw=pCNW@Tp1-si#^-bxaF6lVyi2YG8P`9B0fMp(jSkg8L98C2I9ilMf4I{G(w*H7zc zlKcKfV2DbL1K@%6r#UHMEtfx8YqPT$?(V7e26a9|S=3{ru%Q`+_zPx%Aaw{E9tDk>6(r+Cf;pnL820B<1|t z#POj$9~rtDC|<^BY`x;{97ebL5g?lbX;GWbW5kYqhgq2x&t0=86g-Jaau4Wh(uextijK)8c&5rm<4}CyZn6cqAdO0bTCagS#rjsy3MxmDsQqw z#kTl;gxm!obD;~TlYA+LO3^&*Y4XviK-#YzWu!P87X%vaPMFR)hAPmcR4Rb8QI(cG^>JO6u`p-CNppYQ@tY>ga)}^k{&#vn`pQ z5b@DNCZiMI6c_q$Z+1 zI4L5dS}M1AcbBWv4A}WyA|uSgwd|Tp`CF;4;Bn%(ddp-H9Pb0If&W}dZs8Ydu_Yo( zO+7+MIr4c|AiCAa-l{$ciJuTWw;cWo44*z+aH$pX73;M&~b z3JU7(4O38Ca9OECv1Zo5Q}Ufz5%!QZ8BYT1EGP$mS;L7W)Etw*nK_~-C~9kA5~O6n z-y-n49B&of46CQMNH=>p>oI-P3!od zIxCX0c}FrJe#9R!OOGca*+eA>r^uH~1h3?Z&4vUW%2Lgpwjg%BaF7E@!qiOHq=8T}Fs#rScCbnIr)m(Vv ztkJP}TzfUeJ%;b`w`YtcZ!6qSK_!$`@Pza~C=z zGY^i$8>-E5q~pSglV;2jk*kAZnHIiWow603pNL{(!N?2{9f`?;F@(M_B=VAR?TR>> zytFA|hLWaF`o>4pcp`F%gEZI}orU~@$NVs9KMrl1WVmHeDk%L=kAiL@4GbNR?I5s5 z%62brZBph+nfc0FQY_6QIeOcSgmYNmBp)N;5&8xQ^%didqZb4|dfH%tW9z2|w=2E< zW2Wtova(cUv+Wm^mOs>uPfUG6vIFrst5bE6oHMB`>yxO#v%9y(80#JAlL|T0j6H&A zOjNsude&R{W`b6@Ojd-WyZ5!W!pjnQZGQop(^!1_DvlTV%3&fnuni3uApHt)X4y{4 zfnoP9`jyqS{veVz2do@fCt$BzmKUkFPK{bZ!!my zsPm_2t4Iy_sk2EklPROukx=AXb{0mJ`uWiClK$+lOG7O*JDfL16JMGT#r$tq$v##j z{!}Tp&kyJ2tt@r2a6~xSb2W{Xb&yD2d#mQEkYbjk1UB50Yc3CFQSe=or5w(2kAby2 z!u6W;P8Lwl9s3sHE5eC0C5E4YbvIJ03glv(OL=HkGs!K3CR$~bc~k~97Gc5iUAgO! zEvUl2iQR8P(ul9@#PC)90I(E26D<49-;ir-Y(`1xr-LqY<^7XUM;5BzIh(DD?e>9g zBY{iH;Q0{dhXHxq&z8TFXuMgOy)?A{qB~x7&rJ7RFL6~nJ2MV;Er%msrB8wA`EQzD OhBXE-vyTa(^?l@ZMFAoJ literal 0 HcmV?d00001 diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index ef882f8..258db43 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -210,6 +210,8 @@ def create_bag_in_dir( if source == "rclone": convert_rclone_md5_to_bagit_manifest(manifest_source, bag_dir) + elif source == "rsync": + convert_rsync_log_to_bagit_manifest(manifest_source, bag_dir) else: create_bagit_manifest(paths, bag_dir) @@ -275,6 +277,30 @@ def convert_rclone_md5_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: return None +def convert_rsync_log_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: + # check for manifest + new_md5_path = bag_dir / "manifest-md5.txt" + if new_md5_path.exists(): + raise FileExistsError("manifest-md5.txt already exists, review package") + + with open(md5_path, "r") as f: + log_data = f.readlines() + + manifest_data = [] + prefix = os.path.commonprefix([os.path.dirname(line.split(',')[1]) for line in log_data if len(line.split(',')) > 1]) + print(prefix) + for line in log_data: + parts = line.split(',') + if len(parts) == 4: + poss_rel_path = parts[1].replace(prefix, 'data') + manifest_data.append(f"{parts[3].strip()} {poss_rel_path}\n") + # re-writes the manifest lines + with open(new_md5_path, "w") as f: + f.writelines(manifest_data) + + return None + + def create_bag_tag_files(bag_dir: Path) -> None: txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" with open(bag_dir / "bagit.txt", "w") as bagit_file: diff --git a/src/digarch_scripts/package/package_filetransfer.py b/src/digarch_scripts/package/package_filetransfer.py new file mode 100644 index 0000000..b7e7bf0 --- /dev/null +++ b/src/digarch_scripts/package/package_filetransfer.py @@ -0,0 +1,34 @@ +import argparse +import logging +import re +from pathlib import Path + +import digarch_scripts.package.package_base as pb + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) + + +def parse_args() -> argparse.Namespace: + parser = pb.TransferParser( + description="Create packages for all file transfer files for a single acquisition." + ) + parser.add_carrierid() + parser.add_payload() + parser.add_log() + parser.add_dest() + + return parser.parse_args() + + +def main(): + args = parse_args() + + base_dir = pb.create_package_dir(args.dest, args.carrierid) + pb.create_bag_in_objects(args.payload, base_dir, args.log, "rsync") + pb.validate_objects_bag(base_dir) + pb.move_metadata_file(args.log, base_dir) + + +if __name__ == "__main__": + main() diff --git a/tests/fixtures/rsync/rsync.log b/tests/fixtures/rsync/rsync.log new file mode 100644 index 0000000..cd0f2c5 --- /dev/null +++ b/tests/fixtures/rsync/rsync.log @@ -0,0 +1,4 @@ +2024/05/29 15:07:45 [46235] building file list +2024/05/29 15:07:45 [46235] , tests/fixtures/rsync/rsync_files/file.01, 3072, d5116a5a40aab468780a3c03b417a8ac +2024/05/29 15:07:45 [46235] , tests/fixtures/rsync/rsync_files/file.02, 3072, 379bc3d1e529f9645bab5482bbd4ac98 +2024/05/29 15:07:45 [46235] sent 3206 bytes received 41 bytes total size 3072 diff --git a/tests/fixtures/rsync/rsync_files/file.01 b/tests/fixtures/rsync/rsync_files/file.01 new file mode 100644 index 0000000000000000000000000000000000000000..b70f4554297d86f9aade2f45b7f769d88c70609e GIT binary patch literal 3072 zcmV+b4FB_;_q;LM+U)%{VT_Y}?hw~j|E2|j8?Rg$m(-68Z+pczw5*k<=ULAz6i4USeym- zEOHcX|2>$~X5?)Zxif6WW3djC@mJ6}E`WCdds2Z-KO_F}-Ih&AJ%U$|4dW&GLZ>`Z zW0-nARhFj5-EbF95c%{IJ2>uV^7>aE9fZO%DB~bUk{P-!537TID`x()UsmnBq5Bb1 zr{#&Qh*!YeqenNWj?gaV>W}&7Ha)A0wXe5$dB9^q$Ncu3+7;w0LwdO&zFCneaB>^p zXqDKGf0|cPfx$i+IlJjmTaG}bSdkqKCv*UxZCLIc$)gjVpWvyP#WN+&c}v->ZKY%N zb8I%Ajm=1y+wBX5ffTYVsJ|d_sxDQ-t#}mRK@r)J5rA}Xl>+7BzOHA)RX{kngnrI3 ztr^5G7lA^~BLVqtUS(a%*UjHssN?lVs!Az#WM%%5{h(f7Q^`4qA_js9S1J6~;OlDB zzMkWQgdRB{eLOlyjYdzkfl<8-D(}{zkLWzwy8-IN-AEzkQZ7sxFxDew7a*Ceu!l)O z<92MkxCN4D2_V}26@HSFgq^e*@~&ow;7Bb-vx#Pf!HTQK4(tLD2T^hi2f z#eJEG=q=&;o8=-IkzTtg;nCH2CvomCgpwr=Kz@@nXC1XutAEsRNrHKNlc-2t#i7>u z&F8#u{})zn1HoQq0G#PYB9u!o4m06YA|M}iczt#Pma`wSH=uOc^GnU9mCmDz!WX4} zCV{xk^awGOk5^#E-h2;%4$s+N^;puiw3oAik5pQQZW8>3&^1!#90h!?SM}}RIvzK5 z#cEq&qnF)luBW16XyOY{pq-g1PqoHoL;TT2#Ni<6(zq3!^0-{_(@SbvZ@?wl2`dB2cDD?N9^ayr4LDC;T8?!y?lQjEs(+;+rD0ULZnmSKro1`%;s`h z8fCA8V`i_)?lT`;Bc+fv$B6a>~oop zDY|tgNCH5`b&BsSgBsoRAuz_P?$Pjem?Y4f76Suqt_UNUx^|93TmnzCE|I6|Z7-)4 zw=pCNW@Tp1-si#^-bxaF6lVyi2YG8P`9B0fMp(jSkg8L98C2I9ilMf4I{G(w*H7zc zlKcKfV2DbL1K@%6r#UHMEtfx8YqPT$?(V7e26a9|S=3{ru%Q`+_zPx%Aaw{E9tDk>6(r+Cf;pnL820B<1|t z#POj$9~rtDC|<^BY`x;{97ebL5g?lbX;GWbW5kYqhgq2x&t0=86g-Jaau4Wh(uextijK)8c&5rm<4}CyZn6cqAdO0bTCagS#rjsy3MxmDsQqw z#kTl;gxm!obD;~TlYA+LO3^&*Y4XviK-#YzWu!P87X%vaPMFR)hAPmcR4Rb8QI(cG^>JO6u`p-CNppYQ@tY>ga)}^k{&#vn`pQ z5b@DNCZiMI6c_q$Z+1 zI4L5dS}M1AcbBWv4A}WyA|uSgwd|Tp`CF;4;Bn%(ddp-H9Pb0If&W}dZs8Ydu_Yo( zO+7+MIr4c|AiCAa-l{$ciJuTWw;cWo44*z+aH$pX73;M&~b z3JU7(4O38Ca9OECv1Zo5Q}Ufz5%!QZ8BYT1EGP$mS;L7W)Etw*nK_~-C~9kA5~O6n z-y-n49B&of46CQMNH=>p>oI-P3!od zIxCX0c}FrJe#9R!OOGca*+eA>r^uH~1h3?Z&4vUW%2Lgpwjg%BaF7E@!qiOHq=8T}Fs#rScCbnIr)m(Vv ztkJP}TzfUeJ%;b`w`YtcZ!6qSK_!$`@Pza~C=z zGY^i$8>-E5q~pSglV;2jk*kAZnHIiWow603pNL{(!N?2{9f`?;F@(M_B=VAR?TR>> zytFA|hLWaF`o>4pcp`F%gEZI}orU~@$NVs9KMrl1WVmHeDk%L=kAiL@4GbNR?I5s5 z%62brZBph+nfc0FQY_6QIeOcSgmYNmBp)N;5&8xQ^%didqZb4|dfH%tW9z2|w=2E< zW2Wtova(cUv+Wm^mOs>uPfUG6vIFrst5bE6oHMB`>yxO#v%9y(80#JAlL|T0j6H&A zOjNsude&R{W`b6@Ojd-WyZ5!W!pjnQZGQop(^!1_DvlTV%3&fnuni3uApHt)X4y{4 zfnoP9`jyqS{veVz2do@fCt$BzmKUkFPK{bZ!!my zsPm_2t4Iy_sk2EklPROukx=AXb{0mJ`uWiClK$+lOG7O*JDfL16JMGT#r$tq$v##j z{!}Tp&kyJ2tt@r2a6~xSb2W{Xb&yD2d#mQEkYbjk1UB50Yc3CFQSe=or5w(2kAby2 z!u6W;P8Lwl9s3sHE5eC0C5E4YbvIJ03glv(OL=HkGs!K3CR$~bc~k~97Gc5iUAgO! zEvUl2iQR8P(ul9@#PC)90I(E26D<49-;ir-Y(`1xr-LqY<^7XUM;5BzIh(DD?e>9g zBY{iH;Q0{dhXHxq&z8TFXuMgOy)?A{qB~x7&rJ7RFL6~nJ2MV;Er%msrB8wA`EQzD OhBXE-vyTa(^?l@ZMFAoJ literal 0 HcmV?d00001 diff --git a/tests/fixtures/rsync/rsync_files/file.02 b/tests/fixtures/rsync/rsync_files/file.02 new file mode 100644 index 0000000000000000000000000000000000000000..3ccaf24521e155886b366874ef75211c66d5c719 GIT binary patch literal 7168 zcmV+b9RK64+xb>{sl=231Q0EsN;H@$+UdhcJdm-V);D@*YR(-)jUjy9vrYgW{~~PU zESw`ffXWTdtRr3zu^9=!6(K_f4%)KE9TTl);ngPZdl?Qm%3IyL0Ib3Lt|RUn70SQR z*9lM*>~arUU8mz8c+Qg*{U|-ajU8m15I;c4*J_1QaQ8~ATE19d7`dzsvRhm=frs(? zwnzJBZ?L5dzWspRQF~%Zw$5FczjUZu;^y;{;sxKqP^W%p)G0*}4ks0go z#NE0*`Z2$f5TFJ$W;}xXC!y-{P3s!>=4!wz0*cV(M_vx(!2kRbP)H{BS4l?U*B*XC z1G=LtWnifbzo;HZNkckcalHO1b#NO*(iD!%Q>}L%jdF)u#yx{$Tb3G1DG8|vCeamr8 z`u?n-ko?frXqFGlK9#xw%6&f&b!M?8$4}4$}o?3J=mA3!UqyzzUeG zF-)L51NK>TfTwd6?gR;GoAe7(V3HY760FnUA>?x+)DV*T>#TMZcChpWzr_k)wnPTv87>IgT&DFQ^ms;GRD(b)YVfY)zJAKD7kMhwTT%bB{?bx4ecm13MQ|mQHoen za|k{zOhoIA$|Vr&Ta$ai)6J5o?^3y=e*7nL22;b_{}Y5r}9;w^2pY&3j9d7AXMQh&Mt{~Z2tPXmzv^N z9FNsAtM(e056D|7ts-?huj-J|k3Q(GpQm89J}i-FTsGfj z-_Ai4aF7u4bQAr;fr`4$gDhbtq=^H5%WY{~(5U$Do>yAYS9_rKQUI=e)ogJQ$0TW4Tn9ZBzVB;K9 z2Fb3x06VI5mnGf;rTts9sPdADp4iHu<(YqqqWS&}9SU(!@x`2ZkRs-}q0bk2{TiHD z4C?b>fkei$XxK`i?^k(1c^-%x34epIg}#b{NIeERSxJ*aVOIcz+3V49^_3VJB$b1I z_ssF44NMMgI6YDhyeLGej%g!C`AAO@Ny}CIKjfy%LC?{hvK1Kh&tl_-BGAJ3;~+NrTp0s%&ZrA$93&8S?ikF`%lFgHp} z(m|+yA>qJS^t0lWl8nA8@VgfHw)Ts~GYO&QiWq)W4eG7-tl`3`9@VqT_B)i5N!LL` zfTLT?WMr+NAVEo;Zu>x_QlVU^Cyl-L%u zKn{Qr^OcR)AH0f!;DPu;g`=4p)db@RxokR0&HXaPu3nU zt=Fe+&P+m7$U4MbMdO%>MX*A$g6possq6ElxNMOPf4H)0dejJ0>eG?AR@-F?&Xt@* z9gu%Ml3nlZ2Gq5YyD0-=4EmzcnmFdK4KP<$m#peM)81n<7|?z)5Vo8={}QE#O}*|X zMv;Kt>P@O~JR2>ZfLZG+d6i7M@M6>m<|DD4vl7g|f5@+bBdx0Xlat|mjf`q<@L}Kl z5?T&NUz3`v50KMI#CwzQo2n=0v9;~=_YnaYw>N(t9P`a^18Ch7|8=@jIHQrHmU2Un z?FAzYq6FQrw|*0ErXL4YAItLZFL~N(^5l#^+Z5V9FLoHLj6~35@wOv;L#5n#66k^~$umU+vF(4}p97T4MIr`21Rq6{4>o3$3NNB!&DnI+ z%Vj?8J_aW%4;Ea3Cf-5VwIJSOP{rd`;@}4hBp+bBb!mVOTx9sZ$Bsk=&0d8wYQk-^ z3?fdxY?R44dUq|h=a1WX6-0SwVZSDQ=29y&k6J2xjFfa1*uW};^mMVZ6(q@R2)50nABb;B0Kp32R4V*vuKg4;?&Ry-L#NN6xLBW<<p4g&|;maP1^Byq)B_FOB+z3WiKInm(MV{rmJ}n%gTv`(OBkPO5xSx861 zs^1#=YV^9VxeLtT6gvJm=*%P)eO9N?)u~(1Fk8cPS69 z=laqS71NJv@l=BlkOAlSDAWjENstkk7DId*(C}K?cZ2~eF3beW4H)oZ&}*ta@zv#% z?6=KI4?_`u@yw9cl{zbh@lx1;1G8t38I{qm*V_IKH$9jJg?G-=#{pc_K%*RylQX7X z_qvXSS1`|QoTR)OVjl4L>59xPzOrf`Br#pbV}7G8`hHbrt^LwF7l#f%=cM&L=#&w_ z?EvW-n~o4$a1M&I#`*`Y$sQd|@Je3Mb>Aj2F#Rn&(jt(MVGD9Kv#3@ur_}Ko{J7ot z%vXog3z%$k%K%;*OOXI`R)iQ8-*PO~fm%}V_Two=H}oAGRc8t~Be`(`fmnN1T6fI6 zOraHSaR{ATjv*|an2YjHolCI~={9}Z*r+3UqjtA*XemCZj>DmAIGuRJ*?S3T?{~YO zcN{8zKc@c|VwxA#;V-1bVgo*y*=7qdvrx+gr!s(4fmhdoHry7 zkP?Kory}koi;}Cu{v+zdaaZj*+^gnqx3Tq{aQCo<9P;#dgi*633OVZbk4VoM5vfxZ zQ^G}w_1J3jGRH`yRugr6du_#UW%T65V`{-^SbQP%jWP0S5gUyP@-s;Rn>e>R8Kahb z#s}h@xRV{aN7;M9B<66Pi1-K84L|1=EVtxPk6d8AJMY{a%~mjG2eBot-6C{PmNd=> zw;KjbQ4g3<=gjE*JpTxrX?PjMr{`wq(y=uyf>`6*D2a-w`ntEI%1KqgA3#ac@txAh z7o2ds`vDhkg*r2-C4V$~KbuIXAsj<23I9KrV=^JfGAH?t_P|*Itw8}9WIOeIQ-wv# zB0@?N!`F^fJztflNlVCTq2Kf8GC<3=TCxs$5X}uW&90$>G^Q9I z;2Fg9s|VFWq(OO+4z2SU#}iNHOD=s5-;T7}xX|s;aHSQ>p}(91Q2mzcD=zdEKt&uz zASBLrEnRV=A%ch?yy52+6jefPbz?5Vci?tay4G{TX~(XE91nxaQ~FN5+ysV&C&Ggn zLR=m;JGMMmjPh1}dG3ap=%PW6Cx(;+!zyX=J}K17TXjGSrqAzFAl!hcmv^8(XAx1R zLrJ18fg#3@f_A1v1MD1fmk^fvr+-CPNoXJ{gX}d zWqRi28sE!N*G&&wrh}7*4q&-W!k?9&MNHXtW`tl_(ShNPVQ6ChL06@~J6&a%yTFfa4tk3W*>G6onk!O&K~FDL)bH8j#X<){XWFTdB;06;<3 zK3(gFci<*if^)8Ah(U-DjR-Pd0}r65QLqv4S}G-_D)wC>Dj7^U&{sg4@LQdd52emR z+l-Gqn`0ZlOwk(JrS1wnZEyJ{?BPqOESVuH9ErS^6m(tovNLhyXAoQfUl{V{&1q34 zA8Pv#h&q`n01Rrr9fEV&?i4+X^vCTEHrYsqiXRKI5JvZN#^mo2Vq#9|V4oF~rkVRL zoNL_U@e|Ky{oqK>L)6EBDv@txwk*812)XDMXt|K`!d=+B+x(mCb;0NH<%4GX4ktA& z-rNprV2q?B7Jlf&D_t`84ReND@xLeqxKR@1qb#OWH@2t(_xKPa^Ro|}!%ig(F%|=I zK@+CHWyCrd z>kEI^*h8LDeqG~3M3&-#f&Rns&xjN!f}hG-koRz zNl%pXC^*rk#4^Kaup9PO6>DZwi9}U=xjxxDXDVff4VDYX_Tm7=bA2KGA|Z`}eQl{T zH+I>3YR_7B&4AdSpM<4iW+Y3P8Aku9_ z-a=e4IUjQxXP%GlH>Lxh)H=%0m;e&gXS$Mk`B!bAyV!mawe8z9PFPoVpHWB5#Luzh zQn=2U3Nbb`daC3J{fFdLVw${ z0&w9zHo9EK;Wy&>ai#qqnjjcDB|Eo;tuG+=1|{q3+EdyIOfv#e%Cok7^Mf7lupp0i zZhVB>ewqDxVzjl*91_9j)oT*^{8^eqXgkLG;!vFeU(2#V3{WGc-v7!yK=ABg(adw0ccusX#PQ>AJbG$f( zvp5G^HXD8r!*9gOokr5O>WZk82|(&@I#vY$Oty1NRV;rV-f|Y<>MJ?n<`6~L$YaSP z!t-!$Wl(ozr(vY}^F27wdCQV_WgN!`ECbO9ZOGEqI;Onj&=wWe+9rhV-{*r5u547m zR-2nbj4#BAe)*)$VzCAa!6@6KnKkvlDN07mL*=C5EDZ2x_iO@CiQqOGV+4ytD`^-M zVJ!pH{!c3|_t68F5X;`4R~sB-ZSt;11p$9qIVIz}I@p5d~RIpW}D=5JMo{!*#nb5~S3xuVqXz!?;O-kv!-Pi8&n8ynK z+b*l$A<*Q8->;}r5fB&t$c|_&G}`7YDr`A5D?9*~$DDJYf>E$L-%^BTDR`FNxm5`H zgqwkt{dO@VYMiyb$aq|!gHauyn?3~c^lRSSh{c1R(@c83ESJM7A>1lW_qUqb2(;a6 zQ_eUxVFdLkuw^KqWu=9fjbZ(LN<;QJ!}!2F`;zSH3FJ5FTrh>uJdR@wqi&*HVI;Q6 zfoGgcjt!SKdh7w97YA(jdfN77;G4#Ui>G~2xRqbeV?p~sx?{^;k9bltduqidIJwUoBJ!7&%yFIfn*3_gS za&3s-yr&9Rxh(oZuxytO_K_T0)vmNdh{c~qct#sh9ewycDR_bv@#Q|M2A5^_ zRq*g8AU{C>8D)D7q!tMcCmk7ASxnKD{wZ{1nOvL+e7*IiID~g|v!Bej`*iP#kydux z6EUKr#Mi>`Y%oPO!=0nUVkN697x>-yKA8jnRmNkzpEaXX)Od%A9)q*7W?0&Kkr_eJ zHWCnIdM5&M*p!YctsF#_##rFdSnhbm*ocbm2O*AD!sNwu~dlKZ2)fTnKo; z7w9>LHZR;hq=xVc!s+GL(6~tk3DwV#%P1WSCLo{&bF*6sW6`DZW`hGN|1B{&1uKeF2 zteO2P$z@sqpO|Dp%p`HD>2gYlE624um-<1^{T&ku*%KFJ{Sa9 zU?Zcun%kiWRPtBTQ$-zm0kva79bN2{Fp<##|V7@bc5CvqS|0?}vP0GLa%z|*}h=7rdl zLD^C^_||sWXp6`f^+<^Za~j=u^Z|#GOi2!-A?1=dWGHQay&y~oZ#PP<3I3eV&@X>| zY+c-qB4U$$lQzUrfV1PVcN+{}z#V!FpqvhaA%b1>f=Xt)+t^3WjKpR%XVi*}3C`|z z400Q~rA&&v@-2S;0cbgfS>PO0#5^y1k=@NfC7GcqSFf{Xr~k+`GNkCmp>WR@9Lz?C z_fF%jD%Lf<6%>)t7*hDMy^0ayJ}ydWdNCU-sZpdnTDDs<3Kl9-X~@Q{G@AiusTd(U z6n@VO7a`;fD1HKSSuH{e*5nnXz&ML>Op5obSzQO)(}5M9hs0v=je!>yK5d^g2lh2gAEHD!P^L^}CEK`oR0WoPc&`*hTXQZjw#&q9`s7ve$J zZ`fJ;~hJS7!HO}Dk=B6<%W;pa3?BpWoUE?VxwqePb5kJ9av_CE_PMN)_QknZKu z!Eb_L=T7P%RIwztR;yftgjKRC>C0kwwL2ysn=MpvuR{WsHwBQYkYDjDQAM(he?`4) zKfPrvV`R`8i`}R*kVDU}@G0ikZhIP$*uoXy4D3`3b+z>J!`9cr1xsz#QFDu_4=$&% z;abypA-%GBaPi4NeQn^-emf86q){q(v}2nxgus#8 z{j$|$_A{au97Wr;{YsHaB>eI_*?pSLANa*Av^cd|x}FMOfxvai^A>~(_7mscFRElf1CfuiKgZo;8cw>Y| zicQ8DsZls{-3QkW1;S4~dx3gPKw-rg)P@_RYzMY-H&;Vc?t`UI|c=>(ymODDaQ(2!o^iplIy#bf2~Z)u*uiK zAH^=}GJQO+dc*`>)!aD}35o@zJ(gB~Y)icb!n*#gDN%jHmNdf8X86W;4U~a1Hbr|+ CwA5h$ literal 0 HcmV?d00001 diff --git a/tests/test_package_filetransfer.py b/tests/test_package_filetransfer.py new file mode 100644 index 0000000..1f79bb2 --- /dev/null +++ b/tests/test_package_filetransfer.py @@ -0,0 +1,99 @@ +import argparse +import os +import shutil +from pathlib import Path + +import bagit +import pytest + +import digarch_scripts.package.package_filetransfer as pf + + +@pytest.fixture +def transfer_files(tmp_path: Path, request): + fixture_data = Path(request.module.__file__).parent / "fixtures" / "rsync" + shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) + return tmp_path + + +@pytest.fixture +def args(transfer_files): + args = [ + "script_name", + "--payload", + str(transfer_files / "rsync_files"), + "--log", + str(transfer_files / "rsync.log"), + "--dest", + str(transfer_files), + "--carrierid", + "ACQ_1234_123456", + ] + return args + + +def test_requires_args( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script requires all five args""" + + for i in range(0, 4): + # remove a pair of list items (arg and value) for each test + part_args = args[0 : 2 * i + 1] + args[2 * i + 3 :] + + monkeypatch.setattr("sys.argv", part_args) + + with pytest.raises(SystemExit): + args = pf.parse_args() + + stderr = capsys.readouterr().err + + assert f"required: {args[2*i+1]}" in stderr + + +def test_arg_paths_must_exist( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script errors if path argument doesn't exist""" + + for i in range(1, 4): + bad_args = args + bad_path = "nonexistant" + bad_args[2 * i] = bad_path + + monkeypatch.setattr("sys.argv", bad_args) + with pytest.raises(SystemExit): + args = pf.parse_args() + + stderr = capsys.readouterr().err + + assert f"{bad_path} does not exist" in stderr + + +def test_id_arg_must_match_pattern( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script errors if id argument doesn't match ACQ_####_######""" + args[-1] = "bad_id" + monkeypatch.setattr("sys.argv", args) + with pytest.raises(SystemExit): + args = pf.parse_args() + + stderr = capsys.readouterr().err + + assert f"bad_id does not match" in stderr + + +def test_full_run( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test end to end successful run""" + + monkeypatch.setattr("sys.argv", args) + pf.main() + + pkg_dir = Path(args[-3]) / args[-1][:-7] / args[-1] + assert pkg_dir.exists() + assert bagit.Bag(str(pkg_dir / "objects")).validate() + + assert "rsync.log" in [x.name for x in (pkg_dir / "metadata").iterdir()] From 2e50139b19da0704dba284878986373a7ad83fc9 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Thu, 30 May 2024 10:42:21 -0400 Subject: [PATCH 24/35] add new tools to entrypoints --- pyproject.toml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c6203ad..b5593e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,9 +12,11 @@ lxml = "^4.9.3" bagit = "^1.8.1" [tool.poetry.scripts] -report_ftk_extents = 'digarch_scripts.report_ftk_extents:main' -report_hdd_extents = 'digarch_scripts.report_hdd_extents:main' -package_cloud = 'ipres_package_cloud.package_cloud:main' +report_ftk_extents = 'digarch_scripts.report.report_ftk_extents:main' +report_hdd_extents = 'digarch_scripts.report.report_hdd_extents:main' +package_cloud = 'digarch_scripts.package.package_cloud:main' +package_images = 'digarch_scripts.package.package_images:main' +package_filetransfer = 'digarch_scripts.package.package_filetransfer:main' [tool.poetry.group.dev.dependencies] nox = "^2023.4.22" From 850b7c7def37ee1c03b6553997e264227228ed0b Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Fri, 31 May 2024 10:05:54 -0400 Subject: [PATCH 25/35] improve rsync log handling --- src/digarch_scripts/package/package_base.py | 51 +++++-- tests/test_package_base.py | 151 ++++++++++++++------ 2 files changed, 153 insertions(+), 49 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 258db43..6d7eaee 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -48,6 +48,14 @@ def add_carrierid(self) -> None: "--carrierid", required=True, type=self.carrier_id, help="ACQ_####_#######" ) + def add_source(self) -> None: + self.add_argument( + "--source", + required=True, + type=self.extant_path, + help="Path to mount carrier", + ) + def add_payload(self) -> None: self.add_argument( "--payload", @@ -107,6 +115,9 @@ def add_streams_folder(self) -> None: def add_dest(self) -> None: self.add_argument("--dest", required=True, type=self.extant_path) + def add_quiet(self, **kwargs) -> None: + self.add_argument("-q", "--quiet", action='store_true', **kwargs) + def find_category_of_carrier_files( carrier_files: dict, acq_id: str, source_dir: Path, exts: list, category: str @@ -277,24 +288,45 @@ def convert_rclone_md5_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: return None -def convert_rsync_log_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: +def convert_rsync_log_to_bagit_manifest(rsync_log: Path, bag_dir: Path, prefix: Path = None) -> None: # check for manifest new_md5_path = bag_dir / "manifest-md5.txt" if new_md5_path.exists(): raise FileExistsError("manifest-md5.txt already exists, review package") - with open(md5_path, "r") as f: + with open(rsync_log, "r") as f: log_data = f.readlines() + if not prefix: + prefix = os.path.commonprefix( + [ + os.path.dirname(line.split(",", 4)[3]) + for line in log_data + if len(line.split(",")) > 1 + ] + ) + else: + prefix = str(prefix) + manifest_data = [] - prefix = os.path.commonprefix([os.path.dirname(line.split(',')[1]) for line in log_data if len(line.split(',')) > 1]) - print(prefix) + for line in log_data: - parts = line.split(',') - if len(parts) == 4: - poss_rel_path = parts[1].replace(prefix, 'data') - manifest_data.append(f"{parts[3].strip()} {poss_rel_path}\n") - # re-writes the manifest lines + parts = line.strip().split(",", 3) + if not len(parts) == 4: + continue + + poss_rel_path = parts[3].strip().replace(prefix[1:], "data") + + poss_md5_hash = parts[2].strip().lower() + if not poss_md5_hash: + continue + elif not re.match(r"[0-9a-f]{32}", poss_md5_hash): + LOGGER.warning(f"{str(rsync_log)} shold be formatted with md5 hash in the 3rd comma-separated fields. Skipping this line: {line}") + continue + + manifest_data.append(f"{poss_md5_hash} {poss_rel_path}\n") + + # write the manifest lines with open(new_md5_path, "w") as f: f.writelines(manifest_data) @@ -307,6 +339,7 @@ def create_bag_tag_files(bag_dir: Path) -> None: bagit_file.write(txt) bag_info = {} + bag_info["ACQ-Object-ID"] = bag_dir.parent.name bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") bag_info["Bag-Software-Agent"] = "digarch_scripts" total_bytes, total_files = get_oxum(bag_dir / "data") diff --git a/tests/test_package_base.py b/tests/test_package_base.py index 3deb4fb..9f09d17 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -15,6 +15,21 @@ def transfer_files(tmp_path: Path, request): return tmp_path +@pytest.fixture +def rclone_payload(transfer_files): + return transfer_files / "rclone_files" + + +@pytest.fixture +def rclone_md5_manifest(transfer_files): + return transfer_files / "rclone.md5" + + +@pytest.fixture +def rclone_log(transfer_files): + return transfer_files / "rclone.log" + + @pytest.fixture def image_files(tmp_path: Path, request): fixture_data = Path(request.module.__file__).parent / "fixtures" / "image" @@ -23,22 +38,24 @@ def image_files(tmp_path: Path, request): @pytest.fixture -def payload(transfer_files): - return transfer_files / "rclone_files" +def rsync_files(tmp_path: Path, request): + fixture_data = Path(request.module.__file__).parent / "fixtures" / "rsync" + shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) + return tmp_path @pytest.fixture -def md5_manifest(transfer_files): - return transfer_files / "rclone.md5" +def rsync_payload(rsync_files): + return rsync_files / "rsync_files" @pytest.fixture -def log(transfer_files): - return transfer_files / "rclone.log" +def rsync_log(rsync_files): + return rsync_files / "rsync.log" @pytest.fixture -def id(): +def acqid(): return "ACQ_1234_123456" @@ -118,33 +135,33 @@ def test_create_acq_dir(tmp_path: Path): assert base_dir.parent.name == tmp_path.name -def test_create_pkg_dir(tmp_path: Path, id: str): +def test_create_pkg_dir(tmp_path: Path, acqid: str): """Test that package folder maker makes ACQ and Carrier folders""" - base_dir = pb.create_package_dir(tmp_path, id) + base_dir = pb.create_package_dir(tmp_path, acqid) - assert base_dir.name == id - assert base_dir.parent.name == id[:-7] + assert base_dir.name == acqid + assert base_dir.parent.name == acqid[:-7] -def test_create_package_basedir_with_existing_acq_dir(tmp_path: Path, id: str): +def test_create_package_basedir_with_existing_acq_dir(tmp_path: Path, acqid: str): """Test that package folder maker respect existing ACQ folder""" - (tmp_path / id[:-7]).mkdir() - base_dir = pb.create_package_dir(tmp_path, id) + (tmp_path / acqid[:-7]).mkdir() + base_dir = pb.create_package_dir(tmp_path, acqid) - assert base_dir.name == id - assert base_dir.parent.name == id[:-7] + assert base_dir.name == acqid + assert base_dir.parent.name == acqid[:-7] -def test_error_on_existing_package_dir(tmp_path: Path, id: str): +def test_error_on_existing_package_dir(tmp_path: Path, acqid: str): """Test that package folder maker errors if carrier folder exists""" - base_dir = tmp_path / id[:-7] / id + base_dir = tmp_path / acqid[:-7] / acqid base_dir.mkdir(parents=True) with pytest.raises(FileExistsError) as exc: - pb.create_package_dir(tmp_path, id) + pb.create_package_dir(tmp_path, acqid) assert f"{base_dir} already exists. Make sure you are using the correct ID" in str( exc.value @@ -152,8 +169,8 @@ def test_error_on_existing_package_dir(tmp_path: Path, id: str): @pytest.fixture -def package_base_dir(tmp_path: Path, id: str): - return pb.create_package_dir(tmp_path, id) +def package_base_dir(tmp_path: Path, acqid: str): + return pb.create_package_dir(tmp_path, acqid) MOVE_FILE = [ @@ -162,29 +179,29 @@ def package_base_dir(tmp_path: Path, id: str): @pytest.mark.parametrize("test_function,dest", MOVE_FILE) -def test_move_file(package_base_dir: Path, log: Path, test_function, dest: str): +def test_move_file(package_base_dir: Path, rclone_log: Path, test_function, dest: str): """Test that metadata folder and log file are moved successfully""" - test_function(log, package_base_dir) + test_function(rclone_log, package_base_dir) - assert not log.exists() + assert not rclone_log.exists() assert (package_base_dir / dest / "rclone.log").exists() @pytest.mark.parametrize("test_function,dest", MOVE_FILE) def test_do_not_overwrite_file( - package_base_dir: Path, log: Path, test_function, dest: str + package_base_dir: Path, rclone_log: Path, test_function, dest: str ): """Test that log file is not moved if a same name file exists in dest""" - rclone_log = package_base_dir / dest / log.name + rclone_log = package_base_dir / dest / rclone_log.name rclone_log.parent.mkdir() rclone_log.touch() with pytest.raises(FileExistsError) as exc: - test_function(log, package_base_dir) + test_function(rclone_log, package_base_dir) - assert log.exists() + assert rclone_log.exists() assert f"{rclone_log} already exists in {dest} folder. Not moving." in str( exc.value ) @@ -198,12 +215,12 @@ def test_do_not_overwrite_file( @pytest.mark.parametrize("test_function,dest", MOVE_FILES) def test_move_multiple_file( - package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str + package_base_dir: Path, rclone_log: Path, rclone_md5_manifest: Path, test_function, dest: str ): """Test that multiple files are moved successfully""" parts = dest.split("/") - md_files = [log, md5_manifest] + md_files = [rclone_log, rclone_md5_manifest] test_function(md_files, package_base_dir) for md_file in md_files: @@ -213,20 +230,20 @@ def test_move_multiple_file( @pytest.mark.parametrize("test_function,dest", MOVE_FILES) def test_partial_halt_multiple_files( - package_base_dir: Path, log: Path, md5_manifest: Path, test_function, dest: str + package_base_dir: Path, rclone_log: Path, rclone_md5_manifest: Path, test_function, dest: str ): """Test that warning is issued for multiple move if a single metadata move fails""" - rclone_log = package_base_dir / dest / log.name + rclone_log = package_base_dir / dest / rclone_log.name rclone_log.parent.mkdir() rclone_log.touch() - md_files = [log, md5_manifest] + md_files = [rclone_log, rclone_md5_manifest] with pytest.raises(Warning) as exc: test_function(md_files, package_base_dir) - assert log.exists() + assert rclone_log.exists() assert ( f"already exists in {dest} folder. Not moving. One or more files may have already been moved to the {dest} folder" in str(exc.value) @@ -234,15 +251,15 @@ def test_partial_halt_multiple_files( @pytest.fixture -def bag_payload(package_base_dir: Path, payload: Path): - pb.move_data_files(list(payload.iterdir()), package_base_dir) +def bag_payload(package_base_dir: Path, rclone_payload: Path): + pb.move_data_files(list(rclone_payload.iterdir()), package_base_dir) bag_payload = package_base_dir / "data" return bag_payload -def test_convert_rclone_md5(bag_payload: Path, md5_manifest: Path): - pb.convert_rclone_md5_to_bagit_manifest(md5_manifest, bag_payload.parent) +def test_convert_rclone_md5(bag_payload: Path, rclone_md5_manifest: Path): + pb.convert_rclone_md5_to_bagit_manifest(rclone_md5_manifest, bag_payload.parent) bag_md5 = bag_payload.parent / "manifest-md5.txt" # Get path to correct payload in data @@ -258,13 +275,67 @@ def test_convert_rclone_md5(bag_payload: Path, md5_manifest: Path): assert a_file in payload_files -def test_create_bag(package_base_dir: Path, payload: Path, md5_manifest: Path): +@pytest.fixture +def rsync_bag_payload(package_base_dir: Path, rsync_payload: Path): + pb.move_data_files(list(rsync_payload.iterdir()), package_base_dir) + bag_payload = package_base_dir / "data" + + return bag_payload + + +def test_convert_rsync_log(rsync_bag_payload: Path, rsync_log: Path, rsync_files): + pb.convert_rsync_log_to_bagit_manifest(rsync_log, rsync_bag_payload.parent) + bag_md5 = rsync_bag_payload.parent / "manifest-md5.txt" + + # Get path to correct payload in data + # read md5 and extract filepaths + with open(bag_md5) as m: + md5_paths = [line.strip().split(" ")[-1] for line in m.readlines()] + + payload_files = [ + str(path.relative_to(rsync_bag_payload.parent)) for path in rsync_bag_payload.rglob("*") + ] + + for a_file in md5_paths: + assert a_file in payload_files + + +def test_convert_rsync_log_replaces_prefix_with_data(rsync_bag_payload: Path, rsync_log: Path): + prefix = "/Users/fortitude/dev/digarch-scripts-poetry/tests/fixtures/rsync/rsync_files" + pb.convert_rsync_log_to_bagit_manifest(rsync_log, rsync_bag_payload.parent, prefix) + bag_md5 = rsync_bag_payload.parent / "manifest-md5.txt" + + #extract paths from manifest + with open(bag_md5) as m: + md5_paths = [line.strip().split(" ")[-1] for line in m.readlines()] + + #extract paths from log + rsync_paths = [] + with open(rsync_log) as m: + lines = m.readlines() + for line in lines: + parts = line.strip().split(", ") + if len(parts) > 3 and parts[2].strip(): + rsync_paths.append(line.strip().split(", ")[-1].replace(prefix[1:], 'data')) + + #assert difference + assert set(md5_paths) == set(rsync_paths) + + +def test_convert_rsync_log_requires_specific_format(rsync_bag_payload: Path, rsync_log: Path, caplog): + rsync_log.write_text('time, size, not a hash, good/path') + pb.convert_rsync_log_to_bagit_manifest(rsync_log, rsync_bag_payload.parent) + + assert f"{str(rsync_log)} should be formatted with md5 hash in the 3rd comma-separated fields" in caplog.text + + +def test_create_bag(package_base_dir: Path, rclone_payload: Path, rclone_md5_manifest: Path): """Test that all tag files are created and rclone md5sums are correctly converted""" bag_path = package_base_dir / "objects" # might need further testing of the oxum and manifest converter functions - pb.create_bag_in_objects(payload, md5_manifest, package_base_dir) + pb.create_bag_in_objects(rclone_payload, package_base_dir, rclone_md5_manifest, 'rclone') assert bagit.Bag(str(bag_path)).validate(completeness_only=True) From 22c697b672b3881c7105a563c4a9f142ed1c3c63 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Fri, 31 May 2024 11:12:23 -0400 Subject: [PATCH 26/35] add new validation --- src/digarch_scripts/package/package_images.py | 24 +++++- tests/test_package_images.py | 74 +++++++++++++++---- 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 8e579c1..f2827ca 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -51,17 +51,29 @@ def validate_carriers_image_files(carrier_files: dict) -> bool: carrier = carrier_files[carrier_name] missing = [] - for key in ["images", "logs", "streams"]: + for key in ["images", "logs"]: if not key in carrier.keys(): missing.append(key) if missing: LOGGER.warning( - f'The following categories of files were not found for {carrier_name}: {", ".join(missing)} ' + f'The following required categories of files were not found for {carrier_name}: {", ".join(missing)} ' ) result = False if "images" in carrier: + if len(carrier["images"]) > 1: + two_sided = True + for image in carrier["images"]: + if not re.match(r"s\d\.001", image.name[-6:]): + print(image.name[-6:]) + two_sided = False + if not two_sided: + LOGGER.warning( + f'Multiple image files found for {carrier_name}. Only 1 allowed. If carrier has 2 disk formats, file names must end with s0.001 or s1.001: {carrier["images"]}' + ) + result = False + for image_file in carrier["images"]: if image_file.stat().st_size == 0: LOGGER.warning(f"The following image file is 0-bytes: {image_file}") @@ -70,10 +82,16 @@ def validate_carriers_image_files(carrier_files: dict) -> bool: if "streams" in carrier: if not len(carrier["streams"]) == 1: LOGGER.warning( - f'Multiple folder of stream folders found for {carrier_name}. Only 1 allowed: {carrier["streams"]}' + f'Multiple folders of streams found for {carrier_name}. Only 1 allowed: {carrier["streams"]}' + ) + result = False + if not list(carrier["streams"][0].iterdir()): + LOGGER.warning( + f'Streams folder for {carrier_name} appears to be empty: {carrier["streams"][0]}' ) result = False + return result diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 179eb51..7df6692 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -157,64 +157,106 @@ def test_good_validate_carrier(carrier_files, caplog): assert result -@pytest.mark.parametrize("key", ["images", "logs", "streams"]) +@pytest.mark.parametrize("key", ["images", "logs"]) def test_warn_carrier_with_one_missing_category(carrier_files, key, caplog): carrier_files["ACQ_1234_123456"].pop(key) result = pi.validate_carriers_image_files(carrier_files) assert ( - f"The following categories of files were not found for ACQ_1234_123456: {key}" + f"The following required categories of files were not found for ACQ_1234_123456: {key}" in caplog.text ) assert not result -def test_warn_carrier_with_logs_no_images_or_streams(caplog): - carrier_files = {"ACQ_1234_123456": {"logs": [Path("ACQ_1234_123456.log")]}} +def test_warn_more_than_one_image(carrier_files, caplog): + carrier = "ACQ_1234_123457" + second_image = carrier_files[carrier]["images"][0].with_suffix('.img2') + second_image.write_text('0') + carrier_files[carrier]["images"].append(second_image) + result = pi.validate_carriers_image_files(carrier_files) assert ( - f"The following categories of files were not found for ACQ_1234_123456: images, streams" + f'Multiple image files found for {carrier}. Only 1 allowed' in caplog.text ) assert not result -def test_warn_carrier_with_streams_no_images_or_logs(caplog): - carrier_files = {"ACQ_1234_123456": {"streams": [Path("ACQ_1234_123456_streams")]}} +def test_accept_two_sided_images(carrier_files): + carrier = "ACQ_1234_123457" + + image_name = carrier_files[carrier]["images"][0].name + first_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "s0.001") + second_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "s1.001") + second_image.write_text('0') + + carrier_files[carrier]["images"][0].rename(first_image) + carrier_files[carrier]["images"] = [first_image, second_image] + + result = pi.validate_carriers_image_files(carrier_files) + + assert result + + +def test_warn_on_malformed_two_sided_image_filename(carrier_files, caplog): + carrier = "ACQ_1234_123457" + + image_name = carrier_files[carrier]["images"][0].name + first_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "side0.001") + second_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "side1.001") + second_image.write_text('0') + + carrier_files[carrier]["images"][0].rename(first_image) + carrier_files[carrier]["images"] = [first_image, second_image] + result = pi.validate_carriers_image_files(carrier_files) assert ( - f"The following categories of files were not found for ACQ_1234_123456: images, logs" + 'If carrier has 2 disk formats, file names must end with s0.001 or s1.001' in caplog.text ) + assert not result def test_warn_and_skip_0_length_image(carrier_files, caplog): - carrier_files["ACQ_1234_123457"]["images"][0].unlink() - carrier_files["ACQ_1234_123457"]["images"][0].touch() + carrier = "ACQ_1234_123457" + carrier_files[carrier]["images"][0].unlink() + carrier_files[carrier]["images"][0].touch() + result = pi.validate_carriers_image_files(carrier_files) assert ( - f'The following image file is 0-bytes: {str(carrier_files["ACQ_1234_123457"]["images"][0])}' + f'The following image file is 0-bytes: {str(carrier_files[carrier]["images"][0])}' in caplog.text ) assert not result -def test_warn_streams_missing_a_side(): - # TODO - assert True +def test_warn_streams_folder_empty(carrier_files, caplog): + carrier = "ACQ_1234_123457" + for file in carrier_files[carrier]["streams"][0].iterdir(): + file.unlink() + + result = pi.validate_carriers_image_files(carrier_files) + + assert ( + f'Streams folder for {carrier} appears to be empty' + in caplog.text + ) + assert not result def test_warn_only_one_stream_folder_allowed(carrier_files, caplog): - carrier_files["ACQ_1234_123457"]["streams"].append("ACQ_1234_123457_2") + carrier = "ACQ_1234_123457" + carrier_files[carrier]["streams"].append("ACQ_1234_123457_2") result = pi.validate_carriers_image_files(carrier_files) assert ( - f"Multiple folder of stream folders found for ACQ_1234_123457. Only 1 allowed" + f"Multiple folders of streams found for {carrier}. Only 1 allowed" in caplog.text ) assert not result From 78b7582b19d9777396a91405db2c90722bf4bb31 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Fri, 31 May 2024 11:20:52 -0400 Subject: [PATCH 27/35] fix typo --- src/digarch_scripts/package/package_base.py | 2 +- tests/test_package_base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 6d7eaee..59a43ed 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -321,7 +321,7 @@ def convert_rsync_log_to_bagit_manifest(rsync_log: Path, bag_dir: Path, prefix: if not poss_md5_hash: continue elif not re.match(r"[0-9a-f]{32}", poss_md5_hash): - LOGGER.warning(f"{str(rsync_log)} shold be formatted with md5 hash in the 3rd comma-separated fields. Skipping this line: {line}") + LOGGER.warning(f"{str(rsync_log.name)} should be formatted with md5 hash in the 3rd comma-separated fields. Skipping this line: {line}") continue manifest_data.append(f"{poss_md5_hash} {poss_rel_path}\n") diff --git a/tests/test_package_base.py b/tests/test_package_base.py index 9f09d17..f5fe79e 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -326,7 +326,7 @@ def test_convert_rsync_log_requires_specific_format(rsync_bag_payload: Path, rsy rsync_log.write_text('time, size, not a hash, good/path') pb.convert_rsync_log_to_bagit_manifest(rsync_log, rsync_bag_payload.parent) - assert f"{str(rsync_log)} should be formatted with md5 hash in the 3rd comma-separated fields" in caplog.text + assert f"{str(rsync_log.name)} should be formatted with md5 hash in the 3rd comma-separated fields" in caplog.text def test_create_bag(package_base_dir: Path, rclone_payload: Path, rclone_md5_manifest: Path): From 878ea29bc28a5d9f307dbfa23f7cc775c9511da3 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Fri, 31 May 2024 11:36:35 -0400 Subject: [PATCH 28/35] fix a bug from rsync hash choice --- rsync_files/file.01 | Bin 3072 -> 0 bytes tests/fixtures/rsync/rsync.log | 9 +++++---- .../rsync/rsync_files/{ => folder}/file.02 | Bin 3 files changed, 5 insertions(+), 4 deletions(-) delete mode 100644 rsync_files/file.01 rename tests/fixtures/rsync/rsync_files/{ => folder}/file.02 (100%) diff --git a/rsync_files/file.01 b/rsync_files/file.01 deleted file mode 100644 index b70f4554297d86f9aade2f45b7f769d88c70609e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3072 zcmV+b4FB_;_q;LM+U)%{VT_Y}?hw~j|E2|j8?Rg$m(-68Z+pczw5*k<=ULAz6i4USeym- zEOHcX|2>$~X5?)Zxif6WW3djC@mJ6}E`WCdds2Z-KO_F}-Ih&AJ%U$|4dW&GLZ>`Z zW0-nARhFj5-EbF95c%{IJ2>uV^7>aE9fZO%DB~bUk{P-!537TID`x()UsmnBq5Bb1 zr{#&Qh*!YeqenNWj?gaV>W}&7Ha)A0wXe5$dB9^q$Ncu3+7;w0LwdO&zFCneaB>^p zXqDKGf0|cPfx$i+IlJjmTaG}bSdkqKCv*UxZCLIc$)gjVpWvyP#WN+&c}v->ZKY%N zb8I%Ajm=1y+wBX5ffTYVsJ|d_sxDQ-t#}mRK@r)J5rA}Xl>+7BzOHA)RX{kngnrI3 ztr^5G7lA^~BLVqtUS(a%*UjHssN?lVs!Az#WM%%5{h(f7Q^`4qA_js9S1J6~;OlDB zzMkWQgdRB{eLOlyjYdzkfl<8-D(}{zkLWzwy8-IN-AEzkQZ7sxFxDew7a*Ceu!l)O z<92MkxCN4D2_V}26@HSFgq^e*@~&ow;7Bb-vx#Pf!HTQK4(tLD2T^hi2f z#eJEG=q=&;o8=-IkzTtg;nCH2CvomCgpwr=Kz@@nXC1XutAEsRNrHKNlc-2t#i7>u z&F8#u{})zn1HoQq0G#PYB9u!o4m06YA|M}iczt#Pma`wSH=uOc^GnU9mCmDz!WX4} zCV{xk^awGOk5^#E-h2;%4$s+N^;puiw3oAik5pQQZW8>3&^1!#90h!?SM}}RIvzK5 z#cEq&qnF)luBW16XyOY{pq-g1PqoHoL;TT2#Ni<6(zq3!^0-{_(@SbvZ@?wl2`dB2cDD?N9^ayr4LDC;T8?!y?lQjEs(+;+rD0ULZnmSKro1`%;s`h z8fCA8V`i_)?lT`;Bc+fv$B6a>~oop zDY|tgNCH5`b&BsSgBsoRAuz_P?$Pjem?Y4f76Suqt_UNUx^|93TmnzCE|I6|Z7-)4 zw=pCNW@Tp1-si#^-bxaF6lVyi2YG8P`9B0fMp(jSkg8L98C2I9ilMf4I{G(w*H7zc zlKcKfV2DbL1K@%6r#UHMEtfx8YqPT$?(V7e26a9|S=3{ru%Q`+_zPx%Aaw{E9tDk>6(r+Cf;pnL820B<1|t z#POj$9~rtDC|<^BY`x;{97ebL5g?lbX;GWbW5kYqhgq2x&t0=86g-Jaau4Wh(uextijK)8c&5rm<4}CyZn6cqAdO0bTCagS#rjsy3MxmDsQqw z#kTl;gxm!obD;~TlYA+LO3^&*Y4XviK-#YzWu!P87X%vaPMFR)hAPmcR4Rb8QI(cG^>JO6u`p-CNppYQ@tY>ga)}^k{&#vn`pQ z5b@DNCZiMI6c_q$Z+1 zI4L5dS}M1AcbBWv4A}WyA|uSgwd|Tp`CF;4;Bn%(ddp-H9Pb0If&W}dZs8Ydu_Yo( zO+7+MIr4c|AiCAa-l{$ciJuTWw;cWo44*z+aH$pX73;M&~b z3JU7(4O38Ca9OECv1Zo5Q}Ufz5%!QZ8BYT1EGP$mS;L7W)Etw*nK_~-C~9kA5~O6n z-y-n49B&of46CQMNH=>p>oI-P3!od zIxCX0c}FrJe#9R!OOGca*+eA>r^uH~1h3?Z&4vUW%2Lgpwjg%BaF7E@!qiOHq=8T}Fs#rScCbnIr)m(Vv ztkJP}TzfUeJ%;b`w`YtcZ!6qSK_!$`@Pza~C=z zGY^i$8>-E5q~pSglV;2jk*kAZnHIiWow603pNL{(!N?2{9f`?;F@(M_B=VAR?TR>> zytFA|hLWaF`o>4pcp`F%gEZI}orU~@$NVs9KMrl1WVmHeDk%L=kAiL@4GbNR?I5s5 z%62brZBph+nfc0FQY_6QIeOcSgmYNmBp)N;5&8xQ^%didqZb4|dfH%tW9z2|w=2E< zW2Wtova(cUv+Wm^mOs>uPfUG6vIFrst5bE6oHMB`>yxO#v%9y(80#JAlL|T0j6H&A zOjNsude&R{W`b6@Ojd-WyZ5!W!pjnQZGQop(^!1_DvlTV%3&fnuni3uApHt)X4y{4 zfnoP9`jyqS{veVz2do@fCt$BzmKUkFPK{bZ!!my zsPm_2t4Iy_sk2EklPROukx=AXb{0mJ`uWiClK$+lOG7O*JDfL16JMGT#r$tq$v##j z{!}Tp&kyJ2tt@r2a6~xSb2W{Xb&yD2d#mQEkYbjk1UB50Yc3CFQSe=or5w(2kAby2 z!u6W;P8Lwl9s3sHE5eC0C5E4YbvIJ03glv(OL=HkGs!K3CR$~bc~k~97Gc5iUAgO! zEvUl2iQR8P(ul9@#PC)90I(E26D<49-;ir-Y(`1xr-LqY<^7XUM;5BzIh(DD?e>9g zBY{iH;Q0{dhXHxq&z8TFXuMgOy)?A{qB~x7&rJ7RFL6~nJ2MV;Er%msrB8wA`EQzD OhBXE-vyTa(^?l@ZMFAoJ diff --git a/tests/fixtures/rsync/rsync.log b/tests/fixtures/rsync/rsync.log index cd0f2c5..38e62fa 100644 --- a/tests/fixtures/rsync/rsync.log +++ b/tests/fixtures/rsync/rsync.log @@ -1,4 +1,5 @@ -2024/05/29 15:07:45 [46235] building file list -2024/05/29 15:07:45 [46235] , tests/fixtures/rsync/rsync_files/file.01, 3072, d5116a5a40aab468780a3c03b417a8ac -2024/05/29 15:07:45 [46235] , tests/fixtures/rsync/rsync_files/file.02, 3072, 379bc3d1e529f9645bab5482bbd4ac98 -2024/05/29 15:07:45 [46235] sent 3206 bytes received 41 bytes total size 3072 +2024/05/30 16:20:36 [59347] building file list +2024/05/30 16:20:36 [59347] , 3072, d5116a5a40aab468780a3c03b417a8ac, Users/fortitude/dev/digarch-scripts-poetry/tests/fixtures/rsync/rsync_files/file.01 +2024/05/30 16:20:36 [59347] , 96, , Users/fortitude/dev/digarch-scripts-poetry/tests/fixtures/rsync/rsync_files/folder +2024/05/30 16:20:36 [59347] , 7168, 379bc3d1e529f9645bab5482bbd4ac98, Users/fortitude/dev/digarch-scripts-poetry/tests/fixtures/rsync/rsync_files/folder/file.02 +2024/05/30 16:20:37 [59347] sent 10516 bytes received 75 bytes total size 10240 diff --git a/tests/fixtures/rsync/rsync_files/file.02 b/tests/fixtures/rsync/rsync_files/folder/file.02 similarity index 100% rename from tests/fixtures/rsync/rsync_files/file.02 rename to tests/fixtures/rsync/rsync_files/folder/file.02 From 4b7ea96dbaf541a440053510bb32566ef6947683 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Fri, 31 May 2024 11:57:34 -0400 Subject: [PATCH 29/35] cleanup empty folders post packaging --- src/digarch_scripts/package/package_base.py | 4 ++++ tests/test_package_base.py | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 59a43ed..7ebb13d 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -238,6 +238,8 @@ def create_bag_in_images(image_paths: list[Path], pkg_dir: Path) -> None: def create_bag_in_streams(stream_path: Path, pkg_dir: Path) -> None: create_bag_in_dir([stream_path], pkg_dir, "streams") + if not list(stream_path.iterdir()): + stream_path.rmdir() return None @@ -251,6 +253,8 @@ def create_bag_in_objects( create_bag_in_dir( [objects_path], pkg_dir, "objects", manifest_source, manifest_type ) + if not list(objects_path.iterdir()): + objects_path.rmdir() return None diff --git a/tests/test_package_base.py b/tests/test_package_base.py index f5fe79e..ad5b120 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -329,7 +329,7 @@ def test_convert_rsync_log_requires_specific_format(rsync_bag_payload: Path, rsy assert f"{str(rsync_log.name)} should be formatted with md5 hash in the 3rd comma-separated fields" in caplog.text -def test_create_bag(package_base_dir: Path, rclone_payload: Path, rclone_md5_manifest: Path): +def test_create_objects_bag(package_base_dir: Path, rclone_payload: Path, rclone_md5_manifest: Path): """Test that all tag files are created and rclone md5sums are correctly converted""" bag_path = package_base_dir / "objects" @@ -338,6 +338,20 @@ def test_create_bag(package_base_dir: Path, rclone_payload: Path, rclone_md5_man pb.create_bag_in_objects(rclone_payload, package_base_dir, rclone_md5_manifest, 'rclone') assert bagit.Bag(str(bag_path)).validate(completeness_only=True) + assert not rclone_payload.exists() + + +def test_create_streams_bag(package_base_dir: Path, image_files: Path): + """Test that all tag files are created and rclone md5sums are correctly converted""" + + streams_path = image_files / "streams" / "ACQ_1234_123456" + bag_path = package_base_dir / "streams" + + # might need further testing of the oxum and manifest converter functions + pb.create_bag_in_streams(streams_path, package_base_dir) + + assert bagit.Bag(str(bag_path)).validate(completeness_only=True) + assert not streams_path.exists() def test_generate_valid_oxum(transfer_files: Path): From 075f5ba1c9401a2110cf615c3934fdfe9baef150 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Fri, 31 May 2024 13:27:42 -0400 Subject: [PATCH 30/35] add rsync to package script --- pyproject.toml | 1 + .../transfer/transfer_rsync.py | 107 ++++++++++++++ tests/fixtures/rsync/rsync_files.dmg | Bin 0 -> 25967 bytes tests/test_transfer_rsync.py | 136 ++++++++++++++++++ 4 files changed, 244 insertions(+) create mode 100644 src/digarch_scripts/transfer/transfer_rsync.py create mode 100644 tests/fixtures/rsync/rsync_files.dmg create mode 100644 tests/test_transfer_rsync.py diff --git a/pyproject.toml b/pyproject.toml index b5593e4..aaa0397 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ report_hdd_extents = 'digarch_scripts.report.report_hdd_extents:main' package_cloud = 'digarch_scripts.package.package_cloud:main' package_images = 'digarch_scripts.package.package_images:main' package_filetransfer = 'digarch_scripts.package.package_filetransfer:main' +transfer_rsync = 'digarch_scripts.transfer.transfer_rsync:main' [tool.poetry.group.dev.dependencies] nox = "^2023.4.22" diff --git a/src/digarch_scripts/transfer/transfer_rsync.py b/src/digarch_scripts/transfer/transfer_rsync.py new file mode 100644 index 0000000..a9b2c31 --- /dev/null +++ b/src/digarch_scripts/transfer/transfer_rsync.py @@ -0,0 +1,107 @@ +import argparse +import logging +import re +import subprocess +from pathlib import Path + +import digarch_scripts.package.package_base as pb + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) + + +def parse_args() -> argparse.Namespace: + parser = pb.TransferParser( + description="Create packages for all file transfer files for a single acquisition." + ) + parser.add_carrierid() + parser.add_source() + parser.add_dest() + parser.add_quiet(help="Suppresses progress bar from rsync") + + return parser.parse_args() + + +def run_rsync(source: Path, dest: Path, quiet: bool = None) -> None: + log_folder = (dest / "metadata") + log_folder.mkdir() + log_file = log_folder / f"{dest.name}_rsync.log" + objects_folder = (dest / "objects") + objects_folder.mkdir() + + cmd = [ + "rsync", + "-arP", + f"{source}/", + objects_folder / "data", + "--checksum-choice=md5", + f"--log-file={log_file}", + "--log-file-format=, %l, %C, %f", + ] + + if quiet: + cmd.append("-q") + + process = subprocess.run(cmd) + + if process.returncode != 0: + LOGGER.warning("Transfer did not complete successfully. Delete transferred files and re-run") + + return + + +def create_bag_files_in_objects(base_dir: Path, rsync_log: Path, source: Path): + objects_dir = base_dir / "objects" + pb.create_bag_tag_files(objects_dir) + pb.convert_rsync_log_to_bagit_manifest(rsync_log, objects_dir, source) + + +def run_disktype(source: Path, dest: Path) -> None: + #determine device to unmount and run disktype on + if not source.is_mount(): + LOGGER.info(f"Disktype log cannot be generated for a folder. Skipping") + return + + output = subprocess.check_output(["df", source]).decode("utf8") + device = re.search(r'(/dev/[a-z0-9]+)', output).group(0) + parent_device = re.search(r'(/dev/[a-z]+\d)', device).group(0) + + LOGGER.info(f"Dismounting device {device} in order to run disktype, may require password for sudo") + process = subprocess.run(['diskutil', 'unmount', device]) + + if process.returncode != 0: + LOGGER.warning(f"Unable to dismount {source}. Disktype report not generated. Create manually") + return + + output = subprocess.check_output(["sudo", "disktype", parent_device]).decode("utf8") + + LOGGER.info(f"Output from disktype: {output}") + metadata_folder = dest / "metadata" + if not metadata_folder.exists(): + metadata_folder.mkdir() + with open(dest / "metadata" / f"{dest.name}_disktype.log", "w") as f: + f.write(output) + + #remount + subprocess.run(['diskutil', 'mount', device]) + LOGGER.info("Device remounted") + + return + + +def main(): + args = parse_args() + + base_dir = pb.create_package_dir(args.dest, args.carrierid) + + run_rsync(args.source, base_dir, args.quiet) + rsync_log = base_dir / "metadata" / f"{base_dir.name}_rsync.log" + create_bag_files_in_objects(base_dir, rsync_log, args.source) + + run_disktype(args.source, base_dir) + + pb.validate_objects_bag(base_dir) + + +if __name__ == "__main__": + main() diff --git a/tests/fixtures/rsync/rsync_files.dmg b/tests/fixtures/rsync/rsync_files.dmg new file mode 100644 index 0000000000000000000000000000000000000000..58e8d82f7f49485b13b73fdbe4b65cefbf5d49b3 GIT binary patch literal 25967 zcmeIabzEG_vNjwc2@*nxkOX&^;O_3u-~<^6GB|@n0)Zd_f;$8VZb1hpxVyVEIKiFy zh$naF>~r>e&fWKYznk^Pn(peV?pjq(Rm)nxvAew9oriq+^22M>L+NH$v68V~JXIhr zsTaKM5z%mS9-mOvO8w>=;Eg*&OE*89b)0BzXV4UE5vKc=HNQdUl5T2CfPiw6cC4$( zadOqTr^;esiG_M+-tO}Hh&>Bn zvx@mcZo*AqtGb@lBALf4@7tj&o^*O5-pvvAq=dTO_;d!8e1b%%Wb%#OxcrF{EfEe& z5LbhE?f5hGlbBV9&Z4fJDvkJ}4V&CeXIJjkBChkp96{VSyqB)Psu7>Uql#Qb5P(Cl zklnIz54h~&j&qgV?tBSgp1tlpvV3shr$B$4YL%ku>zIqv5>n$@FNm!|moF9nPt}GuNbRIUO1SQ>!LtZ7yer z)r031t=!a6mY~3?2QV$Qn+3R+F2nO&x^ny3B{jzQU7f4y8ZT7rf^Y7;)5KoP?Erf__TII(B9IC0$L>FN zF^z-zMum{_P6d2FtQpe@?eoAtbpC%s@dK><66$lFZOBa)1c4Caxp`yWVl-o%lL@AC zAWXU(=`^(i)0_W~hWVxL=j8A6M3f2bP}5*+ZrFb?$iLXjf3@TP60zxO8Xf`M*IC*5 z#V1(Nu(iHBUZO+C^Tv4=RPfK?_*Uc3_U2p%m3F$KShhVc5d5fZ_Tc$( z7g@Cz{A%?PBaU0}FMrf_l z--ajGg0bz#?+?b~AHulGCkZrt_n^06TWX<0ja~?AaI$XKd1K#5-^Mq;<9}OyXnPx1 zIU5Kp2UobRH``1@sfIx5DQxiaTq^*+&zD(9)cSsTf$3@Hvjz5QV2*w$9q%#{1=>s636f zfTw4)J)4-z+{aBfPyKC>U=c0c`6qp|whgBp#zz1&*Z!tsfvqB3;>YKCjidXUij6pm zABNE1ob61)N@3xOgY5OK#?jRo=UT>3D#1gUs{!NZc`<{~-m}b7SS-4hmhsuGEST+i zc7{5)bTyjm;*r^Ap94`JtXwSY6cF0}p-1|3a`RB5W%gi@>tiZ%goTRfpXhq;sUyHa ziVkf<&!Nk(=VxUDWz9HYu$~21Tp}l;AH8~Rae3Vj1CCiH=oHspcsrs#RFByLFOM#e z5YKyhXJ`rrJIsi<{5_-njEApz?l<-S%C!62Ex+CJUo!abi2cK)_S+=CP4aEU{y=;nb0DO_Xng zfh;@XcWZs^Cl`G!mb5L>J%oI>FZrHF$~Xy}&BQk+X-)oB`rW}QDs zCde0e*h2cs%`{C1MOI*CWRzYB3M`ul&=DuuIs-w?n>Y$fll48<;;c%=IJUPe@|
gKV^mZ=9((1jG&eM^qX17^<1uFpgbtwZz}gy;$!m8Q9fr+S6+Gv@*My z@)+%~=S;gcZ*Hq2;dS?VevHeuC3^S(O8AXsu2D*s0cQ$*iK<+mrxhF7*N4*L(wH*s zm;D3eWS?vu~RcQ&AcF~(`#zL-2iEL$;u`+HIs0=BgfN%zUMc&?`>+xAU@gM zQ>81i<@(_F+6w$mi+gp!ZYYPof8rSa05&3;*KNmT1cbfN*k4;X zzIHj5I;v)}3c(&7Z4Xiw9>uL#eyYR7#=pHAfrYnFz;+PoKHD$netkR!b3@YwyWSg! zne5oxRe1z{#|kwP*Vm)6Du(&LIq9&PzXQ;DMHOGYlZ=pP>TGpd-m<#wIMnKXvw|+4HfSYTvfJWR^Munr|!$4Z*veH=4ZH)4u6j8EUR&i^}%bxH$Bctrg zQC5Sal_{EwY|<_bB`z(Rh;ybHmG|@>iC8dmZ1uAP#n=nZ&C>lGiHb=u;M?FUyqxY; z%U4jo*?hg5EOgXyVg$%{7tGuj(aW!r4lQ+@_Eobie9wP?gEf#Y2nRH z6JZj+FER@%uMyyzRgmZmKJ6w@9f&zJk9?}WQ4*|7t(y!-lN_6^(N->9q&8&;+A?@Y z(nYYtMU?+mAS2OK&KM&)ceBo@yAItxvzFh~J^;`)gYfb&G=3#)Mpc#&$E^z5im2?Z zF3;wWk@X5_xxIhjx8ai^%nqX49M?fs<8Wn28_IXGe6sN*+f_F{CNQZpMoLvt7OOgH zmc5pI!lZZ;*)TG|2gNJgGvdZtr6hr=Z@)u4Y`!5RudCz9@sf}D8M5O91(kKC=yN2_ zuw23e6V9Ya!Y9}T3Ih16wU}D0LR%a6E6)wM`+}&$(Hb8`Q8x{_B|fN3C7v9kMqEIG z7_T{_E}t%E2YB_qxfC8Zuh%SR9VWP*+*InO&)PsdPh2NR)WUR`Q3Lw`pl;b@CWaQCfydr42>&1-4b!$r@|MeG~2(b?-h*9l%iM(9K|IM~`- z@EZ?S%Wn7i>pmbY-*i%YHJa=)5p7+t)5)`_v;E4bD>?Ha{#952B}C8tWfPs`kVrL|230r&BWzA2gc zJFfAvonfn&eF0oj`RM@aK8s0Q)^x|Dh7{`2&DkEk6eV9e7a?C#&3s#@M-P}F6zRu$ z1)LP5-K@ssofuV9YqB$agY0T3FVjgt@WWbrb+&B$c6;lr7Fdx@**ERJeW=6zZUyHFq+o%nz^;Cjr9kklI8X3V! z*%ZtOt;8;K_+ZG}6I+D6i%!duwDvAEisjf}wQ8plH4GPNuG81Chl^ywVPsNI2`jfsIyq z-(}8m%dNzy>1EY(Ua{6KQSvhaeQIH|%*mqFoSt||PBCU2iVlYeJ3c!SmG?acs1>m$ z2#hWHk51XKq*XvIY_dCXj%7X_kz4DFZsUl4oLr^;Z>N@w`A?cOmkQg0#7aT*Pw5?| zO%zA^`u3Ag!;A@%I_q_!Vq~1s4=#&J!PstjyVIVIo9*ueLYXxCh^Zd=n^f_%PQD$A zQLS0tY9J0D!yu89c7j|-Ml@;607v1a1F z!%}6wJDIl;IPAOVd8L&`jqF7@@^yxBm22Bd_nJkoHS4Mq3Ot>cic;=PdCb12oVH+2 z9&lUbIBVNzmRlVHFfoIV)>fFQ}Ue*eCOgeP9 zqp5tajgnqk3fW&M*#H{d?6;owu6cv*0G-^VdKKuRS*AHt+b=wQR|gK@vS{j|n3^swZ~A=481eN0G~!0X z0n?s1vm~Kuv8FL4W{v{p(3P2pY5H-6=n>s2%Slcc zHuUygZ8h{Fkcz-l_POo_34J-ic&;|wXzy{qP`&^qkY!b4NEmty=}D^-qip)lwl=8R zIhl|?|Jw0}1(_VggXGS3T$rLFmqFwc9Pu|H>t1s3;BKb0vgtlhvEBm{YppR-h61C6 zip2c|G_%x8iK|$|RTOSL?c$P3<8-!ZY04}BvaZoe*8Y2XMYv^7MVY502B)U^e8owp zVMO}g?GOqS4W@|$2|Awl+j9X)YOk=g~Qd}FiLxU0KOxqX8$b0(LBr}Za&JZ}<17yCp4-#MnY zxkNIX_J-${H5p;DYdwy_F^-K8EOW%22_f~1-6#}vVp3}xiFmVJSu~Z~g{8yR+ger2 zK&pU&*Ih4;9Sa?Y^ z+T$xQc{STx`VMAB05SKd%_g96Qg)QMKWzsGuG?}Bhe@76X`Nmk!J#k*+JXB*iDZ)!>?uY z^)(TPQKBHGi*>XoyHj^&DPSsJoLiyzw)(tcfg^UU)PF+iN-^lEL41d6AV`K zR}LkMLjhF64B&PQ;|!F2lY=SY?Z@)XgWLNlB=%z{PcvGvxYf?&pXd#R5{S#^_3cVx z?Ml4r1BQdVSJ0BHIv)ANPFQMpy)y;(((|eYPwKSDhg=OI5tZ}azLV$~wYjzKBnGZe z4+{^_eobYj_BP3#xG#U7*ZaDy$Bi>Y6&=9_t6J%UX3bj;3}B8tXv?Xx@n^P^!XGXXOW)K)&tyGX)Oa(1Ow2Llo zuQ{yNA4R>+bUe0jtL?4__DyVCrTqp04#medPF6kmMh2citJwh78&h1)z>|w?doD1^ zCsLcqzv=DP)N;^|czZLqrEAB}Y*S+aaZTj|=Anf4K(&s28fDn{T+wj~&Iu94Eo-K< zB2g|Q>SSAP0*Hp_YcFW@mB`B2*UCG{2BVFHon`8)jVFA?Q5Y_grn!|xg3D2fq%Xw=JRqO&z^g$>MMejh0q=9ka43Mb?T%G*T8w>|i?YlQ?=pLe0F@<3cOW z0xL{nfNEn3Y}0fxRDr0+f=4}O@47&~Il-49Dlp=r zYx_Bw7%#_CZwNsfX;vl)lsJlUkE@;uD4aUJJ_=)<%Zw)kvlt*?qEwetv>*5qdX!`| zCZypA!{sX45f(XvC8jxMTKZ0vPX_B+p!9+48IsB_h;|IGy+aCoxW^d??Y*Xk;)R)1 zrDW6!^2mX$s+9t{Vy~Q!z0%im?z$;L&o-Vetp==9X(N-$PaF@h`llWbzXWv67SI&3 zVuV7~#4sl&(_L)mN3(q1nxwXx_u<{?*{FC>afB#M`IIwMhgoe3$FV&)fydF`kx5*% z)>y&)q}@21BtAoJNl{9wTFnG4uPv5QA{w+top72U+hX{bbQm9f#^xBhU1vPRjb6}T zEqcG*Ofu1s;V$VccuI{9dki~?MJXoCSS5iJ6Z)j;#QC+sRJ+<(n*jAqirkxybuZ{A zA!(0G>U%rmlC-jseVm5x;F}Z?P`K5@lKVt=iYpq@qZ?D!8=-yGt0Xk2EFx2Ds=zRN zOx89jBNMVvQl3@^O(hc3mvwO}*U({EMsYWc$~oUU1(0lCze{m{5V4*SI&Nn2*1?+R z$jPeskR+HR6^V=BP9vCNCepKLa*{}*<8Gav)isP91?L1Vp$#YE&@+8Z)XEfHxg9^$ zHaw;?KN_D*1jn5&jX6i*m`WosjTsZytiGB2v4|y)USqE`Ik>ezSbs&nLbp^HXWR(5 zuQ?H=lW%_-4eUOL0&i8SF3UGgwoZ=KL1f`xj3YQ=KFUgXC%ciR@0Jlti?`b$a5Az9 zP$}NlL`G1yV+`g^mRu)H3UbJN9O7TrfDd+}aYnIt_=s(plH_Lpi|h9) zN<;&v@06QsEEEiPD@h%blqFlfq6^Hui?Q_bWNII0@U?~cK7$kI`?rcS6y>s}_^q&{ zfq}4afP+ed$lGD90+~!?l%YimcpBw*j|!V4p&Ft1txh_RG)TeZ#FLTy}#sOhx3G`v##x&%A;+YqRVou1mZrF?j}I(oqEDwH{MTF&#uo7j1x4J zM$t4*t|Z!T8xlD+q2Dr$#(7x|@x8w{A7|JzY0ymyFi2nuxt9Po&@%iWuvIX2cPf7~ z-aiSkL~CWKPc(;n67}GH+EiHnf>Eo>{_v#+JKywN4EDCU3^p_oy8vi*qI<6);F=8 zYlUPjKpEs9OCk|^nONf)uSJzL>G-@A0=7^>jnVL-BO||Wu!KI(KV`V2= zoPe^hV_5421p>Ru}@=OYW#o;Y1phl;B;(-WxtiV~P03~IHCdf-q#O~Vn=L=0zri0Mt!0n5I!7kAMV2z< zYf%lU90jp6^_G?y@5=+@gS#ihhN1+qOyu&qt?6UMT1->-PG>8Gx*i!j-_cg1j^gBJcBH39 zsT#*V0+qv|;t12WT`DaLoK^0&DT*dg^uafIw3wn6W=AadaAMIu9kY5LM?c=dd_Xf1 ze=mt;Ktd>R%+RA^B4hWwj z+r(k-IqyblyBA#M9`CH=V8_;i4c%Of+eXs1y>OwN?Ncn8elsK^tgU-*$j%E-qx|y- z7W-fE!hCksi-)(i>Q`u)Oq)>IP3_rpz>P^ksMAQW=8CBik zGOI+DG+yMxQrUNk%sOlUgJTV|b=NmGZAH9teVVHm{r!qY=gBp~3<%@KSnH1G&c_-6 zjStwFh+@HxqAbPhPcUmbNJBifDl^RiuhS=YPhUSfj$;Kg@jHq=io0Rp?Jm)Y5$RBs zF<8}Lw@7K9p~4VuZq%DRQkQ;Xv)5^IeP}4UX)bnIc6kx^O~iwlI7S=?;CIms&!xRXKw@+w>o?8GIM z6dJlw%$4Mlty+Je*<==kjIJWIyHN#=U5N!jJy$`P@o@}W3&$=hI)j0xW58AwGRZ^? zx=_okNbjny7Os?W=_jss(;Y{t6-m_TGroGU3S#jQFDPCH)JLdsjj}VoA2cB}6$Xdr z9#S2Y)_Iq!Hj_g@{e#J(Y}?BN!zf2u^UP8Xd4o`wd<;itfGXjzUYe_cfZpVGU27Jx zWF7h~V8;;sMH-8gSI+==Ma0(4zct;tw#Jrg%})^EE2$VU8{W%^A)0et^gMa#C@M)n zkX#^(WZ&gr07c%hrBd4WTt&L@dF_^UDOq_tn9Cya`HSFFwaMX3HZdy`YUh%K6jPY} z+>=r-st4QmeL{9^N=T>nw*zR8)l53mnM397k!@uavVt?bDnlgIpB_lJ0mP5m=%GZb z_|mz<;|F(!*?SBX-)T2_DkV-}nqBOy0JX2e7jj2w#JgWuGYcq)a50d?v0DHNjpnI( zHaNor)RXQpu(C*-6bQFLWrbw4Sz_-TU}+cB1`l}GL7S_@qA}m~BRQD%;^R!lMB1gM z<(+UDfQ*;lI#!bPT+Ej5=l#>gG@4G`{Z`fksV#JeP!RLTHSe&Tj2?Kalo~JXy^odF zC-5PGRGcj*uV9Pf<`jqb8|o2rxscRq6He1b8g&?8sW27A1igJpBUm2Imd=R6#5U;w zTMvG_-Gf^>y9#^~FH)%L~Crggb z2V+&mrF!&C6AnRqmU@g9d?MEVvMPR(vB8iA`TAKCVt_e|!QM29(DgcJsST#YqBtCm zn>gU|fjAS=TiQtohP_%`yHB6(@;!6~!LM@dtd}1sHDd^OpdZ;wYl>jE$#^eTn|tg= zPPeECKJO%^w_T8A&6Yoox%%9Gh@zQB7!#R3Z2C|j z#ziIBz1eO)eA@ppO6vBo%;58*9x1xmE%?aA6aNKH!&m9ZocUx|<(Oi*JNU^+oiv3x z`;b#79fDF^JhP+MbFx*eUwc!^M`FviBw)5SWg@T@T!a>Z+EjJ_dQSpv8-i$w2 zm1*55Drz4CrCg>hX`R=-N9ooex1xWr$WxD%#ey)DNfnc<(anxWbsZ0xRtMxp*NCY1 zL8L-t=LPdZXz8gSAx53kYSp*4m0=hyV|JozqSDN~D#>a+I9H8lYLVN3hvNog%wvGy!lSnvw{A7vk1{XSkGg}=DS7ftXpWDtDMe0o?-~SJCvQDC_>xO4g;ujgR1;N_mGLnH|CCYnF>jJ! zQ30OJs=kve=Qzw&CERBW^mlY1d>u4Y+osQsLx2isG{G?C z*w-HOS3<|G6-1J-Gmj5V(8Z9&=)OzE53th?41=mYl58#DcZw#85K(`rg)wEU@qxA( z-fM}RB+dtP8Z~rZ$H~;k&I3yups-RTv?N{ooFHMcBfvN`de!n>d><&C6Wgql zy*SE&;l_SSvWLFf*cP^g&Oi)j{wP-Vz)YT5yjIlidb``hAiNoVLiVb8o1r7l_5^c> zV)fX4h2S!Y6#?xfvFQ&X)1}iZZMt#%!Sd9R={i|e3PsfNn-$D@cdC5WkM~X8z~#H; z&Wm^QF-_Ih%^D>(TJS_yLyV2Y@lBQsT_0{%>aB%n4y;N8&Wl`!@b2mcFic#{T~Xb< zwK`!Yni4Wx*+$|Tp~I+})WnVV)Fsn&Uxcl?BfWz^$Tp*%{ARNHqwrESHlyZx_sCRa zaRy;lIAjkgp^PK*V+e})0`unlfX^&O(5QGOlg@w)M&G<~vMR*vWk#t0eoISR5?T=Z zlD;}6>r@=;YLavAfd!s>!KH)Mn6AN|v6y_UAY=GhKlua5iT~Xj@l?122T7ZXTbpHtj$p70+9W2`f7jJra$}e)g$IDDjGxD0d8*CF!g$`tnaEA z#i%vS_U70m7Wb-US&`7%O3E8hS0a-K_JfD%XGJC>XpYo&isbRjrq>mhpJw3IsRYQN z=kjk5zg)RKF`faeZPjaT2b=ArIxngZg=CzU>2T0(B6@5eK4MKmhFeF3xO>)K9-Atq zL-t2%x#i!RvBxx8F`a(4XxX^$l?uvH9NcEU7Rj_3z7t>nelQg z;$is-_=Qvz>>i)I)99PVrA9wzSsfeHb8(f9L0e`yu8EQONXoM_?`L7h;sb$MyfYnr z`$(zY0Rm3Ll>?r;y)FsClsB2!F*T6-oQjMvTD&+Yt2rm~Wi+<8qHn0gt=^REq1Ooa z+&MXDNw>N;fl28Xes(w$Hds#JT*ZCUF45}dtw$QE`ujVpY&x^I7i%4#;{!8zVtbQn z4m@=lwI3wJO52Lk-VeVeAS8DEcH~2`9kRW^l~<|qLe}A%9bkt68_-h^_l>sMjtCU( z77(&nG+%)kyTn5B(o^Rgmoa+Q=zgA~00q~w5B2rvf zC|VGSy0M6bIMQHc!9k^FTI@5xZ21k(dGQ2hx11aYkq>MsA>2m0bkZn2KVy3d7vFYMir!83&EX9RNXISlGLxc})MPfA z6O%k47ozhvmWB^;U5m5YpzR)}X!Mx3V6%oX}sI?O-)&_k%=^>Y^JD{QPX%D zf#r6+)CW$-iy-cCJ8Ur7ko3WM(e`BCyTRt3tZ}MNT~v>R%P!fY5PO4`zD=P~VeX0M zy6sUnh<{F>G5&FbS3lNFg3Iteq{jjfNNOcBqdF1=tPiIajL*}`{m7}76hA) z_7>eo%kpt!fDtPCjtzK`2V|t&3P9MdF^q z=~`B!aM4)q`8^g}{1Fb1+Jl}0%t#q&NPZz8lmeQ9Kl_T~`VtWx-Kv98s;S#c=5E; zP=!7ITwbnHD^8lK64jt!#KM&F<;nP)aZdmKJm`H{4nMJ)QvbzyO~!WlUM6gdm)Tw1 zbDWuKJ39TT3ML=$BYO}ag&ou}6GBP>K3QI!>h}qp&kOqk1Zx~vqWFPJR={pRDnivZ zZBu^QY34xRD;uR8HvrDiiZ$|s%ni#ULcE5U20M*Xvk_MSny7-9!B)V5x<^Z<8;yx1 zf}tphg639S2W&0lK6SqgwJk^`xn=l7)j+w=P zsqGLR9oo--x*Vc%>`bSKWbY!Ln7;&Vn`8>>uY+>|NGZ;OParSex+gPY1qcY&Ipn5Q z^o&&wuWvQZ$Pw(=`g!@8s!cg`qH4Sv8hqP1kM14Ln%74*jhf0-x`UmK?-Ng!4br*n zl1g=*+DeV&kZ3KZ51A69Y_$}$vxN?h?^q`5$5XFCv#L=AE1p%}P)%G}1CQ7W3CR{p zIGBJZ_8+UU=Q}9X!$<<%*F2JNhHtLSH*_{tf|O>&tB9Vc%jt)oWXh`28E2h`iBsGi zquBFBdg|9x6Z8O=B zaFub7&60{EK;F+=AsMSWnP`LjP$1ZWe8XU$_j!WGc%EJ*T{10}Uss~p21?3@{N@tZ z%~?A`4)IH7b;$HwrUxloNSHDF_upd6Pqje=!LAC?`ERgWnJTmyEMh57Ck%-X#P)P) zT|A9+4X(V>&jI5U4u?xsO!UvI$R@^Z#yPaTtEsK)Eq*R{vJe)cC~)WEAe-a_&}pOD z9lU$eUq-Vi4V_HlwkDY%>umCBVTx6XH+G1$S=8g}Y#8p^M+#7&2u3%%H811b z`--;IS_JyuCYA)$b)OwRbRtm5&-u=_(<>vk(1p5jE(T2Zcre!NL@|X{zjZ#7m8*_x z`)V%2j##un|1LFklazmj%L=B+-f}lfYK_LESkjf9*D`f-HC;5>oNWgi(7Y@^n7JT$+DM?0;DK7^Y0h4_7v~+j&YD zJ06X%6>P+fBHxHDvzeftNI~tWWPdt3)3BG;KL%X%F!m$nUcDc0dKUeFnM)p;b~-ck zKE)I;Mc=3ZJ)h3&an$Ifqc4jm%jv{9esBOsq{Yy`Bzs>X^>i(}Qn_%}G1KJ%C_nTF zc|Vhx&()W&PjWZ$5M$`Ff8ZI5N9+-Py(aHS!S{Z$zn2H#Bl#{8=z>m(9X4e!+FlO zGEz5B^Ds`Lbh$1%q{-cAT+2xpF*TmLrd1&$mZGdi$v$V?=<3O&g-(!2r$=oDO zPX~u#)~fu9_33t!#fPat_h$L~?AI-&J+mS_?M7Z;d-dCQI;a5}*Qr!c_B==H5TWi) zg`wFiDr|K@hPt+}pd1IY-L@|CiW*%Pvxxb{<(}LY)|aP;j{qCTAJ%gz=ENB1K@||S zx?+d@inWqa_ZEPB1~*rhwNW=FQ!>mFFECE2dpAQakaomJ6X|4iwJE&milMx{KckS%oFxKJ0VG$*6O{|RrRU<{fWjr zU?jnLPf8Dgq-T9Ma6oC!hXo=$Lz^nIIipJD4bV6RfZJPeW-MƟ=G<1+jy=X2Oq z<#l^_yURP>`cDj>Jkfj1&`tm3lAG{e5em+Lnf!})cnn@0?{P2TS6$u$;!*q&Gk3%@bu=FXzzwx-)ezF=5fZo z$s~;^=6*PTIf4c+Zt)8Gm3Kl@NbgX!mGRPG}fkQcf~kP5E+Sa2mV5n_)WG) zBA4xNe~8E8qu@O{cysy(MJ~R=MCL(%(KcVMbtzr;aWns=OU(0kgFh?6&jNRT27}%5 zdh}fLuuH0pn-6gL(&8)l^^dmsf9P*AIJoUVCb(drrIn$XDJMQH^<#WopsB8zo}sA$ zC%&SB5ETnPHy0uYo`5vJf{KhFuDOY!l{K!6BARNLlB5)W3?YS(0R%SMqxtR^>9fg`Q&ifa37QXqDuj7YwaI9 z)VrvE(ilMpVB(<7`|%R|bpJ0mz8~wKROIC~5dM+)XXP&fq91t-3{=g{EoJ|VotO8E zz@Iby*!uSh@Ecr=&G+dUB!AT6Yx$G4zNsW@`;q(?sh zCk76h?@s#!E>VFWJ^BSjRJ6bF{(lfo7g~G{mH&3I3(F|riU76sE>KP?V`&J|wzS9n zfyRbQ`yT-McR2nD(7!W%^~fK7zqs%-(2b300mim};Q5L0&wTz9^T*2HEASVAZfW&R zUj_ZY2k5^_oc{plKcMDcLek%haT#q(YeVacxe@nclKvU%3fej*z>Azt_aA`z_gwV7 z^c`1Ua{5nA-+cO0;U}mwiCXFF{S%?T7r8&_`j09dA)$0hx=n$r@p1bzfI{l5=v(eLmR{=vXs6Xq9S z|C4n5QSgh)KIiZsd-_M^f9B&q$o;(npON^{I==??Cu@CIN&JV!iOjEZxXP#J`7ZE> z$NNP1E5N4v*Vih4j+?&^zj}sN#<&-Y5d+{?jYNdxFM!PO?*m!J0iXNelb=fc8_#yzb`Ofrh%S12mZIQ^w z`lXAlr{Yxcb^4*5qx~i-^cv1^z|9O(*8>D(f{kg_b-C}?~B5( zvG-*bVf?p)PlWEP33+)%z8Amx@XMC>^Um+b=0BFdxa5D$Eb_@(-*6~v@O>x1`wJlR z^6Gvv**Aed8t@b4uK=0hUkCD^FkHs+tUxL=q3O#eQv>A$C(uW9Z_ z2Yi`Sero!)OMb%j|GH(rto>))ezeWswd`m7*Kz%ShNr(5?_bYye!%$`5*se_zYlbp z??C@Hw|?~27ur8M_J>jaUV(qMzx`ybZz{WvfmKd9|T!{VN3Q(pZa)ZydMUwjMylN$dYPZ6F~ literal 0 HcmV?d00001 diff --git a/tests/test_transfer_rsync.py b/tests/test_transfer_rsync.py new file mode 100644 index 0000000..078e092 --- /dev/null +++ b/tests/test_transfer_rsync.py @@ -0,0 +1,136 @@ +import shutil +from pathlib import Path +import subprocess + +import bagit +import pytest + +import digarch_scripts.transfer.transfer_rsync as tr + + +@pytest.fixture +def transfer_files(tmp_path: Path, request): + fixture_data = Path(request.module.__file__).parent / "fixtures" / "rsync" + shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) + return tmp_path + + +@pytest.fixture +def args(transfer_files): + args = [ + "script_name", + "--source", + str(transfer_files / "rsync_files"), + "--dest", + str(transfer_files), + "--carrierid", + "ACQ_1234_123456", + ] + return args + + +def test_requires_args( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script requires all five args""" + + for i in range(0, 3): + # remove a pair of list items (arg and value) for each test + part_args = args[0 : 2 * i + 1] + args[2 * i + 3 :] + + monkeypatch.setattr("sys.argv", part_args) + + with pytest.raises(SystemExit): + args = tr.parse_args() + + stderr = capsys.readouterr().err + + assert f"required: {args[2*i+1]}" in stderr + + +def test_arg_paths_must_exist( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script errors if path argument doesn't exist""" + + for i in range(1, 3): + bad_args = args + bad_path = "nonexistant" + bad_args[2 * i] = bad_path + + monkeypatch.setattr("sys.argv", bad_args) + with pytest.raises(SystemExit): + args = tr.parse_args() + + stderr = capsys.readouterr().err + + assert f"{bad_path} does not exist" in stderr + + +def test_id_arg_must_match_pattern( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test that script errors if id argument doesn't match ACQ_####_######""" + args[-1] = "bad_id" + monkeypatch.setattr("sys.argv", args) + with pytest.raises(SystemExit): + args = tr.parse_args() + + stderr = capsys.readouterr().err + + assert f"bad_id does not match" in stderr + + +def test_rsync_completes_successfully(transfer_files): + id = "ACQ_1234_123456" + source = transfer_files / "rsync_files" + dest = transfer_files / id + dest.mkdir() + tr.run_rsync(source, dest) + + assert (dest / "metadata" / f"{id}_rsync.log").exists() + assert (dest / "objects" / "data").is_dir() + assert True + + +def test_rsync_fails_gracefully(transfer_files, monkeypatch, caplog): + tr.run_rsync("/nonexistant", transfer_files) + + assert "Transfer did not complete successfully. Delete transferred files and re-run" in caplog.text + + +@pytest.fixture +def mounted_image(transfer_files): + image = transfer_files / "rsync_files.dmg" + mount_point = transfer_files / "new" + mount_point.mkdir() + process = subprocess.run(["hdiutil", "attach", image, "-mountpoint", mount_point]) + + return mount_point + + +def test_disktype_completes_successfully(mounted_image, transfer_files): + # source make and mount tiny disk image + dest = transfer_files + tr.run_disktype(mounted_image, dest) + assert (dest / "metadata" / f"{dest.name}_disktype.log").exists() + + +def test_disktype_skips_folders(transfer_files, caplog): + source = transfer_files / "rsync_files" + tr.run_disktype(source, transfer_files) + + assert "Disktype log cannot be generated for a folder. Skipping" in caplog.text + + +def test_full_run( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list +): + """Test end to end successful run""" + + monkeypatch.setattr("sys.argv", args) + tr.main() + + pkg_dir = Path(args[-3]) / args[-1][:-7] / args[-1] + assert pkg_dir.exists() + assert bagit.Bag(str(pkg_dir / "objects")).validate() From 7adbe71e3375bb3e6ec0af8984670fe65f206652 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Thu, 20 Jun 2024 13:21:00 -0400 Subject: [PATCH 31/35] move to single source folder for packaging --- src/digarch_scripts/package/package_images.py | 34 +++++++++--- .../image/{images => }/ACQ_1234_123456.img | 0 .../ACQ_1234_123456/ACQ_1234_123456.001 | 0 .../{logs => }/ACQ_1234_123456_process1.log | 0 .../{logs => }/ACQ_1234_123456_process2.log | 0 .../image/{images => }/ACQ_1234_123457.img | 0 .../ACQ_1234_123457/ACQ_1234_123457.001 | 0 .../{logs => }/ACQ_1234_123457_process21.log | 0 tests/test_package_images.py | 54 +++++++------------ 9 files changed, 46 insertions(+), 42 deletions(-) rename tests/fixtures/image/{images => }/ACQ_1234_123456.img (100%) rename tests/fixtures/image/{streams => }/ACQ_1234_123456/ACQ_1234_123456.001 (100%) rename tests/fixtures/image/{logs => }/ACQ_1234_123456_process1.log (100%) rename tests/fixtures/image/{logs => }/ACQ_1234_123456_process2.log (100%) rename tests/fixtures/image/{images => }/ACQ_1234_123457.img (100%) rename tests/fixtures/image/{streams => }/ACQ_1234_123457/ACQ_1234_123457.001 (100%) rename tests/fixtures/image/{logs => }/ACQ_1234_123457_process21.log (100%) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index f2827ca..392b4da 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -14,23 +14,35 @@ def parse_args() -> argparse.Namespace: + """ + Parse command line arguments. + :return: The parsed arguments. + """ parser = pb.TransferParser( description="Create packages for all disk imaging files for a single acquisition." ) parser.add_acqid() - parser.add_images_folder() - parser.add_logs_folder() - parser.add_streams_folder() + parser.add_source() parser.add_dest() return parser.parse_args() def find_carriers_image_files( - acq_id: str, images_dir: Path, log_dir: Path, stream_dir: Path + acq_id: str, source_dir: Path, log_dir: Path=None, stream_dir: Path=None ) -> dict: + """ + Find all carrier files for a given acquisition ID in the source directory. + """ + + # Optional args kept in case process changes back to multiple source folders + if not log_dir: + log_dir = source_dir + if not stream_dir: + stream_dir = source_dir + carrier_files = pb.find_category_of_carrier_files( - {}, acq_id, images_dir, IMG_EXTS, "images" + {}, acq_id, source_dir, IMG_EXTS, "images" ) carrier_files = pb.find_category_of_carrier_files( carrier_files, acq_id, log_dir, LOG_EXTS, "logs" @@ -46,6 +58,9 @@ def find_carriers_image_files( def validate_carriers_image_files(carrier_files: dict) -> bool: + """ + Validate that all required files are present for each carrier. + """ result = True for carrier_name in carrier_files: carrier = carrier_files[carrier_name] @@ -66,7 +81,6 @@ def validate_carriers_image_files(carrier_files: dict) -> bool: two_sided = True for image in carrier["images"]: if not re.match(r"s\d\.001", image.name[-6:]): - print(image.name[-6:]) two_sided = False if not two_sided: LOGGER.warning( @@ -96,6 +110,9 @@ def validate_carriers_image_files(carrier_files: dict) -> bool: def package_carriers_image_files(carrier_files: dict, acq_dir: Path) -> None: + """ + Create packages for all carriers in the carrier_files dictionary. + """ for carrier, files in carrier_files.items(): try: base_dir = pb.create_package_dir(acq_dir, carrier) @@ -114,10 +131,13 @@ def package_carriers_image_files(carrier_files: dict, acq_dir: Path) -> None: def main(): + """ + Main function for packaging images. + """ args = parse_args() carrier_files = find_carriers_image_files( - args.acqid, args.images_folder, args.logs_folder, args.streams_folder + args.acqid, args.source ) if validate_carriers_image_files(carrier_files): diff --git a/tests/fixtures/image/images/ACQ_1234_123456.img b/tests/fixtures/image/ACQ_1234_123456.img similarity index 100% rename from tests/fixtures/image/images/ACQ_1234_123456.img rename to tests/fixtures/image/ACQ_1234_123456.img diff --git a/tests/fixtures/image/streams/ACQ_1234_123456/ACQ_1234_123456.001 b/tests/fixtures/image/ACQ_1234_123456/ACQ_1234_123456.001 similarity index 100% rename from tests/fixtures/image/streams/ACQ_1234_123456/ACQ_1234_123456.001 rename to tests/fixtures/image/ACQ_1234_123456/ACQ_1234_123456.001 diff --git a/tests/fixtures/image/logs/ACQ_1234_123456_process1.log b/tests/fixtures/image/ACQ_1234_123456_process1.log similarity index 100% rename from tests/fixtures/image/logs/ACQ_1234_123456_process1.log rename to tests/fixtures/image/ACQ_1234_123456_process1.log diff --git a/tests/fixtures/image/logs/ACQ_1234_123456_process2.log b/tests/fixtures/image/ACQ_1234_123456_process2.log similarity index 100% rename from tests/fixtures/image/logs/ACQ_1234_123456_process2.log rename to tests/fixtures/image/ACQ_1234_123456_process2.log diff --git a/tests/fixtures/image/images/ACQ_1234_123457.img b/tests/fixtures/image/ACQ_1234_123457.img similarity index 100% rename from tests/fixtures/image/images/ACQ_1234_123457.img rename to tests/fixtures/image/ACQ_1234_123457.img diff --git a/tests/fixtures/image/streams/ACQ_1234_123457/ACQ_1234_123457.001 b/tests/fixtures/image/ACQ_1234_123457/ACQ_1234_123457.001 similarity index 100% rename from tests/fixtures/image/streams/ACQ_1234_123457/ACQ_1234_123457.001 rename to tests/fixtures/image/ACQ_1234_123457/ACQ_1234_123457.001 diff --git a/tests/fixtures/image/logs/ACQ_1234_123457_process21.log b/tests/fixtures/image/ACQ_1234_123457_process21.log similarity index 100% rename from tests/fixtures/image/logs/ACQ_1234_123457_process21.log rename to tests/fixtures/image/ACQ_1234_123457_process21.log diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 7df6692..0c06d5a 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -19,16 +19,12 @@ def transfer_files(tmp_path: Path, request): def args(transfer_files): args = [ "script_name", - "--images_folder", - str(transfer_files / "images"), + "--source", + str(transfer_files), "--dest", str(transfer_files), "--acqid", "ACQ_1234", - "--streams_folder", - str(transfer_files / "streams"), - "--logs_folder", - str(transfer_files / "logs"), ] return args @@ -38,7 +34,7 @@ def test_requires_args( ): """Test that script requires image, dest, and id (first 3 args)""" - for i in range(0, 3): + for i in range(0, 2): # remove a pair of list items (arg and value) for each test part_args = args[0 : 2 * i + 1] + args[2 * i + 3 :] @@ -52,29 +48,12 @@ def test_requires_args( assert f"required: {args[2*i+1]}" in stderr -def test_optional_args( - monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list -): - """Test that script requires all five args""" - - for i in [3, 4]: - # remove a pair of list items (arg and value) for each test - part_args = args[0 : 2 * i + 1] + args[2 * i + 3 :] - missing_arg = args[2 * i] - - monkeypatch.setattr("sys.argv", part_args) - - parsed_args = pi.parse_args() - - assert missing_arg not in parsed_args - - def test_arg_paths_must_exist( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture, args: list ): """Test that script errors if a path argument doesn't exist""" - for i in [1, 2, 4, 5]: + for i in [1, 2]: bad_args = args bad_path = "nonexistant" bad_args[2 * i] = bad_path @@ -107,9 +86,9 @@ def test_carrier_files_found(transfer_files): carrier_files = pi.find_carriers_image_files( acq_id, - transfer_files / "images", - transfer_files / "logs", - transfer_files / "streams", + transfer_files, + transfer_files, + transfer_files, ) carrier1 = f"{acq_id}_123456" @@ -127,9 +106,7 @@ def test_acqid_not_found(transfer_files): with pytest.raises(Warning) as exc: pi.find_carriers_image_files( acq_id, - transfer_files / "images", - transfer_files / "logs", - transfer_files / "streams", + transfer_files, ) assert f"No files found with the acquisition ID {acq_id} in filename" in str( @@ -143,9 +120,7 @@ def carrier_files(transfer_files): carrier_files = pi.find_carriers_image_files( acq_id, - transfer_files / "images", - transfer_files / "logs", - transfer_files / "streams", + transfer_files, ) return carrier_files @@ -280,4 +255,13 @@ def test_full_run( acq_dir = Path(args[4]) / args[6] assert acq_dir.exists() - assert "ACQ_1234_123456" in [x.name for x in acq_dir.iterdir()] + carrier = "ACQ_1234_123456" + + assert carrier in [x.name for x in acq_dir.iterdir()] + + for x in ['streams', 'images']: + component = (acq_dir / carrier / x) + assert component.exists() + assert bagit.Bag(str(component)).validate() + + assert (acq_dir / carrier / "metadata").exists() From 402384116efd817e9716279ef930717df431098e Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Thu, 20 Jun 2024 14:01:14 -0400 Subject: [PATCH 32/35] formatting --- src/digarch_scripts/package/package_base.py | 20 +++++-- src/digarch_scripts/package/package_images.py | 7 +-- .../transfer/transfer_rsync.py | 28 ++++++---- tests/test_package_base.py | 52 ++++++++++++++----- tests/test_package_images.py | 40 +++++++------- tests/test_transfer_rsync.py | 7 ++- 6 files changed, 99 insertions(+), 55 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 7ebb13d..908688e 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -115,8 +115,16 @@ def add_streams_folder(self) -> None: def add_dest(self) -> None: self.add_argument("--dest", required=True, type=self.extant_path) + def add_transfer(self) -> None: + self.add_argument( + "--transfers", + required=True, + type=self.extant_path, + help="Path to the directory containing all transfers", + ) + def add_quiet(self, **kwargs) -> None: - self.add_argument("-q", "--quiet", action='store_true', **kwargs) + self.add_argument("-q", "--quiet", action="store_true", **kwargs) def find_category_of_carrier_files( @@ -254,7 +262,7 @@ def create_bag_in_objects( [objects_path], pkg_dir, "objects", manifest_source, manifest_type ) if not list(objects_path.iterdir()): - objects_path.rmdir() + objects_path.rmdir() return None @@ -292,7 +300,9 @@ def convert_rclone_md5_to_bagit_manifest(md5_path: Path, bag_dir: Path) -> None: return None -def convert_rsync_log_to_bagit_manifest(rsync_log: Path, bag_dir: Path, prefix: Path = None) -> None: +def convert_rsync_log_to_bagit_manifest( + rsync_log: Path, bag_dir: Path, prefix: Path = None +) -> None: # check for manifest new_md5_path = bag_dir / "manifest-md5.txt" if new_md5_path.exists(): @@ -325,7 +335,9 @@ def convert_rsync_log_to_bagit_manifest(rsync_log: Path, bag_dir: Path, prefix: if not poss_md5_hash: continue elif not re.match(r"[0-9a-f]{32}", poss_md5_hash): - LOGGER.warning(f"{str(rsync_log.name)} should be formatted with md5 hash in the 3rd comma-separated fields. Skipping this line: {line}") + LOGGER.warning( + f"{str(rsync_log.name)} should be formatted with md5 hash in the 3rd comma-separated fields. Skipping this line: {line}" + ) continue manifest_data.append(f"{poss_md5_hash} {poss_rel_path}\n") diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 392b4da..92cd214 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -29,7 +29,7 @@ def parse_args() -> argparse.Namespace: def find_carriers_image_files( - acq_id: str, source_dir: Path, log_dir: Path=None, stream_dir: Path=None + acq_id: str, source_dir: Path, log_dir: Path = None, stream_dir: Path = None ) -> dict: """ Find all carrier files for a given acquisition ID in the source directory. @@ -105,7 +105,6 @@ def validate_carriers_image_files(carrier_files: dict) -> bool: ) result = False - return result @@ -136,9 +135,7 @@ def main(): """ args = parse_args() - carrier_files = find_carriers_image_files( - args.acqid, args.source - ) + carrier_files = find_carriers_image_files(args.acqid, args.source) if validate_carriers_image_files(carrier_files): package_carriers_image_files(carrier_files, args.dest) diff --git a/src/digarch_scripts/transfer/transfer_rsync.py b/src/digarch_scripts/transfer/transfer_rsync.py index a9b2c31..9cccc95 100644 --- a/src/digarch_scripts/transfer/transfer_rsync.py +++ b/src/digarch_scripts/transfer/transfer_rsync.py @@ -23,10 +23,10 @@ def parse_args() -> argparse.Namespace: def run_rsync(source: Path, dest: Path, quiet: bool = None) -> None: - log_folder = (dest / "metadata") + log_folder = dest / "metadata" log_folder.mkdir() log_file = log_folder / f"{dest.name}_rsync.log" - objects_folder = (dest / "objects") + objects_folder = dest / "objects" objects_folder.mkdir() cmd = [ @@ -45,7 +45,9 @@ def run_rsync(source: Path, dest: Path, quiet: bool = None) -> None: process = subprocess.run(cmd) if process.returncode != 0: - LOGGER.warning("Transfer did not complete successfully. Delete transferred files and re-run") + LOGGER.warning( + "Transfer did not complete successfully. Delete transferred files and re-run" + ) return @@ -57,20 +59,24 @@ def create_bag_files_in_objects(base_dir: Path, rsync_log: Path, source: Path): def run_disktype(source: Path, dest: Path) -> None: - #determine device to unmount and run disktype on + # determine device to unmount and run disktype on if not source.is_mount(): LOGGER.info(f"Disktype log cannot be generated for a folder. Skipping") return output = subprocess.check_output(["df", source]).decode("utf8") - device = re.search(r'(/dev/[a-z0-9]+)', output).group(0) - parent_device = re.search(r'(/dev/[a-z]+\d)', device).group(0) + device = re.search(r"(/dev/[a-z0-9]+)", output).group(0) + parent_device = re.search(r"(/dev/[a-z]+\d)", device).group(0) - LOGGER.info(f"Dismounting device {device} in order to run disktype, may require password for sudo") - process = subprocess.run(['diskutil', 'unmount', device]) + LOGGER.info( + f"Dismounting device {device} in order to run disktype, may require password for sudo" + ) + process = subprocess.run(["diskutil", "unmount", device]) if process.returncode != 0: - LOGGER.warning(f"Unable to dismount {source}. Disktype report not generated. Create manually") + LOGGER.warning( + f"Unable to dismount {source}. Disktype report not generated. Create manually" + ) return output = subprocess.check_output(["sudo", "disktype", parent_device]).decode("utf8") @@ -82,8 +88,8 @@ def run_disktype(source: Path, dest: Path) -> None: with open(dest / "metadata" / f"{dest.name}_disktype.log", "w") as f: f.write(output) - #remount - subprocess.run(['diskutil', 'mount', device]) + # remount + subprocess.run(["diskutil", "mount", device]) LOGGER.info("Device remounted") return diff --git a/tests/test_package_base.py b/tests/test_package_base.py index ad5b120..e39ec29 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -215,7 +215,11 @@ def test_do_not_overwrite_file( @pytest.mark.parametrize("test_function,dest", MOVE_FILES) def test_move_multiple_file( - package_base_dir: Path, rclone_log: Path, rclone_md5_manifest: Path, test_function, dest: str + package_base_dir: Path, + rclone_log: Path, + rclone_md5_manifest: Path, + test_function, + dest: str, ): """Test that multiple files are moved successfully""" parts = dest.split("/") @@ -230,7 +234,11 @@ def test_move_multiple_file( @pytest.mark.parametrize("test_function,dest", MOVE_FILES) def test_partial_halt_multiple_files( - package_base_dir: Path, rclone_log: Path, rclone_md5_manifest: Path, test_function, dest: str + package_base_dir: Path, + rclone_log: Path, + rclone_md5_manifest: Path, + test_function, + dest: str, ): """Test that warning is issued for multiple move if a single metadata move fails""" @@ -293,49 +301,65 @@ def test_convert_rsync_log(rsync_bag_payload: Path, rsync_log: Path, rsync_files md5_paths = [line.strip().split(" ")[-1] for line in m.readlines()] payload_files = [ - str(path.relative_to(rsync_bag_payload.parent)) for path in rsync_bag_payload.rglob("*") + str(path.relative_to(rsync_bag_payload.parent)) + for path in rsync_bag_payload.rglob("*") ] for a_file in md5_paths: assert a_file in payload_files -def test_convert_rsync_log_replaces_prefix_with_data(rsync_bag_payload: Path, rsync_log: Path): - prefix = "/Users/fortitude/dev/digarch-scripts-poetry/tests/fixtures/rsync/rsync_files" +def test_convert_rsync_log_replaces_prefix_with_data( + rsync_bag_payload: Path, rsync_log: Path +): + prefix = ( + "/Users/fortitude/dev/digarch-scripts-poetry/tests/fixtures/rsync/rsync_files" + ) pb.convert_rsync_log_to_bagit_manifest(rsync_log, rsync_bag_payload.parent, prefix) bag_md5 = rsync_bag_payload.parent / "manifest-md5.txt" - #extract paths from manifest + # extract paths from manifest with open(bag_md5) as m: md5_paths = [line.strip().split(" ")[-1] for line in m.readlines()] - #extract paths from log + # extract paths from log rsync_paths = [] with open(rsync_log) as m: lines = m.readlines() for line in lines: parts = line.strip().split(", ") if len(parts) > 3 and parts[2].strip(): - rsync_paths.append(line.strip().split(", ")[-1].replace(prefix[1:], 'data')) + rsync_paths.append( + line.strip().split(", ")[-1].replace(prefix[1:], "data") + ) - #assert difference + # assert difference assert set(md5_paths) == set(rsync_paths) -def test_convert_rsync_log_requires_specific_format(rsync_bag_payload: Path, rsync_log: Path, caplog): - rsync_log.write_text('time, size, not a hash, good/path') +def test_convert_rsync_log_requires_specific_format( + rsync_bag_payload: Path, rsync_log: Path, caplog +): + rsync_log.write_text("time, size, not a hash, good/path") pb.convert_rsync_log_to_bagit_manifest(rsync_log, rsync_bag_payload.parent) - assert f"{str(rsync_log.name)} should be formatted with md5 hash in the 3rd comma-separated fields" in caplog.text + assert ( + f"{str(rsync_log.name)} should be formatted with md5 hash in the 3rd comma-separated fields" + in caplog.text + ) -def test_create_objects_bag(package_base_dir: Path, rclone_payload: Path, rclone_md5_manifest: Path): +def test_create_objects_bag( + package_base_dir: Path, rclone_payload: Path, rclone_md5_manifest: Path +): """Test that all tag files are created and rclone md5sums are correctly converted""" bag_path = package_base_dir / "objects" # might need further testing of the oxum and manifest converter functions - pb.create_bag_in_objects(rclone_payload, package_base_dir, rclone_md5_manifest, 'rclone') + pb.create_bag_in_objects( + rclone_payload, package_base_dir, rclone_md5_manifest, "rclone" + ) assert bagit.Bag(str(bag_path)).validate(completeness_only=True) assert not rclone_payload.exists() diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 0c06d5a..3249400 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -147,16 +147,13 @@ def test_warn_carrier_with_one_missing_category(carrier_files, key, caplog): def test_warn_more_than_one_image(carrier_files, caplog): carrier = "ACQ_1234_123457" - second_image = carrier_files[carrier]["images"][0].with_suffix('.img2') - second_image.write_text('0') + second_image = carrier_files[carrier]["images"][0].with_suffix(".img2") + second_image.write_text("0") carrier_files[carrier]["images"].append(second_image) result = pi.validate_carriers_image_files(carrier_files) - assert ( - f'Multiple image files found for {carrier}. Only 1 allowed' - in caplog.text - ) + assert f"Multiple image files found for {carrier}. Only 1 allowed" in caplog.text assert not result @@ -164,9 +161,13 @@ def test_accept_two_sided_images(carrier_files): carrier = "ACQ_1234_123457" image_name = carrier_files[carrier]["images"][0].name - first_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "s0.001") - second_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "s1.001") - second_image.write_text('0') + first_image = carrier_files[carrier]["images"][0].parent / image_name.replace( + ".img", "s0.001" + ) + second_image = carrier_files[carrier]["images"][0].parent / image_name.replace( + ".img", "s1.001" + ) + second_image.write_text("0") carrier_files[carrier]["images"][0].rename(first_image) carrier_files[carrier]["images"] = [first_image, second_image] @@ -180,9 +181,13 @@ def test_warn_on_malformed_two_sided_image_filename(carrier_files, caplog): carrier = "ACQ_1234_123457" image_name = carrier_files[carrier]["images"][0].name - first_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "side0.001") - second_image = carrier_files[carrier]["images"][0].parent / image_name.replace(".img", "side1.001") - second_image.write_text('0') + first_image = carrier_files[carrier]["images"][0].parent / image_name.replace( + ".img", "side0.001" + ) + second_image = carrier_files[carrier]["images"][0].parent / image_name.replace( + ".img", "side1.001" + ) + second_image.write_text("0") carrier_files[carrier]["images"][0].rename(first_image) carrier_files[carrier]["images"] = [first_image, second_image] @@ -190,7 +195,7 @@ def test_warn_on_malformed_two_sided_image_filename(carrier_files, caplog): result = pi.validate_carriers_image_files(carrier_files) assert ( - 'If carrier has 2 disk formats, file names must end with s0.001 or s1.001' + "If carrier has 2 disk formats, file names must end with s0.001 or s1.001" in caplog.text ) @@ -218,10 +223,7 @@ def test_warn_streams_folder_empty(carrier_files, caplog): result = pi.validate_carriers_image_files(carrier_files) - assert ( - f'Streams folder for {carrier} appears to be empty' - in caplog.text - ) + assert f"Streams folder for {carrier} appears to be empty" in caplog.text assert not result @@ -259,8 +261,8 @@ def test_full_run( assert carrier in [x.name for x in acq_dir.iterdir()] - for x in ['streams', 'images']: - component = (acq_dir / carrier / x) + for x in ["streams", "images"]: + component = acq_dir / carrier / x assert component.exists() assert bagit.Bag(str(component)).validate() diff --git a/tests/test_transfer_rsync.py b/tests/test_transfer_rsync.py index 078e092..d94d997 100644 --- a/tests/test_transfer_rsync.py +++ b/tests/test_transfer_rsync.py @@ -1,6 +1,6 @@ import shutil -from pathlib import Path import subprocess +from pathlib import Path import bagit import pytest @@ -96,7 +96,10 @@ def test_rsync_completes_successfully(transfer_files): def test_rsync_fails_gracefully(transfer_files, monkeypatch, caplog): tr.run_rsync("/nonexistant", transfer_files) - assert "Transfer did not complete successfully. Delete transferred files and re-run" in caplog.text + assert ( + "Transfer did not complete successfully. Delete transferred files and re-run" + in caplog.text + ) @pytest.fixture From 83b37298f2128ae2721bfa847815f1ffe49be211 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Thu, 20 Jun 2024 15:37:38 -0400 Subject: [PATCH 33/35] fix bug with manifest new lines --- src/digarch_scripts/package/package_base.py | 2 +- tests/fixtures/image/ACQ_1234_123456/ACQ_1234_123456.002 | 0 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 tests/fixtures/image/ACQ_1234_123456/ACQ_1234_123456.002 diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 908688e..694a3e5 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -276,7 +276,7 @@ def create_bagit_manifest(paths: list[Path], bag_dir: Path) -> None: with open(bag_dir / "manifest-md5.txt", "w") as f: for line in manifest_lines: - f.write(f"{line[0]} {line[1]}") + f.write(f"{line[0]} {line[1]}\n") return None diff --git a/tests/fixtures/image/ACQ_1234_123456/ACQ_1234_123456.002 b/tests/fixtures/image/ACQ_1234_123456/ACQ_1234_123456.002 new file mode 100644 index 0000000..e69de29 From 34b81376e24de72b5d252137541cfc980d980e38 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Thu, 20 Jun 2024 16:02:44 -0400 Subject: [PATCH 34/35] catch when streams contains folders --- src/digarch_scripts/package/package_base.py | 2 ++ src/digarch_scripts/package/package_images.py | 10 ++++-- tests/test_package_base.py | 34 ++++++++++++++----- tests/test_package_images.py | 11 ++++++ 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 694a3e5..8914345 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -271,6 +271,8 @@ def create_bagit_manifest(paths: list[Path], bag_dir: Path) -> None: # paths must be files manifest_lines = [] for path in paths: + if path.is_dir(): + raise IsADirectoryError(f"{path} is a directory, skipping") md5_hash = bagit.generate_manifest_lines(str(path), ["md5"])[0][1] manifest_lines.append([md5_hash, Path("data") / path.name]) diff --git a/src/digarch_scripts/package/package_images.py b/src/digarch_scripts/package/package_images.py index 92cd214..7bab879 100644 --- a/src/digarch_scripts/package/package_images.py +++ b/src/digarch_scripts/package/package_images.py @@ -104,6 +104,12 @@ def validate_carriers_image_files(carrier_files: dict) -> bool: f'Streams folder for {carrier_name} appears to be empty: {carrier["streams"][0]}' ) result = False + for child in carrier["streams"][0].iterdir(): + if child.is_dir(): + LOGGER.warning( + f"Folders found with streams folder for {carrier_name}. None allowed: {child}" + ) + result = False return result @@ -118,9 +124,9 @@ def package_carriers_image_files(carrier_files: dict, acq_dir: Path) -> None: pb.move_metadata_files(files["logs"], base_dir) pb.create_bag_in_images(files["images"], base_dir) pb.create_bag_in_streams(files["streams"][0], base_dir) - except: + except Exception as e: LOGGER.error( - f"Packaging incomplete for {carrier}. Address warnings manually." + f"Packaging incomplete for {carrier}. Address warnings manually.\n{e}" ) finally: pb.validate_images_bag(base_dir) diff --git a/tests/test_package_base.py b/tests/test_package_base.py index e39ec29..b3fcbbe 100644 --- a/tests/test_package_base.py +++ b/tests/test_package_base.py @@ -79,11 +79,11 @@ def test_file_found(image_files): carrier_files = {} carrier_files = pb.find_category_of_carrier_files( - carrier_files, acq_id, image_files / "images", [".img"], "images" + carrier_files, acq_id, image_files, [".img"], "images" ) assert ( - image_files / "images" / "ACQ_1234_123456.img" + image_files / "ACQ_1234_123456.img" in carrier_files[f"{acq_id}_123456"]["images"] ) @@ -93,7 +93,7 @@ def test_ignore_unknown_extension_for_category(image_files): carrier_files = {} carrier_files = pb.find_category_of_carrier_files( - carrier_files, acq_id, image_files / "images", [".001"], "images" + carrier_files, acq_id, image_files, [".001"], "images" ) assert f"{acq_id}_123456" not in carrier_files @@ -104,7 +104,7 @@ def test_multiple_files_found(image_files): carrier_files = {} carrier_files = pb.find_category_of_carrier_files( - carrier_files, acq_id, image_files / "logs", [".log"], "logs" + carrier_files, acq_id, image_files, [".log"], "logs" ) assert len(carrier_files[f"{acq_id}_123456"]["logs"]) == 2 @@ -366,21 +366,39 @@ def test_create_objects_bag( def test_create_streams_bag(package_base_dir: Path, image_files: Path): - """Test that all tag files are created and rclone md5sums are correctly converted""" + """Test that all tag files are created and new md5s are correctly created""" - streams_path = image_files / "streams" / "ACQ_1234_123456" + streams_path = image_files / "ACQ_1234_123456" bag_path = package_base_dir / "streams" - # might need further testing of the oxum and manifest converter functions pb.create_bag_in_streams(streams_path, package_base_dir) assert bagit.Bag(str(bag_path)).validate(completeness_only=True) assert not streams_path.exists() +def test_error_on_folder_when_creating_streams_bag( + package_base_dir: Path, image_files: Path +): + """Test that Exception is raised when streams folder contains a child directory""" + + streams_path = image_files / "ACQ_1234_123456" + subdir = streams_path / "subdir" + subdir.mkdir() + streams_contents = set(streams_path.iterdir()) + bag_path = package_base_dir / "streams" + + with pytest.raises(IsADirectoryError) as exc: + pb.create_bag_in_streams(streams_path, package_base_dir) + + assert f"{str(subdir)} is a directory, skipping" in str(exc.value) + assert set(streams_path.iterdir()) == streams_contents + assert not list(bag_path.iterdir()) + + def test_generate_valid_oxum(transfer_files: Path): """Test that script generates oxum correctly""" - # test with entire fixture to text folder recursion + # test with entire fixture to test folder recursion total_bytes, total_files = pb.get_oxum(transfer_files) diff --git a/tests/test_package_images.py b/tests/test_package_images.py index 3249400..1f93d0f 100644 --- a/tests/test_package_images.py +++ b/tests/test_package_images.py @@ -239,6 +239,17 @@ def test_warn_only_one_stream_folder_allowed(carrier_files, caplog): assert not result +def test_warn_stream_folder_contains_folders(carrier_files, caplog): + carrier = "ACQ_1234_123457" + (carrier_files[carrier]["streams"][0] / "subfolder").mkdir() + result = pi.validate_carriers_image_files(carrier_files) + + assert ( + f"Folders found with streams folder for {carrier}. None allowed" in caplog.text + ) + assert not result + + def test_good_packaging(carrier_files, tmp_path: Path): pi.package_carriers_image_files(carrier_files, tmp_path) From 82937431b5013fe83125bea467459893c5555f20 Mon Sep 17 00:00:00 2001 From: Nick Krabbenhoeft Date: Mon, 12 Aug 2024 15:25:35 -0400 Subject: [PATCH 35/35] initial commit on report transfers --- src/digarch_scripts/package/package_base.py | 2 +- .../report/report_transfers.py | 158 ++++++++++++++++++ tests/test_report_transfers.py | 53 ++++++ 3 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 src/digarch_scripts/report/report_transfers.py create mode 100644 tests/test_report_transfers.py diff --git a/src/digarch_scripts/package/package_base.py b/src/digarch_scripts/package/package_base.py index 8914345..ffb564d 100644 --- a/src/digarch_scripts/package/package_base.py +++ b/src/digarch_scripts/package/package_base.py @@ -115,7 +115,7 @@ def add_streams_folder(self) -> None: def add_dest(self) -> None: self.add_argument("--dest", required=True, type=self.extant_path) - def add_transfer(self) -> None: + def add_transfers(self) -> None: self.add_argument( "--transfers", required=True, diff --git a/src/digarch_scripts/report/report_transfers.py b/src/digarch_scripts/report/report_transfers.py new file mode 100644 index 0000000..5fd817f --- /dev/null +++ b/src/digarch_scripts/report/report_transfers.py @@ -0,0 +1,158 @@ +import csv +import logging +from datetime import date +from pathlib import Path + +from digarch_scripts.package import package_base + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) + + +def parse_args(): + """ + Parse command line arguments. + + :return: The parsed arguments. + """ + parser = package_base.TransferParser( + description="Report on the transfers in a directory." + ) + parser.add_acqid() + parser.add_transfers() + parser.add_dest() + + return parser.parse_args() + + +def collect_stats(transfer_path: Path) -> list[date, int, int, int, int]: + """ + Collects statistics about the transfers in the given directory. + + :param path: Path to the directory containing the transfer files. + :return: A tuple containing the date of transfer, number of image files, + cumulative size of image files, number of object files, and + cumulative size of object files. + """ + + # initialize the image and object statistics + image_date = None + image_stats = [] + object_date = None + object_stats = [] + + # Iterate over the files in the directory. + for path in transfer_path.iterdir(): + # Skip directories. + if path.name == "images": + image_date, image_stats = collect_bag_stats(path) + elif path.name == "objects": + object_date, object_stats = collect_bag_stats(path) + else: + continue + + # Return the statistics + stats_stub = transfer_path.name.rsplit("_", 1) + + if not object_stats and not image_stats: + LOGGER.info(f"No images or objects found for {transfer_path}.") + return None + else: + if image_date: + stats_stub.append(image_date) + else: + stats_stub.append(object_date) + stats_stub.extend(image_stats if image_stats else [0, 0]) + stats_stub.extend(object_stats if object_stats else [0, 0]) + + return stats_stub + + +def collect_bag_stats(bag_path: Path) -> tuple[date, list[int, int]]: + """ + Collects statistics from a bag in the given directory. + + :param path: Path to the directory containing the object transfer files. + :return: A tuple containing the date of the transfers and a list of the + number of files and cumulative size of files. + """ + + # Initialize the statistics + bagdate = None + size = 0 + files = 0 + + # Check that image_path is a bag + possible_bag_info = bag_path / "bag-info.txt" + if not possible_bag_info.exists(): + LOGGER.warning(f"Directory should be formatted as a bag: {bag_path}") + return None + + else: + with open(possible_bag_info, "r") as bag_info: + for line in bag_info: + if line.startswith("Bagging-Date:"): + bagdate = date.fromisoformat(line.split(":")[1].strip()) + elif line.startswith("Payload-Oxum:"): + size, files = line.split(":")[1].strip().split(".") + + if not bagdate: + LOGGER.warning(f"Bagging date not found in {possible_bag_info}") + return None + + if not size or not files: + LOGGER.warning(f"Bagging size or files not found in {possible_bag_info}") + return None + + return bagdate, [int(files), int(size)] + + +def write_stats(stats: list, dest: Path, acqid: str) -> None: + """ + Write the statistics to a report file. + + :param stats: A list of lists containing the date of transfer, number of image files, + cumulative size of image files, number of object files, and cumulative size of object files. + :param dest: The destination directory for the report. + :param acqid: The acquisition ID. + """ + with open(dest / f"{acqid}_transfer_report.txt", "w") as report: + writer = csv.writer(report) + writer.writerow( + ["acquisition_id", "object_id", "date", "image_files", "image_size", "object_files", "object_size"] + ) + writer.writerows(stats) + + return None + + +def main(): + """ + Main function for reporting on transfers. + + Collects statistics on the transfers in the given directory and writes them to a report file. + """ + args = parse_args() + + acq_folder = args.transfers / args.acqid + + if not acq_folder.exists(): + LOGGER.error(f"Transfer folder not found: {acq_folder}") + return + else: + all_stats = [] + for transfer in acq_folder.iterdir(): + stats = collect_stats(transfer) + if stats: + LOGGER.info(stats) + all_stats.append(stats) + else: + LOGGER.warning(f"No stats found for {transfer}") + + write_stats(all_stats, args.dest, args.acqid) + + return None + + +if __name__ == "__main__": + main() diff --git a/tests/test_report_transfers.py b/tests/test_report_transfers.py new file mode 100644 index 0000000..848ccee --- /dev/null +++ b/tests/test_report_transfers.py @@ -0,0 +1,53 @@ +from pathlib import Path +import pytest +import shutil + +import digarch_scripts.package.package_images as pi +import digarch_scripts.report.report_transfers as rt + + +@pytest.fixture +def transfer_dir(tmp_path: Path, request): + fixture_data = Path(request.module.__file__).parent / "fixtures" / "image" + shutil.copytree(fixture_data, tmp_path, dirs_exist_ok=True) + acq_id = "ACQ_1234" + carrier_files = pi.find_carriers_image_files( + acq_id, + tmp_path, + ) + pi.package_carriers_image_files(carrier_files, tmp_path) + return tmp_path / acq_id + + + +def test_parse_args(transfers_dir): + assert rt.parse_args() == rt.parse_args() + + +def test_collect_stats(transfers_dir): + assert rt.collect_stats(Path("test")) == rt.collect_stats(Path("test")) + + +def test_collect_bag_stats(transfers_dir): + assert rt.collect_bag_stats(Path("test")) == rt.collect_bag_stats(Path("test")) + + +def test_warn_on_invalid_bag(image_bag, caplog): + (image_bag / "bagit.txt").unlink() + rt.collect_bag_stats(image_bag) + + assert "Directory should be formatted as a bag" in caplog.text + + +def test_warn_on_missing_date_in_bag(image_bag, caplog): + (image_bag / "bag-info.txt").write_text("Bag-Size: 1234") + rt.collect_bag_stats(image_bag) + + assert "Directory should be formatted as a bag" in caplog.text + + +def test_warn_on_missing_size_in_bag(image_bag, caplog): + (image_bag / "bagit.txt").unlink() + rt.collect_bag_stats(image_bag) + + assert "Directory should be formatted as a bag" in caplog.text