diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 15bb7fb02..a76cf23e8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,6 +2,3 @@ * @aws-deadline/deadline-cloud-developers # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners -# overriding directories for job attachments -src/deadline/job_attachments/ @aws-deadline/job-attachments-developers -test/*/deadline_job_attachments/ @aws-deadline/job-attachments-developers diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bd07438de..33f6b0552 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -12,8 +12,6 @@ See [DEVELOPMENT.md](https://github.com/aws-deadline/deadline-cloud/blob/mainlin - Have you run the unit tests? - Have you run the integration tests? -- Have you made changes to the `download` or `asset_sync` modules? If so, then it is highly recommended - that you ensure that the docker-based unit tests pass. ### Was this change documented? diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml index 6b36cf398..8cbb53ab3 100644 --- a/.github/workflows/code_quality.yml +++ b/.github/workflows/code_quality.yml @@ -40,7 +40,7 @@ jobs: python-version: ${{ matrix.python-version }} ref: ${{inputs.tag}} # The incremental output download and mcp feature doesn't run on Python 3.8, so test coverage is lower - cov-fail-under: "74" + cov-fail-under: "63" Attributions: name: Attributions diff --git a/AGENTS.md b/AGENTS.md index c1198b9fb..b49bed45f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -21,10 +21,6 @@ src/deadline/ │ ├── ui/ # Qt/PySide GUI components │ ├── job_bundle/ # Job bundle handling and history │ └── dataclasses/ # Data structures -├── job_attachments/ # File transfer to/from S3 -│ ├── api/ # Public job attachments API -│ ├── caches/ # Hash and S3 check caches -│ └── asset_manifests/ # Manifest handling ├── mcp/ # MCP server (public) ├── _mcp/ # MCP server (internal) └── common/ # Shared utilities diff --git a/docs/code_reference/code_organization.md b/docs/code_reference/code_organization.md index af526df55..3bfb63522 100644 --- a/docs/code_reference/code_organization.md +++ b/docs/code_reference/code_organization.md @@ -1,8 +1,7 @@ # Code organization -This repository is split up into two main modules: -1. `src/client` -2. `src/job_attachments` +The main module in this repository is `src/client`. Job attachments functionality +lives in the separate [`deadline-cloud-job-attachments`](https://github.com/aws-deadline/deadline-cloud-job-attachments) package. The `src/client` organization is laid out below. diff --git a/docs/code_reference/job_attachments_api.md b/docs/code_reference/job_attachments_api.md deleted file mode 100644 index cf883c9e1..000000000 --- a/docs/code_reference/job_attachments_api.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.api diff --git a/docs/code_reference/job_attachments_asset_manifests.md b/docs/code_reference/job_attachments_asset_manifests.md deleted file mode 100644 index 9569bb6eb..000000000 --- a/docs/code_reference/job_attachments_asset_manifests.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.asset_manifests \ No newline at end of file diff --git a/docs/code_reference/job_attachments_asset_sync.md b/docs/code_reference/job_attachments_asset_sync.md deleted file mode 100644 index 9e6c9c59e..000000000 --- a/docs/code_reference/job_attachments_asset_sync.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.asset_sync \ No newline at end of file diff --git a/docs/code_reference/job_attachments_caches.md b/docs/code_reference/job_attachments_caches.md deleted file mode 100644 index 66e65ccff..000000000 --- a/docs/code_reference/job_attachments_caches.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.caches \ No newline at end of file diff --git a/docs/code_reference/job_attachments_download.md b/docs/code_reference/job_attachments_download.md deleted file mode 100644 index 1855a6c7b..000000000 --- a/docs/code_reference/job_attachments_download.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.download \ No newline at end of file diff --git a/docs/code_reference/job_attachments_exceptions.md b/docs/code_reference/job_attachments_exceptions.md deleted file mode 100644 index e60df8d8c..000000000 --- a/docs/code_reference/job_attachments_exceptions.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.exceptions \ No newline at end of file diff --git a/docs/code_reference/job_attachments_models.md b/docs/code_reference/job_attachments_models.md deleted file mode 100644 index b58682297..000000000 --- a/docs/code_reference/job_attachments_models.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.models \ No newline at end of file diff --git a/docs/code_reference/job_attachments_os_file_permission.md b/docs/code_reference/job_attachments_os_file_permission.md deleted file mode 100644 index db4faf3ac..000000000 --- a/docs/code_reference/job_attachments_os_file_permission.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.os_file_permission \ No newline at end of file diff --git a/docs/code_reference/job_attachments_progress_tracker.md b/docs/code_reference/job_attachments_progress_tracker.md deleted file mode 100644 index f67a688de..000000000 --- a/docs/code_reference/job_attachments_progress_tracker.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.progress_tracker \ No newline at end of file diff --git a/docs/code_reference/job_attachments_upload.md b/docs/code_reference/job_attachments_upload.md deleted file mode 100644 index 1a8e5ee0d..000000000 --- a/docs/code_reference/job_attachments_upload.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.upload \ No newline at end of file diff --git a/docs/code_reference/job_attachments_vfs.md b/docs/code_reference/job_attachments_vfs.md deleted file mode 100644 index 47fc7ff55..000000000 --- a/docs/code_reference/job_attachments_vfs.md +++ /dev/null @@ -1 +0,0 @@ -#### ::: deadline.job_attachments.vfs \ No newline at end of file diff --git a/docs/design/storage-profile-cli-support.md b/docs/design/storage-profile-cli-support.md index 91ddc900d..8baac6271 100644 --- a/docs/design/storage-profile-cli-support.md +++ b/docs/design/storage-profile-cli-support.md @@ -179,7 +179,7 @@ _incremental_output_download() ### 4.4 Path Mapping Implementation -**File:** `src/deadline/job_attachments/_path_mapping.py` +**File:** `deadline-cloud-job-attachments/src/deadline/job_attachments/_path_mapping.py` #### 4.4.1 Rule Generation @@ -225,7 +225,7 @@ class _PathMappingRuleApplier: ### 4.5 Download File Operations -**File:** `src/deadline/job_attachments/download.py` +**File:** `deadline-cloud-job-attachments/src/deadline/job_attachments/download.py` #### 4.5.1 OutputDownloader Class @@ -268,7 +268,7 @@ def download_files( ### 5.1 StorageProfile -**File:** `src/deadline/job_attachments/models.py` +**File:** `deadline-cloud-job-attachments/src/deadline/job_attachments/models.py` ```python @dataclass @@ -281,7 +281,7 @@ class StorageProfile: ### 5.2 FileSystemLocation -**File:** `src/deadline/job_attachments/models.py` +**File:** `deadline-cloud-job-attachments/src/deadline/job_attachments/models.py` ```python @dataclass @@ -520,7 +520,7 @@ Note: Both profiles are fetched via the typed API (`api.get_storage_profile_for_ Since the function is private (underscore-prefixed) with only two call sites (`_incremental_download.py` and the new `_job_download_helpers.py`), we refactor it directly instead of adding an adapter. -**File:** `src/deadline/job_attachments/_path_mapping.py` +**File:** `deadline-cloud-job-attachments/src/deadline/job_attachments/_path_mapping.py` ```python def _generate_path_mapping_rules( @@ -827,12 +827,12 @@ Each helper function is independently testable in `test/unit/deadline_client/cli | Incremental download | `src/deadline/client/cli/_incremental_download.py` | | Job submission API | `src/deadline/client/api/_submit_job_bundle.py` | | Storage profile API | `src/deadline/client/api/_get_storage_profile_for_queue.py` | -| S3AssetManager | `src/deadline/job_attachments/upload.py` | -| OutputDownloader | `src/deadline/job_attachments/download.py` | -| Path mapping | `src/deadline/job_attachments/_path_mapping.py` | -| Hash algorithms | `src/deadline/job_attachments/asset_manifests/hash_algorithms.py` | -| Hash cache | `src/deadline/job_attachments/caches/hash_cache.py` | -| Models | `src/deadline/job_attachments/models.py` | +| S3AssetManager | `deadline-cloud-job-attachments/src/deadline/job_attachments/upload.py` | +| OutputDownloader | `deadline-cloud-job-attachments/src/deadline/job_attachments/download.py` | +| Path mapping | `deadline-cloud-job-attachments/src/deadline/job_attachments/_path_mapping.py` | +| Hash algorithms | `deadline-cloud-job-attachments/src/deadline/job_attachments/asset_manifests/hash_algorithms.py` | +| Hash cache | `deadline-cloud-job-attachments/src/deadline/job_attachments/caches/hash_cache.py` | +| Models | `deadline-cloud-job-attachments/src/deadline/job_attachments/models.py` | | Config file | `src/deadline/client/config/config_file.py` | --- @@ -905,7 +905,7 @@ Key operations: ### 12.3 Asset Manager: `S3AssetManager` -**File:** `src/deadline/job_attachments/upload.py` +**File:** `deadline-cloud-job-attachments/src/deadline/job_attachments/upload.py` ``` S3AssetManager.__init__() diff --git a/docs/job_attachments_guide.md b/docs/job_attachments_guide.md deleted file mode 100644 index 8f8af8951..000000000 --- a/docs/job_attachments_guide.md +++ /dev/null @@ -1,173 +0,0 @@ -# Job attachments - -[Job attachments][job-attachments] enable you to transfer files back and forth between your workstation and [AWS Deadline Cloud][deadline-cloud], using an Amazon S3 bucket in your AWS account associated with your [AWS Deadline Cloud queues][queue]. - -Job attachments uses your configured S3 bucket as a [content-addressable storage](https://en.wikipedia.org/wiki/Content-addressable_storage), which creates a snapshot of the files used in your job submission in [asset manifests](#asset-manifests), only uploading files that aren't already in S3. This saves you time and bandwidth when iterating on jobs. When an [AWS Deadline Cloud worker agent][worker-agent] starts working on a job with job attachments, it recreates the file system snapshot in the worker agent session directory, and uploads any outputs back to your S3 bucket. - -You can then easily download your outputs with the [deadline job download-output] command, or using the [protocol handler](#protocol-handler) to download from a click of a button in the [AWS Deadline Cloud monitor][monitor]. - -Job attachments also works as an auxiliary storage when used with [AWS Deadline Cloud storage profiles][shared-storage], allowing you to flexibly upload files to your Amazon S3 bucket that aren't on your configured shared storage. - -See the [`examples`](https://github.com/aws-deadline/deadline-cloud/tree/mainline/examples) directory for some simple examples on how to use job attachments. See the [developer guide][developer-guide] for a demonstration of how the CLI works with job attachments. - -[job-attachments]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/storage-job-attachments.html -[deadline-cloud]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/what-is-deadline-cloud.html -[queue]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/queues.html -[monitor]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/working-with-deadline-monitor.html -[shared-storage]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/storage-shared.html -[worker-agent]: https://github.com/aws-deadline/deadline-cloud-worker-agent/blob/release/docs/ -[developer-guide]: https://docs.aws.amazon.com/deadline-cloud/latest/developerguide/what-job-attachments-uploads-to-amazon-s3.html -[deadline job download-output]: cli_reference/deadline_job.md#download-output - -## Job Attachments Bucket Structure - -The basic structure that job attachments uses in your S3 bucket is as follows: - -``` -RootPrefix/ - Data/ - Manifests/ -``` - -- `RootPrefix` is the top-level prefix that all job attachments files are written to. This is configurable when you associate your S3 bucket with a queue. -- `Data` is where the files are stored, based on a hash of their contents. This is a fixed prefix used by the job attachments library and is non-configurable. -- `Manifests` is where manifests are stored which are associated with job submissions. This is a fixed prefix used by the job attachments library and is non-configurable. - -[ja-security]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/security-best-practices.html#job-attachment-queues - -### S3 Key Format - -#### Data Files - -Data files in the job attachments system are stored using a content-addressable approach. The S3 key format for data files is: - -``` -/Data/. -``` - -For example: -``` -my-deadline-prefix/Data/a1b2c3d4e5f6g7h8i9j0.xxh128 -``` - -Where: -- `` is the prefix configured in the queue's job attachment settings -- `` is the hash of the file contents -- `` is the algorithm used to generate the hash (currently "xxh128") - -This content-addressable approach ensures that identical files are only stored once, regardless of their original filenames or paths. - -#### Manifest Files - -Manifests are stored for both job inputs and task outputs. The S3 key for the input manifest for a job can be found by [calling GetJob](https://docs.aws.amazon.com/deadline-cloud/latest/APIReference/API_GetJob.html#API_GetJob_ResponseSyntax) and looking in the response under `attachments.manifests.rootPath`. See the [developer guide on job attachments](https://docs.aws.amazon.com/deadline-cloud/latest/developerguide/run-jobs-job-attachments.html#job-attachments-in-depth) for more info. - -Output manifests for tasks are stored under: -``` -/Manifests//////_/_output -``` - -Where: -- `` is the prefix configured in the queue's job attachment settings -- ``, ``, ``, ``, ``, and `` are the respective identifiers (e.g., farm-1234567890abcdefg) -- `` is a hash of the concatenation of `fileSystemLocationName` (if set) and `rootPath` fields in the job's `manifests` list. -- `` is the time that the task started. It is formatted as an ISO8601 timestamp with microsecond precision and in the UTC timezone (e.g. `2025-04-01T17:27:28.044179Z`) - -Each manifest file also has an asset root which defines the local root path where files should be placed when downloaded. The asset root is stored in the user-defined metadata of the manifest S3 object. If the asset root can be encoded in ASCII, it is stored directly under the `asset-root` userdata property. If not, it is stored as a JSON-encoded string under `asset-root-json`. - -## Asset Manifests - -When making a job submission, the job attachments library makes a snapshot of all of the files included in the submission. The contents of each file are hashed, and the files are uploaded to the S3 bucket associated with the queue you are submitting to. This way, if the files haven't changed since a previous submission, the hash will be the same and the files will not be re-uploaded. - -These snapshots are encapsulated in one or more asset_manifests. Asset manifests include the local file path and associated hash of every file included in the submission, plus some metadata such as the file size and last modified time. Asset manifests are uploaded to your job attachments S3 bucket alongside your files. - -When starting work, the worker downloads the manifest associated with your job, and recreates the file structure of your submission locally, either downloading all files at once, or as needed if using the [virtual][vfs] job attachments filesystem type. When a task completes, the worker creates a new manifest for any outputs that were specified in the job submission, and uploads the manifest and the outputs back to your S3 bucket. - -Manifest files are written to a `manifests` directory within each job bundle that is added to the job history if submitted through the GUI (default: `~/.deadline/job_history`). A corresponding `manifest_s3_mapping` file is created alongside manifests, which specifies each local manifest file with the S3 manifest path in the submitted job's job attachments metadata. - -[vfs]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/storage-virtual.html - -### Manifest Format - -Asset manifests are JSON documents that follow a specific schema. The current manifest version is `2023-03-03` and has the following structure: - -```json -{ - "manifestVersion": "2023-03-03", - "hashAlg": "xxh128", - "totalSize": 12345, - "paths": [ - { - "path": "relative/path/to/file1.txt", - "hash": "abcdef1234567890", - "size": 1024, - "mtime": 1678012345000000 - }, - { - "path": "relative/path/to/file2.png", - "hash": "0987654321fedcba", - "size": 11321, - "mtime": 1678012346000000 - } - ] -} -``` - -The components of the manifest are: - -- `manifestVersion`: The version of the manifest schema (currently "2023-03-03") -- `hashAlg`: The algorithm used to hash the files (currently only "xxh128" is supported) -- `totalSize`: The sum of all file sizes in bytes -- `paths`: An array of file entries, each containing: - - `path`: The relative path to the file from the root directory - - `hash`: The hash of the file contents using the specified algorithm - - `size`: The file size in bytes - - `mtime`: The file's last modified time as epoch time in microseconds - -The manifest is canonicalized (with paths sorted) before being converted to a JSON string, which ensures consistent hashing and comparison of manifests. - -### Manifest Aggregation for Job Downloads - -When downloading job outputs, the system aggregates manifests across task outputs. Manifests are aggregated by keeping only the latest version of each file. Keeping the latest file allows task outputs to override job inputs and later tasks to overwrite output of earlier tasks. - -## Local Cache Files - -In order to further improve submission time, there are currently two local caches, which are simple SQLite databases that cache file information locally. These include: - -1. Hash Cache: a cache recording a file name and corresponding hash of its contents at a specific time. If a file does not exist in the hash cache, or its last modified time is later than the time in the cache, the file will be hashed and the cache updated. - -2. S3 Check Cache: a 'last seen on S3' cache that records the last time that a specific S3 object was seen. For the case of this library, this will just be a hash and a timestamp of the last time that hash was seen in S3. If a hash does not exist in the cache, or the last check time is expired (currently after 30 days), an S3 head object API call will be made to check if the hash exists in your S3 bucket, and if so, will write to the cache. - -[hash-cache]: https://github.com/aws-deadline/deadline-cloud/blob/mainline/src/deadline/job_attachments/caches/hash_cache.py -[s3-check-cache]: https://github.com/aws-deadline/deadline-cloud/blob/mainline/src/deadline/job_attachments/caches/s3_check_cache.py - -## Protocol Handler - -On Windows and Linux operating systems, you can choose to install the [Deadline CLI](cli_reference/index.md) protocol handler in order to run AWS Deadline Cloud commands sent from a web browser. Of note is the ability to download job attachments outputs from your jobs through the [AWS Deadline Cloud monitor][downloading-output]. - -You can install the protocol handler by running the command: `deadline handle-web-url --install` - -[downloading-output]: https://docs.aws.amazon.com/deadline-cloud/latest/userguide/download-finished-output.html - -## Security - -When creating a queue, provide the name of an S3 bucket in the same account and region as the queue you are creating, and provide a 'root prefix' name for files to be uploaded to. You also must provide an IAM role that has access to the S3 bucket. See the [security best practices][ja-security] documentation for more information on securely configuring job attachments. - -## Job Attachments Subcommands - -Job Attachments provides `attachment` and `manifest` subcommand groups as part of deadline command-line interface. - -Attachment subcommands work with data files based on asset manifest files. - -```sh -$ deadline attachment upload -$ deadline attachment download -``` - -Manifest subcommands work with asset manifest files that capture local asset lifecycle. - -```sh -$ deadline manifest snapshot -$ deadline manifest diff -$ deadline manifest download -$ deadline manifest upload -``` \ No newline at end of file diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index e0bcca060..000000000 --- a/examples/README.md +++ /dev/null @@ -1 +0,0 @@ -This directory holds sample scripts for illustrating how to use the Job Attachments Library. \ No newline at end of file diff --git a/examples/download_output.py b/examples/download_output.py deleted file mode 100644 index e9ee79a63..000000000 --- a/examples/download_output.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -#! /usr/bin/env python3 -import argparse -import sys -import time - -from deadline.job_attachments._aws.deadline import get_queue -from deadline.job_attachments.download import OutputDownloader - -""" -A small script to download job output. Can provide just the Job ID to download all outputs -for a Job, optionally include the Step ID to get all outputs for the Job's Step, or optionally -include the Job, Step, and Task ID to get the outputs for a specific Task. - -Example usage: - -python download_output.py -f $FARM_ID -q $QUEUE_ID -j $JOB_ID -""" - -if __name__ == "__main__": - start_time = time.perf_counter() - - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", "--farm-id", type=str, help="Deadline Farm you want to download from.", required=True - ) - parser.add_argument( - "-q", "--queue-id", type=str, help="Deadline Queue you want to download.", required=True - ) - parser.add_argument( - "-j", "--job-id", type=str, help="Deadline Job you want outputs of.", required=True - ) - parser.add_argument( - "-s", "--step-id", type=str, help="Optional. Deadline Step you want outputs of." - ) - parser.add_argument( - "-t", - "--task-id", - type=str, - help="Optional. Deadline Task you want outputs of. If specifying, must include Step ID.", - ) - args = parser.parse_args() - - farm_id = args.farm_id - queue_id = args.queue_id - job_id = args.job_id - step_id = args.step_id - task_id = args.task_id - - if task_id and not step_id: - print("Must specify Step ID when including Task ID! Stopping.") - sys.exit() - - print("\nGetting queue settings...") - settings = get_queue(farm_id, queue_id).jobAttachmentSettings - - print("\nStarting download...") - start = time.perf_counter() - output_downloader = OutputDownloader( - s3_settings=settings, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - step_id=step_id, - task_id=task_id, - ) - output_downloader.download_job_output() - total = time.perf_counter() - start - print(f"Finished downloading after {total} seconds") diff --git a/examples/submit_job.py b/examples/submit_job.py deleted file mode 100644 index b629bb1e0..000000000 --- a/examples/submit_job.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -#! /usr/bin/env python3 - -import argparse -import pprint -import time -from pathlib import Path - -from deadline.job_attachments.upload import S3AssetManager -from deadline.job_attachments.models import JobAttachmentS3Settings -from deadline.client.config.config_file import get_cache_directory -from deadline.client.api import get_queue_user_boto3_session, get_boto3_session - -""" -This is a sample script that illustrates how to submit a custom job using the -Job Attachments library. Please make sure to specify `endpoint_url` to the target -endpoint you want to test, when creating a (boto3) service client for deadline. - -Example usage: - -python submit_job.py -f $FARM_ID -q $QUEUE_ID -i /tmp/asset_root/inputs -o /tmp/asset_root/outputs -""" - - -def process_job_attachments(farm_id, queue_id, inputs, outputDir, deadline_client, session): - """ - Uploads all of the input files to the Job Attachments S3 bucket associated with - the Deadline Queue, returning Attachment Settings to be associated with a Deadline Job. - """ - - print("Getting queue information...") - start = time.perf_counter() - queue = deadline_client.get_queue(farmId=farm_id, queueId=queue_id) - total = time.perf_counter() - start - print(f"Finished getting queue information after {total} seconds.\n") - - print(f"Processing {len(inputs)} job attachments...") - start = time.perf_counter() - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=JobAttachmentS3Settings(**queue["jobAttachmentSettings"]), - session=session, - ) - upload_group = asset_manager.prepare_paths_for_upload( - inputs, - [outputDir], - [], - ) - cache_directory = get_cache_directory() - (_, manifests) = asset_manager.hash_assets_and_create_manifest( - upload_group.asset_groups, - upload_group.total_input_files, - upload_group.total_input_bytes, - cache_directory, - ) - (_, attachments) = asset_manager.upload_assets(manifests, s3_check_cache_dir=cache_directory) - attachments_dict = attachments.to_dict() - total = time.perf_counter() - start - print(f"Finished processing job attachments after {total} seconds.\n") - print(f"Created these attachment settings: {attachments_dict}\n") - - return attachments_dict - - -JOB_TEMPLATE = """specificationVersion: 'jobtemplate-2023-09' -name: SubmitJobExample -description: > - A Job that counts the number of files and total size, - and also creates a default output file. -parameterDefinitions: - - name: DataDir - type: PATH - objectType: DIRECTORY - dataFlow: INOUT - - name: RelOutput - type: PATH -steps: - - name: layerDefaultFrames - script: - actions: - onRun: - command: '{{Task.File.Run}}' - embeddedFiles: - - name: Run - filename: count-files.sh - type: TEXT - runnable: true - data: | - #!/bin/env bash - - set -euo pipefail - echo 'Confirming that inputs were downloaded to the correct location' - echo 'Total number of inputs' && find {{Param.DataDir}} -type f | wc -l - echo 'Total file size' && du -hs {{Param.DataDir}} - echo 'Creating the expected output directory and output file' - mkdir -p {{Param.DataDir}}/{{Param.RelOutput}} - echo 'This is test output' > {{Param.DataDir}}/{{Param.RelOutput}}/output.txt -""" - - -def submit_custom_job( - farm_id, queue_id, job_template, attachment_settings, parameters, deadline_client -): - """ - Submits a Job defined in the Job Template to the given Queue, adding the givent Attachment Settings - to the Job definition. - """ - - # Submit the Job - print("Submitting the job...") - start = time.perf_counter() - response = deadline_client.create_job( - farmId=farm_id, - queueId=queue_id, - template=job_template, - templateType="YAML", - attachments=attachment_settings if attachment_settings else None, - parameters=parameters, - priority=50, - ) - total = time.perf_counter() - start - print(f"Submitted Job Template after {total} seconds:") - pprint.pprint(job_template.encode()) - print(f"Job ID: {response['jobId']}") - - -if __name__ == "__main__": - start_time = time.perf_counter() - - parser = argparse.ArgumentParser() - - parser.add_argument( - "-f", "--farm-id", type=str, help="Deadline Farm you want to submit to.", required=True - ) - parser.add_argument( - "-q", "--queue-id", type=str, help="Deadline Queue you want to submit to.", required=True - ) - parser.add_argument( - "-i", - "--input-files", - type=str, - help="List of input files (or directories) you want to upload to be used with the Job.", - action="append", - required=True, - ) - parser.add_argument( - "-o", - "--output-dir", - type=str, - help="A single output directory used by the Job.", - required=True, - ) - parser.add_argument( - "-ao", - "--assets-only", - help="Specify this flag to only upload input files. No job will be submitted. Helpful when pre-populating the Job Attachments S3 bucket.", - action="store_true", - required=False, - ) - - args = parser.parse_args() - - inputs = [] - for input in args.input_files: - file_path = Path(input) - if file_path.is_dir(): - inputs.extend( - [ - str(file) - for file in file_path.glob("**/*") - if not file.is_dir() and file.exists() - ] - ) - else: - inputs.append(str(file_path)) - - session = get_boto3_session() - deadline_client = session.client("deadline") - queue_session = get_queue_user_boto3_session( - deadline_client, - None, - args.farm_id, - args.queue_id, - ) - - attachments = process_job_attachments( - args.farm_id, - args.queue_id, - inputs, - args.output_dir, - deadline_client, - queue_session, - ) - - if not args.assets_only: - root_dir = attachments["manifests"][0]["rootPath"] - rel_output = str(Path(args.output_dir).relative_to(root_dir)) - submit_custom_job( - args.farm_id, - args.queue_id, - JOB_TEMPLATE, - attachments, - {"DataDir": {"path": root_dir}, "RelOutput": {"path": rel_output}}, - deadline_client, - ) - - print(f"\nTotal submit runtime: {time.perf_counter() - start_time}") diff --git a/examples/summarize_dir.py b/examples/summarize_dir.py deleted file mode 100644 index 7211057cf..000000000 --- a/examples/summarize_dir.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python - -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import argparse -import os - -from deadline.job_attachments.api import summarize_path_list - -""" -This is a sample script that uses the path summarization features to summarize -all the files in a specified directory. - -Example usage: - - python summarize_dir.py --max-entries 5 ./mydir -""" - - -def main(): - parser = argparse.ArgumentParser() - - parser.add_argument( - "--max-entries", type=int, default=10, help="How many entries to limit the summary to." - ) - parser.add_argument( - "--file-sizes", default=False, action="store_true", help="Include file sizes." - ) - parser.add_argument( - "--skip-dot-paths", - default=False, - action="store_true", - help="Skip directories and files that start with '.'.", - ) - parser.add_argument( - "--exclude-totals", - default=False, - action="store_true", - help="Exclude totals from the summary.", - ) - parser.add_argument( - "--follow-symlinks", - default=False, - action="store_true", - help="Follows symlinks for directory traversal and file sizes.", - ) - parser.add_argument("summary_dir", help="The directory to summarize.") - - args = parser.parse_args() - - if not os.path.exists(args.summary_dir) or not os.path.isdir(args.summary_dir): - print(f"Directory not found: {args.summary_dir}") - - total_size_by_path: dict[str, int] = None - if args.file_sizes: - total_size_by_path = {} - - path_list = [] - - dirs_to_visit = [args.summary_dir] - while dirs_to_visit: - dir = dirs_to_visit.pop() - for entry in os.scandir(dir): - if entry.is_dir(follow_symlinks=args.follow_symlinks): - if not (args.skip_dot_paths and entry.name.startswith(".")): - dirs_to_visit.append(entry.path) - elif entry.is_file(follow_symlinks=args.follow_symlinks): - if not (args.skip_dot_paths and entry.name.startswith(".")): - path_list.append(entry.path) - if total_size_by_path is not None: - total_size_by_path[entry.path] = entry.stat( - follow_symlinks=args.follow_symlinks - ).st_size - - if path_list: - print( - summarize_path_list( - path_list, - total_size_by_path=total_size_by_path, - max_entries=args.max_entries, - include_totals=not args.exclude_totals, - ) - ) - else: - print(f"No files found in {args.summary_dir}") - - -if __name__ == "__main__": - main() diff --git a/hatch.toml b/hatch.toml index 76cca53fc..80373eba2 100644 --- a/hatch.toml +++ b/hatch.toml @@ -4,14 +4,12 @@ pre-install-commands = ["pip install -r requirements-testing.txt"] [envs.default.scripts] sync = "pip install -r requirements-testing.txt" test = "pytest --cov-config pyproject.toml {args:test/unit}" -test_docker = "./scripts/run_sudo_tests.sh --build" test_installer = "pytest --no-cov {args:test/installer} -vvv" test_build_installer = "pytest --no-cov {args:test/build_installer}" typing = "mypy {args:src test} --always-false=PYQT5 --always-false=PYSIDE2 --always-false=PYQT6 --always-true=PYSIDE6 --exclude=PySide6" style = ["ruff check {args:.}", "ruff format --check --diff {args:.}"] fmt = ["ruff format {args:.}", "style"] lint = ["style", "typing"] -check-imports = "lint-imports" [[envs.all.matrix]] python = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] diff --git a/mkdocs.yml b/mkdocs.yml index 13c49be06..5807b3750 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,19 +51,6 @@ nav: - "exceptions": code_reference/client_exceptions.md - "job_bundle": code_reference/client_job_bundle.md - "ui": code_reference/client_ui.md - - "deadline.job_attachments": - - "api": code_reference/job_attachments_api.md - - "asset_manifests": code_reference/job_attachments_asset_manifests.md - - "asset_sync": code_reference/job_attachments_asset_sync.md - - "caches": code_reference/job_attachments_caches.md - - "download": code_reference/job_attachments_download.md - - "exceptions": code_reference/job_attachments_exceptions.md - - "models": code_reference/job_attachments_models.md - - "os_file_permission": code_reference/job_attachments_os_file_permission.md - - "progress_tracker": code_reference/job_attachments_progress_tracker.md - - "upload": code_reference/job_attachments_upload.md - - "vfs": code_reference/job_attachments_vfs.md - - Job Attachments Guide: job_attachments_guide.md - Submission Hooks: submission-hooks.md - MCP Guide: mcp_guide.md diff --git a/pyproject.toml b/pyproject.toml index fcaba5ebb..2082dd2f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,12 +40,10 @@ dependencies = [ # Click 8.2 dropped python 3.8/3.9 support "click >= 8.1.7", "pyyaml >= 6.0", - # Job Attachments + "deadline-job-attachments == 0.0.1", "typing_extensions >= 4.8", - "xxhash >= 3.4,< 3.7", - "pywin32 >= 307; sys_platform == 'win32'", + "psutil >= 7.0.0", "QtPy == 2.4.*", - "psutil >= 7.0.0" ] [project.urls] @@ -84,7 +82,7 @@ version-file = "_version.py" path = "hatch_custom_hook.py" [tool.hatch.build.hooks.custom.copy_version_py] -destinations = ["src/deadline/client", "src/deadline/job_attachments"] +destinations = ["src/deadline/client"] [tool.hatch.build.targets.sdist] include = ["src/*", "hatch_custom_hook.py", "THIRD_PARTY_LICENSES"] @@ -109,7 +107,7 @@ explicit_package_bases = true mypy_path = "src" [[tool.mypy.overrides]] -module = ["qtpy.*", "boto3.*", "botocore.*", "moto.*", "xxhash"] +module = ["qtpy.*", "boto3.*", "botocore.*", "moto.*"] [[tool.mypy.overrides]] module = "deadline.client.ui.*" @@ -126,13 +124,6 @@ ignore = ["E501"] [tool.ruff.lint.isort] known-first-party = ["deadline"] -[tool.ruff.lint.per-file-ignores] -# We need to use a platform assertion to short-circuit mypy type checking on non-Windows platforms -# https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks -# This causes imports to come after regular Python statements causing flake8 rule E402 to be flagged -"src/deadline/job_attachments/_windows/*.py" = ["E402"] - - # Configuration for pytest; enable coverage for deadline_worker_agent, emit # XML, HTML, and terminal reports. [tool.pytest.ini_options] @@ -151,7 +142,6 @@ looponfailroots = ["src", "test/unit"] markers = [ "no_setup: mark that test shouldn't use default setups", "integ: tests that run against AWS resources", - "docker: marks tests to be run only in a Docker environment", "cross_account: tests that run against other aws accounts", "asyncio: mark test as async", ] @@ -172,12 +162,9 @@ source = ["src/"] [tool.coverage.report] show_missing = true -fail_under = 80 +fail_under = 69 # https://github.com/wemake-services/coverage-conditional-plugin -[tool.coverage.coverage_conditional_plugin.omit] -"sys_platform != 'win32'" = ["src/deadline/job_attachments/_windows/*.py"] - [tool.coverage.coverage_conditional_plugin.rules] # This cannot be empty otherwise coverage-conditional-plugin crashes with: # AttributeError: 'NoneType' object has no attribute 'items' @@ -225,12 +212,3 @@ lstrip_blocks = true [tool.semantic_release.branches.release] match = "(mainline|release|patch_.*)" - -[tool.importlinter] -root_package = "deadline" - -[[tool.importlinter.contracts]] -name = "Job Attachments must not import from Client" -type = "forbidden" -source_modules = ["deadline.job_attachments"] -forbidden_modules = ["deadline.client"] diff --git a/requirements-testing.txt b/requirements-testing.txt index e7d378fca..3693e1bfb 100644 --- a/requirements-testing.txt +++ b/requirements-testing.txt @@ -12,7 +12,6 @@ twine == 6.* mypy == 1.13.*; python_version <= '3.8' mypy == 1.*; python_version > '3.8' ruff == 0.15.* -import-linter == 2.* moto[server] == 5.* jsondiff == 2.* pyinstrument == 5.* diff --git a/scripted_tests/README.md b/scripted_tests/README.md deleted file mode 100644 index bc251192e..000000000 --- a/scripted_tests/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Scripted Tests -This directory contains Python scripts designed to test functionalities of Job Attachment module that are hard to cover with unit tests. - -## How to Use -Each script contains its own execution instructions at the top of the file. Please follow those instructions to run the individual tests. diff --git a/scripted_tests/download_cancel_test.py b/scripted_tests/download_cancel_test.py deleted file mode 100644 index 0e5829213..000000000 --- a/scripted_tests/download_cancel_test.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -#! /usr/bin/env python3 -import argparse -import pathlib -from tempfile import TemporaryDirectory -import time -from threading import Thread - -from deadline.job_attachments.asset_sync import AssetSync -from deadline.job_attachments._aws.deadline import get_job, get_queue -from deadline.job_attachments.download import OutputDownloader -from deadline.job_attachments.exceptions import AssetSyncCancelledError - -""" -A testing script to simulate cancellation of (1) syncing inputs, or (2) downloading outputs. - -How to test: - -1. Run the script with the following command for each test: - (1) To test canceling syncing inputs, run the following command: - python3 download_cancel_test.py sync_inputs -f -q -j - (2) To test canceling downloading outputs, run the following command: - python3 download_cancel_test.py download_outputs -f -q -j -2. In the middle of downloading files, you can send a cancel signal by pressing 'k' key - and then pressing 'Enter' key in succession. Confirm that cancelling is working as expected. -""" - -MESSAGE_HOW_TO_CANCEL = ( - "To stop the download process, please hit 'k' key and then 'Enter' key in succession.\n" -) -continue_reporting = True -main_terminated = False - - -def run(): - print(MESSAGE_HOW_TO_CANCEL) - parser = argparse.ArgumentParser(description=MESSAGE_HOW_TO_CANCEL) - parser.add_argument( - "test_to_run", - choices=["sync_inputs", "download_outputs"], - help="Test to run. ('sync_inputs' or 'download_outputs')", - ) - parser.add_argument( - "-f", "--farm-id", type=str, help="Deadline Farm to download assets from.", required=True - ) - parser.add_argument( - "-q", "--queue-id", type=str, help="Deadline Queue to download assets from.", required=True - ) - parser.add_argument( - "-j", "--job-id", type=str, help="Deadline Job to download assets from.", required=True - ) - args = parser.parse_args() - - test_to_run = args.test_to_run - farm_id = args.farm_id - queue_id = args.queue_id - job_id = args.job_id - - if test_to_run == "sync_inputs": - test_sync_inputs(farm_id=farm_id, queue_id=queue_id, job_id=job_id) - elif test_to_run == "download_outputs": - test_download_outputs(farm_id=farm_id, queue_id=queue_id, job_id=job_id) - - -def test_sync_inputs( - farm_id: str, - queue_id: str, - job_id: str, -): - """ - Tests cancellation during execution of the `sync_inputs` function. - """ - start_time = time.perf_counter() - - with TemporaryDirectory() as temp_root_dir: - print(f"Created a temporary directory for the test: {temp_root_dir}") - - queue = get_queue(farm_id=farm_id, queue_id=queue_id) - job = get_job(farm_id=farm_id, queue_id=queue_id, job_id=job_id) - - print("Starting test to sync inputs...") - asset_sync = AssetSync(farm_id=farm_id) - - try: - download_start = time.perf_counter() - (summary_statistics, local_roots) = asset_sync.sync_inputs( - s3_settings=queue.jobAttachmentSettings, - attachments=job.attachments, - queue_id=queue_id, - job_id=job_id, - session_dir=pathlib.Path(temp_root_dir), - on_downloading_files=mock_on_downloading_files, - ) - print(f"Download Summary Statistics:\n{summary_statistics}") - print( - f"Finished downloading after {time.perf_counter() - download_start} seconds, returned:\n{local_roots}" - ) - - except AssetSyncCancelledError as asce: - print(f"AssetSyncCancelledError: {asce}") - print(f"payload: {asce.summary_statistics}") - - print(f"\nTotal test runtime: {time.perf_counter() - start_time}") - - print(f"Cleaned up the temporary directory: {temp_root_dir}") - global main_terminated - main_terminated = True - - -def test_download_outputs( - farm_id: str, - queue_id: str, - job_id: str, -): - """ - Tests cancellation during execution of the `download_job_output` function. - """ - start_time = time.perf_counter() - - queue = get_queue(farm_id=farm_id, queue_id=queue_id) - - print("Starting test to download outputs...") - - try: - download_start = time.perf_counter() - output_downloader = OutputDownloader( - s3_settings=queue.jobAttachmentSettings, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - ) - summary_statistics = output_downloader.download_job_output( - on_downloading_files=mock_on_downloading_files - ) - print(f"Download Summary Statistics:\n{summary_statistics}") - print(f"Finished downloading after {time.perf_counter() - download_start} seconds") - - except AssetSyncCancelledError as asce: - print(f"AssetSyncCancelledError: {asce}") - print(f"payload: {asce.summary_statistics}") - - print(f"\nTotal test runtime: {time.perf_counter() - start_time}") - - global main_terminated - main_terminated = True - - -def mock_on_downloading_files(metadata): - print(metadata) - return mock_on_cancellation_check() - - -def mock_on_cancellation_check(): - return continue_reporting - - -def wait_for_cancellation_input(): - while not main_terminated: - ch = input() - if ch == "k": - set_cancelled() - break - - -def set_cancelled(): - global continue_reporting - continue_reporting = False - print("Canceled the process.") - - -if __name__ == "__main__": - t = Thread(target=wait_for_cancellation_input) - t.start() - run() diff --git a/scripted_tests/job_bundles/minimal_job_bundle/template.yaml b/scripted_tests/job_bundles/minimal_job_bundle/template.yaml deleted file mode 100644 index 0b5a29323..000000000 --- a/scripted_tests/job_bundles/minimal_job_bundle/template.yaml +++ /dev/null @@ -1,26 +0,0 @@ -specificationVersion: 'jobtemplate-2023-09' -name: Minimal Sleep Job -description: A minimal job that sleeps for 10 seconds - -steps: -- name: SleepTask - hostRequirements: - attributes: - - name: attr.worker.os.family - anyOf: - - linux - script: - actions: - onRun: - command: bash - args: ['{{Task.File.SleepScript}}'] - embeddedFiles: - - name: SleepScript - type: TEXT - data: | - #!/bin/bash - set -euo pipefail - - echo "Starting minimal job..." - sleep 10 - echo "Job completed successfully" diff --git a/scripted_tests/job_bundles/ui_controls_showcase/file1.txt b/scripted_tests/job_bundles/ui_controls_showcase/file1.txt deleted file mode 100644 index 8a15d495a..000000000 --- a/scripted_tests/job_bundles/ui_controls_showcase/file1.txt +++ /dev/null @@ -1,3 +0,0 @@ -This is file1.txt for the PathDropdown parameter. -It's one of the options in the dropdown list. -PathDropdown option 1 content. diff --git a/scripted_tests/job_bundles/ui_controls_showcase/file2.txt b/scripted_tests/job_bundles/ui_controls_showcase/file2.txt deleted file mode 100644 index 9d6cf2d8e..000000000 --- a/scripted_tests/job_bundles/ui_controls_showcase/file2.txt +++ /dev/null @@ -1,3 +0,0 @@ -This is file2.txt for the PathDropdown parameter. -It's another option in the dropdown list. -PathDropdown option 2 content. diff --git a/scripted_tests/job_bundles/ui_controls_showcase/file3.txt b/scripted_tests/job_bundles/ui_controls_showcase/file3.txt deleted file mode 100644 index 988146b56..000000000 --- a/scripted_tests/job_bundles/ui_controls_showcase/file3.txt +++ /dev/null @@ -1,3 +0,0 @@ -This is file3.txt for the PathDropdown parameter. -It's the third option in the dropdown list. -PathDropdown option 3 content. diff --git a/scripted_tests/job_bundles/ui_controls_showcase/hidden_file.txt b/scripted_tests/job_bundles/ui_controls_showcase/hidden_file.txt deleted file mode 100644 index 8e3a38cd3..000000000 --- a/scripted_tests/job_bundles/ui_controls_showcase/hidden_file.txt +++ /dev/null @@ -1,3 +0,0 @@ -This is a hidden file for the HiddenPath parameter. -It won't be visible in the UI but is used by the job. -Hidden parameter demonstration file. diff --git a/scripted_tests/job_bundles/ui_controls_showcase/input.txt b/scripted_tests/job_bundles/ui_controls_showcase/input.txt deleted file mode 100644 index f595ffa10..000000000 --- a/scripted_tests/job_bundles/ui_controls_showcase/input.txt +++ /dev/null @@ -1,3 +0,0 @@ -This is a sample input file for the UI Controls Showcase. -It demonstrates the InputFile PATH parameter. -Created for testing purposes. diff --git a/scripted_tests/job_bundles/ui_controls_showcase/template.yaml b/scripted_tests/job_bundles/ui_controls_showcase/template.yaml deleted file mode 100644 index 45aa2c7c4..000000000 --- a/scripted_tests/job_bundles/ui_controls_showcase/template.yaml +++ /dev/null @@ -1,231 +0,0 @@ -specificationVersion: 'jobtemplate-2023-09' -name: UI Controls Showcase -description: Demonstrates all available OpenJD UI control types - -parameterDefinitions: -# STRING type controls -- name: LineEditString - type: STRING - default: "Hello World" - userInterface: - control: LINE_EDIT - label: Line Edit Control - groupLabel: String Controls - description: Single line text input - -- name: MultilineEditString - type: STRING - default: | - Line 1 - Line 2 - Line 3 - userInterface: - control: MULTILINE_EDIT - label: Multiline Edit Control - groupLabel: String Controls - description: Multi-line text input - -- name: StringDropdown - type: STRING - default: "Option2" - allowedValues: ["Option1", "Option2", "Option3"] - userInterface: - control: DROPDOWN_LIST - label: String Dropdown - groupLabel: String Controls - description: Dropdown list with string values - -- name: CheckBoxString - type: STRING - default: "true" - allowedValues: ["true", "false"] - userInterface: - control: CHECK_BOX - label: Checkbox Control - groupLabel: String Controls - description: Boolean checkbox - -- name: HiddenString - type: STRING - default: "secret_value" - userInterface: - control: HIDDEN - description: Hidden string parameter - -# INT type controls -- name: SpinBoxInt - type: INT - default: 42 - minValue: 0 - maxValue: 100 - userInterface: - control: SPIN_BOX - label: Integer Spin Box - groupLabel: Integer Controls - singleStepDelta: 5 - description: Integer spinner with step size 5 - -- name: IntDropdown - type: INT - default: 10 - allowedValues: [5, 10, 15, 20, 25] - userInterface: - control: DROPDOWN_LIST - label: Integer Dropdown - groupLabel: Integer Controls - description: Dropdown with integer values - -- name: HiddenInt - type: INT - default: 999 - userInterface: - control: HIDDEN - description: Hidden integer parameter - -# FLOAT type controls -- name: SpinBoxFloat - type: FLOAT - default: 3.14159 - minValue: 0.0 - maxValue: 10.0 - userInterface: - control: SPIN_BOX - label: Float Spin Box - groupLabel: Float Controls - decimals: 3 - singleStepDelta: 0.1 - description: Float spinner with 3 decimal places - -- name: FloatDropdown - type: FLOAT - default: 2.5 - allowedValues: [1.0, 1.5, 2.0, 2.5, 3.0] - userInterface: - control: DROPDOWN_LIST - label: Float Dropdown - groupLabel: Float Controls - description: Dropdown with float values - -- name: HiddenFloat - type: FLOAT - default: 123.456 - userInterface: - control: HIDDEN - description: Hidden float parameter - -# PATH type controls -- name: InputFile - type: PATH - objectType: FILE - dataFlow: IN - default: "input.txt" - userInterface: - control: CHOOSE_INPUT_FILE - label: Input File Chooser - groupLabel: Path Controls - fileFilters: - - label: Text Files - patterns: ["*.txt", "*.log"] - - label: All Files - patterns: ["*"] - description: Choose an input file - -- name: OutputFile - type: PATH - objectType: FILE - dataFlow: OUT - default: "output.txt" - userInterface: - control: CHOOSE_OUTPUT_FILE - label: Output File Chooser - groupLabel: Path Controls - fileFilters: - - label: Text Files - patterns: ["*.txt"] - - label: All Files - patterns: ["*"] - description: Choose an output file - -- name: DirectoryPath - type: PATH - objectType: DIRECTORY - dataFlow: INOUT - default: "work_dir" - userInterface: - control: CHOOSE_DIRECTORY - label: Directory Chooser - groupLabel: Path Controls - description: Choose a working directory - -- name: PathDropdown - type: PATH - objectType: FILE - dataFlow: IN - default: "file1.txt" - allowedValues: ["file1.txt", "file2.txt", "file3.txt"] - userInterface: - control: DROPDOWN_LIST - label: Path Dropdown - groupLabel: Path Controls - description: Dropdown with path values - -- name: HiddenPath - type: PATH - objectType: FILE - dataFlow: IN - default: "hidden_file.txt" - userInterface: - control: HIDDEN - description: Hidden path parameter - -steps: -- name: ShowAllControls - hostRequirements: - attributes: - - name: attr.worker.os.family - anyOf: - - linux - script: - actions: - onRun: - command: bash - args: ['{{Task.File.ShowControlsScript}}'] - embeddedFiles: - - name: ShowControlsScript - type: TEXT - data: | - #!/bin/bash - set -euo pipefail - - echo "=== UI Controls Showcase ===" - echo "" - - echo "STRING CONTROLS:" - echo " LineEditString: '{{Param.LineEditString}}'" - echo " MultilineEditString: '{{Param.MultilineEditString}}'" - echo " StringDropdown: '{{Param.StringDropdown}}'" - echo " CheckBoxString: '{{Param.CheckBoxString}}'" - echo " HiddenString: '{{Param.HiddenString}}'" - echo "" - - echo "INTEGER CONTROLS:" - echo " SpinBoxInt: {{Param.SpinBoxInt}}" - echo " IntDropdown: {{Param.IntDropdown}}" - echo " HiddenInt: {{Param.HiddenInt}}" - echo "" - - echo "FLOAT CONTROLS:" - echo " SpinBoxFloat: {{Param.SpinBoxFloat}}" - echo " FloatDropdown: {{Param.FloatDropdown}}" - echo " HiddenFloat: {{Param.HiddenFloat}}" - echo "" - - echo "PATH CONTROLS:" - echo " InputFile: '{{Param.InputFile}}'" - echo " OutputFile: '{{Param.OutputFile}}'" - echo " DirectoryPath: '{{Param.DirectoryPath}}'" - echo " PathDropdown: '{{Param.PathDropdown}}'" - echo " HiddenPath: '{{Param.HiddenPath}}'" - echo "" - - echo "=== All controls demonstrated successfully ===" diff --git a/scripted_tests/job_bundles/ui_controls_showcase/work_dir/.gitkeep b/scripted_tests/job_bundles/ui_controls_showcase/work_dir/.gitkeep deleted file mode 100644 index 98bf94e21..000000000 --- a/scripted_tests/job_bundles/ui_controls_showcase/work_dir/.gitkeep +++ /dev/null @@ -1,2 +0,0 @@ -# This file ensures the work_dir directory is preserved in git -# The work_dir is used as a working directory for job outputs diff --git a/scripted_tests/job_bundles/with_job_attachments/sample_attachment.txt b/scripted_tests/job_bundles/with_job_attachments/sample_attachment.txt deleted file mode 100644 index d39963f9a..000000000 --- a/scripted_tests/job_bundles/with_job_attachments/sample_attachment.txt +++ /dev/null @@ -1,3 +0,0 @@ -This is a sample attachment file for the AWS Deadline Cloud job bundle. -It demonstrates how job attachments work. -File created at: $(date) diff --git a/scripted_tests/job_bundles/with_job_attachments/sample_binary.dat b/scripted_tests/job_bundles/with_job_attachments/sample_binary.dat deleted file mode 100644 index 8990a02e2..000000000 Binary files a/scripted_tests/job_bundles/with_job_attachments/sample_binary.dat and /dev/null differ diff --git a/scripted_tests/job_bundles/with_job_attachments/template.yaml b/scripted_tests/job_bundles/with_job_attachments/template.yaml deleted file mode 100644 index 911fcc8f8..000000000 --- a/scripted_tests/job_bundles/with_job_attachments/template.yaml +++ /dev/null @@ -1,62 +0,0 @@ -specificationVersion: jobtemplate-2023-09 -name: Simple Job with Attachments -description: A minimal job bundle for profiling job attachments with embedded script - -parameterDefinitions: -- name: AttachmentFile - type: PATH - default: sample_attachment.txt - dataFlow: IN - objectType: FILE - description: Sample text attachment file to demonstrate job attachments - -- name: BinaryAttachment - type: PATH - default: sample_binary.dat - dataFlow: IN - objectType: FILE - description: Sample binary attachment file to demonstrate binary file handling - -steps: -- name: ProcessAttachments - script: - actions: - onRun: - command: bash - args: ['{{Task.File.MainScript}}'] - embeddedFiles: - - name: MainScript - type: TEXT - data: | - #!/bin/bash - echo "=== Job Attachments Demo ===" - echo "Script started at: $(date)" - echo "Working directory: $(pwd)" - echo "" - - echo "=== TEXT ATTACHMENT ===" - echo "Text attachment file: {{Param.AttachmentFile}}" - if [ -f "{{Param.AttachmentFile}}" ]; then - echo "Found text attachment file, contents:" - cat "{{Param.AttachmentFile}}" - else - echo "Text attachment file not found" - fi - echo "" - - echo "=== BINARY ATTACHMENT ===" - echo "Binary attachment file: {{Param.BinaryAttachment}}" - if [ -f "{{Param.BinaryAttachment}}" ]; then - echo "Found binary attachment file" - echo "File size: $(wc -c < "{{Param.BinaryAttachment}}") bytes" - echo "File type: $(file "{{Param.BinaryAttachment}}")" - echo "Hex dump (first 32 bytes):" - hexdump -C "{{Param.BinaryAttachment}}" | head -2 - else - echo "Binary attachment file not found" - fi - echo "" - - echo "Sleeping for 10 seconds..." - sleep 10 - echo "Job completed at: $(date)" diff --git a/scripted_tests/profiling/README.md b/scripted_tests/profiling/README.md deleted file mode 100644 index 80230ce35..000000000 --- a/scripted_tests/profiling/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Profiling - -The application level profiling tests submit job bundles with invocations wrapped by pyinstrument to generate -profiling data. - -## Running - -You will need a farm and a queue that you can submit jobs to. - -### bash/zsh -```sh -# Replace values with your own -export AWS_DEFAULT_REGION=us-west-2 -export AWS_DEFAULT_PROFILE=myprofile -export FARM_ID=farm-myfarmid -export QUEUE_ID=queue-myqueueid - -uv venv .venv-profiling -source .venv-profiling/bin/activate -uv pip install -e . -uv pip install -r requirements-testing.txt - -python scripted_tests/profiling/profiling.py --output-dir path/where/you/want/output -``` - -### fish -```fish -# Replace values with your own -set -x AWS_DEFAULT_REGION us-west-2 -set -x AWS_DEFAULT_PROFILE myprofile -set -x FARM_ID farm-myfarmid -set -x QUEUE_ID queue-myqueueid - -uv venv .venv-profiling -source .venv-profiling/bin/activate.fish -uv pip install -e . -uv pip install -r requirements-testing.txt - -python scripted_tests/profiling/profiling.py --output-dir path/where/you/want/output -``` - -After running the script your output will be in the directory you specified. By default, the output will be an -html file for each test. - diff --git a/scripted_tests/profiling/profiling.py b/scripted_tests/profiling/profiling.py deleted file mode 100644 index 7ecd752e4..000000000 --- a/scripted_tests/profiling/profiling.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import subprocess - -from enum import Enum, unique -from pathlib import Path -from typing import Optional, Union - -import click - - -@unique -class OutputFormat(Enum): - HTML = "html" - JSON = "json" - - -def profile( - name: str, - farm_id: str, - queue_id: str, - job_bundle: Path, - output_dir: Path, - output_format: OutputFormat, - parameters: Optional[dict[str, Union[int, float, str, Path]]] = None, -) -> None: - expanded_params = [ - ["--parameter", f"{key}={str(value)}"] - for key, value in (parameters.values() if parameters is not None else {}) - ] - subprocess.run( - [ - "pyinstrument", - "--renderer", - str(output_format.value).lower(), - "--outfile", - str(output_dir / f"{name}.{str(output_format.value).lower()}"), - "--from-path", - "deadline", - "bundle", - "submit", - str(job_bundle), - "--farm-id", - farm_id, - "--queue-id", - queue_id, - *expanded_params, - "--yes", - ], - input="y\n", - text=True, - check=True, - ) - - -@click.command() -@click.option("--farm-id", envvar="FARM_ID", type=str, required=True, help="The farm to submit to") -@click.option( - "--queue-id", envvar="QUEUE_ID", type=str, required=True, help="The queue to submit to" -) -@click.option("--output-dir", type=Path, required=True, help="The prefix to output the results to") -@click.option( - "--output-format", - type=click.Choice(OutputFormat), - default=OutputFormat.HTML, - help="The format the output should be in", -) -def cli(farm_id: str, queue_id: str, output_dir: Path, output_format: OutputFormat) -> None: - job_bundle_dir = Path(__file__).parent.parent / "job_bundles" - if not output_dir.is_dir(): - output_dir.mkdir(parents=True) - profile( - "minimal_job_bundle", - farm_id, - queue_id, - job_bundle_dir / "minimal_job_bundle", - output_dir, - output_format, - ) - profile( - "with_job_attachments", - farm_id, - queue_id, - job_bundle_dir / "with_job_attachments", - output_dir, - output_format, - ) - - -if __name__ == "__main__": - cli() diff --git a/scripted_tests/set_file_permission_for_windows.py b/scripted_tests/set_file_permission_for_windows.py deleted file mode 100644 index 4b8efd7cc..000000000 --- a/scripted_tests/set_file_permission_for_windows.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import argparse -import os -from pathlib import Path -from tempfile import TemporaryDirectory -import time -from typing import Tuple -import ntsecuritycon as con -import win32security - -from deadline.job_attachments.os_file_permission import ( - WindowsFileSystemPermissionSettings, - WindowsPermissionEnum, - _set_fs_permission_for_windows, -) - -""" -This script is to test a `_set_fs_permission_for_windows()` function in Job Attachment module, -which is for setting file permissions and ownership on Windows. - -Prerequisites -------------- -Before running the test, prepare a target user and a disjoint user. - -How to Run ----------- -To execute this script, run the following command from the root location: -python ./scripted_tests/set_file_permission_for_windows.py \ - -n \ - -f \ - -d \ - -u \ - -du \ - -ex - -Note: The `-f` and `-d` flags are optional. - -Then, the command will do the following: -1. Installs `pywin32`, which is a required package for the testing. -2. Creates a temporary directory and creates the specified number of files in it. -2. The script will add the given target user to the owner list for the specified files. -3. It will then verify (1) whether the target user has Read/Write access to these files, - and (2) that the disjoint user does not have access. - -Example Output --------------- -Created a temporary directory for the test: C:... -Creating temporary files... -Temporary files created. -Running test: Setting file permissions and group ownership... -File permissions and group ownership set. -Total running time for 10 files: 0.01644610000000002 -Checking file permissions... -Verified that file permissions are correctly set. -Cleaned up the temporary directory: C:... -End of test execution. -""" - - -def run_test(): - parser = argparse.ArgumentParser() - parser.add_argument("-n", "--num_files", type=int, required=True) - parser.add_argument("-f", "--file_permission", required=False, type=str, default="FULL_CONTROL") - parser.add_argument("-d", "--dir_permission", required=False, type=str, default="FULL_CONTROL") - parser.add_argument("-u", "--target_user", required=True, type=str) - parser.add_argument("-du", "--disjoint_user", required=True, type=str) - parser.add_argument( - "-ex", - "--use_extended_paths", - action="store_true", - help="Use extended-length file paths (\\\\?\\)", - ) - - args = parser.parse_args() - - num_files = args.num_files - file_permission = WindowsPermissionEnum(args.file_permission.upper()) - dir_permission = WindowsPermissionEnum(args.dir_permission.upper()) - - with TemporaryDirectory() as temp_root_dir: - print(f"Created a temporary directory for the test: {temp_root_dir}") - - print("Creating temporary files...") - files = [] - for i in range(0, num_files): - sub_dir = Path(temp_root_dir) / "sub_directory" - sub_dir.mkdir(parents=True, exist_ok=True) - if i < num_files / 2: - file_path = Path(temp_root_dir) / f"test{i}.txt" - else: - file_path = sub_dir / f"test{i}.txt" - if not os.path.exists(file_path): - with file_path.open("w", encoding="utf-8") as f: - f.write(f"test: {i}") - - file_path_str = ( - _to_extended_path(file_path) if args.use_extended_paths else str(file_path) - ) - files.append(str(file_path_str)) - - print("Temporary files created.") - print("Running test: Setting file permissions...") - start_time = time.perf_counter() - - fs_permission_settings = WindowsFileSystemPermissionSettings( - os_user=args.target_user, - dir_mode=dir_permission, - file_mode=file_permission, - ) - _set_fs_permission_for_windows( - file_paths=files, - local_root=temp_root_dir, - fs_permission_settings=fs_permission_settings, - ) - print("File permissions set.") - print(f"Total running time for {num_files} files: {time.perf_counter() - start_time}") - - print("Checking file permissions...") - for path in files: - assert check_file_permission(path, args.target_user) == (True, True) - assert check_file_permission(path, args.disjoint_user) == (False, False) - print("Verified that file permissions are correctly set.") - - print(f"Cleaned up the temporary directory: {temp_root_dir}") - - -def check_file_permission(file_path, username) -> Tuple[bool, bool]: - # Get the file's security information - sd = win32security.GetFileSecurity(file_path, win32security.DACL_SECURITY_INFORMATION) - - # Get the discretionary access control list (DACL) - dacl = sd.GetSecurityDescriptorDacl() - - # Lookup the user's SID (Security Identifier) - sid, _, _ = win32security.LookupAccountName("", username) - - # Trustee - trustee = { - "TrusteeForm": win32security.TRUSTEE_IS_SID, - "TrusteeType": win32security.TRUSTEE_IS_USER, - "Identifier": sid, - } - - # Get effective rights - result = dacl.GetEffectiveRightsFromAcl(trustee) - - # Return a tuple of (has read access, has write access) - return (bool(result & con.FILE_GENERIC_READ), bool(result & con.FILE_GENERIC_WRITE)) - - -def _to_extended_path(path: Path) -> str: - # Convert to absolute and apply extended-length prefix - return f"\\\\?\\{path.resolve()}" - - -if __name__ == "__main__": - run_test() - print("End of test execution.") diff --git a/scripted_tests/sync_inputs_with_step_deps.py b/scripted_tests/sync_inputs_with_step_deps.py deleted file mode 100644 index c25352661..000000000 --- a/scripted_tests/sync_inputs_with_step_deps.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -#! /usr/bin/env python3 -import argparse -import pathlib -from pprint import pprint -from tempfile import TemporaryDirectory -import time - -from deadline.job_attachments.asset_sync import AssetSync -from deadline.job_attachments._aws.deadline import get_job, get_queue - -""" -A script to manually test that input syncing is functioning well in scenarios where -there are step-step dependencies within a job. The AWS account to be tested should -have an S3 bucket set up for Job Attachments, and inside the bucket, prepare a job -that has assets and outputs on two or more different steps. - -How to test: - -1. Run the script with the following command: - python3 sync_inputs_with_step_deps.py -f -q -j -s -2. See the logs on the console to confirm whether the expected files have been - downloaded to the temporary (session) directory. (This directory will be deleted - when the test is finished.) -""" - - -def run(): - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", "--farm-id", type=str, help="Deadline Farm to download assets from.", required=True - ) - parser.add_argument( - "-q", "--queue-id", type=str, help="Deadline Queue to download assets from.", required=True - ) - parser.add_argument( - "-j", "--job-id", type=str, help="Deadline Job to download assets from.", required=True - ) - parser.add_argument( - "-s", - "--step-ids", - nargs="+", - type=str, - help="IDs of steps to sync inputs from", - required=False, - ) - args = parser.parse_args() - - farm_id = args.farm_id - queue_id = args.queue_id - job_id = args.job_id - step_ids = args.step_ids - - test_sync_inputs(farm_id, queue_id, job_id, step_ids) - - -def test_sync_inputs( - farm_id: str, - queue_id: str, - job_id: str, - step_ids: list[str], -): - """ - Downloads all inputs for a given job, and the outputs of the provided steps within the job. - """ - with TemporaryDirectory() as temp_root_dir: - print(f"Created a temporary directory for the test: {temp_root_dir}\n") - - queue = get_queue(farm_id=farm_id, queue_id=queue_id) - job = get_job(farm_id=farm_id, queue_id=queue_id, job_id=job_id) - - print("Starting test to sync inputs...\n") - asset_sync = AssetSync(farm_id=farm_id) - - download_start = time.perf_counter() - - (summary_statistics, local_roots) = asset_sync.sync_inputs( - s3_settings=queue.jobAttachmentSettings, - attachments=job.attachments, - queue_id=queue_id, - job_id=job_id, - session_dir=pathlib.Path(temp_root_dir), - step_dependencies=step_ids, - ) - - print(f"Download Summary Statistics:\n{summary_statistics}") - print( - f"Finished downloading after {time.perf_counter() - download_start} seconds, returned:" - ) - pprint(local_roots) - - print("\nListing files in the temporary directory:") - for pathmapping in local_roots: - all_files = _get_files_list_recursively(pathlib.Path(pathmapping["destination_path"])) - for file in all_files: - print(file) - - print(f"\nCleaned up the temporary directory: {temp_root_dir}") - - -def _get_files_list_recursively(directory: pathlib.Path): - files_list = [] - - for file in directory.iterdir(): - if file.is_file(): - files_list.append(file) - - for subdirectory in directory.iterdir(): - if subdirectory.is_dir(): - subdirectory_files = _get_files_list_recursively(subdirectory) - files_list.extend(subdirectory_files) - - return files_list - - -if __name__ == "__main__": - run() diff --git a/scripted_tests/upload_cancel_test.py b/scripted_tests/upload_cancel_test.py deleted file mode 100644 index 53a621d0f..000000000 --- a/scripted_tests/upload_cancel_test.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -#! /usr/bin/env python3 -import argparse -import os -import pathlib -import time -from threading import Thread - -from deadline.job_attachments._aws.deadline import get_queue -from deadline.job_attachments.exceptions import AssetSyncCancelledError -from deadline.job_attachments.upload import S3AssetManager - -""" -A testing script to simulate cancelling a hash/upload of assets. -It creates a large amount of local text files and uploads them to the S3 bucket -configured for the given Farm's Queue. - -How to test: - -1. Run this script with the following command: - python3 upload_cancel_test.py -f -q -2. In the middle of hashing or uploading those files, you can send a cancel - signal by pressing 'k' and Enter keys in succession. Confirm that cancelling - is working as expected by checking the console output. - -Note: This script generates test files in the /tmp/test_submit directory for testing -purpose. But it does not delete these files after the test is completed. -""" - -MESSAGE_HOW_TO_CANCEL = ( - "To stop the hash/upload process, please hit 'k' key and then 'Enter' key in succession.\n" -) - -NUM_TINY_FILES = 0 -NUM_SMALL_FILES = 0 -NUM_MEDIUM_FILES = 0 -NUM_LARGE_FILES = 1 - -continue_reporting = True -main_terminated = False - - -def run(): - print(MESSAGE_HOW_TO_CANCEL) - start_time = time.perf_counter() - - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", "--farm-id", type=str, help="Deadline Farm you want to submit to.", required=True - ) - parser.add_argument( - "-q", "--queue-id", type=str, help="Deadline Queue you want to submit to.", required=True - ) - args = parser.parse_args() - - farm_id = args.farm_id - queue_id = args.queue_id - - print("Setting up the test...") - - files = [] - root_path = pathlib.Path("/tmp/test_submit") - root_path.mkdir(parents=True, exist_ok=True) - - if NUM_TINY_FILES > 0: - for i in range(0, NUM_TINY_FILES): - file_path = root_path / f"tiny_test{i}.txt" - if not os.path.exists(file_path): - with file_path.open("wb") as f: - f.write(os.urandom(2 * (1024**2))) # 2 MB files - files.append(str(file_path)) - - # Make small files - if NUM_SMALL_FILES > 0: - for i in range(0, NUM_SMALL_FILES): - file_path = root_path / f"small_test{i}.txt" - if not os.path.exists(file_path): - with file_path.open("wb") as f: - f.write(os.urandom(10 * (1024**2))) # 10 MB files - files.append(str(file_path)) - - # Make medium-sized files - if NUM_MEDIUM_FILES > 0: - for i in range(0, NUM_MEDIUM_FILES): - file_path = root_path / f"medium_test{i}.txt" - if not os.path.exists(file_path): - with file_path.open("wb") as f: - f.write(os.urandom(100 * (1024**2))) # 100 MB files - files.append(str(file_path)) - - # Make large files - if NUM_LARGE_FILES > 0: - for i in range(0, NUM_LARGE_FILES): - file_path = root_path / f"large_test{i}.txt" - if not os.path.exists(file_path): - with file_path.open("ab") as f: - _create_large_file_with_chunks(file_path, 20 * (1024**3), 10**9) - files.append(str(file_path)) - - queue = get_queue(farm_id=farm_id, queue_id=queue_id) - asset_manager = S3AssetManager( - farm_id=farm_id, queue_id=queue_id, job_attachment_settings=queue.jobAttachmentSettings - ) - - print("\nStarting test...") - start = time.perf_counter() - - try: - print("\nStart hashing...") - upload_group = asset_manager.prepare_paths_for_upload( - ".", files, [root_path / "outputs"], [] - ) - (summary_statistics_hashing, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - print(f"Hashing Summary Statistics:\n{summary_statistics_hashing}") - - print("\nStart uploading...") - (summary_statistics_upload, attachment_settings) = asset_manager.upload_assets( - manifests, on_uploading_assets=mock_on_uploading_assets - ) - print(f"Upload Summary Statistics:\n{summary_statistics_upload}") - - total = time.perf_counter() - start - print( - f"Finished uploading after {total} seconds, created these attachment settings:\n{attachment_settings.to_dict()}" - ) - except AssetSyncCancelledError as asce: - print(f"AssetSyncCancelledError: {asce}") - print(f"payload:\n{asce.summary_statistics}") - - print(f"\nTotal test runtime: {time.perf_counter() - start_time}") - - global main_terminated - main_terminated = True - - -def _create_large_file_with_chunks(file_path: str, total_size: int, chunk_size: int) -> None: - """ - Creates a large file of a given total size by writing in chunks with random data. - It prevents MemoryError by dividing the size into manageable chunks and writing - each chunk sequentially. - """ - with open(file_path, "wb") as f: - num_chunks = total_size // chunk_size - for _ in range(num_chunks): - f.write(os.urandom(chunk_size)) - remaining = total_size % chunk_size - if remaining > 0: - f.write(os.urandom(remaining)) - - -def mock_on_preparing_to_submit(metadata): - print(metadata) - return mock_on_cancellation_check() - - -def mock_on_uploading_assets(metadata): - print(metadata) - return mock_on_cancellation_check() - - -def mock_on_cancellation_check(): - return continue_reporting - - -def wait_for_cancellation_input(): - while not main_terminated: - ch = input() - if ch == "k": - set_cancelled() - break - - -def set_cancelled(): - global continue_reporting - continue_reporting = False - print("Canceled the process.") - - -if __name__ == "__main__": - t = Thread(target=wait_for_cancellation_input) - t.start() - run() diff --git a/scripted_tests/upload_scale_test.py b/scripted_tests/upload_scale_test.py deleted file mode 100644 index 1e10a9015..000000000 --- a/scripted_tests/upload_scale_test.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -#! /usr/bin/env python3 -import argparse -import os -import pathlib -import sys -import time - -from deadline.job_attachments._aws.deadline import get_queue -from deadline.job_attachments.download import download_files_from_manifests, get_manifest_from_s3 -from deadline.job_attachments.models import S3_MANIFEST_FOLDER_NAME -from deadline.job_attachments.upload import S3AssetManager - -NUM_SMALL_FILES = 2000 -NUM_MEDIUM_FILES = 2000 -NUM_LARGE_FILES = 0 - -""" -A simple scale testing script for measuring input file upload and hashing speed. -Creates a large amount of local text files and uploads them to the S3 bucket configured -for the given Farm's Queue. - -Optionally, downloads the same files that were uploaded, to a different directory. - -Example usage: - -- You can run this command (assuming you have a Farm configured with a Queue): - python3 upload_scale_test.py -f $FARM_ID -q $QUEUE_ID - -- You can profile this by running with cProfile: - python -m cProfile -o profile.prof upload_scale_test.py -f $FARM_ID -q $QUEUE_ID - -- You can then visualize the data by running it through a tool like 'snakeviz' (just pip install): - snakeviz profile.prof -""" - -if __name__ == "__main__": - start_time = time.perf_counter() - - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", "--farm-id", type=str, help="Deadline Farm you want to submit to.", required=True - ) - parser.add_argument( - "-q", "--queue-id", type=str, help="Deadline Queue you want to submit to.", required=True - ) - parser.add_argument( - "-sd", - "--skip-download", - help="Specify this flag to skip the download step", - required=False, - action="store_true", - ) - parser.add_argument( - "-so", - "--setup-only", - help="Specify this flag to only generate local files for setup", - required=False, - action="store_true", - ) - args = parser.parse_args() - - farm_id = args.farm_id - queue_id = args.queue_id - - print("Setting up the test...") - - files = [] - root_path = pathlib.Path("/tmp/test_submit") - make_test_files = not root_path.exists() - root_path.mkdir(parents=True, exist_ok=True) - - # Make a ton of small files - if NUM_SMALL_FILES > 0: - for i in range(0, NUM_SMALL_FILES): - file_path = root_path / f"small_test{i}.txt" - if not os.path.exists(file_path): - with file_path.open("w", encoding="utf-8") as f: - f.write(f"test value: {i}") - files.append(str(file_path)) - - # Make 100GB worth of 5MB files - if NUM_MEDIUM_FILES > 0: - for i in range(0, NUM_MEDIUM_FILES): - file_path = root_path / f"medium_test{i}.txt" - if not os.path.exists(file_path): - with file_path.open("wb") as f: - f.write(os.urandom(5242880)) # 5 MB files - files.append(str(file_path)) - - # Make a 100GB file to test large file sizes (100 GB each) - if NUM_LARGE_FILES > 0: - for i in range(0, NUM_LARGE_FILES): - file_path = root_path / f"large_test{i}.txt" - if not os.path.exists(file_path): - for i in range(100): # Let's make it 100 GB for now - with file_path.open("ab") as f: - f.write(os.urandom(1073741824)) # Write 1 GB at a time - files.append(str(file_path)) - - if args.setup_only: - print("\nFinished setup, exiting.") - sys.exit() - - queue = get_queue(farm_id=farm_id, queue_id=queue_id) - asset_manager = S3AssetManager( - farm_id=farm_id, queue_id=queue_id, job_attachment_settings=queue.jobAttachmentSettings - ) - - print("\nStarting upload test...") - start = time.perf_counter() - - upload_group = asset_manager.prepare_paths_for_upload(".", files, [root_path / "outputs"], []) - (summary_statistics_hashing, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - ) - print(f"Summary Statistics for file hashing:\n{summary_statistics_hashing}") - - (summary_statistics_upload, attachment_settings) = asset_manager.upload_assets(manifests) - print(f"Summary Statistics for file uploads:\n{summary_statistics_upload}") - - total = time.perf_counter() - start - print( - f"Finished uploading after {total} seconds, created these attachment settings:\n{attachment_settings.to_dict()}" - ) - - if not args.skip_download: - print("\nStarting download test...") - start = time.perf_counter() - manifest_key = f"{queue.jobAttachmentSettings.rootPrefix}/{S3_MANIFEST_FOLDER_NAME}/{attachment_settings.manifests[0].inputManifestPath}" - asset_manifest = get_manifest_from_s3( - manifest_key, queue.jobAttachmentSettings.s3BucketName - ) - - download_files_from_manifests( - s3_bucket=queue.jobAttachmentSettings.s3BucketName, - manifests_by_root={"/tmp/test_download": asset_manifest}, - cas_prefix=queue.jobAttachmentSettings.full_cas_prefix(), - ) - total = time.perf_counter() - start - print(f"Finished downloading after {total} seconds") - - print(f"\nTotal test runtime: {time.perf_counter() - start_time}") diff --git a/scripts/attributions/cli.py b/scripts/attributions/cli.py index a29bb2b7b..637e0c6cc 100644 --- a/scripts/attributions/cli.py +++ b/scripts/attributions/cli.py @@ -621,7 +621,8 @@ def _get_license_info(python_interpreter: PythonInstall, dev: bool) -> list[_Pac pip_license_info, ) for pip_license_info in pip_licenses_parsed - if pip_license_info["Name"] != "deadline" + if pip_license_info["Name"] not in _BUNDLED_OWN_PACKAGES + and pip_license_info["Name"].replace("-", "_") not in _BUNDLED_OWN_PACKAGES ] diff --git a/scripts/run_sudo_tests.sh b/scripts/run_sudo_tests.sh deleted file mode 100755 index 986cb6ec5..000000000 --- a/scripts/run_sudo_tests.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -set -eu - -# Run this from the root of the repository -if ! test -d scripts -then - echo "Must run from the root of the repository" - exit 1 -fi - -DO_BUILD="False" -BUILD_ONLY="False" -while [[ "${1:-}" != "" ]]; do - case $1 in - -h|--help) - echo "Usage: run_sudo_tests.sh [--build]" - exit 1 - ;; - --build) - DO_BUILD="True" - ;; - --build-only) - BUILD_ONLY="True" - ;; - *) - echo "Unrecognized parameter: $1" - exit 1 - ;; - esac - shift -done - -# Copying the dist/ dir can cause permission issues, so just nuke it. -hatch clean 2> /dev/null || true - -ARGS="" - -if test "${PIP_INDEX_URL:-}" != ""; then - # If PIP_INDEX_URL is set, then export that in to the container - # so that `pip install` run in the container will fetch packages - # from the correct repository. - ARGS="${ARGS} -e PIP_INDEX_URL=${PIP_INDEX_URL}" -fi - -ARGS="${ARGS} -h localuser.environment.internal" -CONTAINER_IMAGE_TAG="job_attachment_localuser_test" -CONTAINER_IMAGE_DIR="localuser_sudo_environment" - -if test "${DO_BUILD}" == "True"; then - docker build testing_containers/"${CONTAINER_IMAGE_DIR}" -t "${CONTAINER_IMAGE_TAG}" -fi - -if test "${BUILD_ONLY}" == "True"; then - exit 0 -fi - -docker run --name test_sudo --rm -v $(pwd):/code:ro ${ARGS} "${CONTAINER_IMAGE_TAG}":latest diff --git a/src/deadline/job_attachments/__init__.py b/src/deadline/job_attachments/__init__.py deleted file mode 100644 index 6c0019222..000000000 --- a/src/deadline/job_attachments/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from ._version import __version__ as version # noqa - -__all__ = [ - "version", -] diff --git a/src/deadline/job_attachments/_aws/__init__.py b/src/deadline/job_attachments/_aws/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/src/deadline/job_attachments/_aws/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/src/deadline/job_attachments/_aws/aws_clients.py b/src/deadline/job_attachments/_aws/aws_clients.py deleted file mode 100644 index 7d004e1ec..000000000 --- a/src/deadline/job_attachments/_aws/aws_clients.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Functions for handling and retrieving AWS clients.""" - -from __future__ import annotations - -from functools import lru_cache -from typing import Optional - -import boto3 -import botocore -from boto3.s3.transfer import create_transfer_manager -from botocore.client import BaseClient, Config - -from .. import version -from .aws_config import ( - S3_CONNECT_TIMEOUT_IN_SECS, - S3_READ_TIMEOUT_IN_SECS, - S3_RETRIES_MODE, - VENDOR_CODE, -) - -MAX_SIZE_CACHE = 128 - - -# Should create a new botocore session since botocore session may be modified by boto3 session/client using it -# https://github.com/boto/boto3/blob/61de529b5f9a7bdcc8c76debb472a7f934d048e6/boto3/session.py#L79 -def get_botocore_session() -> botocore.session.Session: - session = botocore.session.get_session() - # Use regional endpoints by default for STS and S3 (us-east-1) to avoid - # cross-region calls to the global endpoint. This is the default in newer verisons, - # but older botocore versions default to "legacy" which routes through us-east-1. - session.set_config_variable("sts_regional_endpoints", "regional") - session.set_config_variable("s3", {"us_east_1_regional_endpoint": "regional"}) - return session - - -@lru_cache(maxsize=MAX_SIZE_CACHE) -def get_boto3_session( - botocore_session: botocore.session.Session = get_botocore_session(), -) -> boto3.session.Session: - return boto3.session.Session(botocore_session=botocore_session) - - -@lru_cache(maxsize=MAX_SIZE_CACHE) -def get_deadline_client( - session: Optional[boto3.session.Session] = None, endpoint_url: Optional[str] = None -) -> BaseClient: - """ - Get a boto3 Deadline client to make API calls to Deadline - """ - if session is None: - session = get_boto3_session() - - return session.client(VENDOR_CODE, endpoint_url=endpoint_url) - - -@lru_cache(maxsize=MAX_SIZE_CACHE) -def get_s3_client( - session: Optional[boto3.Session] = None, s3_max_pool_connections: int = 50 -) -> BaseClient: - """ - Get a boto3 S3 client to make API calls to S3 - """ - if session is None: - session = get_boto3_session() - - client = session.client( - "s3", - config=Config( - signature_version="s3v4", - connect_timeout=S3_CONNECT_TIMEOUT_IN_SECS, - read_timeout=S3_READ_TIMEOUT_IN_SECS, - retries={"mode": S3_RETRIES_MODE}, - user_agent_extra=f"S3A/Deadline/NA/JobAttachments/{version}", - max_pool_connections=s3_max_pool_connections, - ), - ) - - def add_expected_bucket_owner(params, model, **kwargs): - """ - Add the expected bucket owner to the params if the API operation to run can use it. - """ - if "ExpectedBucketOwner" in model.input_shape.members: - params["ExpectedBucketOwner"] = get_account_id(session=session) - - client.meta.events.register("provide-client-params.s3.*", add_expected_bucket_owner) - - return client - - -@lru_cache(maxsize=MAX_SIZE_CACHE) -def get_s3_transfer_manager(s3_client: BaseClient): - transfer_config = boto3.s3.transfer.TransferConfig() - return create_transfer_manager(client=s3_client, config=transfer_config) - - -@lru_cache(maxsize=MAX_SIZE_CACHE) -def get_sts_client(session: Optional[boto3.session.Session] = None) -> BaseClient: - """ - Get a boto3 sts client to make API calls to STS. - """ - if session is None: - session = get_boto3_session() - - return session.client("sts") - - -@lru_cache(maxsize=MAX_SIZE_CACHE) -def get_caller_identity( - session: Optional[boto3.session.Session] = None, -) -> dict[str, str]: - """ - Get the caller identity for the current session. - """ - return get_sts_client(session).get_caller_identity() - - -def get_account_id(session: Optional[boto3.session.Session] = None) -> str: - """ - Get the account id for the current session. - """ - return get_caller_identity(session)["Account"] diff --git a/src/deadline/job_attachments/_aws/aws_config.py b/src/deadline/job_attachments/_aws/aws_config.py deleted file mode 100644 index 29820456b..000000000 --- a/src/deadline/job_attachments/_aws/aws_config.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""AWS configuration.""" - -VENDOR_CODE: str = "deadline" - -# S3 related -S3_CONNECT_TIMEOUT_IN_SECS: int = 30 -S3_READ_TIMEOUT_IN_SECS: int = 30 -S3_RETRIES_MODE: str = "standard" diff --git a/src/deadline/job_attachments/_aws/deadline.py b/src/deadline/job_attachments/_aws/deadline.py deleted file mode 100644 index 42e3fb414..000000000 --- a/src/deadline/job_attachments/_aws/deadline.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Functions for interfacing with Deadline API calls.""" - -from typing import Optional - -import boto3 -from botocore.exceptions import ClientError - -from ..exceptions import JobAttachmentsError -from ..models import ( - Attachments, - JobAttachmentsFileSystem, - Job, - JobAttachmentS3Settings, - ManifestProperties, - PathFormat, - Queue, -) -from .aws_clients import get_deadline_client - - -def get_queue( - farm_id: str, - queue_id: str, - session: Optional[boto3.Session] = None, - deadline_endpoint_url: Optional[str] = None, -) -> Queue: - """ - Retrieves a specific queue from AWS Deadline Cloud. - """ - try: - response = get_deadline_client( - session=session, endpoint_url=deadline_endpoint_url - ).get_queue(farmId=farm_id, queueId=queue_id) - except ClientError as exc: - raise JobAttachmentsError(f'Failed to get queue "{queue_id}" from Deadline: {exc}') from exc - - # The API returns empty fields instead of an empty dict if there are no job attachment settings. So we need to - # double check if the s3BucketName is set. - if response.get("jobAttachmentSettings") and response["jobAttachmentSettings"].get( - "s3BucketName" - ): - job_attachment_settings = JobAttachmentS3Settings( - s3BucketName=response["jobAttachmentSettings"].get("s3BucketName", ""), - rootPrefix=response["jobAttachmentSettings"].get("rootPrefix", ""), - ) - else: - job_attachment_settings = None - - display_name_key = "displayName" - status_key = "status" - if "name" in response: - display_name_key = "name" - if "state" in response: - status_key = "state" - - return Queue( - displayName=response[display_name_key], - queueId=response["queueId"], - farmId=response["farmId"], - status=response[status_key], - defaultBudgetAction=response["defaultBudgetAction"], - jobAttachmentSettings=job_attachment_settings, - ) - - -def get_job( - farm_id: str, - queue_id: str, - job_id: str, - session: Optional[boto3.Session] = None, - deadline_endpoint_url: Optional[str] = None, -) -> Job: - """ - Retrieves a specific job from AWS Deadline Cloud. - """ - try: - response = get_deadline_client(session=session, endpoint_url=deadline_endpoint_url).get_job( - farmId=farm_id, queueId=queue_id, jobId=job_id - ) - except ClientError as exc: - raise JobAttachmentsError(f'Failed to get job "{job_id}" from Deadline') from exc - return Job( - jobId=response["jobId"], - attachments=( - Attachments( - manifests=[ - ManifestProperties( - fileSystemLocationName=manifest_properties.get( - "fileSystemLocationName", None - ), - rootPath=manifest_properties["rootPath"], - rootPathFormat=PathFormat(manifest_properties["rootPathFormat"]), - outputRelativeDirectories=manifest_properties.get( - "outputRelativeDirectories", None - ), - inputManifestPath=manifest_properties.get("inputManifestPath", None), - ) - for manifest_properties in response["attachments"]["manifests"] - ], - fileSystem=JobAttachmentsFileSystem( - response["attachments"].get("fileSystem", JobAttachmentsFileSystem.COPIED.value) - ), - ) - if "attachments" in response and response["attachments"] - else None - ), - ) diff --git a/src/deadline/job_attachments/_diff.py b/src/deadline/job_attachments/_diff.py deleted file mode 100644 index 98869525a..000000000 --- a/src/deadline/job_attachments/_diff.py +++ /dev/null @@ -1,301 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -import concurrent.futures - -import logging -import os -from pathlib import Path, PurePosixPath -from typing import Any, Callable, Dict, List, Optional, Tuple -from math import trunc -from deadline.job_attachments.exceptions import NonValidInputError -from deadline.job_attachments.asset_manifests.base_manifest import ( - BaseAssetManifest, - BaseManifestPath, -) -from deadline.job_attachments.caches.hash_cache import HashCache -from deadline.job_attachments.models import AssetRootManifest, FileStatus, ManifestDiff -from deadline.job_attachments.upload import S3AssetManager - - -def diff_manifest( - asset_manager: S3AssetManager, - asset_root_manifest: AssetRootManifest, - manifest: str, - update: bool, -) -> List[(Tuple[FileStatus, BaseManifestPath])]: - """ - Gets the file paths in specified manifest if the contents of file have changed since its last snapshot. - Returns a list of FileStatus and BaseManifestPath - """ - manifest_dir_name: str = os.path.basename(manifest) - root_path: str = asset_root_manifest.root_path - input_paths: List[Path] = [] - - asset_manifest = asset_root_manifest.asset_manifest - if asset_manifest is None: - raise NonValidInputError("Manifest object not found, please check input manifest. ") - - for base_manifest_path in asset_manifest.paths: - if base_manifest_path.path.startswith(manifest_dir_name): - # skip the manifest folder, or else every upload will need an update after a previous change - continue - input_paths.append(Path(root_path, base_manifest_path.path)) - - return find_file_with_status( - asset_manager=asset_manager, - input_paths=input_paths, - root_path=root_path, - update=update, - statuses=[FileStatus.NEW, FileStatus.MODIFIED], - ) - - -def find_file_with_status( - asset_manager: S3AssetManager, - input_paths: List[Path], - root_path: str, - update: bool, - statuses: List[FileStatus], - cache_dir: Optional[str] = None, -) -> List[(Tuple[FileStatus, BaseManifestPath])]: - """ - Checks a manifest file, compares it to specified root directory or manifest of files with the local hash cache, and finds files that match the specified statuses. - Returns a list of tuples containing the file information, and its corresponding file status. - """ - with HashCache(cache_dir) as hash_cache: - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = { - executor.submit( - asset_manager._process_input_path, - path=path, - root_path=root_path, - hash_cache=hash_cache, - update=update, - ): path - for path in input_paths - } - status_paths: List[tuple] = [] - for future in concurrent.futures.as_completed(futures): - file_status, _, manifestPath = future.result() - if file_status in statuses: - status_paths.append((file_status, manifestPath)) - - return status_paths - - -def compare_manifest( - reference_manifest: BaseAssetManifest, compare_manifest: BaseAssetManifest -) -> List[(Tuple[FileStatus, BaseManifestPath])]: - """ - Compares two manifests, reference_manifest acting as the base, and compare_manifest acting as manifest with changes. - Returns a list of FileStatus and BaseManifestPath - """ - reference_dict: Dict[str, BaseManifestPath] = { - manifest_path.path: manifest_path for manifest_path in reference_manifest.paths - } - compare_dict: Dict[str, BaseManifestPath] = { - manifest_path.path: manifest_path for manifest_path in compare_manifest.paths - } - - differences: List[(Tuple[FileStatus, BaseManifestPath])] = [] - - # Find new files - for file_path, manifest_path in compare_dict.items(): - if file_path not in reference_dict: - differences.append((FileStatus.NEW, manifest_path)) - elif reference_dict[file_path].hash != manifest_path.hash: - differences.append((FileStatus.MODIFIED, manifest_path)) - else: - differences.append((FileStatus.UNCHANGED, manifest_path)) - - # Find deleted files - for file_path, manifest_path in reference_dict.items(): - if file_path not in compare_dict: - differences.append((FileStatus.DELETED, manifest_path)) - - return differences - - -def _fast_file_list_to_manifest_diff( - root: str, - current_files: List[str], - diff_manifest: BaseAssetManifest, - print_function_callback: Callable[[Any], None] = lambda msg: None, - return_root_relative_path: bool = True, -) -> List[Tuple[str, FileStatus]]: - """ - Perform a fast difference of the current list of files to a previous manifest to diff against using time stamps and file sizes. - :param root: Root folder of files to diff against. - :param current_files: List of files to compare with. - :param diff_manifest: Manifest containing files to diff against. - :param return_root_relative_path: File Path to return, either relative to root or full. - :param logger: logger. - :return List[Tuple[str, FileStatus]]: List of Tuple containing the file path and FileStatus pair. - """ - - # Select either relative or absolut path for results. - def select_path(full_path: str, relative_path: str, return_root_relative_path: bool): - return relative_path if return_root_relative_path else full_path - - changed_paths: List[Tuple[str, FileStatus]] = [] - input_files_map: Dict[str, BaseManifestPath] = {} - for input_file in diff_manifest.paths: - # Normalize paths so we can compare different OSes - normalized_path = Path(os.path.normpath(input_file.path)).as_posix() - input_files_map[normalized_path] = input_file - - # Iterate for each file that we found in glob. - root_relative_paths: List[str] = [] - for local_file in current_files: - # Get the file's time stamp and size. We want to compare both. - # From enabling CRT, sometimes timestamp update can fail. - local_file_path = Path(local_file) - file_stat = local_file_path.stat() - - # Compare the glob against the relative path we store in the manifest. - # Save it to a list so we can look for deleted files. - root_relative_path = str(PurePosixPath(*local_file_path.relative_to(root).parts).as_posix()) - root_relative_paths.append(root_relative_path) - - return_path = select_path( - full_path=local_file, - relative_path=root_relative_path, - return_root_relative_path=return_root_relative_path, - ) - if root_relative_path not in input_files_map: - # This is a new file - print_function_callback( - f"Found difference at: {root_relative_path}, Status: FileStatus.NEW" - ) - changed_paths.append((return_path, FileStatus.NEW)) - else: - # This is a modified file, compare with manifest relative timestamp. - input_file = input_files_map[root_relative_path] - # Check file size first as it is easier to test. Usually modified files will also have size diff. - if file_stat.st_size != input_file.size: - changed_paths.append((return_path, FileStatus.MODIFIED)) - print_function_callback( - f"Found size difference at: {root_relative_path}, Status: FileStatus.MODIFIED" - ) - # Check file mtime, allow 1 microsecond diff to prevent false positive - # utime set from microsecond to nanosecond conversion could create 1 microsecond diff upon division - elif abs(trunc(file_stat.st_mtime_ns / 1000) - input_file.mtime) > 1: - changed_paths.append((return_path, FileStatus.MODIFIED)) - print_function_callback( - f"Found time difference at: {root_relative_path}, Status: FileStatus.MODIFIED" - ) - - # Find deleted files. Manifest store files in relative form. - for manifest_file_path in diff_manifest.paths: - if manifest_file_path.path not in root_relative_paths: - full_path = os.path.join(root, manifest_file_path.path) - return_path = select_path( - full_path=full_path, - relative_path=manifest_file_path.path, - return_root_relative_path=return_root_relative_path, - ) - changed_paths.append((return_path, FileStatus.DELETED)) - return changed_paths - - -def pretty_print_cli(root: str, all_files: List[str], manifest_diff: ManifestDiff): - """ - Prints to command line a formatted file tree structure with corresponding file statuses - """ - - # ASCII characters for the tree structure - PIPE = "│" - HORIZONTAL = "──" - ELBOW = "└" - TEE = "├" - SPACE = " " - - # ANSI escape sequences for colors - COLORS = { - "MODIFIED": "\033[93m", # yellow - "NEW": "\033[92m", # green - "DELETED": "\033[91m", # red - "UNCHANGED": "\033[90m", # grey - "RESET": "\033[0m", # base color - "DIRECTORY": "\033[80m", # grey - } - - # Tooltips: - TOOLTIPS = { - FileStatus.NEW: " +", # added files - FileStatus.DELETED: " -", # deleted files - FileStatus.MODIFIED: " M", # modified files - FileStatus.UNCHANGED: "", # unchanged files - } - - class ColorFormatter(logging.Formatter): - def format(self, record): - message = super().format(record) - return f"{message}" - - # Configure logger - formatter = ColorFormatter("") - handler = logging.StreamHandler() - handler.setFormatter(formatter) - logger = logging.getLogger(__name__) - logger.addHandler(handler) - logger.setLevel(logging.INFO) - logger.propagate = False - - def print_tree(directory_tree, prefix=""): - sorted_entries = sorted(directory_tree.items()) - - for i, (entry, subtree) in enumerate(sorted_entries, start=1): - is_last_entry = i == len(sorted_entries) - symbol = ELBOW + HORIZONTAL if is_last_entry else TEE + HORIZONTAL - is_dir = isinstance(subtree, dict) - color = COLORS["DIRECTORY"] if is_dir else COLORS[subtree.name] - tooltip = TOOLTIPS[FileStatus.UNCHANGED] if is_dir else TOOLTIPS[subtree] - - message = f"{prefix}{symbol}{color}{entry}{tooltip}{COLORS['RESET']}{os.path.sep if is_dir else ''}" - logger.info(message) - - if is_dir: - new_prefix = prefix + (SPACE if is_last_entry else PIPE + SPACE) - print_tree(subtree, new_prefix) - - if not directory_tree: - symbol = ELBOW + HORIZONTAL - message = f"{prefix}{symbol}{COLORS['UNCHANGED']}. {COLORS['RESET']}" - logger.info(message) - - def get_file_status(file: str, manifest_diff: ManifestDiff): - print(file) - if file in manifest_diff.new: - return FileStatus.NEW - elif file in manifest_diff.modified: - return FileStatus.MODIFIED - elif file in manifest_diff.deleted: - return FileStatus.DELETED - else: - # Default, not in any diff list. - return FileStatus.UNCHANGED - - def build_directory_tree(all_files: List[str]) -> Dict[str, dict]: - directory_tree: dict = {} - - def add_to_tree(path, status): - parts = str(path).split(os.path.sep) - current_level = directory_tree - for i, part in enumerate(parts): - if i == len(parts) - 1: - current_level[part] = status - else: - current_level = current_level.setdefault(part, {}) - - for file in all_files: - print(f"{file} {root}") - relative_path = str(Path(file).relative_to(root)) - add_to_tree( - relative_path, - get_file_status(relative_path, manifest_diff), - ) - return directory_tree - - directory_tree = build_directory_tree(all_files) - print_tree(directory_tree) - logger.info("") diff --git a/src/deadline/job_attachments/_glob.py b/src/deadline/job_attachments/_glob.py deleted file mode 100644 index 2190bc180..000000000 --- a/src/deadline/job_attachments/_glob.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -import glob -import json -from pathlib import Path -from typing import List, Optional -from deadline.job_attachments.exceptions import NonValidInputError -from deadline.job_attachments.models import GlobConfig - - -def _process_glob_inputs(glob_arg_input: str) -> GlobConfig: - """ - Helper function to process glob inputs. - glob_input: String, can represent a json, filepath, or general include glob syntax. - """ - - # Default Glob config. - glob_config = GlobConfig() - if glob_arg_input is None or len(glob_arg_input) == 0: - # Not configured, or not passed in. - return glob_config - - try: - input_as_path = Path(glob_arg_input) - if input_as_path.is_file(): - # Read the file so it can be parsed as JSON. - with open(glob_arg_input) as f: - glob_arg_input = f.read() - except Exception: - # If this cannot be processed as a file, try it as JSON. - pass - - try: - # Parse the input as JSON, default to Glob Config defaults. - input_as_json = json.loads(glob_arg_input) - glob_config.include_glob = input_as_json.get(GlobConfig.INCLUDE, glob_config.include_glob) - glob_config.exclude_glob = input_as_json.get(GlobConfig.EXCLUDE, glob_config.exclude_glob) - except Exception: - # This is not a JSON blob, bad input. - raise NonValidInputError(f"Glob input {glob_arg_input} cannot be deserialized as JSON") - - return glob_config - - -def _match_files_with_pattern(base_path: str, patterns: List[str]) -> set: - """ - Helper function to match files based on glob patterns. - - Args: - base_path: Root path to glob from - patterns: List of glob patterns to match - - Returns: - Set of normalized file paths that match the patterns - """ - matched_files = set() - for pattern in patterns: - # Make pattern relative to base path - full_pattern = os.path.join(base_path, pattern) - - # Use recursive glob for directory matching - for matched_path in glob.glob(full_pattern, recursive=True): - # Only add files, not directories - if os.path.isfile(matched_path): - # Convert to proper path format - normalized_path = os.path.normpath(matched_path) - matched_files.add(normalized_path) - - return matched_files - - -def _glob_paths( - path: str, include: List[str] = ["**/*"], exclude: Optional[List[str]] = None -) -> List[str]: - """ - Glob routine that supports Unix style pathname pattern expansion for includes and excludes. - This function will recursively list all files of path, including all files globbed by include and removing all files marked by exclude. - path: Root path to glob. - include: Optional, pattern syntax for files to include. - exclude: Optional, pattern syntax for files to exclude. - return: List of files found based on supplied glob patterns. - """ - # Convert path to absolute path - base_path = os.path.abspath(path) - - # Process include patterns - matched_files = _match_files_with_pattern(base_path, include) - - # Process exclude patterns - if exclude: - files_to_exclude = _match_files_with_pattern(base_path, exclude) - # Remove excluded files from result - matched_files -= files_to_exclude - - return list(matched_files) diff --git a/src/deadline/job_attachments/_incremental_downloads/_manifest_s3_downloads.py b/src/deadline/job_attachments/_incremental_downloads/_manifest_s3_downloads.py deleted file mode 100644 index ee72dc228..000000000 --- a/src/deadline/job_attachments/_incremental_downloads/_manifest_s3_downloads.py +++ /dev/null @@ -1,667 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -from __future__ import annotations - -__all__ = [ - "_add_output_manifests_from_s3", - "_download_all_manifests_with_absolute_paths", - "_merge_absolute_path_manifest_list", - "_download_manifest_paths", -] - -from typing import Any, Callable, DefaultDict, Optional -from datetime import datetime, timezone -import re -import os -import concurrent.futures -from threading import Lock -from pathlib import Path -import posixpath -import ntpath - -import boto3 -from boto3.s3.transfer import ProgressCallbackInvoker -from botocore.client import BaseClient -from botocore.exceptions import BotoCoreError, ClientError - -from ..download import ( - _get_output_manifest_prefix, - _get_tasks_manifests_keys_from_s3, - _get_asset_root_and_manifest_from_s3_with_last_modified, - _get_num_download_workers, - _get_new_copy_file_path, - S3_DOWNLOAD_MAX_CONCURRENCY, -) -from ..asset_manifests import ( - hash_data as ja_hash_data, - BaseAssetManifest, - BaseManifestPath, - HashAlgorithm, -) -from ..asset_manifests.v2023_03_03.asset_manifest import DEFAULT_HASH_ALG -from ..models import ( - FileConflictResolution, - JobAttachmentS3Settings, - S3_MANIFEST_FOLDER_NAME, - PathFormat, -) -from ..exceptions import ( - COMMON_ERROR_GUIDANCE_FOR_S3, - AssetSyncError, - AssetSyncCancelledError, - JobAttachmentS3BotoCoreError, - JobAttachmentsS3ClientError, - JobAttachmentsError, -) -from .._aws.aws_clients import ( - get_account_id, - get_s3_client, - get_s3_transfer_manager, -) -from ..progress_tracker import ( - ProgressTracker, - ProgressStatus, - ProgressReportMetadata, -) -from .._utils import _get_long_path_compatible_path -from ...job_attachments._path_mapping import _PathMappingRuleApplier - - -""" -This file contains a forked copy of some functionality from deadline.job_attachments.download, -with its interface refactored to support the incremental download command. - -It consists fully of internal-only functionality. We would like to iteratively refine these -interfaces over time, with the goal that we deprecate the current interfaces -in deadline.job_attachments.download and replace them with more general and flexible interfaces. -""" - -SESSION_ACTION_ID_FROM_KEY_RE = re.compile(r"(sessionaction-[^/-]+-[^/-]+)/") - - -def _add_output_manifests_from_s3( - farm_id: str, - queue: dict[str, Any], - job: dict[str, Any], - boto3_session: boto3.Session, - session_action_list: list[dict[str, Any]], -): - """ - This function takes a list of session actions (as returned by boto3 deadline.list_session_actions), - and for any that lack manifest fields, updates them with values retrieved from S3. While the response - from Deadline Cloud will always return both outputManifestPath and outputManifestHash, this function - only populates the outputManifestPath value. The order of the manifests in the list precisely correspond - to the manifests returned by boto3 deadline.get_job, clients of these APIs can zip() the two - manifests lists together to get the full set of fields needed for processing. - - * https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/deadline/client/list_session_actions.html - * https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/deadline/client/get_job.html - - Args: - farm_id: The farm id. - queue: The queue as returned by boto3 deadline.get_queue(). - job: The job as returned by boto3 deadline.search_jobs(). - boto3_session: The boto3.Session for accessing AWS. - session_action_list: A list of session actions to modify by adding the "manifests" field where necessary. - Its shape is as returned by boto3 deadline.list_session_actions() or deadline.get_session_action(). - """ - # If the job has no job attachments, there's nothing to add - if "attachments" not in job: - return - - s3_settings = JobAttachmentS3Settings(**queue["jobAttachmentSettings"]) - - # Filter the session action list to exclude any that already contain manifests, - # then return early if the result is empty. - session_action_list = [ - session_action - for session_action in session_action_list - if "manifests" not in session_action - ] - if not session_action_list: - return - - # Process the job's attachments list to generate the hashes - job_manifests_length = len(job["attachments"]["manifests"]) - job_indexed_root_path_hash = [ - ( - index, - ja_hash_data( - f"{manifest.get('fileSystemLocationName', '')}{manifest['rootPath']}".encode(), - DEFAULT_HASH_ALG, - ), - ) - for index, manifest in enumerate(job["attachments"]["manifests"]) - ] - # Initialize to empty "manifests" entries - for session_action in session_action_list: - session_action["manifests"] = [{}] * job_manifests_length - - # Get all the output manifest keys for all the steps/tasks of the job. - # TODO: This is very inefficient if the job has lots of tasks, because - # the incremental download will generally only use a few of them at a time. - manifest_prefix: str = _get_output_manifest_prefix( - s3_settings, farm_id, queue["queueId"], job["jobId"] - ) - try: - manifests_keys: list[str] = _get_tasks_manifests_keys_from_s3( - manifest_prefix, - s3_settings.s3BucketName, - session=boto3_session, - select_latest_per_task=False, - ) - except JobAttachmentsError as e: - # If there are no manifests, treat as no data. - if str(e).startswith("Unable to find asset manifest in"): - return - else: - raise - - # Organize the session actions by session action id, so that we can quickly get - # to the correct session action from the manifest object key. - session_actions_by_session_action_id: dict[str, dict[str, Any]] = { - session_action["sessionActionId"]: session_action for session_action in session_action_list - } - - manifest_prefix = f"{queue['jobAttachmentSettings']['rootPrefix']}/{S3_MANIFEST_FOLDER_NAME}/" - for key in manifests_keys: - # Extract the session action id from the manifest key - m = SESSION_ACTION_ID_FROM_KEY_RE.search(key) - if m: - manifest_session_action_id = m[1] - else: - raise RuntimeError( - f"Job attachments manifest key for job {job['name']} ({job['jobId']}) lacks a session action id" - ) - # Loop through all the manifests to see whether the hash of the rootPath is in the key, - # in order to determine which position in the manifests list this key should get set in. - manifests_index = None - for index, root_path_hash in job_indexed_root_path_hash: - if root_path_hash in key: - manifests_index = index - break - if manifests_index is None: - root_path_hashes = [hash for _, hash in job_indexed_root_path_hash] - raise RuntimeError( - f"Job attachments manifest key for job {job['name']} ({job['jobId']}) does not contain any of the rootPath hashes {', '.join(root_path_hashes)}: {key}" - ) - # If this session action is in the list, add this key to it - session_action_for_key = session_actions_by_session_action_id.get( - manifest_session_action_id - ) - if session_action_for_key is not None: - # This is equivalent to "output_manifest_path = key.removeprefix(manifest_prefix)" to - # retain support for Python 3.8 which does not support str.removeprefix. - output_manifest_path = key - if output_manifest_path.startswith(manifest_prefix): - output_manifest_path = output_manifest_path[len(manifest_prefix) :] - session_action_for_key["manifests"][manifests_index] = { - "outputManifestPath": output_manifest_path, - } - - -def _download_manifest_and_make_paths_absolute( - index: int, - queue: dict[str, Any], - job_id: str, - root_path: str, - manifest_s3_key: str, - path_mapping_rule_applier: Optional[_PathMappingRuleApplier], - boto3_session_for_s3: boto3.Session, - output_manifests: list, - output_unmapped_paths: list[tuple[str, str]], -): - """ - Downloads the specified manifest, makes all its paths absolute using root_path, - and then places it in output_manifests[index]. - """ - # Download the manifest - _, last_modified, manifest = _get_asset_root_and_manifest_from_s3_with_last_modified( - manifest_s3_key, queue["jobAttachmentSettings"]["s3BucketName"], boto3_session_for_s3 - ) - if path_mapping_rule_applier: - new_manifest_paths = [] - if path_mapping_rule_applier.source_path_format == PathFormat.WINDOWS.value: - source_os_path: Any = ntpath - else: - source_os_path = posixpath - else: - source_os_path = os.path - - # Convert all the manifest paths to have absolute normalized local paths - for manifest_path in manifest.paths: - manifest_path.path = source_os_path.normpath( - source_os_path.join(root_path, manifest_path.path) - ) - if path_mapping_rule_applier: - try: - manifest_path.path = str( - path_mapping_rule_applier.strict_transform(manifest_path.path) - ) - new_manifest_paths.append(manifest_path) - except ValueError: - output_unmapped_paths.append((job_id, manifest_path.path)) - - if path_mapping_rule_applier: - # Update the manifest to only include the mapped paths - manifest.paths = new_manifest_paths - - output_manifests[index] = (last_modified, manifest) - - -def _get_manifests_to_download( - job_attachments_root_prefix: str, - download_candidate_jobs: dict[str, dict[str, Any]], - job_sessions: dict[str, list], - path_mapping_rule_appliers: dict[str, Optional[_PathMappingRuleApplier]], -) -> list[tuple[Optional[_PathMappingRuleApplier], str, str, str]]: - """ - Collect a list of (pathMappingRuleApplier, jobId, rootPath, manifest_s3_key) tuples for all the job attachments that need to be downloaded. - - Args: - job_attachments_root_prefix: The queue.jobAttachmentSettings.rootPrefix field from the Deadline - Cloud queue. - download_candidate_jobs: A mapping from job id to jobs as returned by deadline.search_jobs. - job_sessions: Contains each job's sessions and session actions, structured as job_sessions[job_id][session_index]["sessionActions"][session_action_index]. - See the function _get_job_sessions for more details. - path_mapping_rule_appliers: A mapping from storage profile ID to the path mapping rule applier to use for it. - If no path mapping should be used, is the empty {}. - - Returns: - A list of (pathMappingRuleApplier, jobId, rootPath, manifest_s3_key) tuples for the manifest objects that need to be downloaded. - """ - manifests_to_download: list[tuple[Optional[_PathMappingRuleApplier], str, str, str]] = [] - for job_id, session_list in job_sessions.items(): - job = download_candidate_jobs[job_id] - for session in session_list: - for session_action in session.get("sessionActions", []): - # The manifests lists from the job and session action correspond, so we can zip them - # together to attach the root path with the S3 manifest key - for job_manifest, session_action_manifest in zip( - job["attachments"]["manifests"], session_action["manifests"] - ): - if "outputManifestPath" in session_action_manifest: - manifests_to_download.append( - ( - path_mapping_rule_appliers[job["storageProfileId"]] - if path_mapping_rule_appliers - else None, - job_id, - job_manifest["rootPath"], - "/".join( - [ - job_attachments_root_prefix, - S3_MANIFEST_FOLDER_NAME, - session_action_manifest["outputManifestPath"], - ] - ), - ) - ) - return manifests_to_download - - -def _download_all_manifests_with_absolute_paths( - queue: dict[str, Any], - download_candidate_jobs: dict[str, dict[str, Any]], - job_sessions: dict[str, list], - path_mapping_rule_appliers: dict[str, Optional[_PathMappingRuleApplier]], - output_unmapped_paths: dict[str, list[str]], - boto3_session_for_s3: boto3.Session, - print_function_callback: Callable[[Any], None] = lambda msg: None, -) -> list[tuple[datetime, BaseAssetManifest]]: - """ - Downloads all the manifest files that are in the session_actions of job_sessions, and uses the rootPath - values taken from the job to make all the paths in the manifest absolute. - - Args: - queue: The Deadline Cloud queue as returned by deadline.get_queue. - download_candidate_jobs: A mapping from job id to jobs as returned by deadline.search_jobs. - job_sessions: Contains each job's sessions and session actions, structured as job_sessions[job_id][session_index]["sessionActions"][session_action_index]. - See the function _get_job_sessions for more details. - path_mapping_rule_appliers: A mapping from storage profile ID to the path mapping rule applier to use for it. - If no path mapping should be used, is the empty {}. - boto3_session_for_s3: The boto3.Session to use for accessing S3. - output_unmapped_paths: A mapping from the job id to a list of all the paths that were not mapped - and therefore will not be downloaded. - print_function_callback: Callback for printing output to the terminal or log. - - Returns: - A list of BaseAssetManifest objects containing local absolute file paths sorted by the last_modified timestamp. - """ - # Get the list of (rootPath, manifest_s3_key) tuples to download from S3. - manifests_to_download: list[tuple[Optional[_PathMappingRuleApplier], str, str, str]] = ( - _get_manifests_to_download( - queue["jobAttachmentSettings"]["rootPrefix"], - download_candidate_jobs, - job_sessions, - path_mapping_rule_appliers, - ) - ) - - print_function_callback("") - print_function_callback(f"Downloading {len(manifests_to_download)} asset manifests from S3...") - start_time = datetime.now(tz=timezone.utc) - - # Download all the manifest files from S3, and make the paths in the manifests absolute local paths - # by joining with the root path and normalizing - downloaded_manifests: list = [None] * len(manifests_to_download) - # All the unmapped paths get recorded here as (jobId, unmappedPath) - unmapped_paths: list[tuple[str, str]] = [] - - max_workers = S3_DOWNLOAD_MAX_CONCURRENCY - print_function_callback(f"Using {max_workers} threads") - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [] - for index, (path_mapping_rule_applier, job_id, root_path, manifest_s3_key) in enumerate( - manifests_to_download - ): - futures.append( - executor.submit( - _download_manifest_and_make_paths_absolute, - index, - queue, - job_id, - root_path, - manifest_s3_key, - path_mapping_rule_applier, - boto3_session_for_s3, - downloaded_manifests, - unmapped_paths, - ) - ) - # surfaces any exceptions in the thread - for future in concurrent.futures.as_completed(futures): - future.result() - - # Transform the unmapped paths into the output, grouping by job id - for job_id, unmapped_path in unmapped_paths: - output_unmapped_paths.setdefault(job_id, []).append(unmapped_path) - - duration = datetime.now(tz=timezone.utc) - start_time - print_function_callback(f"...downloaded manifests in {duration}") - - return downloaded_manifests - - -def _merge_absolute_path_manifest_list( - downloaded_manifests: list[tuple[datetime, BaseAssetManifest]], -) -> list[BaseManifestPath]: - """ - Given a list of manifests that contain absolute paths, uses the provided last modified timestamps - to sort them, and merges them all into a single manifest. Returns the list of manifest paths - for download. - - Args: - downloaded_manifests: A list of (last_modified_timestamp, manifest) tuples, where each last - modified timestamp is the LastModified datetime from the S3 object holding the manifest. - - Returns: - A list of manifest paths to download, the result of merging the manifests. - """ - # Because the paths in manifests are all absolute and normalized now, we can merge them - # in order by inserting them into a dict in order, using the normcased path as the key. - # Later files of the same name will overwrite earlier ones. - - # Sort the manifests by last modified, so that we can overlay them - # with later manifests overwriting files from earlier ones. - downloaded_manifests.sort(key=lambda item: item[0]) - - merged_manifest_paths_dict = {} - for _, manifest in downloaded_manifests: - for manifest_path in manifest.paths: - merged_manifest_paths_dict[os.path.normcase(manifest_path.path)] = manifest_path - return list(merged_manifest_paths_dict.values()) - - -def _download_file_with_transfer_manager( - local_file_path: Path, - s3_bucket: str, - s3_key: str, - boto3_session: boto3.Session, - s3_client: BaseClient, - progress_tracker: ProgressTracker, -): - """ - Downloads a single file from S3 using S3 transfer manager. This is appropriate for a larger - file that benefits from parallel multi-part download. - """ - transfer_manager = get_s3_transfer_manager(s3_client=s3_client) - - future: concurrent.futures.Future - - def handler(bytes_downloaded): - nonlocal progress_tracker - nonlocal future - - should_continue = progress_tracker.track_progress_callback(bytes_downloaded) - if not should_continue: - future.cancel() - - subscribers = [ProgressCallbackInvoker(handler)] - - future = transfer_manager.download( - bucket=s3_bucket, - key=s3_key, - fileobj=str(local_file_path), - extra_args={"ExpectedBucketOwner": get_account_id(session=boto3_session)}, - subscribers=subscribers, - ) - - future.result() - - -def _download_file_with_get_object( - local_file_path: Path, - s3_bucket: str, - s3_key: str, - boto3_session: boto3.Session, - s3_client: BaseClient, - progress_tracker: ProgressTracker, -): - """ - Downloads a single file from S3 using get_object. This is appropriate for a smaller - file that benefits from reduced overhead. - """ - res = s3_client.get_object( - Bucket=s3_bucket, - Key=s3_key, - ExpectedBucketOwner=get_account_id(session=boto3_session), - ) - body = res["Body"] - # Copy the data this amount at a time - buffer_size = 128 * 1024 - with open(local_file_path, "wb") as fh: - while True: - data = body.read(buffer_size) - if not data: - break - should_continue = progress_tracker.track_progress_callback(len(data)) - if not should_continue: - fh.close() - os.remove(local_file_path) - raise AssetSyncCancelledError("File download cancelled.") - fh.write(data) - - -def _download_file( - file: BaseManifestPath, - hash_algorithm: HashAlgorithm, - collision_lock: Lock, - collision_file_dict: DefaultDict[str, int], - s3_bucket: str, - cas_prefix: str, - s3_client: BaseClient, - boto3_session_for_s3: boto3.Session, - progress_tracker: ProgressTracker, - file_conflict_resolution: FileConflictResolution, -) -> None: - """ - Downloads a file from the S3 bucket to the local directory. - - Args: - file: A BaseManifestPath whose path is a local absolute path. - hash_algorithm: The hash algorithm used for the queue. - collision_lock: A lock to ensure only one thread resolves a path name collision at a time. - collision_file_dict: Dictionary for tracking path name collisions. - s3_bucket: The job attachments S3 bucket. - cas_prefix: The prefix for content-addressed data files in the S3 bucket. - s3_client: A boto3 client for accessing S3. - boto3_session_for_s3: The boto3.Session to use for accessing S3. - progress_tracker: Object to update with download progress status. - file_conflict_resolution: The strategy to use for file conflict resolution. - """ - local_file_path = _get_long_path_compatible_path(file.path) - - s3_key = f"{cas_prefix}/{file.hash}.{hash_algorithm.value}" - - # If the file name already exists, resolve the conflict based on the file_conflict_resolution - if local_file_path.is_file(): - if file_conflict_resolution == FileConflictResolution.SKIP: - return - elif file_conflict_resolution == FileConflictResolution.OVERWRITE: - pass - elif file_conflict_resolution == FileConflictResolution.CREATE_COPY: - copy_local_file_path = _get_new_copy_file_path( - local_file_path, collision_lock, collision_file_dict - ) - - # Re-run _get_long_path_compatible_path for updated file name after file conflict resolution - # _get_long_path_compatible_path is idempotent, so it doesn't re-process an existing long path - local_file_path = _get_long_path_compatible_path(copy_local_file_path) - else: - raise ValueError( - f"Unknown choice for file conflict resolution: {file_conflict_resolution}" - ) - - local_file_path.parent.mkdir(parents=True, exist_ok=True) - - if file.size > 1024 * 1024: - download_file = _download_file_with_transfer_manager - else: - download_file = _download_file_with_get_object - - try: - download_file( - local_file_path=local_file_path, - s3_bucket=s3_bucket, - s3_key=s3_key, - boto3_session=boto3_session_for_s3, - s3_client=s3_client, - progress_tracker=progress_tracker, - ) - except concurrent.futures.CancelledError as ce: - if progress_tracker and progress_tracker.continue_reporting is False: - raise AssetSyncCancelledError("File download cancelled.") - else: - raise AssetSyncError("File download failed.", ce) from ce - except ClientError as exc: - status_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - status_code_guidance = { - **COMMON_ERROR_GUIDANCE_FOR_S3, - 403: ( - ( - "Forbidden or Access denied. Please check your AWS credentials, and ensure that " - "your AWS IAM Role or User has the 's3:GetObject' permission for this bucket. " - ) - if "kms:" not in str(exc) - else ( - "Forbidden or Access denied. Please check your AWS credentials and Job Attachments S3 bucket " - "encryption settings. If a customer-managed KMS key is set, confirm that your AWS IAM Role or " - "User has the 'kms:Decrypt' and 'kms:DescribeKey' permissions for the key used to encrypt the bucket." - ) - ), - 404: ( - "Not found. Please check your bucket name and object key, and ensure that they exist in the AWS account." - ), - } - raise JobAttachmentsS3ClientError( - action="downloading file", - status_code=status_code, - bucket_name=s3_bucket, - key_or_prefix=s3_key, - message=f"{status_code_guidance.get(status_code, '')} {str(exc)} (Failed to download the file to {str(local_file_path)})", - ) from exc - except BotoCoreError as bce: - raise JobAttachmentS3BotoCoreError( - action="downloading file", - error_details=str(bce), - ) from bce - except Exception as e: - raise AssetSyncError(e) from e - - # The modified time in the manifest is in microseconds, but utime requires the time be expressed in seconds. - modified_time_override = file.mtime / 1000000 # type: ignore[attr-defined] - os.utime(local_file_path, (modified_time_override, modified_time_override)) # type: ignore[arg-type] - - # Verify that what we downloaded has the correct file size from the manifest. - file_size_on_disk = os.path.getsize(local_file_path) - if file_size_on_disk != file.size: - # TODO: Improve this error message - raise JobAttachmentsError( - f"File from S3 for {file.path} had incorrect size {file_size_on_disk}. Required size: {file.size}" - ) - - -def _download_manifest_paths( - manifest_paths_to_download: list[BaseManifestPath], - hash_algorithm: HashAlgorithm, - queue: dict[str, Any], - boto3_session_for_s3: boto3.Session, - file_conflict_resolution: FileConflictResolution, - on_downloading_files: Optional[Callable[[ProgressReportMetadata], bool]], - print_function_callback: Callable[[Any], None] = lambda msg: None, -) -> None: - """ - Downloads all files from the S3 bucket in the Job Attachment settings to the specified directory. - - Args: - manifest_paths_to_download: A list of manifest path objects to download, whose path is an absolute file system path. - hash_algorithm: The hash algorithm in use by the queue. - queue: The queue as returned by boto3 deadline.get_queue(). - boto3_session_for_s3: The boto3.Session to use for accessing S3. - file_conflict_resolution: The strategy to use for file conflict resolution. - on_downloading_files: A callback to handle progress messages and cancelation. - print_function_callback: Callback for printing output to the terminal or log. - """ - s3_settings = JobAttachmentS3Settings(**queue["jobAttachmentSettings"]) - s3_client = get_s3_client(session=boto3_session_for_s3) - max_workers = _get_num_download_workers() - - collision_lock: Lock = Lock() - collision_file_dict: DefaultDict[str, int] = DefaultDict(int) - full_cas_prefix: str = s3_settings.full_cas_prefix() - - progress_tracker = ProgressTracker( - status=ProgressStatus.DOWNLOAD_IN_PROGRESS, - total_files=len(manifest_paths_to_download), - total_bytes=sum(manifest_path.size for manifest_path in manifest_paths_to_download), - on_progress_callback=on_downloading_files, - ) - - print_function_callback(f"Using {max_workers} threads") - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [ - executor.submit( - _download_file, - manifest_path, - hash_algorithm, - collision_lock, - collision_file_dict, - s3_settings.s3BucketName, - full_cas_prefix, - s3_client, - boto3_session_for_s3, - progress_tracker, - file_conflict_resolution, - ) - for manifest_path in manifest_paths_to_download - ] - # surfaces any exceptions in the thread - for future in concurrent.futures.as_completed(futures): - future.result() - if progress_tracker: - progress_tracker.increase_processed(1, 0) - progress_tracker.report_progress() - - # to report progress 100% at the end - if progress_tracker: - progress_tracker.report_progress() diff --git a/src/deadline/job_attachments/_incremental_downloads/incremental_download_state.py b/src/deadline/job_attachments/_incremental_downloads/incremental_download_state.py deleted file mode 100644 index 6d88bef90..000000000 --- a/src/deadline/job_attachments/_incremental_downloads/incremental_download_state.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from __future__ import annotations - -import json -import os -from datetime import datetime -from typing import Any, Optional -import tempfile - -# This as an upper bound to allow for eventual consistency into the materialized view that -# the deadline:SearchJobs API is based on. It's taken from numbers seen in heavy load testing, -# increased by a generous amount. -EVENTUAL_CONSISTENCY_MAX_SECONDS = 120 - - -def _datetimes_to_str(obj: Any) -> Any: - """Recursively applies the isoformat() function to all datetimes in the object""" - if isinstance(obj, datetime): - return obj.isoformat() - elif isinstance(obj, list): - return [_datetimes_to_str(item) for item in obj] - elif isinstance(obj, dict): - return {key: _datetimes_to_str(value) for key, value in obj.items()} - else: - return obj - - -class IncrementalDownloadJob: - """ - Model representing a job in the download progress state. - """ - - _required_dict_fields = ["job"] - - job: dict[str, Any] - session_ended_timestamp: Optional[datetime] - session_completed_indexes: dict[str, int] - - def __init__( - self, - job: dict[str, Any], - session_ended_timestamp: Optional[datetime], - session_completed_indexes: Optional[dict[str, int]], - ): - """ - Initialize a Job instance. - Args: - job (dict[str, Any]): The job as returned by boto3 from deadline:SearchJobs. - session_ended_timestamp (Optional[datetime]): The largest endedAt timestamp for a session - whose output has been downloaded. This can be None only when the job lacks job attachments. - session_completed_index (dict[str, int]): A mapping from session id to the index - of the latest completed session action download. - """ - self.job = _datetimes_to_str(job) - self.session_ended_timestamp = session_ended_timestamp - self.session_completed_indexes = session_completed_indexes or {} - - @property - def job_id(self) -> str: - return self.job["jobId"] - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "IncrementalDownloadJob": - """ - Create a Job instance from a dictionary. - Args: - data (dict): Dictionary containing job data - Returns: - Job: A new instance populated with the data - """ - if not isinstance(data, dict): - raise ValueError("Input must be a dict.") - missing_fields = [field for field in cls._required_dict_fields if field not in data] - if missing_fields: - raise ValueError(f"Input is missing required fields: {missing_fields}") - - job = data["job"] - session_completed_indexes = data.get("sessionCompletedIndexes", {}) - session_ended_timestamp = ( - datetime.fromisoformat(data["sessionEndedTimestamp"]) - if data.get("sessionEndedTimestamp") is not None - else None - ) - return cls( - job=job, - session_ended_timestamp=session_ended_timestamp, - session_completed_indexes=session_completed_indexes, - ) - - def to_dict(self) -> dict[str, Any]: - """ - Convert the Job to a dictionary. - Returns: - dict: Dictionary representation of the job - """ - result: dict[str, Any] = { - "job": self.job, - } - if self.session_ended_timestamp is not None: - result["sessionEndedTimestamp"] = self.session_ended_timestamp.isoformat() - if self.session_completed_indexes != {}: - result["sessionCompletedIndexes"] = self.session_completed_indexes - return result - - -class IncrementalDownloadState: - """ - Model for tracking all the job attachments downloads to perform for a queue over time. - A new download becomes available whenever a TASK_RUN session action completes. - - This class includes some informational fields that are not strictly necessary, to help make the data - on disk easier to understand on inspection. - - * https://docs.aws.amazon.com/deadline-cloud/latest/APIReference/API_GetSessionAction.html#API_GetSessionAction_ResponseSyntax - * https://docs.aws.amazon.com/deadline-cloud/latest/APIReference/API_SessionActionDefinition.html - - The Deadline Cloud APIs do not provide direct access to a stream of completed session actions, so we reconstruct such - a stream by tracking state at three levels. Where possible, we use the resource state at one level to prune queries at lower levels: - - 1. Job - The jobs list contains every job that is active and that we have downloaded output from in a previous incremental download command. - When a job becomes inactive, it tracks a minimal stub including the sessionEndedTimestamp value, to use for detecting - requeued jobs later. - 2. Session - Each session of a job represents a single worker running a sequence of tasks from the job. The sessionCompletedIndexes - member of the IncrementalDownloadJob contains an entry for every session that is either still running, or whose - endedAt field is >= the downloadsCompletedTimestamp. When a job gets requeued, the sessionEndedTimestamp stored in the minimal - stub lets us skip sessions from before the job was requeued. - 3. SessionAction - Session actions have sequential IDs, so for each session we store the highest index of session action - for which we have completed the download. A session action ID looks like "sessionaction-abc123-12" for session action - index 12. - """ - - _required_dict_fields = [ - "downloadsStartedTimestamp", - "downloadsCompletedTimestamp", - "eventualConsistencyMaxSeconds", - "localStorageProfileId", - "jobs", - ] - - local_storage_profile_id: Optional[str] - """The storage profile of the host running the incremental download operation, or None if --ignore-storage-profiles is used.""" - - downloads_started_timestamp: datetime - """The timestamp of when the download state was bootstrapped.""" - downloads_completed_timestamp: datetime - """The timestamp up to which we are confident downloads are complete.""" - eventual_consistency_max_seconds: int = EVENTUAL_CONSISTENCY_MAX_SECONDS - """The duration for deadline:SearchJobs query overlap, to account for eventual consistency.""" - - jobs: list[IncrementalDownloadJob] - """The list of jobs that entered 'active' status between downloads_started_timestamp and downloads_completed_timestamp, and are not completed.""" - - def __init__( - self, - local_storage_profile_id: Optional[str], - downloads_started_timestamp: datetime, - downloads_completed_timestamp: Optional[datetime] = None, - jobs: Optional[list] = None, - eventual_consistency_max_seconds: Optional[int] = None, - ): - """ - Initialize a IncrementalDownloadState instance. To bootstrap the state, construct with only the downloads_started_timestamp. - - Args: - local_storage_profile_id: The storage profile id for the host running the download command. - If set to None, all jobs will be downloaded to the paths specified in the job, even if the machine - that submitted the job has a different configuration. - downloads_started_timestamp (datetime): The timestamp of when the download state was bootstrapped. - downloads_completed_timestamp (datetime): The timestamp up to which we are confident downloads are complete. - jobs (list[IncrementalDownloadJob]): The list of jobs that entered 'active' status between downloads_started_timestamp - and downloads_completed_timestamp, and are not completed. - eventual_consistency_max_seconds (Optional[int]): The duration, in seconds, for deadline:SearchJobs query overlap, - to account for eventual consistency. - """ - self.local_storage_profile_id = local_storage_profile_id - self.downloads_started_timestamp = downloads_started_timestamp - if downloads_completed_timestamp is not None: - self.downloads_completed_timestamp = downloads_completed_timestamp - else: - self.downloads_completed_timestamp = downloads_started_timestamp - if eventual_consistency_max_seconds: - self.eventual_consistency_max_seconds = eventual_consistency_max_seconds - self.jobs = jobs or [] - - @classmethod - def from_dict(cls, data): - """ - Create a IncrementalDownloadState instance from a dictionary. - Args: - data (dict): Dictionary containing state file data - Returns: - IncrementalDownloadState: A new instance populated with the data - """ - if not isinstance(data, dict): - raise ValueError("Input must be a dict.") - missing_fields = [field for field in cls._required_dict_fields if field not in data] - if missing_fields: - raise ValueError(f"Input is missing required fields: {missing_fields}") - - return cls( - local_storage_profile_id=data["localStorageProfileId"], - downloads_started_timestamp=datetime.fromisoformat(data["downloadsStartedTimestamp"]), - downloads_completed_timestamp=datetime.fromisoformat( - data["downloadsCompletedTimestamp"] - ), - eventual_consistency_max_seconds=int(data["eventualConsistencyMaxSeconds"]), - jobs=[IncrementalDownloadJob.from_dict(job) for job in data["jobs"]], - ) - - def to_dict(self): - """ - Convert the IncrementalDownloadState to a dictionary. - Returns: - dict: Dictionary representation of the state file model - """ - result = { - "localStorageProfileId": self.local_storage_profile_id, - "downloadsStartedTimestamp": self.downloads_started_timestamp.isoformat(), - "eventualConsistencyMaxSeconds": self.eventual_consistency_max_seconds, - "jobs": [job.to_dict() for job in self.jobs], - } - if self.downloads_completed_timestamp is not None: - result["downloadsCompletedTimestamp"] = self.downloads_completed_timestamp.isoformat() - - return result - - @classmethod - def from_file( - cls, - file_path: str, - ) -> "IncrementalDownloadState": - """ - Loads progress from state file saved at saved_progress_checkpoint_full_path - :param saved_progress_checkpoint_full_path: full path of the saved progress checkpoint file - :param print_function_callback: Callback to print messages produced in this function. - Used in the CLI to print to stdout using click.echo. By default, ignores messages. - :return: Returns the loaded state file, - or throws an exception if we're unable to read it as we already validated its existence - """ - state_data: dict = {} - with open(file_path, "r") as file: - state_data = json.load(file) - - download_state = IncrementalDownloadState.from_dict(state_data) - return download_state - - def save_file( - self, - file_path: str, - ) -> None: - """ - Save the current download progress to a state file atomically. - - :param file_path: Where to save the file. - :param print_function_callback: Callback to print messages produced in this function. - Used in the CLI to print to stdout using click.echo. By default, ignores messages - :return: None if save was successful, throws an exception if we're unable to save progress file to download location. - """ - # Create directory if it doesn't exist - file_dir = os.path.dirname(file_path) - os.makedirs(file_dir, exist_ok=True) - - state_data = self.to_dict() - - # Write the data to a unique temporary filename - with tempfile.NamedTemporaryFile( - mode="w", - dir=file_dir, - prefix=os.path.basename(file_path), - encoding="utf-8", - delete=False, - ) as tmpfile: - json.dump(state_data, tmpfile.file, indent=2) - - # Atomically replace the target file with the temporary file - os.replace(tmpfile.name, file_path) diff --git a/src/deadline/job_attachments/_path_mapping.py b/src/deadline/job_attachments/_path_mapping.py deleted file mode 100644 index 2a7a0b8f3..000000000 --- a/src/deadline/job_attachments/_path_mapping.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from __future__ import annotations - -from pathlib import Path, PurePosixPath, PureWindowsPath -from typing import Any, Callable, Optional, Union - -from .models import ( - PathFormat, - PathMappingRule, - StorageProfile, - StorageProfileOperatingSystemFamily, -) - -__all__ = ["_generate_path_mapping_rules", "_PathMappingRuleApplier"] - -""" -This file contains functionality related to path mapping rules. This functionality is internal-only for now, -to be marked as public after developing some experience with it. -""" - - -def _generate_path_mapping_rules( - source_storage_profile: Union[StorageProfile, dict[str, Any]], - destination_storage_profile: Union[StorageProfile, dict[str, Any]], -) -> list[PathMappingRule]: - """ - Given a pair of storage profiles, generate all the path mapping rules to transform paths - from the source to the destination. - - A mapping rule is generated for every file system location name that's shared between - the storage profile regardless of the type (SHARED vs LOCAL), to account for the broadest - possible storage profile configurations. - - Accepts either StorageProfile dataclass or raw boto3 dict responses. - - Args: - source_storage_profile: A StorageProfile dataclass or a storage profile dict as returned - by boto3 deadline.get_storage_profile or deadline.get_storage_profile_for_queue. - destination_storage_profile: A StorageProfile dataclass or a storage profile dict as returned - by boto3 deadline.get_storage_profile or deadline.get_storage_profile_for_queue. - Returns: - A list of path mapping rules to transform paths. - """ - # Normalize inputs to a common shape - src = _normalize_storage_profile(source_storage_profile) - dst = _normalize_storage_profile(destination_storage_profile) - - # If the source and destination are identical, no transformation is needed - if src["storageProfileId"] == dst["storageProfileId"]: - return [] - - # Put the locations into dictionaries to match up the names - source_locations = {location["name"]: location for location in src["fileSystemLocations"]} - destination_locations = {location["name"]: location for location in dst["fileSystemLocations"]} - - if src["osFamily"].lower() == StorageProfileOperatingSystemFamily.WINDOWS.value: - source_path_format = PathFormat.WINDOWS.value - else: - source_path_format = PathFormat.POSIX.value - - path_mapping_rules: list[PathMappingRule] = [] - for source_name, source_location in source_locations.items(): - if source_name in destination_locations: - path_mapping_rules.append( - PathMappingRule( - source_path_format, - source_location["path"], - destination_locations[source_name]["path"], - ) - ) - - return path_mapping_rules - - -def _normalize_storage_profile( - profile: Union[StorageProfile, dict[str, Any]], -) -> dict[str, Any]: - """Convert a StorageProfile dataclass to the dict format, or pass through if already a dict.""" - if isinstance(profile, StorageProfile): - return { - "storageProfileId": profile.storageProfileId, - "osFamily": profile.osFamily.value, - "fileSystemLocations": [ - {"name": loc.name, "path": loc.path, "type": loc.type.value} - for loc in profile.fileSystemLocations - ], - } - return profile - - -class _PathMappingRuleApplier: - """ - This class provides an accelerated implementation for transforming paths according to a list of - path mapping rules. For details about how rules are applied, see the documentation - https://github.com/OpenJobDescription/openjd-specifications/wiki/How-Jobs-Are-Run#applying-path-mapping-rules-within-a-job-template - - When mapping a path, the most specific rule is the one that applies. For example, if there are two rules - * '/mnt/Projects -> X:\\Projects' - * '/mnt/Projects/Special -> Y:\\' - then '/mnt/Projects/Special/data.txt' maps to 'Y:\\data.txt', not to 'X:\\Projects\\Special\\data.txt'. - - The implementation uses a trie data structure for acceleration, as follows: - - 1. The trie is a dictionary, where every key is a string and every value is another trie. - The one exception is the key ".", which holds the destination path of a rule instead. - 2. Each source path is divided into parts by the PurePosixPath or PureWindowsPath class. - 3. Each subsequent level of the trie corresponds to the matching subsequent part of a path. - 4. A rule with parts (part1, part2, ..., partN) -> destination_path is represented in the trie - by the equation trie[part1][part2]...[partN]["."] == destination_path. - - For Windows source paths, the parts are transformed to lower case within the trie to make the transformation - case insensitive while still case preserving. - """ - - source_path_format: Optional[str] = None - path_mapping_rules: list[PathMappingRule] - - _path_mapping_trie: dict[str, Any] - - # These two members implement the windows- or posix-specific parts of the trie. - # _split_source_path is used to divide a source path into parts, and _part_normalization is used - # to normalize a part for trie insertion or lookup. - _split_source_path: Callable[[str], tuple[str, ...]] - _normalize_part: Callable[[str], str] - - def __init__(self, path_mapping_rules: list[PathMappingRule]): - self.path_mapping_rules = path_mapping_rules - self._path_mapping_trie = {} - - trie_entry: dict - - if path_mapping_rules: - self.source_path_format = path_mapping_rules[0].source_path_format - if not all( - rule.source_path_format == self.source_path_format for rule in path_mapping_rules - ): - formats = list({rule.source_path_format for rule in path_mapping_rules}) - raise ValueError( - f"The path mapping rules included multiple source path formats {', '.join(formats)}, only one is permitted." - ) - - if self.source_path_format == PathFormat.POSIX.value: - self._split_source_path = lambda v: PurePosixPath(v).parts - self._normalize_part = lambda v: v - elif self.source_path_format == PathFormat.WINDOWS.value: - self._split_source_path = lambda v: PureWindowsPath(v).parts - self._normalize_part = lambda v: v.lower() - else: - raise ValueError(f"Unexpected source path format {self.source_path_format}") - - for rule in path_mapping_rules: - trie_entry = self._path_mapping_trie - parts = self._split_source_path(rule.source_path) - # Traverse all the parts using trie_entry - for part in parts: - trie_entry = trie_entry.setdefault(self._normalize_part(part), {}) - # Set the destination path of the trie entry - trie_entry["."] = Path(rule.destination_path) - else: - self.source_path_format = None - - def _transform(self, path: str) -> Union[None, Path]: - parts = self._split_source_path(path) - - matched_destination_path = None - matched_remaining_parts: tuple[str, ...] = () - - # Traverse the trie using trie_entry - trie_entry: dict = self._path_mapping_trie - for i, part in enumerate(parts): - next_trie_entry = trie_entry.get(self._normalize_part(part)) - # Stop if there are no rules with this path prefix - if next_trie_entry is None: - break - # Record the match if there is one at this path prefix, - # overwriting any previous match to apply the longest rule. - destination_path = next_trie_entry.get(".") - if destination_path: - matched_destination_path = destination_path - matched_remaining_parts = tuple(parts[i + 1 :]) - trie_entry = next_trie_entry - - if matched_destination_path is None: - return None - else: - return matched_destination_path.joinpath(*matched_remaining_parts) - - def strict_transform(self, source_path: str) -> Path: - """Transform the provided path according to the path mapping rules. Raise ValueError if no rule applied.""" - if self.source_path_format is not None: - result = self._transform(source_path) - if result: - return result - - raise ValueError("No path mapping rule could be applied") - - def transform(self, source_path: str) -> Union[str, Path]: - """Transform the provided path according to the path mapping rules. Return an untransformed path if no rule applied.""" - if self.source_path_format is not None: - result = self._transform(source_path) - if result: - return result - - return source_path diff --git a/src/deadline/job_attachments/_path_summarization.py b/src/deadline/job_attachments/_path_summarization.py deleted file mode 100644 index be4d77049..000000000 --- a/src/deadline/job_attachments/_path_summarization.py +++ /dev/null @@ -1,526 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -from __future__ import annotations - -import os -import re - -from typing import Any, Union, Iterable, Optional -from collections.abc import Collection -from pathlib import PurePath, PurePosixPath, PureWindowsPath -import ntpath -import posixpath - -from .models import PathFormat - - -def human_readable_file_size(size_in_bytes: int) -> str: - """ - Convert a size in bytes to something human readable. For example 1000 bytes will be converted - to 1 KB. Sizes close enough to a postfix threshold will be rounded up to the next threshold. - For example 999999 bytes would be output as 1.0 MB and NOT 999.99 KB (or as a consequence of - Python's round function 1000.0 KB). - - This function is for display purposes only. - """ - converted_size: Union[int, float] = size_in_bytes - rounded: Union[int, float] - postfixes = ["B", "KB", "MB", "GB", "TB", "PB"] - - for postfix in postfixes: - rounded = round(converted_size, ndigits=2) - - if rounded < 1000: - return f"{rounded} {postfix}" - - converted_size /= 1000 - - # If we go higher than the provided postfix, - # then return as a large amount of the highest postfix we've specified. - return f"{rounded} {postfixes[-1]}" - - -_NUMBERED_PATH_REGEX = re.compile(r"^(.*\D|)(\d+)(\.[^/\\]+)?$") - - -class _NumberedPath: - """ - Representation of a file system path that may be numbered like frame_001.png. - - Some example properties for paths: - frame_001.png: grouping='frame_#.png', padding_min=3, padding_max=3, number=1 - sequence_v907: grouping='sequence_v#', padding_min=1, padding_max=3, number=907 - """ - - path: str - """The path that may or may not be numbered""" - parts: Optional[list[str]] = None - """The path split into parts""" - grouping: str - """The path with the number as '#' for grouping purposes""" - padding_min: int - """The minimum padding or -1. e.g. '%04d' for padding of 4, that produces the number in the path""" - padding_max: int - """The maximum padding or -1. e.g. '%04d' for padding of 4, that produces the number in the path""" - number: Optional[int] = None - """The number in the path, or None if the path is not numbered""" - - def __init__(self, path: str): - m = _NUMBERED_PATH_REGEX.match(path) - if m: - self.path = path - self.parts = [m.group(1), m.group(2), m.group(3) or ""] - self.grouping = f"{self.parts[0]}#.{self.parts[2]}" - number = self.parts[1] - if number[0] == "0": - self.padding_min = len(number) - else: - self.padding_min = 1 - self.padding_max = len(number) - self.number = int(number) - else: - self.path = self.grouping = path - self.padding_min = self.padding_max = -1 - - def __repr__(self): - return f"_NumberedPath({self.path!r})" - - -def _divide_numbered_path_group(group: list[_NumberedPath]) -> dict[str, set[int]]: - """Given a list of numbered paths that all have the same grouping string, check - for padding consistency and split into multiple groups if necessary. Convert - into a dictionary from a printf pattern to the set of numbers for it. Groups - of size 2 are divided into individual paths. - - For example, the paths frame_001.png and frame_0002.png cannot be together, - because they require different padding values.""" - - result: dict[str, set[int]] = {} - - while len(group) > 0: - # Treat groups of size 1 or 2 as individual paths - if len(group) <= 2: - for numbered_path in group: - result[numbered_path.path] = set() - break - - # The largest minimum padding is likely the right padding for the group - padding = max(numbered_path.padding_min for numbered_path in group) - pattern = f"%0{padding}d" if padding > 1 else "%d" - consistent_group = [ - numbered_path for numbered_path in group if numbered_path.padding_max >= padding - ] - pattern_path = f"{consistent_group[0].parts[0]}{pattern}{consistent_group[0].parts[2]}" # type: ignore - result[pattern_path] = {numbered_path.number for numbered_path in consistent_group} # type: ignore - # Process the remaining paths separately - group = [path for path in group if path.padding_max < padding] - - return result - - -class PathSummary: - """ - Represents a summary of a path, including the path itself, the number of files, - and the total size. The summary represents a sequence of files when the index_set - value is non-empty. If the summary is a nested accumulation of paths, child - paths are in the dictionary 'children'. - - If a path represents a directory, it ends with a directory separator. - """ - - path: str - """Either the path, or a printf pattern if index_set is non-empty""" - index_set: set[int] - """The set of indexes if the path is a printf pattern or an empty set otherwise""" - file_count: int - """The number of files""" - total_size: Optional[int] - """The total size of all files, if sizes are provided""" - children: Optional[dict[str, "PathSummary"]] - """The children of this path, if the summary is nested""" - - _os_path: Any - """Either ntpath or posixpath depending on the path_format from construction.""" - - def __init__( - self, - path: str, - *, - path_format: Optional[PathFormat] = None, - index_set: Optional[set[int]] = None, - file_count: Optional[int] = None, - total_size: Optional[int] = None, - children: Optional[dict[str, "PathSummary"]] = None, - ): - if path_format is None: - self._os_path = os.path - elif path_format == PathFormat.WINDOWS: - self._os_path = ntpath - else: - self._os_path = posixpath - - self.path = path - self.index_set = index_set or set() - if index_set: - self.file_count = len(index_set) - elif file_count is None: - self.file_count = 0 if self.is_dir() else 1 - else: - self.file_count = file_count - self.total_size = total_size - self.children = children - - def is_dir(self) -> bool: - """Returns True if the path is a directory (indicated by a trailing '/')""" - # On Windows, both '/' and '\\' are directory separators, so check both sep and altsep - return self.path.endswith(self._os_path.sep) or ( - self._os_path.altsep and self.path.endswith(self._os_path.altsep) - ) # type: ignore - - def summary(self, *, include_totals=True, relative_to: Optional[Union[PurePath, str]] = None): - """Returns the path summary, including file count and size totals by default.""" - relpath = self.path - if relative_to is not None: - relpath = self._os_path.relpath(self.path, relative_to) - # Ensure a trailing separator for directories - if self.is_dir(): - relpath = self._os_path.join(relpath, "") - - if include_totals: - return f"{relpath} ({self.summary_totals()})" - elif self.index_set: - return f"{relpath} (sequence indexes {_int_set_to_range_expr(self.index_set)})" - else: - return relpath - - def summary_totals(self) -> str: - """Returns the totals of the summary, like 'sequence indexes 1-3, 3 files, 30 MB' - if the path represents a sequence or '1 file' if the path represents a file - and there is no size information available.""" - if self.index_set: - seq_summary = f", sequence {_int_set_to_range_expr(self.index_set)}" - else: - seq_summary = "" - if self.total_size is not None: - size_summary = f", {human_readable_file_size(self.total_size)}" - else: - size_summary = "" - plural = "s" if self.file_count != 1 else "" - return f"{self.file_count} file{plural}{size_summary}{seq_summary}" - - def __str__(self): - return self.summary() - - def __repr__(self): - parts = ["PathSummary(", repr(self.path)] - if self.is_dir(): - if self.file_count != 0: - parts.append(f", file_count={self.file_count!r}") - else: - if self.index_set: - parts.append(f", index_set={{{', '.join(str(v) for v in sorted(self.index_set))}}}") - if self.total_size is not None: - parts.append(f", total_size={self.total_size!r}") - if self.children is not None: - parts.append(f", children={self.children!r}") - parts.append(")") - return "".join(parts) - - def __eq__(self, value): - if isinstance(value, PathSummary): - return ( - self.path == value.path - and self.index_set == value.index_set - and self.file_count == value.file_count - and self.total_size == value.total_size - and self.children == value.children - ) - else: - return False - - -def _int_set_to_range_expr(int_set: set[int]) -> str: - """ - Converts a set of integers into a range expression. - For example, {1,2,3,4,5,7,8,9,10} -> "1-5,7-10" - """ - int_list = sorted(set(int_set)) - range_expr_components = [] - last_interval_start = last_interval_end = int_list[0] - - def add_interval(start: int, end: int): - if start == last_interval_end: - range_expr_components.append(str(start)) - else: - range_expr_components.append(f"{start}-{end}") - - for value in int_list[1:]: - if value == last_interval_end + 1: - last_interval_end = value - else: - add_interval(last_interval_start, last_interval_end) - last_interval_start = last_interval_end = value - add_interval(last_interval_start, last_interval_end) - return ",".join(range_expr_components) - - -def summarize_paths_by_sequence( - path_list: Collection[Union[PurePath, str]], - *, - path_format: Optional[PathFormat] = None, - total_size_by_path: Optional[dict[str, int]] = None, -) -> list[PathSummary]: - """ - Identifies numbered sequences of files/directories within a list of paths. - Returns a sorted list of PathSummary objects. If total_size_by_path is provided, it - must provide a total size for every path in path_list. - - >> group_sequence_paths(["frame_1.png", "frame_3.png", "frame_20.png", "readme.txt"]) - {PathSummary("frame_%d.png", index_set={1, 3, 20}), PathSummary("readme.txt")} - - >> group_sequence_paths(["frame_01.png", "frame_1.png", "frame_30.png", "frame_09.png"]) - {PathSummary("frame_%02d.png", index_set={1, 9, 20}), PathSummary("frame_1.png")} - """ - if len(path_list) == 0: - return [] - - if path_format is None: - path_format = PathFormat.get_host_path_format() - - # Convert all the paths into strings, and deduplicate by converting into a set - path_list_as_str: set[str] = { - str(path) if isinstance(path, PurePath) else path for path in path_list - } - # On Windows, convert all "/" separators to "\\" - if path_format == PathFormat.WINDOWS: - path_list_as_str = {path.replace("/", "\\") for path in path_list_as_str} - if total_size_by_path: - total_size_by_path = { - path.replace("/", "\\"): size for path, size in total_size_by_path.items() - } - - # Group according to the _NumberedPath.grouping property - raw_grouped_paths: dict[str, list[_NumberedPath]] = {} - for path in path_list_as_str: - numbered_path = _NumberedPath(path) - raw_grouped_paths.setdefault(numbered_path.grouping, []).append(numbered_path) - - # Divide any groups with inconsistent padding into smaller consistent groups, - # and merge into a dictionary {printf_pattern: set of indexes}. - grouped_paths: dict[str, set[int]] = {} - for raw_group in raw_grouped_paths.values(): - grouped_paths.update(_divide_numbered_path_group(raw_group)) - - # Sort the result by the printf pattern and convert to PathSummary objects - result = [ - PathSummary(path, path_format=path_format, index_set=index_set) - for path, index_set in sorted(grouped_paths.items(), key=lambda x: x[0]) - ] - - # If sizes are provided, populate them in the path summary objects - if total_size_by_path: - for path_summary in result: - if path_summary.index_set: - path_summary.total_size = sum( - total_size_by_path[path_summary.path % i] for i in path_summary.index_set - ) - else: - path_summary.total_size = total_size_by_path[path_summary.path] - - return result - - -def _collapse_each_path_summary( - path_summary_list: Iterable[PathSummary], -) -> list[PathSummary]: - """ - Collapses each path summary in the list while it has a single child. - """ - result = [] - for path_summary in path_summary_list: # type: ignore - while path_summary.children is not None and len(path_summary.children) == 1: - path_summary = next(iter(path_summary.children.values())) - result.append(path_summary) - - return result - - -def summarize_paths_by_nested_directory( - path_list: Collection[Union[PurePath, str]], - *, - path_format: Optional[PathFormat] = None, - total_size_by_path: Optional[dict[str, int]] = None, -) -> list[PathSummary]: - """ - Summarizes the provided paths by sequence, and then nests them into - common parent paths. The returned summaries do not contain a common parent, - for example if they are different relative paths, or absolute paths for - different drives on Windows - - By default, paths are for the current operating system. If path_format is provided, you can - override that to PathFormat.WINDOWS or PathFormat.POSIX as necessary. - """ - if len(path_list) == 0: - return [] - - if path_format is None: - path_format = PathFormat.get_host_path_format() - - if path_format == PathFormat.WINDOWS: - path_type: Any = PureWindowsPath - os_path: Any = ntpath - else: - path_type = PurePosixPath - os_path = posixpath - - # First summarize the paths by sequence - summary_list = summarize_paths_by_sequence( - path_list, total_size_by_path=total_size_by_path, path_format=path_format - ) - - # Put all the summaries into a temporary common root. - nested_summary = PathSummary("ROOT/", path_format=path_format) - for path_summary in summary_list: - # Split the path into its components - path_components = path_type(path_summary.path).parts - # Start with the root component, and build up the nested structure - current_level: PathSummary = nested_summary - for i in range(len(path_components) - 1): - component = path_components[i] - # Add the child if it's not already there - if current_level.children is None: - current_level.children = {} - if component not in current_level.children: - current_level.children[component] = PathSummary( - os_path.join(*path_components[: i + 1], ""), - path_format=path_format, - total_size=0 if total_size_by_path else None, - ) - # Descend into the new level - current_level = current_level.children[component] - # Accumulate the file counts and sizes - current_level.file_count += path_summary.file_count - if total_size_by_path: - current_level.total_size += path_summary.total_size # type: ignore - # Add the path summary to the end - if current_level.children is None: - current_level.children = {} - current_level.children[path_components[-1]] = path_summary - - # For each distinct root, collapse it while it contains a single child - return _collapse_each_path_summary(nested_summary.children.values()) # type: ignore - - -def summarize_path_list( - path_list: Collection[Union[PurePath, str]], - *, - path_format: Optional[PathFormat] = None, - total_size_by_path: Optional[dict[str, int]] = None, - max_entries=10, - include_totals=True, -) -> str: - """ - Creates a string summary of the files in the list provided, - grouping numbered filenames by their sequence pattern, and nesting - summaries of the directory into the specified maximum number of entries. - - By default, paths are for the current operating system. If path_format is provided, you can - override that to PathFormat.WINDOWS or PathFormat.POSIX as necessary. - - If total_size_by_path is provided, it must provide a total size for every path in path_list. - - >>> print(summarize_path_list(["frame_1.png", "frame_3.png", "frame_20.png", "readme.txt"])) - frame_%d.png (3 files, sequence 1,3,20) - readme.txt (1 file) - """ - if len(path_list) == 0: - return "" - - if path_format is None: - path_format = PathFormat.get_host_path_format() - - lines = [] - summary_list = summarize_paths_by_nested_directory( - path_list, total_size_by_path=total_size_by_path, path_format=path_format - ) - - # If the summary list has one entry, and its path is a very shallow root like '/' or 'C:/', - # then take all its children at the outer level. This makes the root paths longer so - # the individually summarized paths will be shorter and easier to look through. - if ( - len(summary_list) == 1 - and summary_list[0].children is not None - and len(summary_list[0].children) <= max_entries / 2 - ): - summary_list = _collapse_each_path_summary(summary_list[0].children.values()) - - if total_size_by_path: - # Sort the list so the largest size is first - summary_list.sort(key=lambda v: (-v.total_size, v.path)) # type: ignore - else: - # Sort the list so the largest file count is first - summary_list.sort(key=lambda v: (-v.file_count, v.path)) # type: ignore - - # Determine how many entries to show at the outer level and one level in, - # with a total less than or equal to max_entries - entry_counts = [ - (0 if summary_path.children is None else min(len(summary_path.children), max_entries)) - for summary_path in summary_list[:max_entries] - ] - while len(entry_counts) + sum(entry_counts) > max_entries: - max_entry_count = max(entry_counts) - if max_entry_count > len(entry_counts): - # If the largest entry count under a root path is more than the number of root paths, - # then decrease that entry count. - for i, entry_count in enumerate(reversed(entry_counts)): - if entry_count == max_entry_count: - entry_counts[len(entry_counts) - i - 1] -= 1 - break - else: - # Otherwise drop a root path from the summary - entry_counts.pop() - - # If we're going to show "... and 1 more ..." after the items, might as well show - # the last item instead - if len(entry_counts) == len(summary_list) - 1 and not summary_list[-1].is_dir(): - entry_counts.append(0) - - for entry_count, summary_path in zip(entry_counts, summary_list): - if summary_path.children is None: - lines.append(f"{summary_path.summary(include_totals=include_totals)}\n") - else: - lines.append(f"{summary_path.summary(include_totals=include_totals)}:\n") - children = list(summary_path.children.values()) - if total_size_by_path: - # Sort the list so the largest size is first - children.sort(key=lambda v: (-v.total_size, v.path)) # type: ignore - else: - # Sort the list so the largest file count is first - children.sort(key=lambda v: (-v.file_count, v.path)) # type: ignore - - # If we're going to show "... and 1 more ..." after the items, might as well show - # the last item instead - if entry_count == len(children) - 1: - entry_count += 1 - - for child in children[:entry_count]: - lines.append( - f" {child.summary(include_totals=include_totals, relative_to=summary_path.path)}\n" - ) - if len(summary_path.children) > entry_count: - lines.append(f" ... and {len(summary_path.children) - entry_count} more\n") - - if len(summary_list) > len(entry_counts): - file_count = sum(v.file_count for v in summary_list[len(entry_counts) :]) - if total_size_by_path and include_totals: - total_size = sum(v.total_size for v in summary_list[len(entry_counts) :]) # type: ignore - lines.append( - f"... and {len(summary_list) - len(entry_counts)} more ({file_count} files, {human_readable_file_size(total_size)})\n" - ) - elif include_totals: - lines.append( - f"... and {len(summary_list) - len(entry_counts)} more ({file_count} files)\n" - ) - else: - lines.append(f"... and {len(summary_list) - len(entry_counts)} more\n") - - return "".join(lines) diff --git a/src/deadline/job_attachments/_utils.py b/src/deadline/job_attachments/_utils.py deleted file mode 100644 index 837eb1bc8..000000000 --- a/src/deadline/job_attachments/_utils.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import datetime -from functools import wraps -from hashlib import shake_256 -from pathlib import Path -import random -import time -from typing import Any, Callable, Optional, Tuple, Type, Union -import uuid -import sys - -__all__ = [ - "_join_s3_paths", - "_generate_random_guid", - "_float_to_iso_datetime_string", - "_get_unique_dest_dir_name", - "_get_bucket_and_object_key", - "_is_relative_to", -] - - -TEMP_DOWNLOAD_ADDED_CHARS_LENGTH = 9 -""" -Add 9 to path length to account for .Hex value when file is in the middle of downloading in windows. -e.g. test.txt when downloaded becomes test.txt.H4SD9Ddj -""" - -WINDOWS_MAX_PATH_LENGTH = 260 -""" -Windows Max path length limit of 260. -https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation -""" - -WINDOWS_UNC_PATH_STRING_PREFIX = "\\\\?\\" -""" -When this is prepended to any path on Windows, -it becomes a UNC path and is allowed to go over the 260 max path length limit. -""" - - -def _join_s3_paths(root: str, *args: str): - return "/".join([root, *args]) - - -def _generate_random_guid(): - return str(uuid.uuid4()).replace("-", "") - - -def _float_to_iso_datetime_string(time: float): - seconds = int(time) - microseconds = int((time - seconds) * 1000000) - - dt = datetime.datetime.utcfromtimestamp(seconds) + datetime.timedelta(microseconds=microseconds) - iso_string = dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ") - - return iso_string - - -def _get_unique_dest_dir_name(source_root: str) -> str: - # Note: this is a quick naive way to attempt to prevent colliding - # relative paths across manifests without adding too much - # length to the filepaths. length = 2n where n is the number - # passed to hexdigest. - return f"assetroot-{shake_256(source_root.encode()).hexdigest(10)}" - - -def _get_bucket_and_object_key(s3_path: str) -> Tuple[str, str]: - """Returns the bucket name and object key from the S3 URI""" - bucket, key = s3_path.replace("s3://", "").split("/", maxsplit=1) - return bucket, key - - -def _normalize_windows_path(path: Union[Path, str]) -> Path: - """ - Strips \\\\?\\ prefix from Windows paths. - """ - p_str = str(path) - if p_str.startswith("\\\\?\\"): - return Path(p_str[4:]) - return Path(path) - - -def _is_relative_to(path1: Union[Path, str], path2: Union[Path, str]) -> bool: - """ - Determines if path1 is relative to path2. This function is to support - Python versions (3.7 and 3.8) that do not have the built-in `Path.is_relative_to()` method. - """ - try: - p1 = _normalize_windows_path(Path(path1).resolve()) - p2 = _normalize_windows_path(Path(path2).resolve()) - p1.relative_to(p2) - return True - except ValueError: - return False - - -def _is_windows_long_path_registry_enabled() -> bool: - if sys.platform != "win32": - return True - - import ctypes - - ntdll = ctypes.WinDLL("ntdll") - ntdll.RtlAreLongPathsEnabled.restype = ctypes.c_ubyte - ntdll.RtlAreLongPathsEnabled.argtypes = () - - return bool(ntdll.RtlAreLongPathsEnabled()) - - -def _get_long_path_compatible_path(original_path: Union[str, Path]) -> Path: - """ - Given a Path or string representing a path, - make it long path compatible if needed on Windows and return the Path object - https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation - - :param original_path: Original unmodified path/string representing an absolute path. - show - :param show_long_path_warning: Whether to show a warning to the user that the resulting path is in a long path. - :return: A Path object representing the long path compatible path. - """ - - original_path_string = str(original_path) - if sys.platform != "win32": - return Path(original_path_string) - - if ( - len(original_path_string) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH >= WINDOWS_MAX_PATH_LENGTH - and not original_path_string.startswith(WINDOWS_UNC_PATH_STRING_PREFIX) - and not _is_windows_long_path_registry_enabled() - ): - # Prepend \\?\ to the file name to treat it as an UNC path - return Path(WINDOWS_UNC_PATH_STRING_PREFIX + original_path_string) - return Path(original_path_string) - - -def _retry( - ExceptionToCheck: Union[Type[Exception], Tuple[Type[Exception], ...]] = AssertionError, - tries: int = 2, - delay: Union[int, float, Tuple[Union[int, float], Union[int, float]]] = 1.0, - backoff: float = 1.0, - logger: Optional[Callable] = print, -) -> Callable: - """Retry calling the decorated function using an exponential backoff. - - http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/ - original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry - - :param ExceptionToCheck: the exception to check. may be a tuple of - exceptions to check - :type ExceptionToCheck: Exception or tuple - :param tries: number of times to try (not retry) before giving up - :type tries: int - :param delay: initial delay between retries in seconds - :type delay: float or tuple - :param backoff: backoff multiplier e.g. value of 2 will double the delay - each retry - :type backoff: float - :param logger: logging function to use. If None, won't log - :type logger: logging.Logger instance - """ - - def deco_retry(f: Callable) -> Callable: - @wraps(f) - def f_retry(*args: Any, **kwargs: Any) -> Callable: - mtries: int = tries - if isinstance(delay, (float, int)): - mdelay = delay - elif isinstance(delay, tuple): - mdelay = random.uniform(delay[0], delay[1]) - else: - raise ValueError(f"Provided delay {delay} isn't supported") - - while mtries > 1: - try: - return f(*args, **kwargs) - except ExceptionToCheck as e: - if logger: - logger(f"{str(e)}, Retrying in {mdelay} seconds...") - time.sleep(mdelay) - mtries -= 1 - mdelay *= backoff - return f(*args, **kwargs) - - return f_retry # true decorator - - return deco_retry diff --git a/src/deadline/job_attachments/_windows/__init__.py b/src/deadline/job_attachments/_windows/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/src/deadline/job_attachments/_windows/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/src/deadline/job_attachments/_windows/file.py b/src/deadline/job_attachments/_windows/file.py deleted file mode 100644 index 29a4777f2..000000000 --- a/src/deadline/job_attachments/_windows/file.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import ctypes -import ctypes.wintypes -import sys - -# This assertion short-circuits mypy from type checking this module on platforms other than Windows -# https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks -assert sys.platform == "win32" - -kernel32 = ctypes.WinDLL("Kernel32") - -# https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-getfinalpathnamebyhandlew -kernel32.GetFinalPathNameByHandleW.restype = ctypes.wintypes.DWORD -kernel32.GetFinalPathNameByHandleW.argtypes = [ - ctypes.wintypes.HANDLE, # [in] HANDLE hFile, - ctypes.wintypes.LPWSTR, # [out] LPWSTR lpszFilePath, - ctypes.wintypes.DWORD, # [in] DWORD cchFilePath, - ctypes.wintypes.DWORD, # [in] DWORD dwFlags -] -GetFinalPathNameByHandleW = kernel32.GetFinalPathNameByHandleW - -VOLUME_NAME_DOS = 0 -VOLUME_NAME_GUID = 1 -VOLUME_NAME_NONE = 4 -VOLUME_NAME_NT = 2 diff --git a/src/deadline/job_attachments/api/__init__.py b/src/deadline/job_attachments/api/__init__.py deleted file mode 100644 index a85dae76b..000000000 --- a/src/deadline/job_attachments/api/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -__all__ = [ - "summarize_paths_by_nested_directory", - "summarize_paths_by_sequence", - "human_readable_file_size", - "summarize_path_list", - "PathSummary", -] - -from .._path_summarization import ( - human_readable_file_size, - summarize_paths_by_nested_directory, - summarize_paths_by_sequence, - summarize_path_list, - PathSummary, -) diff --git a/src/deadline/job_attachments/api/_hashing.py b/src/deadline/job_attachments/api/_hashing.py deleted file mode 100644 index 54972aaac..000000000 --- a/src/deadline/job_attachments/api/_hashing.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import textwrap - -from typing import Any, Optional, List, Callable, Tuple - -from deadline.job_attachments.models import ( - AssetRootGroup, - AssetRootManifest, -) -from deadline.job_attachments.progress_tracker import ( - ProgressReportMetadata, - ProgressStatus, -) -from deadline.job_attachments.upload import S3AssetManager, SummaryStatistics - - -def _hash_attachments( - *, - asset_manager: S3AssetManager, - asset_groups: List[AssetRootGroup], - total_input_files: int, - total_input_bytes: int, - print_function_callback: Callable[[str], None] = lambda msg: None, - hashing_progress_callback: Optional[Callable[[Any], bool]] = None, - hash_cache_dir: Optional[str] = None, - telemetry_callback: Optional[Callable[[SummaryStatistics], None]] = None, -) -> Tuple[SummaryStatistics, List[AssetRootManifest]]: - """ - Starts the job attachments hashing and returns a list of the asset manifests of the hashed files. - Provides callbacks for: - * Printing output - * Hashing progress reporting - * Sending hashing telemetry - """ - - def _default_update_hash_progress(hashing_metadata: ProgressReportMetadata) -> bool: - return True - - if not hashing_progress_callback: - hashing_progress_callback = _default_update_hash_progress - - hashing_summary, manifests = asset_manager.hash_assets_and_create_manifest( - asset_groups=asset_groups, - total_input_files=total_input_files, - total_input_bytes=total_input_bytes, - hash_cache_dir=hash_cache_dir, - on_preparing_to_submit=hashing_progress_callback, - ) - if telemetry_callback: - telemetry_callback(hashing_summary) - if hashing_summary.total_files > 0: - print_function_callback("Hashing Summary:") - print_function_callback(textwrap.indent(str(hashing_summary), " ")) - else: - # Ensure to call the callback once if no files were processed - hashing_progress_callback( - ProgressReportMetadata( - status=ProgressStatus.PREPARING_IN_PROGRESS, - progress=100, - transferRate=0, - progressMessage="No files to hash", - processedFiles=0, - ) - ) - - return hashing_summary, manifests diff --git a/src/deadline/job_attachments/api/_utils.py b/src/deadline/job_attachments/api/_utils.py deleted file mode 100644 index e622b551f..000000000 --- a/src/deadline/job_attachments/api/_utils.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os - -from contextlib import ExitStack -from typing import List, Dict - -from ..exceptions import NonValidInputError -from ..asset_manifests.base_manifest import BaseAssetManifest -from ..asset_manifests.decode import decode_manifest - - -def _read_manifests(manifest_paths: List[str]) -> Dict[str, BaseAssetManifest]: - """ - Read in manfiests from the given file path list, and produce file name to manifest mapping. - - Args: - manifest_paths (List[str]): List of file paths to manifest file. - - Raises: - NonValidInputError: Raise when any of the file is not valid. - - Returns: - Dict[str, BaseAssetManifest]: File name to encoded manifest mapping - """ - - if nonvalid_files := [manifest for manifest in manifest_paths if not os.path.isfile(manifest)]: - raise NonValidInputError(f"Specified manifests {nonvalid_files} are not valid.") - - with ExitStack() as stack: - file_name_manifest_dict: Dict[str, BaseAssetManifest] = { - os.path.basename(file_path): decode_manifest( - stack.enter_context(open(file_path)).read() - ) - for file_path in manifest_paths - } - - return file_name_manifest_dict diff --git a/src/deadline/job_attachments/api/attachment.py b/src/deadline/job_attachments/api/attachment.py deleted file mode 100644 index 0029ffc9c..000000000 --- a/src/deadline/job_attachments/api/attachment.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -import boto3 -import json - -from typing import Any, Optional, List, Dict, Callable -from pathlib import Path - -from deadline.job_attachments.api._utils import _read_manifests -from deadline.job_attachments.asset_manifests.base_manifest import BaseAssetManifest -from deadline.job_attachments.download import download_files_from_manifests -from deadline.job_attachments.models import ( - FileConflictResolution, - JobAttachmentS3Settings, - UploadManifestInfo, - PathMappingRule, -) -from deadline.job_attachments.progress_tracker import DownloadSummaryStatistics -from deadline.job_attachments.upload import S3AssetUploader - -from deadline.job_attachments.exceptions import NonValidInputError - - -def _attachment_download( - manifests: List[str], - s3_root_uri: str, - boto3_session: boto3.Session, - path_mapping_rules: Optional[str] = None, - print_function_callback: Callable[[Any], None] = lambda msg: None, - conflict_resolution: FileConflictResolution = FileConflictResolution.CREATE_COPY, -) -> DownloadSummaryStatistics: - """ - BETA API - This API is still evolving. - - API to download job attachments based on given list of manifests. - If path mapping rules file is given, map to corresponding destinations. - - Args: - manifests (List[str]): File Path to the manifest file for upload. - s3_root_uri (str): S3 root uri including bucket name and root prefix. - boto3_session (boto3.Session): Boto3 session for interacting with customer s3. - path_mapping_rules (Optional[str], optional): Optional file path to a JSON file contains list of path mapping. Defaults to None. - print_function_callback (Callable[[str], None], optional): Callback function to provide visibility. Defaults to lambda msg: None. - - Raises: - NonValidInputError: raise when any of the input is not valid. - """ - - file_name_manifest_dict: Dict[str, BaseAssetManifest] = _read_manifests( - manifest_paths=manifests - ) - - path_mapping_rule_list: List[PathMappingRule] = _process_path_mapping( - path_mapping_rules=path_mapping_rules - ) - - merged_manifests_by_root: Dict[str, BaseAssetManifest] = dict() - for file_name, manifest in file_name_manifest_dict.items(): - # File name is supposed to be prefixed by a hash of source path in path mapping, use that to determine destination - # If it doesn't appear in path mapping or mapping doesn't exist, download to current directory instead - destination = next( - ( - rule.destination_path - for rule in (path_mapping_rule_list or []) - if rule.get_hashed_source_path(manifest.get_default_hash_alg()) in file_name - ), - # Write to current directory partitioned by manifest name when no path mapping defined - f"{os.getcwd()}/{file_name}", - ) - # Assuming the manifest is already aggregated and correspond to a single destination - if merged_manifests_by_root.get(destination): - raise NonValidInputError( - f"{destination} is already in use, one destination path maps to one manifest file only." - ) - - merged_manifests_by_root[destination] = manifest - - # Given manifests and S3 bucket + root, downloads all files from a CAS in each manifest. - s3_settings: JobAttachmentS3Settings = JobAttachmentS3Settings.from_s3_root_uri(s3_root_uri) - return download_files_from_manifests( - s3_bucket=s3_settings.s3BucketName, - manifests_by_root=merged_manifests_by_root, - cas_prefix=s3_settings.full_cas_prefix(), - session=boto3_session, - conflict_resolution=conflict_resolution, - ) - - -def _attachment_upload( - manifests: List[str], - s3_root_uri: str, - boto3_session: boto3.Session, - root_dirs: List[str] = [], - path_mapping_rules: Optional[str] = None, - manifest_path_mapping: Optional[Dict[str, str]] = None, - upload_manifest_path: Optional[str] = None, - print_function_callback: Callable[[Any], None] = lambda msg: None, - s3_check_cache_dir: Optional[str] = None, - s3_max_pool_connections: int = 50, - small_file_threshold_multiplier: int = 20, -) -> List[UploadManifestInfo]: - """ - BETA API - This API is still evolving. - - API to upload job attachments based on given list of manifests and corresponding file directories. - If path mapping rules file is given, map to corresponding destinations. - - Args: - manifests (List[str]): File Path to the manifest file for upload. - s3_root_uri (str): S3 root uri including bucket name and root prefix. - boto3_session (boto3.Session): Boto3 session for interacting with customer s3. - root_dirs (List[str]): List of root directories holding attachments. Defaults to empty. - path_mapping_rules (Optional[str], optional): Optional file path to a JSON file contains list of path mapping. Defaults to None. - upload_manifest_path (Optional[str], optional): Optional path prefix for uploading given manifests. Defaults to None. - print_function_callback (Callable[[str], None], optional): Callback function to provide visibility. Defaults to lambda msg: None. - - Returns: - List[UploadManifestInfo]: A list of UploadManifestInfo objects corresponding to the input manifests - containing manifest path, hash information, and source path - - Raises: - NonValidInputError: raise when any of the input is not valid. - """ - - file_name_manifest_dict: Dict[str, BaseAssetManifest] = _read_manifests( - manifest_paths=manifests - ) - - if bool(path_mapping_rules) == bool(root_dirs): - raise NonValidInputError("One of path mapping rule and root dir must exist, and not both.") - - path_mapping_rule_list: List[PathMappingRule] = _process_path_mapping( - path_mapping_rules=path_mapping_rules, root_dirs=root_dirs - ) - - # Initialize an empty list to store manifest information - manifest_info_list = [] - - s3_settings: JobAttachmentS3Settings = JobAttachmentS3Settings.from_s3_root_uri(s3_root_uri) - asset_uploader: S3AssetUploader = S3AssetUploader( - session=boto3_session, - s3_max_pool_connections=s3_max_pool_connections, - small_file_threshold_multiplier=small_file_threshold_multiplier, - ) - - # Iterate over original manifests in the order they were provided - for manifest_path in manifests: - file_name = os.path.basename(manifest_path) - manifest: BaseAssetManifest = file_name_manifest_dict[file_name] - - # File name is supposed to be prefixed by a hash of source path in path mapping or provided root dirs - rule: Optional[PathMappingRule] = next( - # search in path mapping to determine source and destination - ( - rule - for rule in path_mapping_rule_list - if rule.get_hashed_source_path(manifest.get_default_hash_alg()) in file_name - ), - None, - ) - if not rule: - raise NonValidInputError( - f"No valid root defined for given manifest {file_name}, please check input root dirs and path mapping rule." - ) - - metadata = {"Metadata": {"asset-root": json.dumps(rule.source_path, ensure_ascii=True)}} - # S3 metadata must be ASCII, so use either 'asset-root' or 'asset-root-json' depending - # on whether the value is ASCII. - try: - # Add the 'asset-root' metadata if the path is ASCII - rule.source_path.encode(encoding="ascii") - metadata["Metadata"]["asset-root"] = rule.source_path - except UnicodeEncodeError: - # Add the 'asset-root-json' metadata encoded to ASCII as a JSON string - metadata["Metadata"]["asset-root-json"] = json.dumps( - rule.source_path, ensure_ascii=True - ) - if rule.source_path_format: - metadata["Metadata"]["file-system-location-name"] = rule.source_path_format - - # Uploads all files to a CAS in the manifest, optionally upload manifest file - key, data = asset_uploader.upload_assets( - job_attachment_settings=s3_settings, - manifest=manifest, - partial_manifest_prefix=upload_manifest_path, - manifest_file_name=file_name, - manifest_metadata=metadata, - source_root=Path(rule.source_path), - asset_root=Path(rule.destination_path), - s3_check_cache_dir=s3_check_cache_dir, - ) - print_function_callback( - f"Uploaded assets from {rule.destination_path}, to {s3_settings.to_s3_root_uri()}/Manifests/{key}, hashed data {data}" - ) - - manifest_info_list.append( - UploadManifestInfo( - output_manifest_path=key, - output_manifest_hash=data, - source_path=rule.source_path, - ) - ) - - return manifest_info_list - - -def _process_path_mapping( - path_mapping_rules: Optional[str] = None, root_dirs: List[str] = [] -) -> List[PathMappingRule]: - """ - Process list of path mapping rules from the input path mapping file or root directories. - - Args: - path_mapping_rules (Optional[str], optional): File path to path mapping rules. Defaults to None. - root_dirs (List[str], optional): List of root directories path. Defaults to []. - - Raises: - NonValidInputError: Raise if any of the path mapping rule file or root dirs are not valid. - - Returns: - List[PathMappingRule]: List of processed PathMappingRule - """ - - path_mapping_rule_list: List[PathMappingRule] = list() - - if path_mapping_rules: - if not os.path.isfile(path_mapping_rules): - raise NonValidInputError( - f"Specified path mapping file {path_mapping_rules} is not valid." - ) - with open(path_mapping_rules, encoding="utf8") as f: - data = json.load(f) - if "path_mapping_rules" in data: - data = data["path_mapping_rules"] - - assert isinstance(data, list), "Path mapping rules have to be a list of dict." - path_mapping_rule_list.extend([PathMappingRule(**mapping) for mapping in data]) - - if nonvalid_dirs := [root for root in root_dirs if not os.path.isdir(root)]: - raise NonValidInputError(f"Specified root dir {nonvalid_dirs} are not valid.") - - path_mapping_rule_list.extend( - PathMappingRule(source_path_format="", source_path=root, destination_path=root) - for root in root_dirs - ) - - return path_mapping_rule_list diff --git a/src/deadline/job_attachments/api/manifest.py b/src/deadline/job_attachments/api/manifest.py deleted file mode 100644 index c8042ff6a..000000000 --- a/src/deadline/job_attachments/api/manifest.py +++ /dev/null @@ -1,606 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import datetime -from io import BytesIO -import os -from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple - -import boto3 -import botocore.client - -from deadline.job_attachments._diff import ( - _fast_file_list_to_manifest_diff, - compare_manifest, -) -from deadline.job_attachments._glob import _process_glob_inputs, _glob_paths -from deadline.job_attachments.api._utils import _read_manifests -from deadline.job_attachments.asset_manifests._create_manifest import ( - _create_manifest_for_single_root, -) -from deadline.job_attachments.asset_manifests.base_manifest import ( - BaseAssetManifest, - BaseManifestPath, -) -from deadline.job_attachments.asset_manifests.decode import decode_manifest -from deadline.job_attachments.asset_manifests.hash_algorithms import hash_data -from deadline.job_attachments.caches.hash_cache import HashCache -from deadline.job_attachments.download import ( - get_manifest_from_s3, - get_output_manifests_by_asset_root, - merge_asset_manifests, -) -from deadline.job_attachments.models import ( - S3_MANIFEST_FOLDER_NAME, - FileStatus, - GlobConfig, - JobAttachmentS3Settings, - ManifestDiff, - ManifestDownload, - ManifestDownloadResponse, - ManifestSnapshot, - ManifestMerge, - default_glob_all, - AssetType, -) -from deadline.job_attachments.progress_tracker import ProgressReportMetadata -from deadline.job_attachments._utils import _get_long_path_compatible_path -from deadline.job_attachments.upload import S3AssetManager, S3AssetUploader, SummaryStatistics - -""" -APIs here should be business logic only. It should perform one thing, and one thing well. -It should use basic primitives like S3 upload, download, boto3 APIs. -These APIs should be boto3 session agnostic and a specific Boto3 Credential to use. -""" - - -def _glob_files( - root: str, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - include_exclude_config: Optional[str] = None, -) -> List[str]: - """ - :param include: Include glob to look for files to add to the manifest. - :param exclude: Exclude glob to exclude files from the manifest. - :param include_exclude_config: Config JSON or file containeing input and exclude config. - :returns: All files matching the include and exclude expressions. - """ - - # Get all files in the root. - glob_config: GlobConfig - if include or exclude: - include = include if include is not None else default_glob_all() - exclude = exclude if exclude is not None else [] - glob_config = GlobConfig(include_glob=include, exclude_glob=exclude) - elif include_exclude_config: - glob_config = _process_glob_inputs(include_exclude_config) - else: - # Default, include all. - glob_config = GlobConfig() - - input_files = _glob_paths( - root, include=glob_config.include_glob, exclude=glob_config.exclude_glob - ) - return input_files - - -def _manifest_snapshot( - *, - root: str, - destination: str, - name: str, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - include_exclude_config: Optional[str] = None, - diff: Optional[str] = None, - force_rehash: bool = False, - print_function_callback: Callable[[Any], None] = lambda msg: None, - hashing_progress_callback: Optional[Callable[[ProgressReportMetadata], bool]] = None, - telemetry_callback: Optional[Callable[[SummaryStatistics], None]] = None, - hash_cache_dir: Optional[str] = None, -) -> Optional[ManifestSnapshot]: - # Get all files in the root. - glob_config: GlobConfig - if include or exclude: - include = include if include is not None else default_glob_all() - exclude = exclude if exclude is not None else [] - glob_config = GlobConfig(include_glob=include, exclude_glob=exclude) - elif include_exclude_config: - glob_config = _process_glob_inputs(include_exclude_config) - else: - # Default, include all. - glob_config = GlobConfig() - - current_files = _glob_paths( - root, include=glob_config.include_glob, exclude=glob_config.exclude_glob - ) - - # Compute the output manifest immediately and hash. - if not diff: - output_manifest = _create_manifest_for_single_root( - files=current_files, - root=root, - print_function_callback=print_function_callback, - hashing_progress_callback=hashing_progress_callback, - telemetry_callback=telemetry_callback, - hash_cache_dir=hash_cache_dir, - ) - if not output_manifest: - return None - - # If this is a diff manifest, load the supplied manifest file. - else: - # Parse local manifest - with open(diff) as source_diff: - source_manifest_str = source_diff.read() - source_manifest = decode_manifest(source_manifest_str) - - # Get the differences - changed_paths: List[str] = [] - - # Fast comparison using time stamps and sizes. - if not force_rehash: - diff_list: List[Tuple[str, FileStatus]] = _fast_file_list_to_manifest_diff( - root=root, - current_files=current_files, - diff_manifest=source_manifest, - print_function_callback=print_function_callback, - return_root_relative_path=False, - ) - for diff_file in diff_list: - # Add all new and modified - if diff_file[1] != FileStatus.DELETED: - changed_paths.append(diff_file[0]) - else: - # In "slow / thorough" mode, we check by hash, which is definitive. - output_manifest = _create_manifest_for_single_root( - files=current_files, - root=root, - print_function_callback=print_function_callback, - hashing_progress_callback=hashing_progress_callback, - telemetry_callback=telemetry_callback, - hash_cache_dir=hash_cache_dir, - ) - if not output_manifest: - return None - differences: List[Tuple[FileStatus, BaseManifestPath]] = compare_manifest( - source_manifest, output_manifest - ) - for diff_item in differences: - if diff_item[0] == FileStatus.MODIFIED or diff_item[0] == FileStatus.NEW: - full_diff_path = f"{root}/{diff_item[1].path}" - changed_paths.append(full_diff_path) - print_function_callback( - f"Found difference at: {full_diff_path}, Status: {diff_item[0]}" - ) - - # If there were no files diffed, return None, there was nothing to snapshot. - if len(changed_paths) == 0: - return None - - # Since the files are already hashed, we can easily re-use has_attachments to remake a diff manifest. - output_manifest = _create_manifest_for_single_root( - files=changed_paths, - root=root, - print_function_callback=print_function_callback, - hashing_progress_callback=hashing_progress_callback, - telemetry_callback=telemetry_callback, - hash_cache_dir=hash_cache_dir, - ) - if not output_manifest: - return None - - # Write created manifest into local file, at the specified location at destination - if output_manifest is not None: - local_manifest_file = _write_manifest( - root=root, - manifest=output_manifest, - destination=destination, - name=name, - ) - # Output results. - print_function_callback(f"Manifest generated at {local_manifest_file}") - return ManifestSnapshot(root=root, manifest=local_manifest_file) - else: - # No manifest generated. - print_function_callback("No manifest generated") - return None - - -def _write_manifest( - root: str, - manifest: BaseAssetManifest, - destination: str, - name: Optional[str] = None, -) -> str: - """ - Write a manifest to a destination. - """ - # Write created manifest into local file, at the specified location at destination - root_hash: str = hash_data(root.encode("utf-8"), manifest.get_default_hash_alg()) - timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - manifest_name = name if name else root.replace("/", "_").replace("\\", "_").replace(":", "_") - manifest_name = manifest_name[1:] if manifest_name[0] == "_" else manifest_name - manifest_name = f"{manifest_name}-{root_hash}-{timestamp}.manifest" - - local_manifest_path = str( - _get_long_path_compatible_path( - os.path.join(destination, manifest_name), - ) - ) - os.makedirs(os.path.dirname(local_manifest_path), exist_ok=True) - with open(local_manifest_path, "w") as file: - file.write(manifest.encode()) - - return local_manifest_path - - -def _manifest_diff( - manifest: str, - root: str, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, - include_exclude_config: Optional[str] = None, - force_rehash=False, - print_function_callback: Callable[[Any], None] = lambda msg: None, - cache_dir: Optional[str] = None, -) -> ManifestDiff: - """ - BETA API - This API is still evolving but will be made public in the near future. - API to diff a manifest root with a previously snapshotted manifest. - :param manifest: Manifest file path to compare against. - :param root: Root directory to generate the manifest fileset. - :param include: Include glob to look for files to add to the manifest. - :param exclude: Exclude glob to exclude files from the manifest. - :param include_exclude_config: Config JSON or file containeing input and exclude config. - :param print_function_callback: Callback function to handle print messages. - :returns: ManifestDiff object containing all new changed, deleted files. - """ - - # Find all files matching our regex - input_files = _glob_files( - root=root, - include=include, - exclude=exclude, - include_exclude_config=include_exclude_config, - ) - input_paths = [Path(p) for p in input_files] - - # Placeholder Asset Manager - asset_manager = S3AssetManager() - - # parse the given manifest to compare against. - local_manifest_object: BaseAssetManifest - with open(manifest) as input_file: - manifest_data_str = input_file.read() - local_manifest_object = decode_manifest(manifest_data_str) - - output: ManifestDiff = ManifestDiff() - - # Helper function to update output datastructure. - def process_output(status: FileStatus, path: str, output_diff: ManifestDiff): - if status == FileStatus.MODIFIED: - output_diff.modified.append(path) - elif status == FileStatus.NEW: - output_diff.new.append(path) - elif status == FileStatus.DELETED: - output_diff.deleted.append(path) - - if force_rehash: - # hash and create manifest of local directory - cache_config = cache_dir - with HashCache(cache_config) as hash_cache: - directory_manifest_object = asset_manager._create_manifest_file( - input_paths=input_paths, root_path=root, hash_cache=hash_cache - ) - - # Hash based compare manifests. - differences: List[Tuple[FileStatus, BaseManifestPath]] = compare_manifest( - reference_manifest=local_manifest_object, - compare_manifest=directory_manifest_object, - ) - # Map to output datastructure. - for item in differences: - process_output(item[0], item[1].path, output) - - else: - # File based comparisons. - fast_diff: List[Tuple[str, FileStatus]] = _fast_file_list_to_manifest_diff( - root=root, - current_files=input_files, - diff_manifest=local_manifest_object, - print_function_callback=print_function_callback, - ) - for fast_diff_item in fast_diff: - process_output(fast_diff_item[1], fast_diff_item[0], output) - - return output - - -def _manifest_upload( - manifest_file: str, - s3_bucket_name: str, - s3_cas_prefix: str, - boto_session: boto3.Session, - s3_key_prefix: Optional[str] = None, - print_function_callback: Callable[[Any], None] = lambda msg: None, -): - """ - BETA API - This API is still evolving but will be made public in the near future. - API to upload a job attachment manifest to the Content Addressable Storage. Manifests will be - uploaded to s3://{s3_bucket_name}/{cas_prefix}/Manifests/{s3_key_prefix}/{manifest_file_name} as per the Deadline CAS folder structure. - manifest_file: File Path to the manifest file for upload. - s3_bucket_name: S3 bucket name. - boto_session: S3 Content Addressable Storage prefix. - s3_key_prefix: [Optional] S3 prefix path to the Content Addressable Storge. - boto_session: Boto3 session. - print_function_callback: Callback function to handle print messages. - """ - # S3 metadata - - # Upload settings: - s3_metadata: Dict[str, Any] = {"Metadata": {}} - s3_metadata["Metadata"]["file-system-location-name"] = manifest_file - - # Always upload the manifest file to case root /Manifest with the original file name. - manifest_path: str = "/".join( - [ - s3_cas_prefix, - S3_MANIFEST_FOLDER_NAME, - s3_key_prefix, - Path(manifest_file).name, - ] - if s3_key_prefix - else [s3_cas_prefix, S3_MANIFEST_FOLDER_NAME, Path(manifest_file).name] - ) - - # S3 uploader. - upload = S3AssetUploader( - session=boto_session, - s3_max_pool_connections=50, - small_file_threshold_multiplier=20, - ) - - manifest_file = str(_get_long_path_compatible_path(manifest_file)) - - with open(manifest_file) as manifest: - upload.upload_bytes_to_s3( - bytes=BytesIO(manifest.read().encode("utf-8")), - bucket=s3_bucket_name, - key=manifest_path, - progress_handler=print_function_callback, - extra_args=s3_metadata, - ) - - -def _manifest_download( - *, - deadline_client: botocore.client.BaseClient, - download_dir: str, - farm_id: str, - job_id: str, - queue_id: str, - queue_role_session: boto3.Session, - queue_s3_settings: JobAttachmentS3Settings, - asset_type: AssetType = AssetType.ALL, - print_function_callback: Callable[[Any], None] = lambda msg: None, - step_id: Optional[str] = None, -) -> ManifestDownloadResponse: - """ - BETA API - This API is still evolving but will be made public in the near future. - API to download the Job Attachment manifest for a Job, and optionally dependencies for Step. - deadline_client: Deadline client for API calls. - download_dir: Download directory. - farm_id: The Deadline Farm to download from. - job_id: Job Id to download. - queue_id: The Deadline Queue to download from. - queue_role_session: Boto3 session for the queue role. - queue_s3_settings: S3 settings for the queue's job attachments. - asset_type: Which asset manifests should be downloaded for given job (& optionally step), options are Input, Output, All. Default behaviour is All. - print_function_callback: Callback function to handle print messages. - step_id: Optional, download manifest for a step. - return ManifestDownloadResponse Downloaded Manifest data. Contains source S3 key and local download path. - """ - - # Get S3 prefix - s3_prefix: Path = Path(queue_s3_settings.rootPrefix, S3_MANIFEST_FOLDER_NAME) - - # Capture a list of success download files for JSON output. - successful_downloads: List[ManifestDownload] = [] - - # Utility function to build up manifests by root. - manifests_by_root: Dict[str, List[BaseAssetManifest]] = dict() - - # Set the values of download input & output as per selected asset types in the api request - download_input: bool = ( - True if asset_type is None or asset_type in (AssetType.INPUT, AssetType.ALL) else False - ) - download_output: bool = ( - True if asset_type is None or asset_type in (AssetType.OUTPUT, AssetType.ALL) else False - ) - - def add_manifest_by_root( - manifests_by_root: Dict[str, list], root: str, manifest: BaseAssetManifest - ): - if root not in manifests_by_root: - manifests_by_root[root] = [] - manifests_by_root[root].append(manifest) - - # Get the job from deadline api - job: dict = deadline_client.get_job(farmId=farm_id, queueId=queue_id, jobId=job_id) - - # If input manifests need to be downloaded - if download_input: - print_function_callback(f"Downloading input manifests for job: {job_id}") - - # Get input_manifest_paths from Deadline GetJob API - attachments: dict = job.get("attachments", {}) - input_manifest_paths: List[Tuple[str, str]] = [ - (manifest.get("inputManifestPath", ""), manifest["rootPath"]) - for manifest in attachments.get("manifests", []) - ] - - # Download each input_manifest_path - for input_manifest_path, root_path in input_manifest_paths: - asset_manifest: BaseAssetManifest = get_manifest_from_s3( - manifest_key=(s3_prefix / input_manifest_path).as_posix(), - s3_bucket=queue_s3_settings.s3BucketName, - session=queue_role_session, - ) - if asset_manifest is not None: - print_function_callback(f"Found input manifest for root: {root_path}") - add_manifest_by_root( - manifests_by_root=manifests_by_root, - root=root_path, - manifest=asset_manifest, - ) - - # Now handle step-step dependencies - if step_id is not None: - print_function_callback(f"Finding step-step dependency manifests for step: {step_id}") - - # Get Step-Step dependencies with pagination - next_token = "" - while next_token is not None: - step_dep_response = deadline_client.list_step_dependencies( - farmId=farm_id, - queueId=queue_id, - jobId=job_id, - stepId=step_id, - nextToken=next_token, - ) - - for dependent_step in step_dep_response["dependencies"]: - print_function_callback( - f"Found Step-Step dependency. {dependent_step['stepId']}" - ) - - # Get manifests for the step-step dependency - step_manifests_by_root: Dict[str, List[BaseAssetManifest]] = ( - get_output_manifests_by_asset_root( - s3_settings=queue_s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - step_id=dependent_step["stepId"], - session=queue_role_session, - ) - ) - # Merge all manifests by root. - for root in step_manifests_by_root.keys(): - for manifest in step_manifests_by_root[root]: - print_function_callback( - f"Found step-step output manifest for root: {root}" - ) - add_manifest_by_root( - manifests_by_root=manifests_by_root, - root=root, - manifest=manifest, - ) - - next_token = step_dep_response.get("nextToken") - - # If output manifests need to be downloaded - if download_output: - output_manifests_by_root: Dict[str, List[BaseAssetManifest]] - if step_id is not None: - print_function_callback( - f"Downloading output manifests step: {step_id} of job: {job_id}" - ) - # Only get the output manifests for selected step - output_manifests_by_root = get_output_manifests_by_asset_root( - s3_settings=queue_s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - step_id=step_id, - session=queue_role_session, - ) - - else: - print_function_callback(f"Downloading output manifests for job: {job_id}") - # Get output manifests for all steps of the job - output_manifests_by_root = get_output_manifests_by_asset_root( - s3_settings=queue_s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - session=queue_role_session, - ) - - # Merge all output manifests by root. - for root in output_manifests_by_root.keys(): - for manifest in output_manifests_by_root[root]: - print_function_callback(f"Found output manifest for root: {root}") - add_manifest_by_root( - manifests_by_root=manifests_by_root, root=root, manifest=manifest - ) - - # Finally, merge all manifest paths to create unified manifests. - # TODO: Filter outputs by path - - merged_manifests: Dict[str, BaseAssetManifest] = {} - for root in manifests_by_root.keys(): - merged_manifest = merge_asset_manifests(manifests_by_root[root]) - if merged_manifest: - merged_manifests[root] = merged_manifest - - # Save the manifest files to disk. - for root in merged_manifests.keys(): - # Save the merged manifest as {root}_{hash}_timestamp. - root_hash: str = hash_data( - root.encode("utf-8"), merged_manifests[root].get_default_hash_alg() - ) - timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - manifest_name = root.replace("/", "_") - manifest_name = manifest_name[1:] if manifest_name[0] == "_" else manifest_name - manifest_name = f"{manifest_name}-{root_hash}-{timestamp}.manifest" - - local_manifest_file_path = os.path.join(download_dir, manifest_name) - with open(local_manifest_file_path, "w") as file: - file.write(merged_manifests[root].encode()) - successful_downloads.append( - ManifestDownload(manifest_root=root, local_manifest_path=str(local_manifest_file_path)) - ) - print_function_callback( - f"Downloaded merged manifest for root: {root} to: {local_manifest_file_path}" - ) - - # JSON output at the end. - output = ManifestDownloadResponse(downloaded=successful_downloads) - return output - - -def _manifest_merge( - root: str, - manifest_files: List[str], - destination: str, - name: Optional[str], - print_function_callback: Callable[[Any], None] = lambda msg: None, -) -> Optional[ManifestMerge]: - """ - BETA API - API to merge multiple manifests into one. - root: Root path for the manifest. - manifest_files: List of manifest files to merge. - destination: Destination directory for the merged manifest. - name: Name of the merged manifest. - print_function_callback: Callback function to handle print messages. - return ManifestMerge object containing the merged manifest. - """ - - manifests: List[BaseAssetManifest] = list( - _read_manifests(manifest_paths=manifest_files).values() - ) - - merged_manifest = merge_asset_manifests(manifests) - - if not merged_manifest: - return None - - local_manifest_file = _write_manifest( - root=root, manifest=merged_manifest, destination=destination, name=name - ) - print_function_callback(f"Manifest generated at {local_manifest_file}") - - return ManifestMerge(manifest_root=root, local_manifest_path=local_manifest_file) diff --git a/src/deadline/job_attachments/asset_manifests/__init__.py b/src/deadline/job_attachments/asset_manifests/__init__.py deleted file mode 100644 index fc9c48742..000000000 --- a/src/deadline/job_attachments/asset_manifests/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from .base_manifest import BaseAssetManifest, BaseManifestPath -from .hash_algorithms import HashAlgorithm, hash_data, hash_file -from .manifest_model import BaseManifestModel, ManifestModelRegistry -from .versions import ManifestVersion - -__all__ = [ - "ManifestVersion", - "ManifestModelRegistry", - "BaseAssetManifest", - "BaseManifestModel", - "BaseManifestPath", - "HashAlgorithm", - "hash_data", - "hash_file", -] - -ManifestModelRegistry.register() diff --git a/src/deadline/job_attachments/asset_manifests/_canonical_json.py b/src/deadline/job_attachments/asset_manifests/_canonical_json.py deleted file mode 100644 index 2bb1e57ba..000000000 --- a/src/deadline/job_attachments/asset_manifests/_canonical_json.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Module that defines the second iteration of the asset manifest""" - -from __future__ import annotations - -import dataclasses -import json - -from .base_manifest import BaseAssetManifest, BaseManifestPath - - -def canonical_path_comparator(path: BaseManifestPath): - """ - Comparator for sorting paths. - """ - # Sort by UTF-16 values as per the spec - # https://www.rfc-editor.org/rfc/rfc8785.html#name-sorting-of-object-propertie - # Use the "surrogatepass" error handler because filenames encountered in the wild - # include surrogates. - return path.path.encode("utf-16_be", errors="surrogatepass") - - -def manifest_to_canonical_json_string(manifest: BaseAssetManifest) -> str: - """ - Return a canonicalized JSON string based on the following: - * The JSON file *MUST* adhere to the JSON canonicalization guidelines - outlined here (https://www.rfc-editor.org/rfc/rfc8785.html). - * For now this is a simplification of this spec. Whitespace between JSON tokens are - not emitted, and the keys are lexographically sorted. However the current implementation doesn't - serialize Literals, String, Numbers, etc. to the letter of the spec explicitly. - It implicitly follows the spec as the object keys all fall within the ASCII range of characters - and this version of the Asset Manifest only serializes strings and integers. - * The paths array *MUST* be in lexicographical order by path. - """ - return json.dumps( - dataclasses.asdict(manifest), sort_keys=True, separators=(",", ":"), ensure_ascii=True - ) diff --git a/src/deadline/job_attachments/asset_manifests/_create_manifest.py b/src/deadline/job_attachments/asset_manifests/_create_manifest.py deleted file mode 100644 index 1e8a7c0ed..000000000 --- a/src/deadline/job_attachments/asset_manifests/_create_manifest.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from typing import Any, Callable, List, Optional - -from deadline.job_attachments.asset_manifests.base_manifest import BaseAssetManifest -from deadline.job_attachments.progress_tracker import ProgressReportMetadata -from deadline.job_attachments.upload import S3AssetManager, SummaryStatistics - -from ...job_attachments.api._hashing import _hash_attachments - - -def _create_manifest_for_single_root( - *, - files: List[str], - root: str, - print_function_callback: Callable[[Any], None] = lambda msg: None, - hashing_progress_callback: Optional[Callable[[ProgressReportMetadata], bool]] = None, - telemetry_callback: Optional[Callable[[SummaryStatistics], None]] = None, - hash_cache_dir: Optional[str] = None, -) -> Optional[BaseAssetManifest]: - """ - Shared logic to create a manifest file from a single root. - :param files: Input files to create a manifest with. - :param root: Asset root of the files. - :param print_function_callback: Callback for printing status messages. - :param hashing_progress_callback: Optional callback for hashing progress updates. - :param telemetry_callback: Optional callback for hashing telemetry reporting. - :param hash_cache_dir: Optional directory for the hash cache. - :return: The generated manifest, or None if no manifest was generated. - """ - asset_manager = S3AssetManager() - - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=files, output_paths=[root], referenced_paths=[] - ) - # We only provided 1 root path, so output should only have 1 group. - assert len(upload_group.asset_groups) == 1 - - manifests = None - if upload_group.asset_groups: - _, manifests = _hash_attachments( - asset_manager=asset_manager, - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - print_function_callback=print_function_callback, - hashing_progress_callback=hashing_progress_callback, - hash_cache_dir=hash_cache_dir, - telemetry_callback=telemetry_callback, - ) - - if not manifests or len(manifests) == 0: - print_function_callback("No manifest generated") - return None - else: - # This is a hard failure, we are snapshotting 1 directory. - assert len(manifests) == 1 - - # Return the generated manifest. - return manifests[0].asset_manifest diff --git a/src/deadline/job_attachments/asset_manifests/base_manifest.py b/src/deadline/job_attachments/asset_manifests/base_manifest.py deleted file mode 100644 index 7cad314df..000000000 --- a/src/deadline/job_attachments/asset_manifests/base_manifest.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Contains the base asset manifest and entities that are part of the Asset Manifest""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from dataclasses import dataclass, fields -from typing import Any, ClassVar - -from .hash_algorithms import HashAlgorithm -from .versions import ManifestVersion - - -@dataclass -class BaseManifestPath(ABC): - """ - Data class for paths in the Asset Manifest - """ - - path: str - hash: str - size: int - mtime: int - manifest_version: ClassVar[ManifestVersion] - - def __init__(self, *, path: str, hash: str, size: int, mtime: int) -> None: - self.path = path - self.hash = hash - self.size = size - self.mtime = mtime - - def __eq__(self, other: object) -> bool: - """ - By default dataclasses still check ClassVars for equality. - We only want to compare fields. - :param other: - :return: True if all fields are equal, False otherwise. - """ - if not isinstance(other, BaseManifestPath): - return NotImplemented - return fields(self) == fields(other) - - -@dataclass -class BaseAssetManifest(ABC): - """Base class for the Asset Manifest.""" - - hashAlg: HashAlgorithm - paths: list[BaseManifestPath] - manifestVersion: ManifestVersion - - def __init__( - self, - *, - paths: list[BaseManifestPath], - hash_alg: HashAlgorithm, - ): - self.paths = paths - self.hashAlg = hash_alg - - @classmethod - @abstractmethod - def get_default_hash_alg(cls) -> HashAlgorithm: # pragma: no cover - """Returns the default hashing algorithm for the Asset Manifest""" - raise NotImplementedError( - "Asset Manifest base class does not implement get_default_hash_alg" - ) - - @classmethod - @abstractmethod - def decode(cls, *, manifest_data: dict[str, Any]) -> BaseAssetManifest: # pragma: no cover - """Turn a dictionary for a manifest into an AssetManifest object""" - raise NotImplementedError("Asset Manifest base class does not implement decode") - - @abstractmethod - def encode(self) -> str: # pragma: no cover - """ - Recursively encode the Asset Manifest into a string according to - whatever format the Asset Manifest was written for. - """ - raise NotImplementedError("Asset Manifest base class does not implement encode") diff --git a/src/deadline/job_attachments/asset_manifests/decode.py b/src/deadline/job_attachments/asset_manifests/decode.py deleted file mode 100644 index 2daca04ce..000000000 --- a/src/deadline/job_attachments/asset_manifests/decode.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Contains methods for decoding and validating Asset Manifests.""" - -from __future__ import annotations - -import json -import re -from typing import Any, Optional, Tuple - -from ..exceptions import ManifestDecodeValidationError -from .base_manifest import BaseAssetManifest -from .manifest_model import ManifestModelRegistry -from .versions import ManifestVersion -from .v2023_03_03.validate import validate_manifest_2023_03_03 - -alphanum_regex = re.compile("[a-zA-Z0-9]+") - - -def validate_manifest( - manifest: dict[str, Any], version: ManifestVersion -) -> Tuple[bool, Optional[str]]: - """ - Checks if the given manifest is valid for the given manifest version. Returns True if the manifest - is valid for the given version. Returns False and a string explaining the error if the manifest is not valid. - """ - if version == ManifestVersion.v2023_03_03: - return validate_manifest_2023_03_03(manifest) - else: - return False, f"Version {version} is not supported" - - -def decode_manifest(manifest: str) -> BaseAssetManifest: - """ - Takes in a manifest string and returns an Asset Manifest object. - A ManifestDecodeValidationError will be raised if the manifest version is unknown or - the manifest is not valid. - """ - document: dict[str, Any] = json.loads(manifest) - - try: - version = ManifestVersion(document["manifestVersion"]) - except ValueError: - # Value of the manifest version is not one we know. - supported_versions = ", ".join( - [v.value for v in ManifestVersion if v != ManifestVersion.UNDEFINED] - ) - raise ManifestDecodeValidationError( - f"Unknown manifest version: {document['manifestVersion']} " - f"(Currently supported Manifest versions: {supported_versions})" - ) - except KeyError: - raise ManifestDecodeValidationError( - 'Manifest is missing the required "manifestVersion" field' - ) - - manifest_valid, error_string = validate_manifest(document, version) - - if not manifest_valid: - raise ManifestDecodeValidationError(error_string) - - manifest_model = ManifestModelRegistry.get_manifest_model(version=version) - decoded_manifest = manifest_model.AssetManifest.decode(manifest_data=document) - - # Validate hashes are alphanumeric - for path in decoded_manifest.paths: - if alphanum_regex.fullmatch(path.hash) is None: - raise ManifestDecodeValidationError( - f"The hash {path.hash} for path {path.path} is not alphanumeric" - ) - - return decoded_manifest diff --git a/src/deadline/job_attachments/asset_manifests/hash_algorithms.py b/src/deadline/job_attachments/asset_manifests/hash_algorithms.py deleted file mode 100644 index 037500bd0..000000000 --- a/src/deadline/job_attachments/asset_manifests/hash_algorithms.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Module that defines the hashing algorithms supported by this library.""" - -import io - -from enum import Enum - -from ..exceptions import UnsupportedHashingAlgorithmError - - -class HashAlgorithm(str, Enum): - """ - Enumerant of all hashing algorithms supported by this library. - - Algorithms: - XXH128 - The xxhash 128-bit hashing algorithm. - - """ - - XXH128 = "xxh128" - - -def hash_file(file_path: str, hash_alg: HashAlgorithm) -> str: - """Hashes the given file using the given hashing algorithm.""" - if hash_alg == HashAlgorithm.XXH128: - from xxhash import xxh3_128 - - hasher = xxh3_128() - else: - raise UnsupportedHashingAlgorithmError( - f"Unsupported hashing algorithm provided: {hash_alg}" - ) - - with open(file_path, "rb") as file: - while True: - chunk = file.read(io.DEFAULT_BUFFER_SIZE) - if not chunk: - break - hasher.update(chunk) - return hasher.hexdigest() - - -def hash_data(data: bytes, hash_alg: HashAlgorithm) -> str: - """Hashes the given data bytes using the given hashing algorithm.""" - if hash_alg == HashAlgorithm.XXH128: - from xxhash import xxh3_128 - - hasher = xxh3_128() - else: - raise UnsupportedHashingAlgorithmError( - f"Unsupported hashing algorithm provided: {hash_alg}" - ) - - hasher.update(data) - return hasher.hexdigest() diff --git a/src/deadline/job_attachments/asset_manifests/manifest_model.py b/src/deadline/job_attachments/asset_manifests/manifest_model.py deleted file mode 100644 index 80b204c53..000000000 --- a/src/deadline/job_attachments/asset_manifests/manifest_model.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Module for the base Manifest Model.""" - -from __future__ import annotations - -from typing import Type - -from .base_manifest import ( - BaseAssetManifest, - BaseManifestPath, -) # noqa # pylint: disable=unused-import -from .versions import ManifestVersion - - -class BaseManifestModel: - """The base Manifest Model""" - - manifest_version: ManifestVersion = ManifestVersion.UNDEFINED # pylint: disable=invalid-name - AssetManifest: Type[BaseAssetManifest] - Path: Type[BaseManifestPath] - - -class ManifestModelRegistry: - _asset_manifest_mapping: dict[ManifestVersion, Type[BaseManifestModel]] = dict() - - @classmethod - def register(cls) -> None: - """ - Register the availble manifest models. - """ - # Import here to avoid circular dependancies. - from .v2023_03_03 import ManifestModel as _ManifestModel2023_03_03 - - new_manifests = { - ManifestVersion.v2023_03_03: _ManifestModel2023_03_03, - } - cls._asset_manifest_mapping = {**cls._asset_manifest_mapping, **new_manifests} - - @classmethod - def get_manifest_model(cls, *, version: ManifestVersion) -> Type[BaseManifestModel]: - """ - Get the manifest model for the specified version. - """ - manifest_model = cls._asset_manifest_mapping.get(version, None) - if not manifest_model: - raise RuntimeError(f"No model for asset manifest version: {version}") - return manifest_model diff --git a/src/deadline/job_attachments/asset_manifests/v2023_03_03/__init__.py b/src/deadline/job_attachments/asset_manifests/v2023_03_03/__init__.py deleted file mode 100644 index bdd25d34f..000000000 --- a/src/deadline/job_attachments/asset_manifests/v2023_03_03/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from .asset_manifest import AssetManifest, ManifestModel, ManifestPath - -__all__ = ["ManifestModel", "ManifestPath", "AssetManifest"] diff --git a/src/deadline/job_attachments/asset_manifests/v2023_03_03/asset_manifest.py b/src/deadline/job_attachments/asset_manifests/v2023_03_03/asset_manifest.py deleted file mode 100644 index f825328f3..000000000 --- a/src/deadline/job_attachments/asset_manifests/v2023_03_03/asset_manifest.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Module that defines the v2023-03-03 version of the asset manifest""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any, Type - -from .._canonical_json import canonical_path_comparator, manifest_to_canonical_json_string -from ..base_manifest import BaseAssetManifest, BaseManifestPath -from ..hash_algorithms import HashAlgorithm -from ..manifest_model import BaseManifestModel -from ..versions import ManifestVersion -from ...exceptions import ManifestDecodeValidationError - - -SUPPORTED_HASH_ALGS: set[HashAlgorithm] = {HashAlgorithm.XXH128} -DEFAULT_HASH_ALG: HashAlgorithm = HashAlgorithm.XXH128 - - -@dataclass -class ManifestPath(BaseManifestPath): - """ - Extension for version v2023-03-03 of the asset manifest. - """ - - manifest_version = ManifestVersion.v2023_03_03 - - def __init__(self, *, path: str, hash: str, size: int, mtime: int) -> None: - super().__init__(path=path, hash=hash, size=size, mtime=mtime) - - -@dataclass -class AssetManifest(BaseAssetManifest): - """Version v2023-03-03 of the asset manifest""" - - totalSize: int # pyline: disable=invalid-name - - def __init__( - self, *, hash_alg: HashAlgorithm, paths: list[BaseManifestPath], total_size: int - ) -> None: - if hash_alg not in SUPPORTED_HASH_ALGS: - raise ManifestDecodeValidationError( - f"Unsupported hashing algorithm: {hash_alg}. Must be one of: {[e.value for e in SUPPORTED_HASH_ALGS]}" - ) - - super().__init__(hash_alg=hash_alg, paths=paths) - self.totalSize = total_size - self.manifestVersion = ManifestVersion.v2023_03_03 - - @classmethod - def decode(cls, *, manifest_data: dict[str, Any]) -> AssetManifest: - """ - Return an instance of this class given a manifest dictionary. - Assumes the manifest has been validated prior to calling. - """ - try: - hash_alg: HashAlgorithm = HashAlgorithm(manifest_data["hashAlg"]) - except ValueError: - raise ManifestDecodeValidationError( - f"Unsupported hashing algorithm: {hash_alg}. Must be one of: {[e.value for e in SUPPORTED_HASH_ALGS]}" - ) - - return cls( - hash_alg=hash_alg, - paths=[ - ManifestPath( - path=path["path"], hash=path["hash"], size=path["size"], mtime=path["mtime"] - ) - for path in manifest_data["paths"] - ], - total_size=manifest_data["totalSize"], - ) - - @classmethod - def get_default_hash_alg(cls) -> HashAlgorithm: # pragma: no cover - """Returns the default hashing algorithm for the Asset Manifest, represented as a string""" - return DEFAULT_HASH_ALG - - def encode(self) -> str: - """ - Return a canonicalized JSON string of the manifest - """ - self.paths.sort(key=canonical_path_comparator) - return manifest_to_canonical_json_string(manifest=self) - - -class ManifestModel(BaseManifestModel): - """ - The asset manifest model for v2023-03-03 - """ - - manifest_version: ManifestVersion = ManifestVersion.v2023_03_03 - AssetManifest: Type[AssetManifest] = AssetManifest - Path: Type[ManifestPath] = ManifestPath diff --git a/src/deadline/job_attachments/asset_manifests/v2023_03_03/validate.py b/src/deadline/job_attachments/asset_manifests/v2023_03_03/validate.py deleted file mode 100644 index fee0aa3c7..000000000 --- a/src/deadline/job_attachments/asset_manifests/v2023_03_03/validate.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Contains functions validating Asset Manifests version 2023_03_03.""" - -from __future__ import annotations - -from typing import Any, Optional, Tuple - -_REQUIRED_FIELDS_2023_03_03: list[str] = [ - "hashAlg", - "paths", - "manifestVersion", - "totalSize", -] - -_HASH_ALGS_2023_03_03: set[str] = {"xxh128"} - -_PATH_REQUIRED_FIELDS_2023_03_03: list[str] = [ - "path", - "hash", - "size", - "mtime", -] - - -def _get_missing_fields(obj: dict[str, Any], required: list[str]) -> list[str]: - missing = [] - for field in required: - if field not in obj: - missing.append(field) - return missing - - -def _validate_path_2023_03_03(path_object: dict[str, Any]) -> Tuple[bool, Optional[str]]: - missing = _get_missing_fields(path_object, _PATH_REQUIRED_FIELDS_2023_03_03) - if len(missing) > 0: - return False, f"path is missing required field(s) {missing}" - - path = path_object["path"] - if not isinstance(path, str): - return False, "path must be a string" - - hash = path_object["hash"] - if not isinstance(hash, str): - return False, "hash must be a string" - - size = path_object["size"] - if not isinstance(size, int): - return False, "size must be an integer" - - mtime = path_object["mtime"] - if not isinstance(mtime, int): - return False, "mtime must be an integer" - - return True, None - - -def validate_manifest_2023_03_03(manifest: dict[str, Any]) -> Tuple[bool, Optional[str]]: - missing = _get_missing_fields(manifest, _REQUIRED_FIELDS_2023_03_03) - if len(missing) > 0: - return False, f"manifest is missing required field(s) {missing}" - - manifest_version = manifest["manifestVersion"] - if not isinstance(manifest_version, str) or manifest_version != "2023-03-03": - return False, 'manifestVersion must be "2023-03-03"' - - hash_alg = manifest["hashAlg"] - if not isinstance(hash_alg, str) or hash_alg not in _HASH_ALGS_2023_03_03: - return False, f"hashAlg must be one of {_HASH_ALGS_2023_03_03}" - - total_size = manifest["totalSize"] - if not isinstance(total_size, int): - return False, "totalSize must be an integer" - - paths = manifest["paths"] - if not isinstance(paths, list): - return False, "paths must be a list" - elif len(paths) < 1: - return False, "paths must have a least one item" - else: - for path_object in paths: - ok, message = _validate_path_2023_03_03(path_object) - if not ok: - return False, message - - return True, None diff --git a/src/deadline/job_attachments/asset_manifests/versions.py b/src/deadline/job_attachments/asset_manifests/versions.py deleted file mode 100644 index cab84b67d..000000000 --- a/src/deadline/job_attachments/asset_manifests/versions.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Module that defines the asset manifest versions.""" - -from enum import Enum - - -class ManifestVersion(str, Enum): - """ - Enumerant of all Asset Manifest versions supported by this library. - - Special values: - UNDEFINED -- Purely for internal testing. - - Versions: - v2023_03_03 - First version. - """ - - UNDEFINED = "UNDEFINED" - v2023_03_03 = "2023-03-03" diff --git a/src/deadline/job_attachments/asset_sync.py b/src/deadline/job_attachments/asset_sync.py deleted file mode 100644 index fdeb269fc..000000000 --- a/src/deadline/job_attachments/asset_sync.py +++ /dev/null @@ -1,1091 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Module for File Attachment synching""" - -from __future__ import annotations -from dataclasses import asdict -import os -import shutil -import sys -import time -import json -import warnings as _warnings -from io import BytesIO -from logging import Logger, LoggerAdapter, getLogger -from math import trunc -from pathlib import Path, PurePosixPath -from typing import Any, Callable, DefaultDict, Dict, List, Optional, Tuple, Type, Union - -import boto3 - -from .progress_tracker import ( - ProgressReportMetadata, - ProgressStatus, - ProgressTracker, - SummaryStatistics, -) - -from .asset_manifests import ( - BaseAssetManifest, - BaseManifestModel, - HashAlgorithm, - hash_data, - hash_file, - ManifestModelRegistry, - ManifestVersion, -) -from .asset_manifests import BaseManifestPath as RelativeFilePath -from ._aws.aws_clients import get_boto3_session -from ._aws.deadline import get_job, get_queue -from .download import ( - merge_asset_manifests, - download_files_from_manifests, - get_manifest_from_s3, - get_output_manifests_by_asset_root, - mount_vfs_from_manifests, -) - -from .exceptions import ( - AssetSyncError, - VFSExecutableMissingError, - JobAttachmentsS3ClientError, - VFSOSUserNotSetError, -) -from .vfs import VFSProcessManager -from .models import ( - Attachments, - FileConflictResolution as _FileConflictResolution, - JobAttachmentsFileSystem, - JobAttachmentS3Settings, - ManifestProperties, - OutputFile, - PathFormat, - PathMappingRule, -) -from .upload import S3AssetUploader -from .os_file_permission import ( - FileSystemPermissionSettings, - PosixFileSystemPermissionSettings, -) -from ._path_summarization import human_readable_file_size -from ._utils import ( - _float_to_iso_datetime_string, - _get_unique_dest_dir_name, - _join_s3_paths, -) - -logger = getLogger("deadline.job_attachments") - - -class AssetSync: - """Class for managing AWS Deadline Cloud job-level attachments.""" - - _ENDING_PROGRESS = 100.0 - - def __init__( - self, - farm_id: str, - boto3_session: Optional[boto3.Session] = None, - manifest_version: ManifestVersion = ManifestVersion.v2023_03_03, - deadline_endpoint_url: Optional[str] = None, - session_id: Optional[str] = None, - s3_max_pool_connections: int = 50, - small_file_threshold_multiplier: int = 20, - ) -> None: - self.farm_id = farm_id - - self.logger: Union[Logger, LoggerAdapter] = logger - if session_id: - self.logger = LoggerAdapter(logger, {"session_id": session_id}) - - self.session: boto3.Session - if boto3_session is None: - self.session = get_boto3_session() - else: - self.session = boto3_session - - self.deadline_endpoint_url = deadline_endpoint_url - self.s3_uploader: S3AssetUploader = S3AssetUploader( - session=boto3_session, - s3_max_pool_connections=s3_max_pool_connections, - small_file_threshold_multiplier=small_file_threshold_multiplier, - ) - self.manifest_model: Type[BaseManifestModel] = ManifestModelRegistry.get_manifest_model( - version=manifest_version - ) - - # A dictionary mapping absolute file paths to their last modification times in microseconds. - # This is used to determine if an asset has been modified since it was last synced. - self.synced_assets_mtime: dict[str, int] = dict() - - self.hash_alg: HashAlgorithm = self.manifest_model.AssetManifest.get_default_hash_alg() - - self._local_root_to_src_map: dict[str, str] = dict() - - @staticmethod - def generate_dynamic_path_mapping( - session_dir: Path, - attachments: Attachments, - ) -> dict[str, PathMappingRule]: - """ - Compute path mapping rules that are relative to the given session directory. - - Args: - session_dir: path to the current session directory - attachments: an object that holds all input assets for the job. - - Returns: a dictionary of local roots for each asset root, used for path mapping. - """ - mapped_path: dict[str, PathMappingRule] = dict() - - for manifest_properties in attachments.manifests: - if not manifest_properties.fileSystemLocationName: - dir_name: str = _get_unique_dest_dir_name(manifest_properties.rootPath) - local_root = str(session_dir.joinpath(dir_name)) - mapped_path[manifest_properties.rootPath] = PathMappingRule( - source_path_format=manifest_properties.rootPathFormat.value, - source_path=manifest_properties.rootPath, - destination_path=local_root, - ) - - return mapped_path - - @staticmethod - def get_local_destination( - manifest_properties: ManifestProperties, - dynamic_mapping_rules: dict[str, PathMappingRule] = {}, - storage_profiles_path_mapping_rules: dict[str, str] = {}, - ) -> str: - """ - Args: - manifest_properties: manifest properties to search local destination for. - dynamic_mapping_rules: manifest root path to worker host destination mapping relative to local session. - storage_profiles_path_mapping_rules: a dict of source path -> destination path mappings. - - Returns: local destination corresponding to the given manifest properties. - Raises: AssetSyncError If no path mapping rule is found for the given root path. - """ - root_path = manifest_properties.rootPath - - if manifest_properties.fileSystemLocationName: - local_destination = storage_profiles_path_mapping_rules.get(root_path) - else: - path_mapping: Optional[PathMappingRule] = dynamic_mapping_rules.get(root_path) - local_destination = path_mapping.destination_path if path_mapping else None - - if local_destination: - return local_destination - else: - raise AssetSyncError( - "Error occurred while attempting to sync input files: " - f"No path mapping rule found for the source path {manifest_properties.rootPath}" - ) - - def _aggregate_asset_root_manifests( - self, - session_dir: Path, - s3_settings: JobAttachmentS3Settings, - queue_id: str, - job_id: str, - attachments: Attachments, - step_dependencies: Optional[list[str]] = None, - dynamic_mapping_rules: dict[str, PathMappingRule] = {}, - storage_profiles_path_mapping_rules: dict[str, str] = {}, - ) -> dict[str, BaseAssetManifest]: - """ - Args: - session_dir: the directory that the session is going to use. - s3_settings: S3-specific Job Attachment settings. - queue_id: the ID of the queue for step-step dependency. - job_id: the ID of the job for step-step dependency. - attachments: an object that holds all input assets for the job. - step_dependencies: the list of Step IDs whose output should be downloaded over the input job attachments. - dynamic_mapping_rules: manifest root path to worker host destination mapping relative to local session. - storage_profiles_path_mapping_rules: manifest root path to worker host destination mapping given storage profile. - Returns: a dictionary of manifest file stored in the session directory. - """ - grouped_manifests_by_root: DefaultDict[str, list[BaseAssetManifest]] = DefaultDict(list) - - for manifest_properties in attachments.manifests: - local_root: str = AssetSync.get_local_destination( - manifest_properties=manifest_properties, - dynamic_mapping_rules=dynamic_mapping_rules, - storage_profiles_path_mapping_rules=storage_profiles_path_mapping_rules, - ) - - if manifest_properties.inputManifestPath: - manifest_s3_key = s3_settings.add_root_and_manifest_folder_prefix( - manifest_properties.inputManifestPath - ) - # s3 call to get manifests - manifest = get_manifest_from_s3( - manifest_key=manifest_s3_key, - s3_bucket=s3_settings.s3BucketName, - session=self.session, - ) - self._local_root_to_src_map[local_root] = manifest_properties.rootPath - grouped_manifests_by_root[local_root].append(manifest) - - # Handle step-step dependencies. - if step_dependencies: - for step_id in step_dependencies: - manifests_by_root = get_output_manifests_by_asset_root( - s3_settings, - self.farm_id, - queue_id, - job_id, - step_id=step_id, - session=self.session, - ) - for root, manifests in manifests_by_root.items(): - # this implicitly put the step dependency files to the same asset root (if no storage profile), - # since the job is submitted from the same root - dir_name = _get_unique_dest_dir_name(root) - local_root = str(session_dir.joinpath(dir_name)) - - self._local_root_to_src_map[local_root] = root - grouped_manifests_by_root[local_root].extend(manifests) - - # Merge the manifests in each root into a single manifest - merged_manifests_by_root: dict[str, BaseAssetManifest] = dict() - for root, manifests in grouped_manifests_by_root.items(): - merged_manifest = merge_asset_manifests(manifests) - - if merged_manifest: - merged_manifests_by_root[root] = merged_manifest - - return merged_manifests_by_root - - def _launch_vfs( - self, - s3_settings: JobAttachmentS3Settings, - session_dir: Path, - fs_permission_settings: Optional[FileSystemPermissionSettings] = None, - merged_manifests_by_root: dict[str, BaseAssetManifest] = dict(), - os_env_vars: dict[str, str] | None = None, - on_mount_complete: Optional[Callable[[bool], None]] = None, - ) -> bool: - """ - Args: - s3_settings: S3-specific Job Attachment settings. - session_dir: the directory that the session is going to use. - fs_permission_settings: An instance defining group ownership and permission modes - to be set on the downloaded (synchronized) input files and directories. - merged_manifests_by_root: Merged manifests produced by - _aggregate_asset_root_manifests() - Returns: bool indicating if VFS was able to be launched. True if VFS launched successfully. - """ - - try: - VFSProcessManager.find_vfs() - mount_vfs_from_manifests( - s3_bucket=s3_settings.s3BucketName, - manifests_by_root=merged_manifests_by_root, - boto3_session=self.session, - session_dir=session_dir, - fs_permission_settings=fs_permission_settings, # type: ignore[arg-type] - os_env_vars=os_env_vars, # type: ignore[arg-type] - cas_prefix=s3_settings.full_cas_prefix(), - on_mount_complete=on_mount_complete, - ) - return True - except VFSExecutableMissingError: - logger.error( - f"Virtual File System not found, falling back to {JobAttachmentsFileSystem.COPIED} for JobAttachmentsFileSystem." - ) - return False - - def copied_download( - self, - s3_settings: JobAttachmentS3Settings, - session_dir: Path, - fs_permission_settings: Optional[FileSystemPermissionSettings] = None, - merged_manifests_by_root: dict[str, BaseAssetManifest] = dict(), - on_downloading_files: Optional[Callable[[ProgressReportMetadata], bool]] = None, - conflict_resolution: _FileConflictResolution = _FileConflictResolution.CREATE_COPY, - ) -> SummaryStatistics: - """ - Args: - s3_settings: S3-specific Job Attachment settings. - session_dir: the directory that the session is going to use. - fs_permission_settings: An instance defining group ownership and permission modes - to be set on the downloaded (synchronized) input files and directories. - merged_manifests_by_root: Merged manifests produced by _aggregate_asset_root_manifests() - on_downloading_files: Callback when download files from S3. - - Returns: - The download summary statistics. - - Raises: - JobAttachmentsS3ClientError: If any issue is encountered while downloading. - """ - try: - total_input_size: int = 0 - for merged_manifest in merged_manifests_by_root.values(): - total_input_size += merged_manifest.totalSize # type: ignore[attr-defined] - self._ensure_disk_capacity(Path(session_dir), total_input_size) - - return download_files_from_manifests( - s3_bucket=s3_settings.s3BucketName, - manifests_by_root=merged_manifests_by_root, - cas_prefix=s3_settings.full_cas_prefix(), - fs_permission_settings=fs_permission_settings, - session=self.session, - on_downloading_files=on_downloading_files, - logger=self.logger, - conflict_resolution=conflict_resolution, - ).convert_to_summary_statistics() - except JobAttachmentsS3ClientError as exc: - if exc.status_code == 404: - raise JobAttachmentsS3ClientError( - action=exc.action, - status_code=exc.status_code, - bucket_name=exc.bucket_name, - key_or_prefix=exc.key_or_prefix, - message=( - "This can happen if the S3 check cache on the submitting machine is out of date. " - "Please delete the cache file from the submitting machine, usually located in the " - "home directory (~/.deadline/cache/s3_check_cache.db) and try submitting again." - ), - ) from exc - else: - raise - - def _check_and_write_local_manifests( - self, - merged_manifests_by_root: dict[str, BaseAssetManifest], - manifest_write_dir: str, - manifest_name_suffix: str = "manifest", - ) -> dict[str, str]: - """Write manifests to the directory and check disk capacity is sufficient for the assets. - - Args: - merged_manifests_by_root (dict[str, BaseAssetManifest]): manifest file to its stored root. - manifest_write_dir (str): local directory to write to. - - Returns: - dict[str, str]: map of local root to file paths the manifests are written to. - """ - - total_input_size: int = 0 - manifest_paths_by_root: dict[str, str] = dict() - - for root, manifest in merged_manifests_by_root.items(): - _, manifest_name = S3AssetUploader._get_hashed_file_name_from_root_str( - manifest=manifest, - source_root=self._local_root_to_src_map[root], - manifest_name_suffix=manifest_name_suffix, - ) - - local_manifest_file = S3AssetUploader._write_local_input_manifest( - manifest_write_dir=manifest_write_dir, - manifest_name=manifest_name, - manifest=manifest, - ) - - total_input_size += manifest.totalSize # type: ignore[attr-defined] - manifest_paths_by_root[root] = local_manifest_file.as_posix() - - self._ensure_disk_capacity(Path(manifest_write_dir), total_input_size) - return manifest_paths_by_root - - def attachment_sync_inputs( - self, - s3_settings: Optional[JobAttachmentS3Settings], - attachments: Optional[Attachments], - queue_id: str, - job_id: str, - session_dir: Path, - fs_permission_settings: Optional[FileSystemPermissionSettings] = None, - storage_profiles_path_mapping_rules: dict[str, str] = {}, - step_dependencies: Optional[list[str]] = None, - on_downloading_files: Optional[Callable[[ProgressReportMetadata], bool]] = None, - os_env_vars: Dict[str, str] | None = None, - on_vfs_mount_complete: Optional[Callable[[bool], None]] = None, - ) -> Tuple[SummaryStatistics, List[Dict[str, str]]]: - """ - Depending on the fileSystem in the Attachments this will perform two - different behaviors: - COPIED / None : downloads a manifest file and corresponding input files, if found. - VIRTUAL: downloads a manifest file and mounts a Virtual File System at the - specified asset root corresponding to the manifest contents - - .. deprecated:: - attachment_sync_inputs is deprecated and will be removed in a future version. - Use other public APIs under job attachments instead. - - Args: - s3_settings: S3-specific Job Attachment settings. - attachments: an object that holds all input assets for the job. - queue_id: the ID of the queue. - job_id: the ID of the job. - session_dir: the directory that the session is going to use. - fs_permission_settings: An instance defining group ownership and permission modes - to be set on the downloaded (synchronized) input files and directories. - storage_profiles_path_mapping_rules: A dict of source path -> destination path mappings. - If this dict is not empty, it means that the Storage Profile set in the job is - different from the one configured in the Fleet performing the input-syncing. - step_dependencies: the list of Step IDs whose output should be downloaded over the input - job attachments. - on_downloading_files: a function that will be called with a ProgressReportMetadata object - for each file being downloaded. If the function returns False, the download will be - cancelled. If it returns True, the download will continue. - os_env_vars: environment variables to set for launched subprocesses - on_vfs_mount_complete: optional callback invoked with a bool indicating whether - each VFS mount succeeded. Callers can use this for telemetry or logging. - - Returns: - COPIED / None : a tuple of (1) final summary statistics for file downloads, - and (2) a list of local roots for each asset root, used for - path mapping. - VIRTUAL: same as COPIED, but the summary statistics will be empty since the - download hasn't started yet. - """ - _warnings.warn( - "attachment_sync_inputs is deprecated and will be removed in a future version. " - "Use other public APIs under job attachments instead.", - DeprecationWarning, - stacklevel=2, - ) - - if not s3_settings: - self.logger.info( - f"No Job Attachment settings configured for Queue {queue_id}, no inputs to sync." - ) - return (SummaryStatistics(), []) - if not attachments: - self.logger.info(f"No attachments configured for Job {job_id}, no inputs to sync.") - return (SummaryStatistics(), []) - - # Generate absolute Path Mapping to local session (no storage profile) - # returns root path to PathMappingRule mapping - dynamic_mapping_rules: dict[str, PathMappingRule] = AssetSync.generate_dynamic_path_mapping( - session_dir=session_dir, - attachments=attachments, - ) - - # Aggregate and merge manifests (with step step dependency handling) in each root into a single manifest - merged_manifests_by_root: dict[str, BaseAssetManifest] = ( - self._aggregate_asset_root_manifests( - session_dir=session_dir, - s3_settings=s3_settings, - queue_id=queue_id, - job_id=job_id, - attachments=attachments, - step_dependencies=step_dependencies, - dynamic_mapping_rules=dynamic_mapping_rules, - storage_profiles_path_mapping_rules=storage_profiles_path_mapping_rules, - ) - ) - - # Download - summary_statistics: SummaryStatistics = SummaryStatistics() - if ( - attachments.fileSystem == JobAttachmentsFileSystem.VIRTUAL.value - and sys.platform != "win32" - and fs_permission_settings is not None - and os_env_vars is not None - and "AWS_PROFILE" in os_env_vars - and isinstance(fs_permission_settings, PosixFileSystemPermissionSettings) - ): - # Virtual Download Flow - self._launch_vfs( - s3_settings=s3_settings, - session_dir=session_dir, - fs_permission_settings=fs_permission_settings, - merged_manifests_by_root=merged_manifests_by_root, - os_env_vars=os_env_vars, - on_mount_complete=on_vfs_mount_complete, - ) - else: - # Copied Download flow — always use OVERWRITE since the worker downloads - # into a fresh session directory. CREATE_COPY would create duplicate files - # with "(1)" suffixes when step dependency outputs overlap with inputs. - summary_statistics = self.copied_download( - s3_settings=s3_settings, - session_dir=session_dir, - fs_permission_settings=fs_permission_settings, - merged_manifests_by_root=merged_manifests_by_root, - on_downloading_files=on_downloading_files, - conflict_resolution=_FileConflictResolution.OVERWRITE, - ) - - self._record_attachment_mtimes(merged_manifests_by_root) - return ( - summary_statistics, - list(asdict(r) for r in dynamic_mapping_rules.values()), - ) - - def _upload_output_files_to_s3( - self, - s3_settings: JobAttachmentS3Settings, - output_files: List[OutputFile], - on_uploading_files: Optional[Callable[[ProgressReportMetadata], bool]], - ) -> SummaryStatistics: - """ - Uploads the given output files to the given S3 bucket. - Sets up `progress_tracker` to report upload progress back to the caller (i.e. worker.) - """ - # Sets up progress tracker to report upload progress back to the caller. - total_file_size = sum([file.file_size for file in output_files]) - progress_tracker = ProgressTracker( - status=ProgressStatus.UPLOAD_IN_PROGRESS, - total_files=len(output_files), - total_bytes=total_file_size, - on_progress_callback=on_uploading_files, - logger=self.logger, - ) - - start_time = time.perf_counter() - - for file in output_files: - if file.in_s3: - progress_tracker.increase_skipped(1, file.file_size) - continue - - self.s3_uploader.upload_file_to_s3( - local_path=Path(file.full_path), - s3_bucket=s3_settings.s3BucketName, - s3_upload_key=file.s3_key, - progress_tracker=progress_tracker, - base_dir_path=Path(file.base_dir) if file.base_dir else None, - ) - - progress_tracker.total_time = time.perf_counter() - start_time - return progress_tracker.get_summary_statistics() - - def _upload_output_manifest_to_s3( - self, - s3_settings: JobAttachmentS3Settings, - output_manifest: BaseAssetManifest, - full_output_prefix: str, - root_path: str, - file_system_location_name: Optional[str] = None, - ) -> None: - """Uploads the given output manifest to the given S3 bucket.""" - hash_alg = output_manifest.get_default_hash_alg() - manifest_bytes = output_manifest.encode().encode("utf-8") - manifest_name_prefix = hash_data( - f"{file_system_location_name or ''}{root_path}".encode(), hash_alg - ) - manifest_path = _join_s3_paths( - full_output_prefix, - f"{manifest_name_prefix}_output", - ) - metadata = {"Metadata": {"asset-root": json.dumps(root_path, ensure_ascii=True)}} - # S3 metadata must be ASCII, so use either 'asset-root' or 'asset-root-json' depending - # on whether the value is ASCII. - try: - # Add the 'asset-root' metadata if the path is ASCII - root_path.encode(encoding="ascii") - metadata["Metadata"]["asset-root"] = root_path - except UnicodeEncodeError: - # Add the 'asset-root-json' metadata encoded to ASCII as a JSON string - metadata["Metadata"]["asset-root-json"] = json.dumps(root_path, ensure_ascii=True) - if file_system_location_name: - metadata["Metadata"]["file-system-location-name"] = file_system_location_name - - self.logger.info(f"Uploading output manifest to {manifest_path}") - - self.s3_uploader.upload_bytes_to_s3( - BytesIO(manifest_bytes), - s3_settings.s3BucketName, - manifest_path, - extra_args=metadata, - ) - - def _generate_output_manifest(self, outputs: List[OutputFile]) -> BaseAssetManifest: - paths: list[RelativeFilePath] = [] - for output in outputs: - path_args: dict[str, Any] = { - "hash": output.file_hash, - "path": output.rel_path, - } - path_args["size"] = output.file_size - # stat().st_mtime_ns returns an int that represents the time in nanoseconds since the epoch. - # The asset manifest spec requires the mtime to be represented as an integer in microseconds. - path_args["mtime"] = trunc(Path(output.full_path).stat().st_mtime_ns // 1000) - paths.append(self.manifest_model.Path(**path_args)) - - asset_manifest_args: dict[str, Any] = { - "paths": paths, - "hash_alg": self.hash_alg, - } - asset_manifest_args["total_size"] = sum([output.file_size for output in outputs]) - - return self.manifest_model.AssetManifest(**asset_manifest_args) # type: ignore[call-arg] - - def _get_output_files( - self, - manifest_properties: ManifestProperties, - s3_settings: JobAttachmentS3Settings, - local_root: Path, - session_dir: Path, - ) -> List[OutputFile]: - """ - Walks the output directories for this asset root for any output files that have been created or modified - since the start time provided. Hashes and checks if the output files already exist in the CAS. - """ - output_files: List[OutputFile] = [] - - source_path_format = manifest_properties.rootPathFormat - current_path_format = PathFormat.get_host_path_format() - - for output_dir in manifest_properties.outputRelativeDirectories or []: - if source_path_format != current_path_format: - if source_path_format == PathFormat.WINDOWS: - output_dir = output_dir.replace("\\", "/") - elif source_path_format == PathFormat.POSIX: - output_dir = output_dir.replace("/", "\\") - output_root: Path = local_root / output_dir - - total_file_count = 0 - total_file_size = 0 - - # Don't fail if output dir hasn't been created yet; another task might be working on it - if not output_root.is_dir(): - self.logger.info(f"Found 0 files (Output directory {output_root} does not exist.)") - continue - - # Get all files in this directory (includes sub-directories) - for file_path in output_root.glob("**/*"): - # Files that are new or have been modified since the last sync will be added to the output list. - mtime_when_synced = self.synced_assets_mtime.get(str(file_path), None) - file_mtime = file_path.stat().st_mtime_ns - is_modified = False - if mtime_when_synced: - if file_mtime > int(mtime_when_synced): - # This file has been modified during this session action. - is_modified = True - else: - # This is a new file created during this session action. - self.synced_assets_mtime[str(file_path)] = int(file_mtime) - is_modified = True - - # Resolve the real path to prevent time-of-check/time-of-use vulnerability - file_real_path = file_path.resolve() - - # validate that the file resolves inside of the session working directory. - is_file_path_under_session_dir = self._is_file_within_directory( - file_real_path, session_dir - ) - if is_file_path_under_session_dir is False: - self.logger.info( - f"Skipping file '{file_path}' as its resolved path '{file_real_path}' is" - f" outside the session directory '{session_dir}'" - ) - continue - - if ( - not file_real_path.is_dir() - and file_real_path.exists() - and is_modified - and is_file_path_under_session_dir - ): - file_size = file_real_path.resolve().lstat().st_size - file_hash = hash_file(str(file_real_path), self.hash_alg) - s3_key = f"{file_hash}.{self.hash_alg.value}" - - if s3_settings.full_cas_prefix(): - s3_key = _join_s3_paths(s3_settings.full_cas_prefix(), s3_key) - in_s3 = self.s3_uploader.file_already_uploaded(s3_settings.s3BucketName, s3_key) - - total_file_count += 1 - total_file_size += file_size - - output_files.append( - OutputFile( - file_size=file_size, - file_hash=file_hash, - rel_path=str(PurePosixPath(*file_path.relative_to(local_root).parts)), - full_path=str(file_real_path), - s3_key=s3_key, - in_s3=in_s3, - base_dir=str(session_dir), - ) - ) - - self.logger.info( - f"Found {total_file_count} file{'' if total_file_count == 1 else 's'}" - f" totaling {human_readable_file_size(total_file_size)}" - f" in output directory: {str(output_root)}" - ) - - return output_files - - def _is_file_within_directory(self, file_path: Path, directory_path: Path) -> bool: - """ - Checks if the given file path is within the given directory path. - """ - real_file_path = file_path.resolve() - real_directory_path = directory_path.resolve() - common_path = os.path.commonpath([real_file_path, real_directory_path]) - return common_path.startswith(str(real_directory_path)) - - def get_s3_settings(self, farm_id: str, queue_id: str) -> Optional[JobAttachmentS3Settings]: - """ - Gets Job Attachment S3 settings by calling the Deadline GetQueue API. - """ - queue = get_queue( - farm_id=farm_id, - queue_id=queue_id, - session=self.session, - deadline_endpoint_url=self.deadline_endpoint_url, - ) - return queue.jobAttachmentSettings if queue and queue.jobAttachmentSettings else None - - def get_attachments(self, farm_id: str, queue_id: str, job_id: str) -> Optional[Attachments]: - """ - Gets Job Attachment settings by calling the Deadline GetJob API. - """ - job = get_job( - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - session=self.session, - deadline_endpoint_url=self.deadline_endpoint_url, - ) - return job.attachments if job and job.attachments else None - - def _record_attachment_mtimes( - self, merged_manifests_by_root: dict[str, BaseAssetManifest] - ) -> None: - # Record the mapping of downloaded files' absolute paths to their last modification time - # (in microseconds). This is used to later determine which files have been modified or - # newly created during the session and need to be uploaded as output. - for local_root, merged_manifest in merged_manifests_by_root.items(): - for manifest_path in merged_manifest.paths: - abs_path = str(Path(local_root) / manifest_path.path) - self.synced_assets_mtime[abs_path] = Path(abs_path).stat().st_mtime_ns - - def _ensure_disk_capacity(self, session_dir: Path, total_input_bytes: int) -> None: - """ - Raises an AssetSyncError if the given input bytes is larger than the available disk space. - """ - disk_free: int = shutil.disk_usage(session_dir).free - if total_input_bytes > disk_free: - input_size_readable = human_readable_file_size(total_input_bytes) - disk_free_readable = human_readable_file_size(disk_free) - raise AssetSyncError( - "Error occurred while attempting to sync input files: " - f"Total file size required for download ({input_size_readable}) is larger than available disk space ({disk_free_readable})" - ) - - def sync_inputs( - self, - s3_settings: Optional[JobAttachmentS3Settings], - attachments: Optional[Attachments], - queue_id: str, - job_id: str, - session_dir: Path, - fs_permission_settings: Optional[FileSystemPermissionSettings] = None, - storage_profiles_path_mapping_rules: dict[str, str] = {}, - step_dependencies: Optional[list[str]] = None, - on_downloading_files: Optional[Callable[[ProgressReportMetadata], bool]] = None, - os_env_vars: Dict[str, str] | None = None, - on_vfs_mount_complete: Optional[Callable[[bool], None]] = None, - ) -> Tuple[SummaryStatistics, List[Dict[str, str]]]: - """ - Depending on the fileSystem in the Attachments this will perform two - different behaviors: - COPIED / None : downloads a manifest file and corresponding input files, if found. - VIRTUAL: downloads a manifest file and mounts a Virtual File System at the - specified asset root corresponding to the manifest contents - - Args: - s3_settings: S3-specific Job Attachment settings. - attachments: an object that holds all input assets for the job. - queue_id: the ID of the queue. - job_id: the ID of the job. - session_dir: the directory that the session is going to use. - fs_permission_settings: An instance defining group ownership and permission modes - to be set on the downloaded (synchronized) input files and directories. - storage_profiles_path_mapping_rules: A dict of source path -> destination path mappings. - If this dict is not empty, it means that the Storage Profile set in the job is - different from the one configured in the Fleet performing the input-syncing. - step_dependencies: the list of Step IDs whose output should be downloaded over the input - job attachments. - on_downloading_files: a function that will be called with a ProgressReportMetadata object - for each file being downloaded. If the function returns False, the download will be - cancelled. If it returns True, the download will continue. - os_env_vars: environment variables to set for launched subprocesses - on_vfs_mount_complete: optional callback invoked with a bool indicating whether - each VFS mount succeeded. Callers can use this for telemetry or logging. - - Returns: - COPIED / None : a tuple of (1) final summary statistics for file downloads, - and (2) a list of local roots for each asset root, used for - path mapping. - VIRTUAL: same as COPIED, but the summary statistics will be empty since the - download hasn't started yet. - - .. deprecated:: - sync_inputs is deprecated and will be removed in a future version. - Use other public APIs under job attachments instead. - """ - _warnings.warn( - "sync_inputs is deprecated and will be removed in a future version. " - "Use other public APIs under job attachments instead.", - DeprecationWarning, - stacklevel=2, - ) - - if not s3_settings: - self.logger.info( - f"No Job Attachment settings configured for Queue {queue_id}, no inputs to sync." - ) - return (SummaryStatistics(), []) - if not attachments: - self.logger.info(f"No attachments configured for Job {job_id}, no inputs to sync.") - return (SummaryStatistics(), []) - - grouped_manifests_by_root: DefaultDict[str, list[BaseAssetManifest]] = DefaultDict(list) - pathmapping_rules: Dict[str, Dict[str, str]] = {} - - storage_profiles_source_paths = list(storage_profiles_path_mapping_rules.keys()) - - for manifest_properties in attachments.manifests: - local_root: str = "" - if ( - len(storage_profiles_path_mapping_rules) > 0 - and manifest_properties.fileSystemLocationName - ): - if manifest_properties.rootPath in storage_profiles_source_paths: - local_root = storage_profiles_path_mapping_rules[manifest_properties.rootPath] - else: - raise AssetSyncError( - "Error occurred while attempting to sync input files: " - f"No path mapping rule found for the source path {manifest_properties.rootPath}" - ) - else: - dir_name: str = _get_unique_dest_dir_name(manifest_properties.rootPath) - local_root = str(session_dir.joinpath(dir_name)) - pathmapping_rules[dir_name] = { - "source_path_format": manifest_properties.rootPathFormat.value, - "source_path": manifest_properties.rootPath, - "destination_path": local_root, - } - - if manifest_properties.inputManifestPath: - manifest_s3_key = s3_settings.add_root_and_manifest_folder_prefix( - manifest_properties.inputManifestPath - ) - manifest = get_manifest_from_s3( - manifest_key=manifest_s3_key, - s3_bucket=s3_settings.s3BucketName, - session=self.session, - ) - grouped_manifests_by_root[local_root].append(manifest) - - # Handle step-step dependencies. - if step_dependencies: - for step_id in step_dependencies: - manifests_by_root = get_output_manifests_by_asset_root( - s3_settings, - self.farm_id, - queue_id, - job_id, - step_id=step_id, - session=self.session, - ) - for root, manifests in manifests_by_root.items(): - dir_name = _get_unique_dest_dir_name(root) - local_root = str(session_dir.joinpath(dir_name)) - grouped_manifests_by_root[local_root].extend(manifests) - - # Merge the manifests in each root into a single manifest - merged_manifests_by_root: dict[str, BaseAssetManifest] = dict() - total_input_size: int = 0 - for root, manifests in grouped_manifests_by_root.items(): - merged_manifest = merge_asset_manifests(manifests) - - if merged_manifest: - merged_manifests_by_root[root] = merged_manifest - total_input_size += merged_manifest.totalSize # type: ignore[attr-defined] - - # Download - # Virtual Download Flow - if ( - attachments.fileSystem == JobAttachmentsFileSystem.VIRTUAL.value - and sys.platform != "win32" - and fs_permission_settings is not None - and os_env_vars is not None - and "AWS_PROFILE" in os_env_vars - and isinstance(fs_permission_settings, PosixFileSystemPermissionSettings) - ): - try: - VFSProcessManager.find_vfs() - mount_vfs_from_manifests( - s3_bucket=s3_settings.s3BucketName, - manifests_by_root=merged_manifests_by_root, - boto3_session=self.session, - session_dir=session_dir, - fs_permission_settings=fs_permission_settings, # type: ignore[arg-type] - os_env_vars=os_env_vars, # type: ignore[arg-type] - cas_prefix=s3_settings.full_cas_prefix(), - on_mount_complete=on_vfs_mount_complete, - ) - summary_statistics = SummaryStatistics() - self._record_attachment_mtimes(merged_manifests_by_root) - return (summary_statistics, list(pathmapping_rules.values())) - except VFSExecutableMissingError: - logger.error( - f"Virtual File System not found, falling back to {JobAttachmentsFileSystem.COPIED} for JobAttachmentsFileSystem." - ) - - # Copied Download flow - self._ensure_disk_capacity(session_dir, total_input_size) - try: - download_summary_statistics = download_files_from_manifests( - s3_bucket=s3_settings.s3BucketName, - manifests_by_root=merged_manifests_by_root, - cas_prefix=s3_settings.full_cas_prefix(), - fs_permission_settings=fs_permission_settings, - session=self.session, - on_downloading_files=on_downloading_files, - logger=self.logger, - ) - except JobAttachmentsS3ClientError as exc: - if exc.status_code == 404: - raise JobAttachmentsS3ClientError( - action=exc.action, - status_code=exc.status_code, - bucket_name=exc.bucket_name, - key_or_prefix=exc.key_or_prefix, - message=( - "This can happen if the S3 check cache on the submitting machine is out of date. " - "Please delete the cache file from the submitting machine, usually located in the " - "home directory (~/.deadline/cache/s3_check_cache.db) and try submitting again." - ), - ) from exc - else: - raise - - self._record_attachment_mtimes(merged_manifests_by_root) - - return ( - download_summary_statistics.convert_to_summary_statistics(), - list(pathmapping_rules.values()), - ) - - def sync_outputs( - self, - s3_settings: Optional[JobAttachmentS3Settings], - attachments: Optional[Attachments], - queue_id: str, - job_id: str, - step_id: str, - task_id: str, - session_action_id: str, - start_time: float, - session_dir: Path, - storage_profiles_path_mapping_rules: dict[str, str] = {}, - on_uploading_files: Optional[Callable[[ProgressReportMetadata], bool]] = None, - ) -> SummaryStatistics: - """ - Uploads any output files specified in the manifest, if found. - - .. deprecated:: - sync_outputs is deprecated and will be removed in a future version. - Use other public APIs under job attachments instead. - """ - _warnings.warn( - "sync_outputs is deprecated and will be removed in a future version. " - "Use other public APIs under job attachments instead.", - DeprecationWarning, - stacklevel=2, - ) - if not s3_settings: - self.logger.info( - f"No Job Attachment settings configured for Queue {queue_id}, no outputs to sync." - ) - return SummaryStatistics() - if not attachments: - self.logger.info(f"No attachments configured for Job {job_id}, no outputs to sync.") - return SummaryStatistics() - - all_output_files: List[OutputFile] = [] - - storage_profiles_source_paths = list(storage_profiles_path_mapping_rules.keys()) - - for manifest_properties in attachments.manifests: - session_root = session_dir - local_root: Path = Path() - if ( - len(storage_profiles_path_mapping_rules) > 0 - and manifest_properties.fileSystemLocationName - ): - if manifest_properties.rootPath in storage_profiles_source_paths: - local_root = Path( - storage_profiles_path_mapping_rules[manifest_properties.rootPath] - ) - # We use session_root to filter out any files resolved to a location outside - # of that directory. If storage profile's path mapping rules are available, - # we can consider the session_root to be the mapped-storage profile path. - session_root = local_root - else: - raise AssetSyncError( - "Error occurred while attempting to sync output files: " - f"No path mapping rule found for the source path {manifest_properties.rootPath}" - ) - else: - dir_name: str = _get_unique_dest_dir_name(manifest_properties.rootPath) - local_root = session_dir.joinpath(dir_name) - - output_files: List[OutputFile] = self._get_output_files( - manifest_properties, - s3_settings, - local_root, - session_root, - ) - if output_files: - output_manifest = self._generate_output_manifest(output_files) - session_action_id_with_time_stamp = ( - f"{_float_to_iso_datetime_string(start_time)}_{session_action_id}" - ) - full_output_prefix = s3_settings.full_output_prefix( - farm_id=self.farm_id, - queue_id=queue_id, - job_id=job_id, - step_id=step_id, - task_id=task_id, - session_action_id=session_action_id_with_time_stamp, - ) - self._upload_output_manifest_to_s3( - s3_settings=s3_settings, - output_manifest=output_manifest, - full_output_prefix=full_output_prefix, - root_path=manifest_properties.rootPath, - file_system_location_name=manifest_properties.fileSystemLocationName, - ) - all_output_files.extend(output_files) - - if all_output_files: - num_output_files = len(all_output_files) - self.logger.info( - f"Uploading {num_output_files} output file{'' if num_output_files == 1 else 's'}" - f" to S3: {s3_settings.s3BucketName}/{s3_settings.full_cas_prefix()}" - ) - summary_stats: SummaryStatistics = self._upload_output_files_to_s3( - s3_settings, all_output_files, on_uploading_files - ) - else: - summary_stats = SummaryStatistics() - return summary_stats - - def cleanup_session( - self, - session_dir: Path, - file_system: JobAttachmentsFileSystem, - os_user: Optional[str] = None, - ): - if file_system == JobAttachmentsFileSystem.COPIED.value: - return - if not os_user: - raise VFSOSUserNotSetError("No os user set - can't clean up vfs session") - try: - VFSProcessManager.find_vfs() - # Shutdown all running Deadline VFS processes since session is complete - VFSProcessManager.kill_all_processes(session_dir=session_dir, os_user=os_user) - except VFSExecutableMissingError: - logger.error("Virtual File System not found, no processes to kill.") diff --git a/src/deadline/job_attachments/caches/__init__.py b/src/deadline/job_attachments/caches/__init__.py deleted file mode 100644 index d2046340d..000000000 --- a/src/deadline/job_attachments/caches/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from .cache_db import CacheDB, CONFIG_ROOT, COMPONENT_NAME -from .hash_cache import HashCache, HashCacheEntry, WHOLE_FILE_RANGE_END -from .s3_check_cache import S3CheckCache, S3CheckCacheEntry - -__all__ = [ - "CacheDB", - "CONFIG_ROOT", - "COMPONENT_NAME", - "HashCache", - "HashCacheEntry", - "WHOLE_FILE_RANGE_END", - "S3CheckCache", - "S3CheckCacheEntry", -] diff --git a/src/deadline/job_attachments/caches/cache_db.py b/src/deadline/job_attachments/caches/cache_db.py deleted file mode 100644 index b46b6013d..000000000 --- a/src/deadline/job_attachments/caches/cache_db.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Module for defining a local cache file. -""" - -import logging -import os -import threading as _threading -from abc import ABC -from threading import Lock -from typing import Optional - -from ..exceptions import JobAttachmentsError -from .._utils import _retry - -CONFIG_ROOT = ".deadline" -COMPONENT_NAME = "job_attachments" - -logger = logging.getLogger("Deadline") - - -class CacheDB(ABC): - """ - Abstract base class for connecting to a local SQLite cache database. - - This class is intended to always be used with a context manager to properly - close the connection to the cache database. - """ - - # Number of retry attempts for SQLite operational errors (e.g., database locks) - _RETRY_ATTEMPTS = 3 - - def __init__( - self, cache_name: str, table_name: str, create_query: str, cache_dir: Optional[str] = None - ) -> None: - if not cache_name or not table_name or not create_query: - raise JobAttachmentsError("Constructor strings for CacheDB cannot be empty.") - self.cache_name: str = cache_name - self.table_name: str = table_name - self.create_query: str = create_query - self._local = _threading.local() - self._local_connections: set = set() - - try: - # SQLite is included in Python installers, but might not exist if building python from source. - import sqlite3 # noqa - - self.enabled = True - except ImportError: - logger.warning(f"SQLite was not found, {cache_name} will not be used.") - self.enabled = False - return - - if cache_dir is None: - cache_dir = self.get_default_cache_db_file_dir() - if cache_dir is None: - raise JobAttachmentsError( - f"No default cache path found. Please provide a directory for {self.cache_name}." - ) - os.makedirs(cache_dir, exist_ok=True) - self.cache_dir: str = os.path.join(cache_dir, f"{self.cache_name}.db") - self.db_lock = Lock() - - def __enter__(self): - """Called when entering the context manager.""" - if self.enabled: - import sqlite3 - - @_retry( - ExceptionToCheck=sqlite3.OperationalError, - tries=self._RETRY_ATTEMPTS, - delay=(0.5, 1.5), # Jitter between 0.5 and 1.5 seconds - backoff=1.0, - logger=logger.warning, - ) - def _connect_to_db(): - """ - Connect to the SQLite database and ensure the table exists. - - Raises: - sqlite3.OperationalError: If there is an error connecting to the database. - """ - connection = sqlite3.connect(self.cache_dir, check_same_thread=False) - connection.execute("PRAGMA journal_mode=WAL") - try: - # Test the connection by trying to query the table - connection.execute(f"SELECT * FROM {self.table_name}") - except Exception: - # DB file doesn't have our table, so we need to create it - logger.info( - f"No cache entries for the current library version were found. Creating a new cache for {self.cache_name}" - ) - connection.execute(self.create_query) - return connection - - try: - self.db_connection = _connect_to_db() - except sqlite3.OperationalError as oe: - raise JobAttachmentsError( - f"Could not access cache file after {self._RETRY_ATTEMPTS} retry attempts: {self.cache_dir}" - ) from oe - return self - - def __exit__(self, exc_type, exc_value, exc_traceback): - """Called when exiting the context manager.""" - - if self.enabled: - import sqlite3 - - self.db_connection.close() - for conn in self._local_connections: - try: - conn.close() - except sqlite3.Error as e: - logger.warning(f"SQLite connection failed to close with error {e}") - - self._local_connections.clear() - - def get_local_connection(self): - """Create and/or returns a thread local connection to the SQLite database.""" - if not self.enabled: - return None - import sqlite3 - - if not hasattr(self._local, "connection"): - - @_retry( - ExceptionToCheck=sqlite3.OperationalError, - tries=self._RETRY_ATTEMPTS, - delay=(0.5, 1.5), # Jitter between 0.5 and 1.5 seconds - backoff=1.0, - logger=logger.warning, - ) - def _create_local_connection(): - """ - Create a local SQLite connection. - - Raises: - sqlite3.OperationalError: If there is an error connecting to the database. - """ - connection = sqlite3.connect(self.cache_dir, check_same_thread=False) - return connection - - try: - self._local.connection = _create_local_connection() - self._local_connections.add(self._local.connection) - except sqlite3.OperationalError as oe: - raise JobAttachmentsError( - f"Could not create connection to cache after {self._RETRY_ATTEMPTS} retry attempts: {self.cache_dir}" - ) from oe - - return self._local.connection - - @classmethod - def get_default_cache_db_file_dir(cls) -> Optional[str]: - """ - Gets the expected directory for the cache database file based on OS environment variables. - If a directory cannot be found, defaults to the working directory. - """ - default_path = os.path.expanduser("~") - if default_path and default_path != "~": - return os.path.join(default_path, CONFIG_ROOT, COMPONENT_NAME) - return None - - def remove_cache(self) -> None: - """ - Removes the underlying cache contents from the file system. - """ - - if self.enabled: - import sqlite3 - - self.db_connection.close() - conn_list = list(self._local_connections) - for conn in conn_list: - try: - conn.close() - self._local_connections.remove(conn) - except sqlite3.Error as e: - logger.warning(f"SQLite connection failed to close with error {e}") - - logger.debug(f"The cache {self.cache_dir} will be removed") - try: - os.remove(self.cache_dir) - except Exception as e: - logger.error(f"Error occurred while removing the cache file {self.cache_dir}: {e}") - - raise e diff --git a/src/deadline/job_attachments/caches/hash_cache.py b/src/deadline/job_attachments/caches/hash_cache.py deleted file mode 100644 index 5ed43a1af..000000000 --- a/src/deadline/job_attachments/caches/hash_cache.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Module for accessing the local file hash cache. - -Supports two types of hash entries: -1. Whole-file hashes: range_start=0, range_end=-1 (WHOLE_FILE_RANGE_END) -2. Byte-range hashes: range_start >= 0, range_end > 0, defining the range [start, end) - -The range parameters allow caching hashes for arbitrary byte ranges of files, -which is useful for caching hashes for any chunking scheme without modifying the cache. -""" - -import logging -from dataclasses import dataclass -from typing import Any, Dict, Optional - -from .cache_db import CacheDB -from ..asset_manifests.hash_algorithms import HashAlgorithm - - -logger = logging.getLogger("Deadline") - -# Sentinel value indicating a whole-file hash (no specific byte range) -WHOLE_FILE_RANGE_END = -1 - - -@dataclass -class HashCacheEntry: - """Represents an entry in the local hash-cache database. - - For whole-file hashes: range_start=0, range_end=-1 (WHOLE_FILE_RANGE_END) - For chunk hashes: range_start and range_end define the byte range [start, end) - """ - - # The file_path is stored as a BLOB in sqlite, encoded with utf-8 and the "surrogatepass" - # error handler, as file names encountered in practice require this. - file_path: str - hash_algorithm: HashAlgorithm - file_hash: str - last_modified_time: str - range_start: int = 0 - range_end: int = WHOLE_FILE_RANGE_END - - def __post_init__(self) -> None: - # Validate byte-range entries have range_end > range_start. - if self.range_end != WHOLE_FILE_RANGE_END and self.range_end <= self.range_start: - raise ValueError( - f"For byte-range entries, range_end ({self.range_end}) must be greater than " - f"range_start ({self.range_start})" - ) - - def is_whole_file(self) -> bool: - """Returns True if this entry represents a whole-file hash.""" - return self.range_start == 0 and self.range_end == WHOLE_FILE_RANGE_END - - def to_dict(self) -> Dict[str, Any]: - return { - "file_path": self.file_path, - "hash_algorithm": self.hash_algorithm.value, - "file_hash": self.file_hash, - "last_modified_time": self.last_modified_time, - "range_start": self.range_start, - "range_end": self.range_end, - } - - -class HashCache(CacheDB): - """ - Class used to store and retrieve entries in the local file hash cache. - - This class is intended to always be used with a context manager to properly - close the connection to the hash cache database. - - This class also automatically locks when doing writes, so it can be called - by multiple threads. - - Schema (hashesV4): - - file_path: blob (part of composite primary key) - - hash_algorithm: text (part of composite primary key) - - range_start: integer (part of composite primary key) - - range_end: integer (part of composite primary key) - - file_hash: text - - last_modified_time: timestamp - - For whole-file hashes, range_start=0 and range_end=-1. - For byte-range hashes, range_start and range_end define [start, end). - """ - - CACHE_NAME = "hash_cache" - CACHE_DB_VERSION = 4 - - def __init__(self, cache_dir: Optional[str] = None) -> None: - table_name: str = f"hashesV{self.CACHE_DB_VERSION}" - create_query: str = ( - f"CREATE TABLE {table_name}(" - "file_path blob, " - "hash_algorithm text, " - "range_start integer, " - "range_end integer, " - "file_hash text, " - "last_modified_time timestamp, " - "PRIMARY KEY (file_path, hash_algorithm, range_start, range_end))" - ) - super().__init__( - cache_name=self.CACHE_NAME, - table_name=table_name, - create_query=create_query, - cache_dir=cache_dir, - ) - - def get_connection_entry( - self, - file_path_key: str, - hash_algorithm: HashAlgorithm, - connection: Any, - range_start: int = 0, - range_end: int = WHOLE_FILE_RANGE_END, - ) -> Optional[HashCacheEntry]: - """ - Returns an entry from the hash cache, if it exists. - - This is the "lockless" version of get_entry which expects a connection - parameter for the connection which will be used to read from the DB - this can generally - be the thread local connection returned by get_local_connection() - - Args: - file_path_key: The file path to look up - hash_algorithm: The hash algorithm used - connection: SQLite connection to use - range_start: Start byte offset (0 for whole-file) - range_end: End byte offset (-1/WHOLE_FILE_RANGE_END for whole-file) - - Returns: - HashCacheEntry if found, None otherwise - """ - if not self.enabled: - return None - - encoded_path = file_path_key.encode(encoding="utf-8", errors="surrogatepass") - - entry_vals = connection.execute( - f"SELECT * FROM {self.table_name} " - "WHERE file_path=? AND hash_algorithm=? AND range_start=? AND range_end=?", - [encoded_path, hash_algorithm.value, range_start, range_end], - ).fetchone() - - if entry_vals: - return HashCacheEntry( - file_path=str(entry_vals[0], encoding="utf-8", errors="surrogatepass"), - hash_algorithm=HashAlgorithm(entry_vals[1]), - file_hash=entry_vals[4], - last_modified_time=str(entry_vals[5]), - range_start=entry_vals[2], - range_end=entry_vals[3], - ) - - return None - - def get_entry( - self, - file_path_key: str, - hash_algorithm: HashAlgorithm, - range_start: int = 0, - range_end: int = WHOLE_FILE_RANGE_END, - ) -> Optional[HashCacheEntry]: - """ - Returns an entry from the hash cache, if it exists. - - Args: - file_path_key: The file path to look up - hash_algorithm: The hash algorithm used - range_start: Start byte offset (0 for whole-file) - range_end: End byte offset (-1/WHOLE_FILE_RANGE_END for whole-file) - - Returns: - HashCacheEntry if found, None otherwise - """ - if not self.enabled: - return None - - with self.db_lock, self.db_connection: - return self.get_connection_entry( - file_path_key, hash_algorithm, self.db_connection, range_start, range_end - ) - - def put_entry(self, entry: HashCacheEntry) -> None: - """ - Inserts or replaces an entry into the hash cache database after acquiring the lock. - - The entry's range_start and range_end determine whether this is a whole-file - hash (range_start=0, range_end=-1) or a byte-range hash. - """ - if self.enabled: - with self.db_lock, self.db_connection: - encoded_path = entry.file_path.encode(encoding="utf-8", errors="surrogatepass") - - self.db_connection.execute( - f"INSERT OR REPLACE INTO {self.table_name} " - "VALUES(:file_path, :hash_algorithm, :range_start, :range_end, " - ":file_hash, :last_modified_time)", - { - "file_path": encoded_path, - "hash_algorithm": entry.hash_algorithm.value, - "range_start": entry.range_start, - "range_end": entry.range_end, - "file_hash": entry.file_hash, - "last_modified_time": entry.last_modified_time, - }, - ) diff --git a/src/deadline/job_attachments/caches/s3_check_cache.py b/src/deadline/job_attachments/caches/s3_check_cache.py deleted file mode 100644 index 1481f6eff..000000000 --- a/src/deadline/job_attachments/caches/s3_check_cache.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Module for accessing the local 'last seen on S3' cache. -""" - -import logging -from dataclasses import dataclass -from datetime import datetime -from typing import Any, Dict, Optional - -from .cache_db import CacheDB - - -logger = logging.getLogger("Deadline") - - -@dataclass -class S3CheckCacheEntry: - """Represents an entry in the local s3 check cache database""" - - s3_key: str - last_seen_time: str - - def to_dict(self) -> Dict[str, Any]: - return { - "s3_key": self.s3_key, - "last_seen_time": self.last_seen_time, - } - - -class S3CheckCache(CacheDB): - """ - Maintains a cache of 'last seen on S3' entries in a local database, which - specifies which full S3 object keys exist in the content-addressed storage - in the Job Attachments S3 bucket. - - This class is intended to always be used with a context manager to properly - close the connection to the hash cache database. - - This class also automatically locks when doing writes, so it can be called - by multiple threads. - """ - - CACHE_NAME = "s3_check_cache" - CACHE_DB_VERSION = 1 - ENTRY_EXPIRY_DAYS = 30 - - def __init__(self, cache_dir: Optional[str] = None) -> None: - table_name: str = f"s3checkV{self.CACHE_DB_VERSION}" - create_query: str = f"CREATE TABLE s3checkV{self.CACHE_DB_VERSION}(s3_key text primary key, last_seen_time timestamp)" - super().__init__( - cache_name=self.CACHE_NAME, - table_name=table_name, - create_query=create_query, - cache_dir=cache_dir, - ) - - def get_connection_entry(self, s3_key: str, connection) -> Optional[S3CheckCacheEntry]: - """ - Returns an entry from the hash cache, if it exists. This is the "lockless" (Doesn't take - the main db_lock protecting db_connection) version of get_entry which expects a connection - parameter for the connection which will be used to read from the DB - this can generally - be the thread local connection returned by get_local_connection() - """ - - entry_vals = connection.execute( - f"SELECT * FROM {self.table_name} WHERE s3_key=?", - [s3_key], - ).fetchone() - if entry_vals: - entry = S3CheckCacheEntry( - s3_key=entry_vals[0], - last_seen_time=str(entry_vals[1]), - ) - try: - last_seen = datetime.fromtimestamp(float(entry.last_seen_time)) - if (datetime.now() - last_seen).days < self.ENTRY_EXPIRY_DAYS: - return entry - except ValueError: - logger.warning(f"Timestamp for S3 key {s3_key} is not valid. Ignoring.") - - return None - - def get_entry(self, s3_key: str) -> Optional[S3CheckCacheEntry]: - """ - Checks if an entry exists in the cache, and returns it if it hasn't expired. - """ - if not self.enabled: - return None - - with self.db_lock, self.db_connection: - return self.get_connection_entry(s3_key, self.db_connection) - - def put_entry(self, entry: S3CheckCacheEntry) -> None: - """Inserts or replaces an entry into the cache database.""" - if self.enabled: - with self.db_lock, self.db_connection: - self.db_connection.execute( - f"INSERT OR REPLACE INTO {self.table_name} VALUES(:s3_key, :last_seen_time)", - entry.to_dict(), - ) diff --git a/src/deadline/job_attachments/download.py b/src/deadline/job_attachments/download.py deleted file mode 100644 index 7e33b93a6..000000000 --- a/src/deadline/job_attachments/download.py +++ /dev/null @@ -1,1487 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Functions for downloading output from the Job Attachment CAS.""" - -from __future__ import annotations - -import concurrent.futures -import json -import os -import re -import time -from collections import defaultdict -from datetime import datetime -from itertools import chain -from logging import Logger, LoggerAdapter, getLogger -from pathlib import Path -from tempfile import NamedTemporaryFile -from typing import Any, Callable, DefaultDict, List, Optional, Tuple, Union - -import boto3 -from boto3.s3.transfer import ProgressCallbackInvoker -from botocore.client import BaseClient -from botocore.exceptions import BotoCoreError, ClientError - -from .asset_manifests.base_manifest import ( - BaseAssetManifest, - BaseManifestPath as RelativeFilePath, -) -from .asset_manifests.hash_algorithms import HashAlgorithm -from .asset_manifests.decode import decode_manifest -from .exceptions import ( - COMMON_ERROR_GUIDANCE_FOR_S3, - AssetSyncError, - AssetSyncCancelledError, - JobAttachmentS3BotoCoreError, - JobAttachmentsS3ClientError, - PathOutsideDirectoryError, - JobAttachmentsError, - MissingAssetRootError, -) -from .vfs import ( - VFSProcessManager, - VFS_CACHE_REL_PATH_IN_SESSION, - VFS_MANIFEST_FOLDER_IN_SESSION, - VFS_LOGS_FOLDER_IN_SESSION, - VFS_MANIFEST_FOLDER_PERMISSIONS, -) - -from .models import ( - Attachments, - FileConflictResolution, - JobAttachmentS3Settings, - ManifestPathGroup, -) -from .progress_tracker import ( - DownloadSummaryStatistics, - ProgressReportMetadata, - ProgressStatus, - ProgressTracker, -) -from ._aws.aws_clients import ( - get_account_id, - get_s3_client, - get_s3_transfer_manager, -) -from .os_file_permission import ( - FileSystemPermissionSettings, - PosixFileSystemPermissionSettings, - WindowsFileSystemPermissionSettings, - _set_fs_group_for_posix, - _set_fs_permission_for_windows, -) -from ._utils import ( - _get_long_path_compatible_path, - _is_relative_to, - _join_s3_paths, -) -from threading import Lock - -download_logger = getLogger("deadline.job_attachments.download") - -S3_DOWNLOAD_MAX_CONCURRENCY = 10 -WINDOWS_MAX_PATH_LENGTH = 260 -TEMP_DOWNLOAD_ADDED_CHARS_LENGTH = 9 - - -def get_manifest_from_s3( - manifest_key: str, s3_bucket: str, session: Optional[boto3.Session] = None -) -> BaseAssetManifest: - _, manifest = get_asset_root_and_manifest_from_s3(manifest_key, s3_bucket, session) - return manifest - - -def get_asset_root_and_manifest_from_s3( - manifest_key: str, s3_bucket: str, session: Optional[boto3.Session] = None -) -> Tuple[Optional[str], BaseAssetManifest]: - asset_root, _, asset_manifest = _get_asset_root_and_manifest_from_s3_with_last_modified( - manifest_key, s3_bucket, session - ) - return (asset_root, asset_manifest) - - -def _get_asset_root_and_manifest_from_s3_with_last_modified( - manifest_key: str, s3_bucket: str, session: Optional[boto3.Session] = None -) -> Tuple[Optional[str], datetime, BaseAssetManifest]: - """ - Gets manifest with its asset root and last modified from s3 using the manifest key in s3 - :param manifest_key: key for searching in s3 - :param s3_bucket: s3 bucket - :param session: boto3 session - :return: Returns Tuple of asset root, manifest's last modified time and the manifest - """ - s3_client = get_s3_client(session=session) - try: - # Assumption: the manifest is less than 5GB. S3 objects larger than 5GB will be truncated. - # Using the assumption because it simplifies the code. A large manifest might be: - # 1 million files * 256 bytes per file path = 256MB so this assumption is safe. - res = s3_client.get_object( - Bucket=s3_bucket, - Key=manifest_key, - ExpectedBucketOwner=get_account_id(session=session), - ) - asset_root = _get_asset_root_from_metadata(metadata=res["Metadata"]) - contents = res["Body"].read().decode("utf-8") - asset_manifest = decode_manifest(contents) - last_modified = res["LastModified"] - - return (asset_root, last_modified, asset_manifest) - except ClientError as exc: - status_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - status_code_guidance = { - **COMMON_ERROR_GUIDANCE_FOR_S3, - 403: ( - ( - "Forbidden or Access denied. Please check your AWS credentials, and ensure that " - "your AWS IAM Role or User has the 's3:GetObject' permission for this bucket. " - ) - if "kms:" not in str(exc) - else ( - "Forbidden or Access denied. Please check your AWS credentials and Job Attachments S3 bucket " - "encryption settings. If a customer-managed KMS key is set, confirm that your AWS IAM Role or " - "User has the 'kms:Decrypt' and 'kms:DescribeKey' permissions for the key used to encrypt the bucket." - ) - ), - 404: "Not found. Please check your bucket name and object key, and ensure that they exist in the AWS account.", - } - raise JobAttachmentsS3ClientError( - action="downloading binary file", - status_code=status_code, - bucket_name=s3_bucket, - key_or_prefix=manifest_key, - message=f"{status_code_guidance.get(status_code, '')} {str(exc)}", - ) from exc - except BotoCoreError as bce: - raise JobAttachmentS3BotoCoreError( - action="downloading binary file", - error_details=str(bce), - ) from bce - except Exception as e: - raise AssetSyncError(e) from e - - -def _get_asset_root_from_metadata(metadata: dict[str, str]) -> Optional[str]: - if "asset-root-json" in metadata: - return json.loads(metadata["asset-root-json"]) - else: - return metadata.get("asset-root", None) - - -def _get_output_manifest_prefix( - s3_settings: JobAttachmentS3Settings, - farm_id: str, - queue_id: str, - job_id: str, - step_id: Optional[str] = None, - task_id: Optional[str] = None, -) -> str: - """ - Get full prefix for output manifest with given farm id, queue id, job id, step id and task id - """ - manifest_prefix: str - if task_id: - if not step_id: - raise JobAttachmentsError( - "Task ID specified, but no Step ID. Job, Step, and Task ID are required to retrieve task outputs." - ) - manifest_prefix = s3_settings.full_task_output_prefix( - farm_id, queue_id, job_id, step_id, task_id - ) - elif step_id: - manifest_prefix = s3_settings.full_step_output_prefix(farm_id, queue_id, job_id, step_id) - else: - manifest_prefix = s3_settings.full_job_output_prefix(farm_id, queue_id, job_id) - - # Previous functions don't terminate the prefix with a '/'. So we'll do it here. - return f"{manifest_prefix}/" - - -def _list_s3_objects_with_error_handling( - s3_bucket: str, - manifest_prefix: str, - session: Optional[boto3.Session] = None, -) -> List[dict]: - """ - List S3 objects with standardized error handling for job attachments. - Returns a list of all S3 object contents under the given prefix. - """ - s3_client = get_s3_client(session=session) - all_contents = [] - - try: - paginator = s3_client.get_paginator("list_objects_v2") - page_iterator = paginator.paginate( - Bucket=s3_bucket, - Prefix=manifest_prefix, - ) - - for page in page_iterator: - contents = page.get("Contents", None) - if contents is None: - raise JobAttachmentsError( - f"Unable to find asset manifest in s3://{s3_bucket}/{manifest_prefix}" - ) - all_contents.extend(contents) - - return all_contents - - except ClientError as exc: - status_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - status_code_guidance = { - **COMMON_ERROR_GUIDANCE_FOR_S3, - 403: ( - "Forbidden or Access denied. Please check your AWS credentials, and ensure that " - "your AWS IAM Role or User has the 's3:ListBucket' permission for this bucket." - ), - 404: "Not found. Please ensure that the bucket and key/prefix exists.", - } - raise JobAttachmentsS3ClientError( - action="listing bucket contents", - status_code=status_code, - bucket_name=s3_bucket, - key_or_prefix=manifest_prefix, - message=f"{status_code_guidance.get(status_code, '')} {str(exc)}", - ) from exc - except BotoCoreError as bce: - raise JobAttachmentS3BotoCoreError( - action="listing bucket contents", - error_details=str(bce), - ) from bce - except JobAttachmentsError: - raise # pass along JobAttachmentsErrors if we get them - except Exception as e: - raise AssetSyncError(e) from e - - -def _get_tasks_manifests_keys_from_s3( - manifest_prefix: str, - s3_bucket: str, - session: Optional[boto3.Session] = None, - *, - select_latest_per_task=True, -) -> List[str]: - """ - Retrieves manifest keys from S3 by listing objects in a given S3 prefix, usually job or step folders. - - Searches for manifest files matching the regex pattern that captures both: - - Chunked step manifests: .../job_id/step_id/timestamp_sessionaction_id/ - - Task-based manifests: .../job_id/step_id/task_id/timestamp_sessionaction_id/ - - For chunked steps (no task ID), all manifests are included. - For task-based manifests, behavior depends on select_latest_per_task: - - If True: selects only the latest session action per task (by timestamp_sessionaction_id alphabetical order) - - If False: includes all manifests - """ - manifests_keys = [] - - # Separate tracking for task-based vs chunked manifests - task_prefixes: dict[str, list] = defaultdict(list) - - all_contents = _list_s3_objects_with_error_handling(s3_bucket, manifest_prefix, session) - - step_pattern = re.compile(r"step-.*/.*/.*output.*") - for content in all_contents: - key = content["Key"] - if step_pattern.search(key): - if "task-" in key: - parts = key.split("/") - for i, part in enumerate(parts): - if "task-" in part: - task_folder = "/".join(parts[: i + 1]) - task_prefixes[task_folder].append(key) - break - else: - manifests_keys.append(key) - - # Handle task-based steps - if select_latest_per_task: - # TODO: Select all files in the last subfolder (alphabetically) under each "task-{any}" folder. - # This sorts by timestamp_sessionaction_id, but timestamp comes from WorkerAgent and shouldn't be relied on. - # Should use S3 LastModified instead. - for task_folder, files in task_prefixes.items(): - last_subfolder = sorted( - set(f.split("/")[len(task_folder.split("/"))] for f in files), - reverse=True, - )[0] - manifests_keys += [f for f in files if f.startswith(f"{task_folder}/{last_subfolder}/")] - else: - # Include all the keys, not just the latest per task - manifests_keys += [f for _, files in task_prefixes.items() for f in files] - - return manifests_keys - - -def get_job_input_paths_by_asset_root( - s3_settings: JobAttachmentS3Settings, - attachments: Attachments, - session: Optional[boto3.Session] = None, -) -> dict[str, ManifestPathGroup]: - """ - Gets dict of grouped paths of all input files of a given job. - The grouped paths are separated by asset root. - Returns a dict of ManifestPathGroups, with the root path as the key. - """ - inputs: dict[str, ManifestPathGroup] = {} - - for manifest_properties in attachments.manifests: - if manifest_properties.inputManifestPath: - key = _join_s3_paths(manifest_properties.inputManifestPath) - _, asset_manifest = get_asset_root_and_manifest_from_s3( - manifest_key=key, - s3_bucket=s3_settings.s3BucketName, - session=session, - ) - - root_path = manifest_properties.rootPath - if root_path not in inputs: - inputs[root_path] = ManifestPathGroup() - inputs[root_path].add_manifest_to_group(asset_manifest) - - return inputs - - -def get_job_input_output_paths_by_asset_root( - s3_settings: JobAttachmentS3Settings, - attachments: Attachments, - farm_id: str, - queue_id: str, - job_id: str, - step_id: Optional[str] = None, - task_id: Optional[str] = None, - session_action_id: Optional[str] = None, - session: Optional[boto3.Session] = None, -) -> dict[str, ManifestPathGroup]: - """ - With given IDs, gets the paths of all input and output files - of this job. The grouped paths are separated by asset root. - Returns a dict of ManifestPathGroups, with the root path as the key. - """ - input_files = get_job_input_paths_by_asset_root( - s3_settings=s3_settings, - attachments=attachments, - session=session, - ) - output_files = get_job_output_paths_by_asset_root( - s3_settings=s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - step_id=step_id, - task_id=task_id, - session_action_id=session_action_id, - session=session, - ) - - combined_path_groups: dict[str, ManifestPathGroup] = {} - for asset_root, path_group in chain(input_files.items(), output_files.items()): - if asset_root not in combined_path_groups: - combined_path_groups[asset_root] = path_group - else: - combined_path_groups[asset_root].combine_with_group(path_group) - - return combined_path_groups - - -def _get_new_copy_file_path( - local_file_name: Path, - collision_lock: Lock, - collision_file_dict: DefaultDict[str, int], -) -> Path: - with collision_lock: - file_str: str = str(local_file_name) - num: int = collision_file_dict[file_str] - new_file_name = local_file_name - - # Iterate until we find a number we don't conflict with - while True: - try: - # Handle multi-process locks with creating and/or opening file to verify if it exists - with open(new_file_name, "x"): - break - # If file exists we go here and increment num to find a unique path - except FileExistsError: - num += 1 - new_file_name = local_file_name.parent.joinpath( - f"{local_file_name.stem} ({num}){local_file_name.suffix}" - ) - - collision_file_dict[file_str] = num - local_file_name = new_file_name - return local_file_name - - -def download_files_in_directory( - s3_settings: JobAttachmentS3Settings, - attachments: Attachments, - farm_id: str, - queue_id: str, - job_id: str, - directory_path: str, - local_download_dir: str, - session: Optional[boto3.Session] = None, - on_downloading_files: Optional[Callable[[ProgressReportMetadata], bool]] = None, -) -> DownloadSummaryStatistics: - """ - From a given job's input and output files, downloads all files in - the given directory path. - (example of `directory_path`: "inputs/subdirectory1") - (example of `local_download_dir`: "/home/username") - """ - all_grouped_paths = get_job_input_output_paths_by_asset_root( - s3_settings=s3_settings, - attachments=attachments, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - session=session, - ) - - # Group by hash algorithm all the files that fall under the directory - files_to_download: DefaultDict[HashAlgorithm, list[RelativeFilePath]] = DefaultDict(list) - total_bytes = 0 - total_files = 0 - for path_group in all_grouped_paths.values(): - for hash_alg, path_list in path_group.files_by_hash_alg.items(): - files_list = [file for file in path_list if file.path.startswith(directory_path + "/")] - files_size = sum([file.size for file in files_list]) - total_bytes += files_size - total_files += len(files_list) - files_to_download[hash_alg].extend(files_list) - - # Sets up progress tracker to report download progress back to the caller. - progress_tracker = ProgressTracker( - status=ProgressStatus.DOWNLOAD_IN_PROGRESS, - total_files=total_files, - total_bytes=total_bytes, - on_progress_callback=on_downloading_files, - ) - - num_download_workers = _get_num_download_workers() - - start_time = time.perf_counter() - - for hash_alg, file_paths in files_to_download.items(): - downloaded_files_paths = _download_files_parallel( - file_paths, - hash_alg, - num_download_workers, - local_download_dir, - s3_settings.s3BucketName, - s3_settings.full_cas_prefix(), - progress_tracker=progress_tracker, - ) - - progress_tracker.total_time = time.perf_counter() - start_time - - return progress_tracker.get_download_summary_statistics( - {local_download_dir: downloaded_files_paths} - ) - - -def download_file( - file: RelativeFilePath, - hash_algorithm: HashAlgorithm, - local_download_dir: str, - collision_lock: Lock, - collision_file_dict: DefaultDict[str, int], - s3_bucket: str, - cas_prefix: Optional[str], - s3_client: Optional[BaseClient] = None, - session: Optional[boto3.Session] = None, - modified_time_override: Optional[float] = None, - progress_tracker: Optional[ProgressTracker] = None, - file_conflict_resolution: Optional[FileConflictResolution] = FileConflictResolution.CREATE_COPY, -) -> Tuple[int, Optional[Path]]: - """ - Downloads a file from the S3 bucket to the local directory. `modified_time_override` is ignored if the manifest - version used supports timestamps. - Returns a tuple of (size in bytes, filename) of the downloaded file. - - The file size of 0 means that this file comes from a manifest version that does not provide file sizes. - - The filename of None indicates that this file has been skipped or has not been downloaded. - """ - if not s3_client: - s3_client = get_s3_client(session=session) - - transfer_manager = get_s3_transfer_manager(s3_client=s3_client) - - # The modified time in the manifest is in microseconds, but utime requires the time be expressed in seconds. - modified_time_override = file.mtime / 1000000 # type: ignore[attr-defined] - - file_bytes = file.size - - # Python will handle the path separator '/' correctly on every platform. - local_file_path: Path = _get_long_path_compatible_path( - Path(local_download_dir).joinpath(file.path) - ) - - s3_key = ( - f"{cas_prefix}/{file.hash}.{hash_algorithm.value}" - if cas_prefix - else f"{file.hash}.{hash_algorithm.value}" - ) - - # If the file name already exists, resolve the conflict based on the file_conflict_resolution - if local_file_path.is_file(): - if file_conflict_resolution == FileConflictResolution.SKIP: - return (file_bytes, None) - elif file_conflict_resolution == FileConflictResolution.OVERWRITE: - pass - elif file_conflict_resolution == FileConflictResolution.CREATE_COPY: - copy_local_file_path = _get_new_copy_file_path( - local_file_path, collision_lock, collision_file_dict - ) - - # Re-run _get_long_path_compatible_path for updated file name after file conflict resolution - # _get_long_path_compatible_path is idempotent, so it doesn't re-process an existing long path - local_file_path = _get_long_path_compatible_path(copy_local_file_path) - else: - raise ValueError( - f"Unknown choice for file conflict resolution: {file_conflict_resolution}" - ) - - try: - local_file_path.parent.mkdir(parents=True, exist_ok=True) - except OSError as e: - raise OSError( - f"Failed to create download directory '{local_file_path.parent}': {e}. " - "If this path was configured on a different operating system, re-run the " - "download and choose a valid local path when prompted." - ) from e - - future: concurrent.futures.Future - - def handler(bytes_downloaded): - nonlocal progress_tracker - nonlocal future - - if progress_tracker: - should_continue = progress_tracker.track_progress_callback(bytes_downloaded) - if not should_continue: - future.cancel() - - subscribers = [ProgressCallbackInvoker(handler)] - - future = transfer_manager.download( - bucket=s3_bucket, - key=s3_key, - fileobj=str(local_file_path), - extra_args={"ExpectedBucketOwner": get_account_id(session=session)}, - subscribers=subscribers, - ) - - try: - future.result() - except concurrent.futures.CancelledError as ce: - if progress_tracker and progress_tracker.continue_reporting is False: - raise AssetSyncCancelledError("File download cancelled.") - else: - raise AssetSyncError("File download failed.", ce) from ce - except ClientError as exc: - - def process_client_error(exc: ClientError, status_code: int): - status_code_guidance = { - **COMMON_ERROR_GUIDANCE_FOR_S3, - 403: ( - ( - "Forbidden or Access denied. Please check your AWS credentials, and ensure that " - "your AWS IAM Role or User has the 's3:GetObject' permission for this bucket. " - ) - if "kms:" not in str(exc) - else ( - "Forbidden or Access denied. Please check your AWS credentials and Job Attachments S3 bucket " - "encryption settings. If a customer-managed KMS key is set, confirm that your AWS IAM Role or " - "User has the 'kms:Decrypt' and 'kms:DescribeKey' permissions for the key used to encrypt the bucket." - ) - ), - 404: ( - "Not found. Please check your bucket name and object key, and ensure that they exist in the AWS account." - ), - } - raise JobAttachmentsS3ClientError( - action="downloading file", - status_code=status_code, - bucket_name=s3_bucket, - key_or_prefix=s3_key, - message=f"{status_code_guidance.get(status_code, '')} {str(exc)} (Failed to download the file to {str(local_file_path)})", - ) from exc - - # TODO: Temporary to prevent breaking backwards-compatibility; if file not found, try again without hash alg postfix - status_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - if status_code == 404: - s3_key = s3_key.rsplit(".", 1)[0] - future = transfer_manager.download( - bucket=s3_bucket, - key=s3_key, - fileobj=str(local_file_path), - extra_args={"ExpectedBucketOwner": get_account_id(session=session)}, - subscribers=subscribers, - ) - try: - future.result() - except concurrent.futures.CancelledError as ce: - if progress_tracker and progress_tracker.continue_reporting is False: - raise AssetSyncCancelledError("File download cancelled.") - else: - raise AssetSyncError("File download failed.", ce) from ce - except ClientError as secondExc: - status_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - process_client_error(secondExc, status_code) - else: - process_client_error(exc, status_code) - except BotoCoreError as bce: - raise JobAttachmentS3BotoCoreError( - action="downloading file", - error_details=str(bce), - ) from bce - except Exception as e: - raise AssetSyncError(e) from e - - download_logger.debug(f"Downloaded {file.path} to {str(local_file_path)}") - os.utime(local_file_path, (modified_time_override, modified_time_override)) # type: ignore[arg-type] - - return (file_bytes, local_file_path) - - -def _download_files_parallel( - files: List[RelativeFilePath], - hash_algorithm: HashAlgorithm, - num_download_workers: int, - local_download_dir: str, - s3_bucket: str, - cas_prefix: Optional[str], - s3_client: Optional[BaseClient] = None, - session: Optional[boto3.Session] = None, - file_mod_time: Optional[float] = None, - progress_tracker: Optional[ProgressTracker] = None, - file_conflict_resolution: Optional[FileConflictResolution] = FileConflictResolution.CREATE_COPY, -) -> list[str]: - """ - Downloads files in parallel using thread pool. - Returns a list of local paths of downloaded files. - """ - downloaded_file_names: list[str] = [] - collision_lock: Lock = Lock() - collision_file_dict: DefaultDict[str, int] = DefaultDict(int) - - with concurrent.futures.ThreadPoolExecutor(max_workers=num_download_workers) as executor: - futures = { - executor.submit( - download_file, - file, - hash_algorithm, - local_download_dir, - collision_lock, - collision_file_dict, - s3_bucket, - cas_prefix, - s3_client, - session, - file_mod_time, - progress_tracker, - file_conflict_resolution, - ): file - for file in files - } - # surfaces any exceptions in the thread - for future in concurrent.futures.as_completed(futures): - file_bytes, local_file_name = future.result() - if local_file_name: - downloaded_file_names.append(str(local_file_name.resolve())) - if progress_tracker: - progress_tracker.increase_processed(1, 0) - progress_tracker.report_progress() - else: - if progress_tracker: - progress_tracker.increase_skipped(1, file_bytes) - progress_tracker.report_progress() - - # to report progress 100% at the end - if progress_tracker: - progress_tracker.report_progress() - - return downloaded_file_names - - -def download_files( - files: list[RelativeFilePath], - hash_algorithm: HashAlgorithm, - local_download_dir: str, - s3_settings: JobAttachmentS3Settings, - session: Optional[boto3.Session] = None, - progress_tracker: Optional[ProgressTracker] = None, - file_conflict_resolution: Optional[FileConflictResolution] = FileConflictResolution.CREATE_COPY, -) -> list[str]: - """ - Downloads all files from the S3 bucket in the Job Attachment settings to the specified directory. - Returns a list of local paths of downloaded files. - """ - s3_client = get_s3_client(session=session) - num_download_workers = _get_num_download_workers() - - file_mod_time: float = datetime.now().timestamp() - - return _download_files_parallel( - files, - hash_algorithm, - num_download_workers, - local_download_dir, - s3_settings.s3BucketName, - s3_settings.full_cas_prefix(), - s3_client, - session, - file_mod_time, - progress_tracker, - file_conflict_resolution, - ) - - -def get_job_output_paths_by_asset_root( - s3_settings: JobAttachmentS3Settings, - farm_id: str, - queue_id: str, - job_id: str, - step_id: Optional[str] = None, - task_id: Optional[str] = None, - session_action_id: Optional[str] = None, - session: Optional[boto3.Session] = None, -) -> dict[str, ManifestPathGroup]: - """ - Gets dict of grouped paths of all output files of a given job. - The grouped paths are separated by asset root. - Returns a dict of ManifestPathGroups, with the root path as the key. - """ - output_manifests_by_root = get_output_manifests_by_asset_root( - s3_settings, - farm_id, - queue_id, - job_id, - step_id, - task_id, - session_action_id, - session=session, - ) - - outputs: dict[str, ManifestPathGroup] = {} - for root, manifests in output_manifests_by_root.items(): - for manifest in manifests: - if root not in outputs: - outputs[root] = ManifestPathGroup() - outputs[root].add_manifest_to_group(manifest) - - return outputs - - -def get_output_manifests_by_asset_root( - s3_settings: JobAttachmentS3Settings, - farm_id: str, - queue_id: str, - job_id: str, - step_id: Optional[str] = None, - task_id: Optional[str] = None, - session_action_id: Optional[str] = None, - session: Optional[boto3.Session] = None, -) -> dict[str, list[BaseAssetManifest]]: - """ - Gets output manifests grouped by asset root for job, step, or task outputs, handling both chunked and non-chunked steps. - - When session_action_id is provided, retrieves outputs for that specific session action - by searching S3 paths containing the session action ID. Session action ID is typically - provided with task ID but is not required. - - When session_action_id is not provided, retrieves all output manifests for the specified - scope (job, step, or task) and merges them chronologically by asset root. This is used - for downloading complete job/step/task outputs or syncing step dependencies by WorkerAgent. - """ - # Handle specific session action ID requests - if session_action_id: - if not step_id or not task_id: - raise JobAttachmentsError( - "Session Action ID specified, but missing Step ID or Task ID. Job, Step, and Task ID are required to retrieve session action outputs." - ) - return _get_manifests_by_session_action_id( - s3_settings, - farm_id, - queue_id, - job_id, - step_id, - task_id, - session_action_id, - session, - ) - - outputs: DefaultDict[str, list[BaseAssetManifest]] = DefaultDict(list) - manifest_prefix: str = _get_output_manifest_prefix( - s3_settings, farm_id, queue_id, job_id, step_id, task_id - ) - try: - manifests_keys: list[str] = _get_tasks_manifests_keys_from_s3( - manifest_prefix, s3_settings.s3BucketName, session=session - ) - except JobAttachmentsError: - return outputs - - # Collect all manifests grouped by asset root - by_root: defaultdict[str, list[Tuple[datetime, BaseAssetManifest]]] = defaultdict(list) - - if manifests_keys: - # Download manifests with timestamps for chronological merging - with concurrent.futures.ThreadPoolExecutor( - max_workers=S3_DOWNLOAD_MAX_CONCURRENCY - ) as executor: - futures = [] - for manifest_key in manifests_keys: - future = executor.submit( - _get_asset_root_and_manifest_from_s3_with_last_modified, - manifest_key, - s3_settings.s3BucketName, - session, - ) - futures.append(future) - - for i, future in enumerate(futures): - asset_root, last_modified, manifest = future.result() - if not asset_root: - raise MissingAssetRootError( - f"Failed to get asset root from metadata of output manifest: {manifests_keys[i]}" - ) - by_root[asset_root].append((last_modified, manifest)) - - # Merge each asset root chronologically - # We must merge here while we have LastModified timestamps, since the returned manifests - # lose this metadata and downstream callers can't merge chronologically. - for asset_root, manifest_list in by_root.items(): - merged_manifest = _merge_asset_manifests_sorted_asc_by_last_modified(manifest_list) - if merged_manifest: - outputs[asset_root].append(merged_manifest) - - return outputs - - -def _get_output_manifest_files_by_asset_root_with_last_modified( - s3_settings: JobAttachmentS3Settings, - output_manifest_paths: List[str], - session: Optional[boto3.Session] = None, -) -> list[Tuple[str, datetime, BaseAssetManifest]]: - """ - For a given list of output manifest paths, returns a list of tuples containing - (asset_root, last_modified, manifest) that exactly mirrors the provided output_manifest_paths. - - Returns: - A list of tuples containing (asset_root, last_modified, manifest) in the same order as - the provided output_manifest_paths. - """ - outputs: List[Tuple[str, datetime, BaseAssetManifest]] = [None] * len(output_manifest_paths) # type: ignore[list-item] - - with concurrent.futures.ThreadPoolExecutor(max_workers=S3_DOWNLOAD_MAX_CONCURRENCY) as executor: - # Submit all tasks and store futures in a list that preserves the original order - futures = [] - for key in output_manifest_paths: - future = executor.submit( - _get_asset_root_and_manifest_from_s3_with_last_modified, - key, - s3_settings.s3BucketName, - session, - ) - futures.append(future) - - # Process results using explicit index-based iteration to ensure order preservation - for index in range(len(output_manifest_paths)): - asset_root, last_modified, asset_manifest = futures[index].result() - if not asset_root: - raise MissingAssetRootError( - f"Failed to get asset root from metadata of output manifest: {output_manifest_paths[index]}" - ) - outputs[index] = (asset_root, last_modified, asset_manifest) - - return outputs - - -def download_files_from_manifests( - s3_bucket: str, - manifests_by_root: dict[str, BaseAssetManifest], - cas_prefix: Optional[str] = None, - fs_permission_settings: Optional[FileSystemPermissionSettings] = None, - session: Optional[boto3.Session] = None, - on_downloading_files: Optional[Callable[[ProgressReportMetadata], bool]] = None, - logger: Optional[Union[Logger, LoggerAdapter]] = None, - conflict_resolution: FileConflictResolution = FileConflictResolution.CREATE_COPY, -) -> DownloadSummaryStatistics: - """ - Given manifests, downloads all files from a CAS in each manifest. - - Args: - s3_bucket: The name of the S3 bucket. - manifests_by_root: a map from each local root path to a corresponding list of tuples of manifest contents and their path. - cas_prefix: The CAS prefix of the files. - session: The boto3 session to use. - on_downloading_files: a callback to be called to periodically report progress to the caller. - The callback returns True if the operation should continue as normal, or False to cancel. - - Returns: - The download summary statistics. - """ - s3_client = get_s3_client(session=session) - num_download_workers = _get_num_download_workers() - file_mod_time = datetime.now().timestamp() - - # Sets up progress tracker to report download progress back to the caller. - total_size = 0 - total_files = 0 - for manifest in manifests_by_root.values(): - total_files += len(manifest.paths) - total_size += manifest.totalSize # type: ignore[attr-defined] - progress_tracker = ProgressTracker( - status=ProgressStatus.DOWNLOAD_IN_PROGRESS, - total_files=total_files, - total_bytes=total_size, - on_progress_callback=on_downloading_files, - logger=logger, - ) - start_time = time.perf_counter() - - downloaded_files_paths_by_root: DefaultDict[str, list[str]] = DefaultDict(list) - - for local_download_dir, manifest in manifests_by_root.items(): - downloaded_files_paths = _download_files_parallel( - manifest.paths, - manifest.hashAlg, - num_download_workers, - local_download_dir, - s3_bucket, - cas_prefix, - s3_client, - session, - file_mod_time, - progress_tracker=progress_tracker, - file_conflict_resolution=conflict_resolution, - ) - - if fs_permission_settings is not None: - _set_fs_group( - file_paths=downloaded_files_paths, - local_root=local_download_dir, - fs_permission_settings=fs_permission_settings, - ) - - downloaded_files_paths_by_root[local_download_dir].extend(downloaded_files_paths) - - progress_tracker.total_time = time.perf_counter() - start_time - return progress_tracker.get_download_summary_statistics(downloaded_files_paths_by_root) - - -def _get_num_download_workers(s3_max_pool_connections: int = 50) -> int: - """ - Determines the max number of thread workers for downloading multiple files in parallel, - based on the allowed S3 max pool connections size. If the max worker count is calculated - to be 0 due to a small pool connections size limit, it returns 1. - """ - num_download_workers = int(s3_max_pool_connections / S3_DOWNLOAD_MAX_CONCURRENCY) - if num_download_workers <= 0: - num_download_workers = 1 - return num_download_workers - - -def _set_fs_group( - file_paths: list[str], - local_root: str, - fs_permission_settings: FileSystemPermissionSettings, -) -> None: - """ - Sets file system group ownership and permissions for all files and directories - in the given paths, starting from root. It is expected that all `file_paths` - point to files, not directories. - - Raises: - TypeError: If the `fs_permission_settings` are not specific to the underlying OS. - """ - if os.name == "posix": - if not isinstance(fs_permission_settings, PosixFileSystemPermissionSettings): - raise TypeError( - "The file system permission settings must be specific to Posix-based system." - ) - _set_fs_group_for_posix( - file_paths=file_paths, - local_root=local_root, - fs_permission_settings=fs_permission_settings, - ) - else: # if os.name is not "posix" - if not isinstance(fs_permission_settings, WindowsFileSystemPermissionSettings): - raise TypeError("The file system permission settings must be specific to Windows.") - _set_fs_permission_for_windows( - file_paths=file_paths, - local_root=local_root, - fs_permission_settings=fs_permission_settings, - ) - - -def merge_asset_manifests( - manifests: list[BaseAssetManifest], -) -> BaseAssetManifest | None: - """Merge files from multiple manifests into a single list, ensuring that each filename - is unique by keeping the one from the last encountered manifest. (Thus, the steps' - outputs are downloaded over the input job attachments.) - - Args: - manifests (list[AssetManifest]): A list of manifests to be merged. - - Raises: - NotImplementedError: When two manifests have different hash algorithms. All manifests must use the same hash algorithm. - - Returns: - AssetManifest | None: A single manifest containing the merged paths of all provided manifests or None if no manifests were provided - """ - if len(manifests) == 0: - return None - elif len(manifests) == 1: - return manifests[0] - - first_manifest = manifests[0] - - hash_alg: HashAlgorithm = first_manifest.hashAlg - merged_paths: dict[str, RelativeFilePath] = dict() - total_size: int = 0 - - # Loop each manifest - for manifest in manifests: - if manifest.hashAlg != hash_alg: - raise NotImplementedError( - f"Merging manifests with different hash algorithms is not supported. {manifest.hashAlg.value} does not match {hash_alg.value}" - ) - - for path in manifest.paths: - merged_paths[path.path] = path - - manifest_args: dict[str, Any] = { - "hash_alg": hash_alg, - "paths": list(merged_paths.values()), - } - - total_size = sum([path.size for path in merged_paths.values()]) # type: ignore - manifest_args["total_size"] = total_size - - output_manifest: BaseAssetManifest = first_manifest.__class__(**manifest_args) - - return output_manifest - - -def _merge_asset_manifests_sorted_asc_by_last_modified( - manifests_with_last_modified_timestamps: list[Tuple[datetime, BaseAssetManifest]], -) -> BaseAssetManifest | None: - """Merge files from multiple manifests into a single list, sorting them by last modified timestamp asc. - This function first sorts the manifests by their timestamps (oldest first) and then merges them, - ensuring that newer files overwrite older ones with the same path. - - Args: - manifests_with_last_modified_timestamps (list[Tuple[datetime, BaseAssetManifest]]): A list of tuples containing - (timestamp, manifest) to be sorted and merged. - - Raises: - NotImplementedError: When two manifests have different hash algorithms. - All manifests must use the same hash algorithm. - - Returns: - BaseAssetManifest | None: A single manifest containing the merged paths of all provided manifests - or None if no manifests were provided - """ - if not manifests_with_last_modified_timestamps: - return None - - # Sort manifests by timestamp (oldest first) - sorted_manifests_with_timestamps = sorted( - manifests_with_last_modified_timestamps, key=lambda x: x[0] - ) - - # Extract just the manifests in the sorted order - sorted_manifests = [manifest for _, manifest in sorted_manifests_with_timestamps] - - # Use the existing merge function with the sorted manifests - return merge_asset_manifests(sorted_manifests) - - -def _write_manifest_to_temp_file(manifest: BaseAssetManifest, dir: Path) -> str: - with NamedTemporaryFile( - suffix=".json", - prefix="deadline-merged-manifest-", - delete=False, - mode="w", - dir=dir, - ) as file: - file.write(manifest.encode()) - return file.name - - -def _read_manifest_file(input_manifest_path: Path): - """ - Given a manifest path, open the file at that location and decode - Args: - input_manifest_path: Path to manifest - Returns: - BaseAssetManifest : Single decoded manifest - """ - with open(input_manifest_path) as input_manifest_file: - return decode_manifest(input_manifest_file.read()) - - -def handle_existing_vfs( - manifest: BaseAssetManifest, session_dir: Path, mount_point: str, os_user: str -) -> BaseAssetManifest: - """ - Combines provided manifest with the input manifest of the running VFS at the - given mount_point if it exists. Then kills the running process at that mount so - it can be replaced - - Args: - manifest: The manifest for the new inputs to be mounted - mount_point: The local directory where the manifest is to be mounted - os_user: the user running the job. - Returns: - BaseAssetManifest: A single manifest containing the merged paths or the original manifest - """ - if not VFSProcessManager.is_mount(mount_point): - return manifest - - input_manifest_path: Optional[Path] = VFSProcessManager.get_manifest_path_for_mount( - session_dir=session_dir, mount_point=mount_point - ) - if input_manifest_path is not None: - input_manifest = _read_manifest_file(input_manifest_path) - - merged_input_manifest: Optional[BaseAssetManifest] = merge_asset_manifests( - [input_manifest, manifest] - ) - manifest = merged_input_manifest if merged_input_manifest is not None else manifest - else: - download_logger.error(f"input manifest not found for mount at {mount_point}") - return manifest - - VFSProcessManager.kill_process_at_mount( - session_dir=session_dir, mount_point=mount_point, os_user=os_user - ) - - return manifest - - -def mount_vfs_from_manifests( - s3_bucket: str, - manifests_by_root: dict[str, BaseAssetManifest], - boto3_session: boto3.Session, - session_dir: Path, - os_env_vars: dict[str, str], - fs_permission_settings: FileSystemPermissionSettings, - cas_prefix: Optional[str] = None, - on_mount_complete: Optional[Callable[[bool], None]] = None, -) -> None: - """ - Given manifests, downloads all files from a CAS in those manifests. - - Args: - s3_bucket: The name of the S3 bucket. - manifests_by_root: a map from each local root path to a corresponding list of tuples of manifest contents and their path. - boto3_session: The boto3 session to use. - session_dir: the directory that the session is going to use. - os_env_vars: environment variables to set for launched subprocesses - cas_prefix: The CAS prefix of the files. - on_mount_complete: optional callback invoked with a bool indicating whether - each VFS mount succeeded. Callers can use this for telemetry or logging. - - Returns: - None - """ - if not isinstance(fs_permission_settings, PosixFileSystemPermissionSettings): - raise TypeError("VFS can only be mounted from manifests on posix file systems.") - vfs_cache_dir: Path = session_dir / VFS_CACHE_REL_PATH_IN_SESSION - asset_cache_hash_path: Path = vfs_cache_dir - if cas_prefix is not None: - asset_cache_hash_path = vfs_cache_dir / cas_prefix - _ensure_paths_within_directory(str(vfs_cache_dir), [str(asset_cache_hash_path)]) - - asset_cache_hash_path.mkdir(parents=True, exist_ok=True) - - _set_fs_group([str(asset_cache_hash_path)], str(vfs_cache_dir), fs_permission_settings) - - manifest_dir: Path = session_dir / VFS_MANIFEST_FOLDER_IN_SESSION - manifest_dir.mkdir(parents=True, exist_ok=True) - manifest_dir_permissions = VFS_MANIFEST_FOLDER_PERMISSIONS - manifest_dir_permissions.os_user = fs_permission_settings.os_user - manifest_dir_permissions.os_group = fs_permission_settings.os_group - - _set_fs_group([str(manifest_dir)], str(manifest_dir), manifest_dir_permissions) - - vfs_logs_dir: Path = session_dir / VFS_LOGS_FOLDER_IN_SESSION - vfs_logs_dir.mkdir(parents=True, exist_ok=True) - - _set_fs_group([str(vfs_logs_dir)], str(vfs_logs_dir), fs_permission_settings) - - for mount_point, manifest in manifests_by_root.items(): - # Validate the file paths to see if they are under the given download directory. - _ensure_paths_within_directory( - mount_point, - [path.path for path in manifest.paths], # type: ignore - ) - final_manifest: BaseAssetManifest = handle_existing_vfs( - manifest=manifest, - session_dir=session_dir, - mount_point=mount_point, - os_user=fs_permission_settings.os_user, - ) - - # Write out a temporary file with the contents of the newly merged manifest - manifest_path: str = _write_manifest_to_temp_file(final_manifest, dir=manifest_dir) - - vfs_manager: VFSProcessManager = VFSProcessManager( - s3_bucket, - boto3_session.region_name, - manifest_path, - mount_point, - fs_permission_settings.os_user, - os_env_vars, - getattr(fs_permission_settings, "os_group", ""), - cas_prefix, - str(vfs_cache_dir), - on_mount_complete=on_mount_complete, - ) - vfs_manager.start(session_dir=session_dir) - - -def _ensure_paths_within_directory(root_path: str, paths_relative_to_root: list[str]) -> None: - """ - Validates the given paths to ensure that they are within the given root path. - If the root path is not an absolute path, raises a ValueError. - If any path is not under the root directory, raises an PathOutsideDirectoryError. - """ - if not Path(root_path).is_absolute(): - raise ValueError(f"The provided root path is not an absolute path: {root_path}") - - for path in paths_relative_to_root: - resolved_path = Path(root_path, path).resolve() - if not _is_relative_to(resolved_path, Path(root_path).resolve()): - raise PathOutsideDirectoryError( - f"The provided path is not under the root directory: {path}" - ) - return - - -class OutputDownloader: - """ - Handler for downloading all output files from the given job, with optional step and task-level granularity. - If no session is provided the default credentials path will be used, see: - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials - - Path mapping for cross-OS downloads is handled at the CLI layer via set_root_path(), - consistent with how queue sync-output applies path mapping externally. - """ - - def __init__( - self, - s3_settings: JobAttachmentS3Settings, - farm_id: str, - queue_id: str, - job_id: str, - step_id: Optional[str] = None, - task_id: Optional[str] = None, - session_action_id: Optional[str] = None, - session: Optional[boto3.Session] = None, - ) -> None: - self.s3_settings = s3_settings - self.session = session - self.outputs_by_root = get_job_output_paths_by_asset_root( - s3_settings=s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - step_id=step_id, - task_id=task_id, - session_action_id=session_action_id, - session=session, - ) - - def get_output_paths_by_root(self) -> dict[str, list[str]]: - """ - Returns a dict of asset root paths to lists of output paths. - """ - output_paths_by_root: dict[str, list[str]] = {} - - for root, path_group in self.outputs_by_root.items(): - output_paths_by_root[root] = path_group.get_all_paths() - return output_paths_by_root - - def set_root_path(self, original_root: str, new_root: str) -> None: - """ - Changes the root path for downloading output files, (which is the root path - saved in the S3 metadata for the output manifest by default,) with a custom path. - (It will store the new root path as an absolute path.) - """ - # Need to use absolute to not resolve symlinks, but need normpath to get rid of relative paths, i.e. '..' - new_root = str(os.path.normpath(Path(new_root).absolute())) - - if original_root not in self.outputs_by_root: - raise ValueError( - f"The root path {original_root} was not found in output manifests {self.outputs_by_root}." - ) - - if new_root == original_root: - return - - if new_root in self.outputs_by_root: - # If the new_root already exists, and the file path in the original_root already exists - # among the file paths of the new_root, then prefix the file path with the original_root path. - # This is to avoid duplicate file paths in the new_root. - paths_in_new_root = self.outputs_by_root[new_root].get_all_paths() - for manifest_paths in self.outputs_by_root[original_root].files_by_hash_alg.values(): - for manifest_path in manifest_paths: - if manifest_path.path in paths_in_new_root: - new_name_prefix = ( - original_root.replace("/", "_").replace("\\", "_").replace(":", "_") - ) - manifest_path.path = str( - Path(manifest_path.path).with_name( - f"{new_name_prefix}_{manifest_path.path}" - ) - ) - self.outputs_by_root[new_root].combine_with_group(self.outputs_by_root[original_root]) - del self.outputs_by_root[original_root] - else: - self.outputs_by_root = { - key if key != original_root else new_root: value - for key, value in self.outputs_by_root.items() - } - - def download_job_output( - self, - file_conflict_resolution: Optional[ - FileConflictResolution - ] = FileConflictResolution.CREATE_COPY, - on_downloading_files: Optional[Callable[[ProgressReportMetadata], bool]] = None, - ) -> DownloadSummaryStatistics: - """ - Downloads outputs files from S3 bucket to the asset root(s). - - Args: - file_conflict_resolution: resolution method for file conflicts. - on_downloading_files: a callback to be called to periodically report progress to the caller. - The callback returns True if the operation should continue as normal, or False to cancel. - - Returns: - The download summary statistics - """ - # Sets up progress tracker to report download progress back to the caller. - total_bytes: int = 0 - total_files: int = 0 - for path_group in self.outputs_by_root.values(): - total_bytes += path_group.total_bytes - total_files += len(path_group.get_all_paths()) - - progress_tracker = ProgressTracker( - status=ProgressStatus.DOWNLOAD_IN_PROGRESS, - total_files=total_files, - total_bytes=total_bytes, - on_progress_callback=on_downloading_files, - ) - - start_time = time.perf_counter() - downloaded_files_paths_by_root: DefaultDict[str, list[str]] = DefaultDict(list) - - try: - for root, output_path_group in self.outputs_by_root.items(): - for hash_alg, path_list in output_path_group.files_by_hash_alg.items(): - # Validate the file paths to see if they are under the given download directory. - _ensure_paths_within_directory(root, [file.path for file in path_list]) - - downloaded_files_paths = download_files( - files=path_list, - hash_algorithm=hash_alg, - local_download_dir=root, - s3_settings=self.s3_settings, - session=self.session, - progress_tracker=progress_tracker, - file_conflict_resolution=file_conflict_resolution, - ) - downloaded_files_paths_by_root[root].extend(downloaded_files_paths) - except AssetSyncCancelledError: - downloaded_files = progress_tracker.processed_files - raise AssetSyncCancelledError( - "Download cancelled. " - f"(Downloaded {downloaded_files} file{'' if downloaded_files == 1 else 's'} before cancellation.)" - ) - - progress_tracker.total_time = time.perf_counter() - start_time - - return progress_tracker.get_download_summary_statistics(downloaded_files_paths_by_root) - - -def _get_manifests_by_session_action_id( - s3_settings: JobAttachmentS3Settings, - farm_id: str, - queue_id: str, - job_id: str, - step_id: str, - task_id: str, - session_action_id: str, - session: Optional[boto3.Session], -) -> dict[str, list[BaseAssetManifest]]: - """ - Get manifests for a specific session action ID by searching S3 paths containing the session action ID. - - When session action ID is known, we don't need to search for the "latest" session action based on - timestamp prefix (which comes from WorkerAgent and can't be trusted). However, we still can't - directly access the outputs because the S3 path contains an unknown timestamp prefix before the - session action ID (e.g., .../step_id/task_id/20241225T120000_sessionaction_id/). - - For task-based paths, searches in the task folder first (...job_id/step_id/task_id/) for efficiency. - This approach is more correct than searching by timestamp prefix since latestSessionActionId is - typically obtained from GetTask API, guaranteeing the latest outputs. - For chunked steps, when no task-based manifests found, falls back to step folder (...job_id/step_id/). - """ - outputs: dict[str, list[BaseAssetManifest]] = defaultdict(list) - - def get_manifests_by_regex(manifest_prefix: str) -> List[str]: - """Get manifest keys matching the session action ID using regex search.""" - all_contents = _list_s3_objects_with_error_handling( - s3_settings.s3BucketName, manifest_prefix, session - ) - manifests_keys = [] - regex_pattern = re.compile(rf".*{re.escape(session_action_id)}.*output.*") - for content in all_contents: - if regex_pattern.search(content["Key"]): - manifests_keys.append(content["Key"]) - return manifests_keys - - # Try task-specific prefix first for efficiency - task_prefix = _get_output_manifest_prefix( - s3_settings, farm_id, queue_id, job_id, step_id, task_id - ) - - manifests_keys: Optional[List[str]] = None - try: - manifests_keys = get_manifests_by_regex(task_prefix) - except JobAttachmentsError: - pass # Unable to find manifests under task prefix. - - # If no manifests found at task level, fall back to step level - if not manifests_keys: - step_prefix = _get_output_manifest_prefix(s3_settings, farm_id, queue_id, job_id, step_id) - try: - manifests_keys = get_manifests_by_regex(step_prefix) - except JobAttachmentsError: - return outputs - - # Download all found manifests - with concurrent.futures.ThreadPoolExecutor(max_workers=S3_DOWNLOAD_MAX_CONCURRENCY) as executor: - futures = [ - executor.submit( - get_asset_root_and_manifest_from_s3, - key, - s3_settings.s3BucketName, - session, - ) - for key in manifests_keys - ] - for i, future in enumerate(futures): - asset_root, manifest = future.result() - if not asset_root: - raise MissingAssetRootError( - f"Failed to get asset root from metadata of output manifest: {manifests_keys[i]}" - ) - outputs[asset_root].append(manifest) - - return outputs diff --git a/src/deadline/job_attachments/exceptions.py b/src/deadline/job_attachments/exceptions.py deleted file mode 100644 index 9e2c3578d..000000000 --- a/src/deadline/job_attachments/exceptions.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Exceptions that the Deadline Job Attachments library can raise. -""" - -from typing import Optional - -COMMON_ERROR_GUIDANCE_FOR_S3 = { - 408: "Request timeout. Please consider retrying later, or ensure your network connection is stable.", - 500: "Internal server error. It might be an issue on AWS's side; please consider retrying later or contacting AWS support.", - 503: "Service unavailable. AWS S3 might be down or experiencing high traffic. Please consider retrying after some time.", -} - - -class AssetSyncError(Exception): - """ - Exception for errors related to synching files to/from S3. - """ - - -class JobAttachmentsError(Exception): - """ - Exception for errors related to the Deadline Service. - """ - - -class JobAttachmentsS3ClientError(AssetSyncError): - """ - Exception for errors related to the S3 client. - """ - - def __init__( - self, - action, - status_code, - bucket_name: str, - key_or_prefix: str, - message: Optional[str] = None, - ) -> None: - self.action = action - self.status_code = status_code - self.bucket_name = bucket_name - self.key_or_prefix = key_or_prefix - - message_parts = [ - f"Error {action} in bucket '{bucket_name}', Target key or prefix: '{key_or_prefix}'", - f"HTTP Status Code: {status_code}", - ] - if message: - message_parts.append(message) - - super().__init__(", ".join(message_parts)) - - -class JobAttachmentS3BotoCoreError(AssetSyncError): - """ - Exception to wrap any botocore.exceptions.BotoCoreError. - """ - - def __init__(self, action: str, error_details: str) -> None: - self.action = action - message = ( - f"An issue occurred with AWS service request while {action}: " - f"{error_details}\n" - "This could be due to temporary issues with AWS, internet connection, or your AWS credentials. " - "Please verify your credentials and network connection. If the problem persists, try again later" - " or contact support for further assistance." - ) - super().__init__(message) - - -class MissingJobAttachmentSettingsError(JobAttachmentsError): - """ - Exception raised when attempting to use Job Attachments but the settings are not set in Queue. - """ - - -class ManifestCreationException(Exception): - """ - Exception for errors related to Creating Manifests. - """ - - -class MissingS3BucketError(JobAttachmentsError): - """ - Exception raised when attempting to use Job Attachments but the S3 bucket is not set in Queue. - """ - - -class MissingS3RootPrefixError(JobAttachmentsError): - """ - Exception raised when attempting to use Job Attachments but the S3 root prefix is not set in Queue. - """ - - -class MalformedAttachmentSettingError(JobAttachmentsError): - """ - Exception raised when encountering error parsing input attachment settings. - """ - - -class AssetOutsideOfRootError(JobAttachmentsError): - """ - Exception for errors related to assets being outside of the asset root. - """ - - -class MisconfiguredInputsError(JobAttachmentsError): - """ - Exception for errors related to missing input directories, empty input directories, - missing input files, and input directories classified as files - """ - - -class ManifestDecodeValidationError(JobAttachmentsError): - """ - Exception for errors related to asset manifest decoding. - """ - - -class MissingManifestError(JobAttachmentsError): - """ - Exception for when trying to retrieve asset manifests that don't exist. - """ - - -class MissingAssetRootError(JobAttachmentsError): - """ - Exception for when trying to retrieve asset root from metadata (in S3) that doesn't exist. - """ - - -class AssetSyncCancelledError(JobAttachmentsError): - """ - Exception thrown when an operation (synching files to/from S3) has been cancelled. - """ - - def __init__(self, message, summary_statistics=None): - super().__init__(message) - self.summary_statistics = summary_statistics - - -class PathOutsideDirectoryError(JobAttachmentsError): - """ - Exception thrown in the _ensure_paths_within_directory function to signal that a given - file path, especially ones that may contain "..", does not reside in the specified root path. - """ - - -class VFSExecutableMissingError(JobAttachmentsError): - """ - Exception for when trying to retrieve VFS executable path doesn't exist. - """ - - -class VFSLaunchScriptMissingError(JobAttachmentsError): - """ - Exception for when trying to retrieve VFS launch script path doesn't exist. - """ - - -class VFSFailedToMountError(JobAttachmentsError): - """ - Exception for when trying to mount VFS at a given path. - """ - - -class VFSOSUserNotSetError(JobAttachmentsError): - """ - Exception attempting to use the vfs without an os user - """ - - -class UnsupportedHashingAlgorithmError(JobAttachmentsError): - """ - Exception for when an unsupported hashing algorithm is provided. - """ - - -class VFSRunPathNotSetError(JobAttachmentsError): - """ - Exception for when the run path hasn't been set for the vfs - """ - - -class NonValidInputError(JobAttachmentsError): - """ - Exception for when user input to a Job Attachments function is not valid. - """ diff --git a/src/deadline/job_attachments/models.py b/src/deadline/job_attachments/models.py deleted file mode 100644 index 396c5740b..000000000 --- a/src/deadline/job_attachments/models.py +++ /dev/null @@ -1,586 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Data classes for AWS objects. -""" - -from __future__ import annotations - -import json -import sys -from dataclasses import dataclass, field -from enum import Enum -from pathlib import Path -from typing import Any, List, Optional, Set -from urllib.parse import urlparse - -from deadline.job_attachments.asset_manifests import HashAlgorithm, hash_data -from deadline.job_attachments.asset_manifests.base_manifest import ( - BaseAssetManifest, - BaseManifestPath, -) -from deadline.job_attachments.exceptions import ( - MissingS3RootPrefixError, - MalformedAttachmentSettingError, -) - -from ._utils import ( - _generate_random_guid, - _join_s3_paths, - _float_to_iso_datetime_string, -) - -S3_DATA_FOLDER_NAME = "Data" -S3_MANIFEST_FOLDER_NAME = "Manifests" -S3_INPUT_MANIFEST_FOLDER_NAME = "Inputs" - - -@dataclass -class AssetRootManifest: - """Represents asset manifest and a list of output files grouped under the same root""" - - file_system_location_name: Optional[str] = None - root_path: str = "" - asset_manifest: Optional[BaseAssetManifest] = None - outputs: List[Path] = field(default_factory=list) - - -@dataclass -class AssetRootGroup: - """Represents lists of input files, output files and path references grouped under the same root""" - - file_system_location_name: Optional[str] = None - root_path: str = "" - inputs: Set[Path] = field(default_factory=set) - outputs: Set[Path] = field(default_factory=set) - references: Set[Path] = field(default_factory=set) - - -@dataclass -class AssetUploadGroup: - """Represents all of the information needed to prepare to upload assets""" - - asset_groups: List[AssetRootGroup] = field(default_factory=list) - known_asset_paths: List[Path] = field(default_factory=list) - """List of paths that should not generate warnings""" - total_input_files: int = 0 - total_input_bytes: int = 0 - - -@dataclass -class ManifestPathGroup: - """ - Represents paths combined from multiple manifests under the same root path, organized by hash algorithm. - """ - - total_bytes: int = 0 - files_by_hash_alg: dict[HashAlgorithm, List[BaseManifestPath]] = field(default_factory=dict) - - def add_manifest_to_group(self, manifest: BaseAssetManifest) -> None: - if manifest.hashAlg not in self.files_by_hash_alg: - self.files_by_hash_alg[manifest.hashAlg] = manifest.paths - else: - self.files_by_hash_alg[manifest.hashAlg].extend(manifest.paths) - self.total_bytes += manifest.totalSize # type: ignore[attr-defined] - - def combine_with_group(self, group: ManifestPathGroup) -> None: - """Adds the content of the given ManifestPathGroup to this ManifestPathGroup""" - for hash_alg, paths in group.files_by_hash_alg.items(): - if hash_alg not in self.files_by_hash_alg: - self.files_by_hash_alg[hash_alg] = paths - else: - self.files_by_hash_alg[hash_alg].extend(paths) - self.total_bytes += group.total_bytes - - def get_all_paths(self) -> list[str]: - """ - Get all paths in this group, regardless of hashing algorithm. - Note that this may include duplicates if the same path exists for multiple hashing algorithms. - - Returns a sorted list of paths represented as strings. - """ - path_list: List[str] = [] - for paths in self.files_by_hash_alg.values(): - path_list.extend([path.path for path in paths]) - return sorted(path_list) - - -@dataclass -class OutputFile: - """Files for output""" - - # File size in Bytes - file_size: int - file_hash: str - rel_path: str - full_path: str - s3_key: str - # If the file already exists in the CAS - in_s3: bool - # The base directory path against which file paths are containment-checked - base_dir: Optional[str] - - -class StorageProfileOperatingSystemFamily(str, Enum): - """Case-insensitive enum for the storage profile operating system family type.""" - - WINDOWS = "windows" - LINUX = "linux" - MACOS = "macos" - - @classmethod - def _missing_(cls, value): - value = value.lower() - for member in cls: - if member == value: - return member - return None - - @classmethod - def get_host_os_family(cls) -> StorageProfileOperatingSystemFamily: - """Get the current path format.""" - if sys.platform.startswith("win"): - return cls.WINDOWS - if sys.platform.startswith("darwin"): - return cls.MACOS - if sys.platform.startswith("linux"): - return cls.LINUX - else: - raise NotImplementedError(f"Operating system {sys.platform} is not supported.") - - -class AssetType(str, Enum): - INPUT = "input" - OUTPUT = "output" - ALL = "all" - - -class PathFormat(str, Enum): - WINDOWS = "windows" - POSIX = "posix" - - @classmethod - def get_host_path_format(cls) -> PathFormat: - """Get the current path format.""" - if sys.platform.startswith("win"): - return cls.WINDOWS - if sys.platform.startswith("darwin") or sys.platform.startswith("linux"): - return cls.POSIX - else: - raise NotImplementedError(f"Operating system {sys.platform} is not supported.") - - @classmethod - def get_host_path_format_string(cls) -> str: - """Get a string of the current path format.""" - return cls.get_host_path_format().value - - -# Behavior to adopt when loading job assets -class JobAttachmentsFileSystem(str, Enum): - # Load all assets at before execution of the job code - COPIED = "COPIED" - # Start job execution immediately and load assets as needed - VIRTUAL = "VIRTUAL" - - -@dataclass -class ManifestProperties: - """The assets for a Step under an asset root""" - - # The path that assets were relative to on submitting machine - rootPath: str - # Used for path mapping. - rootPathFormat: PathFormat - # If submitting machine has a 'Local' Storage Profile and files are relative - # to any of its 'Asset Roots', the Asset Root Path will be used below. - # Otherwise, the dynamic Job Attachments root path will be used and this will be empty. - fileSystemLocationName: Optional[str] = field(default=None) # type: ignore - # An S3 (object) key that points to a file manifest location. - # Optional as we may not need inputs if everything is embedded in the Job Template. - inputManifestPath: Optional[str] = field(default=None) # type: ignore - # The hash of the manifest, for data provenance - inputManifestHash: Optional[str] = field(default=None) # type: ignore - # The expected output directories to search for outputs. Relative to the rootPath. - outputRelativeDirectories: Optional[List[str]] = field(default=None) - - def to_dict(self) -> dict[str, Any]: - result: dict[str, Any] = {"rootPath": self.rootPath} - if self.fileSystemLocationName: - result["fileSystemLocationName"] = self.fileSystemLocationName - result["rootPathFormat"] = self.rootPathFormat.value - if self.inputManifestPath: - result["inputManifestPath"] = self.inputManifestPath - if self.inputManifestHash: - result["inputManifestHash"] = self.inputManifestHash - if self.outputRelativeDirectories: - result["outputRelativeDirectories"] = self.outputRelativeDirectories - return result - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "ManifestProperties": - """Create ManifestProperties from a dictionary.""" - return cls( - rootPath=data["rootPath"], - rootPathFormat=PathFormat(data["rootPathFormat"]), - fileSystemLocationName=data.get("fileSystemLocationName"), - inputManifestPath=data.get("inputManifestPath"), - inputManifestHash=data.get("inputManifestHash"), - outputRelativeDirectories=data.get("outputRelativeDirectories"), - ) - - def as_output_metadata(self) -> dict[str, dict[str, str]]: - """ - Generate S3 metadata for output manifest uploads. - - Creates metadata dictionary containing asset root path and optional file system location. - Handles non-ASCII characters in paths by JSON-encoding them with ASCII-safe format. - - Returns: - dict[str, str]: S3 metadata dictionary with 'Metadata' key containing: - - 'asset-root': ASCII-compatible root path, or JSON-encoded root path for non-ASCII paths - - 'asset-root-json': JSON-encoded root path for non-ASCII paths - - 'file-system-location-name': Optional file system location name - """ - metadata: dict[str, str] = {} - try: - # Set 'asset-root' metadata as the path if the path is ASCII - self.rootPath.encode(encoding="ascii") - metadata["asset-root"] = self.rootPath - except UnicodeEncodeError: - # S3 metadata must be ASCII - # Add both 'asset-root' and 'asset-root-json' metadata encoded to ASCII as a JSON string - # Populate both fileds for backward compatibility - json_root_path = json.dumps(self.rootPath, ensure_ascii=True) - metadata["asset-root-json"] = json_root_path - metadata["asset-root"] = json_root_path - if self.fileSystemLocationName: - metadata["file-system-location-name"] = self.fileSystemLocationName - - return {"Metadata": metadata} - - -@dataclass -class Attachments: - """An object that holds the job attachments for a Job""" - - # The list of required assets per asset root - manifests: List[ManifestProperties] = field(default_factory=list) - # Method to use when loading assets required for a job - fileSystem: str = JobAttachmentsFileSystem.COPIED.value - - def to_dict(self) -> dict[str, Any]: - return { - "manifests": [manifest.to_dict() for manifest in self.manifests], - "fileSystem": self.fileSystem, - } - - -@dataclass -class JobAttachmentS3Settings: - """S3-specific Job Attachment settings, configured at the Queue level.""" - - # The S3 bucket all attachments are stored in. (required) - s3BucketName: str # pylint: disable=invalid-name - # The S3 bucket prefix all files are stored relative to. (required) - rootPrefix: str # pylint: disable=invalid-name - - @staticmethod - def from_root_path(root_path: str) -> JobAttachmentS3Settings: - path_split: list = root_path.split("/") - - if len(path_split) < 2: - raise MalformedAttachmentSettingError( - "Invalid root path format, should be s3BucketName/rootPrefix." - ) - - return JobAttachmentS3Settings(path_split[0], "/".join(path_split[1:])) - - @staticmethod - def from_s3_root_uri(uri: str) -> JobAttachmentS3Settings: - res = urlparse(uri) - - if not res.netloc or not res.path[1:] or res.scheme != "s3": - raise MalformedAttachmentSettingError( - "Invalid root uri format, should be s3://s3BucketName/rootPrefix." - ) - - return JobAttachmentS3Settings(res.netloc, res.path[1:]) - - def to_root_path(self) -> str: - return _join_s3_paths(self.s3BucketName, self.rootPrefix) - - def to_s3_root_uri(self) -> str: - return f"s3://{self.to_root_path()}" - - def full_cas_prefix(self) -> str: - self._validate_root_prefix() - return _join_s3_paths(self.rootPrefix, S3_DATA_FOLDER_NAME) - - def full_job_output_prefix(self, farm_id, queue_id, job_id) -> str: - self._validate_root_prefix() - return _join_s3_paths(self.rootPrefix, S3_MANIFEST_FOLDER_NAME, farm_id, queue_id, job_id) - - def full_step_output_prefix(self, farm_id, queue_id, job_id, step_id) -> str: - self._validate_root_prefix() - return _join_s3_paths( - self.rootPrefix, S3_MANIFEST_FOLDER_NAME, farm_id, queue_id, job_id, step_id - ) - - def full_task_output_prefix(self, farm_id, queue_id, job_id, step_id, task_id) -> str: - self._validate_root_prefix() - return _join_s3_paths( - self.rootPrefix, S3_MANIFEST_FOLDER_NAME, farm_id, queue_id, job_id, step_id, task_id - ) - - def full_output_prefix( - self, farm_id, queue_id, job_id, step_id, task_id, session_action_id - ) -> str: - self._validate_root_prefix() - return _join_s3_paths( - self.rootPrefix, - S3_MANIFEST_FOLDER_NAME, - farm_id, - queue_id, - job_id, - step_id, - task_id, - session_action_id, - ) - - @staticmethod - def partial_session_action_manifest_prefix( - farm_id: str, - queue_id: str, - job_id: str, - step_id: str, - task_id: str, - session_action_id: str, - time: float, - ) -> str: - """ - Constructs the partial S3 prefix for storing session action output manifests. - - This method creates a hierarchical path structure for organizing output manifests in S3, - following the pattern: farm_id/queue_id/job_id/step_id/task_id/timestamp_session_action_id. - The timestamp is converted from a float to an ISO datetime string format. - """ - return _join_s3_paths( - farm_id, - queue_id, - job_id, - step_id, - task_id, - f"{_float_to_iso_datetime_string(time)}_{session_action_id}", - ) - - @staticmethod - def partial_session_action_manifest_prefix_without_task( - farm_id: str, - queue_id: str, - job_id: str, - step_id: str, - session_action_id: str, - time: float, - ) -> str: - """ - Constructs the partial S3 prefix for storing session action output manifests. - - This method creates a hierarchical path structure for organizing output manifests in S3, - following the pattern: farm_id/queue_id/job_id/step_id/timestamp_session_action_id. - The timestamp is converted from a float to an ISO datetime string format. - """ - return _join_s3_paths( - farm_id, - queue_id, - job_id, - step_id, - f"{_float_to_iso_datetime_string(time)}_{session_action_id}", - ) - - def partial_manifest_prefix(self, farm_id, queue_id) -> str: - guid = _generate_random_guid() - return _join_s3_paths( - farm_id, - queue_id, - S3_INPUT_MANIFEST_FOLDER_NAME, - guid, - ) - - def add_root_and_manifest_folder_prefix(self, path: str) -> str: - """ - Adds “{self.rootPrefix}/{S3_MANIFEST_FOLDER_NAME}/” to the beginning - of the path and returns it. - """ - self._validate_root_prefix() - return _join_s3_paths(self.rootPrefix, S3_MANIFEST_FOLDER_NAME, path) - - def _validate_root_prefix(self) -> None: - if not self.rootPrefix: - raise MissingS3RootPrefixError("Missing S3 root prefix") - - -@dataclass -class Fleet: - """DataClass to store fleet objects""" - - fleetId: str # pylint: disable=invalid-name - priority: int - - -@dataclass -class Queue: - """DataClass to store queue objects""" - - queueId: str # pylint: disable=invalid-name - displayName: str - farmId: str # pylint: disable=invalid-name - status: str - defaultBudgetAction: str - jobAttachmentSettings: Optional[JobAttachmentS3Settings] = None # pylint: disable=invalid-name - - -@dataclass -class Job: - """A non-exhaustive DataClass to store job objects""" - - jobId: str - attachments: Optional[Attachments] = None # pylint: disable=invalid-name - - -@dataclass -class StorageProfile: - """DataClass to store Storage Profile For Queue objects""" - - storageProfileId: str - displayName: str - osFamily: StorageProfileOperatingSystemFamily - fileSystemLocations: List[FileSystemLocation] = field(default_factory=list) # type: ignore - - def to_dict(self) -> dict[str, Any]: - return { - "storageProfileId": self.storageProfileId, - "displayName": self.displayName, - "osFamily": self.osFamily.value, - "fileSystemLocations": [item.to_dict() for item in self.fileSystemLocations], - } - - -@dataclass -class FileSystemLocation: - """DataClass to store File System Location objects""" - - name: str - path: str - type: FileSystemLocationType - - def to_dict(self) -> dict[str, Any]: - return {"name": self.name, "path": self.path, "type": self.type.value} - - -class FileSystemLocationType(str, Enum): - SHARED = "SHARED" - LOCAL = "LOCAL" - - -class FileConflictResolution(Enum): - NOT_SELECTED = 0 - SKIP = 1 - OVERWRITE = 2 - CREATE_COPY = 3 - - -def default_glob_all() -> List[str]: - return ["**/*"] - - -@dataclass -class GlobConfig: - """Include and Exclude configuration for glob input files""" - - include_glob: List[str] = field(default_factory=default_glob_all) - exclude_glob: List[str] = field(default_factory=list) - - INCLUDE = "include" - EXCLUDE = "exclude" - - -@dataclass -class ManifestSnapshot: - """Data structure to store the results of a manifest snapshot""" - - root: str - manifest: str - - -@dataclass -class ManifestDiff: - """Data structure to store new, modified, or deleted files when comparing manifest to a local file system""" - - new: List[str] = field(default_factory=list) - modified: List[str] = field(default_factory=list) - deleted: List[str] = field(default_factory=list) - - -@dataclass -class ManifestDownload: - """Data structure to store the S3 and local paths of a manifest""" - - manifest_root: str = field(default_factory=str) - local_manifest_path: str = field(default_factory=str) - - -@dataclass -class ManifestMerge: - """Data structure to store the S3 and local paths of a manifest""" - - manifest_root: str = field(default_factory=str) - local_manifest_path: str = field(default_factory=str) - - -@dataclass -class ManifestDownloadResponse: - """Data structure to capture the response for manifest download""" - - downloaded: list[ManifestDownload] = field(default_factory=list) - - -@dataclass -class PathMappingRule: - source_path_format: str - """The path format associated with the source path (windows vs posix)""" - - source_path: str - """The path we're looking to change""" - - destination_path: str - """The path to transform the source path to""" - - def get_hashed_source_path(self, hash_alg: HashAlgorithm) -> str: - return hash_data(self.source_path.encode("utf-8"), hash_alg) - - -class FileStatus(Enum): - """ - Status of local files compared to manifest listed files, comparing hash and time modfied - """ - - UNCHANGED = 0 - NEW = 1 - MODIFIED = 2 - DELETED = 3 - - -@dataclass -class UploadManifestInfo: - """ - Structured class for output manifest information. - - Attributes: - output_manifest_path: The relative path to the uploaded output manifest without root prefix - output_manifest_hash: The hash of the output manifest content - source_path: Optional source path from the mapping rule - """ - - output_manifest_path: str - output_manifest_hash: str - source_path: Optional[str] = None diff --git a/src/deadline/job_attachments/os_file_permission.py b/src/deadline/job_attachments/os_file_permission.py deleted file mode 100644 index 47211fa45..000000000 --- a/src/deadline/job_attachments/os_file_permission.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -from pathlib import Path -import shutil -import sys -from dataclasses import dataclass -from enum import Enum -from typing import List, Set, Union - -from .exceptions import AssetSyncError, PathOutsideDirectoryError -from ._utils import _is_relative_to, _normalize_windows_path - - -@dataclass -class PosixFileSystemPermissionSettings: - """ - A dataclass representing file system permission-related information - for Posix. The specified permission modes will be bitwise-OR'ed with - the directory or file's existing permissions. - - Attributes: - os_user (str): The target operating system user for ownership. - os_group (str): The target operating system group for ownership. - dir_mode (int): The permission mode to be added to directories. - file_mode (int): The permission mode to be added to files. - """ - - os_user: str - os_group: str - dir_mode: int - file_mode: int - - -@dataclass -class WindowsPermissionEnum(Enum): - """ - An enumeration of different Windows permission flags. - """ - - READ = "READ" - WRITE = "WRITE" - EXECUTE = "EXECUTE" - READ_WRITE = "READ_WRITE" - FULL_CONTROL = "FULL_CONTROL" - - -@dataclass -class WindowsFileSystemPermissionSettings: - """ - A dataclass representing file system permission-related information - for Windows. - - Attributes: - os_user (str): The target operating system user or ownership. - os_group (str): The target operating system group for ownership. - dir_mode (WindowsPermissionEnum): The permission mode to be added to directories. - file_mode (WindowsPermissionEnum): The permission mode to be added to files. - """ - - os_user: str - dir_mode: WindowsPermissionEnum - file_mode: WindowsPermissionEnum - - -# A union of different file system permission settings that are based on the underlying OS. -FileSystemPermissionSettings = Union[ - PosixFileSystemPermissionSettings, WindowsFileSystemPermissionSettings -] - - -def _set_fs_group_for_posix( - file_paths: List[str], - local_root: str, - fs_permission_settings: PosixFileSystemPermissionSettings, -) -> None: - os_group = fs_permission_settings.os_group - dir_mode = fs_permission_settings.dir_mode - file_mode = fs_permission_settings.file_mode - - # A set that stores the unique directory paths where permissions need to be changed. - dir_paths_to_change_fs_group: Set[Path] = set() - - # 1. Set group ownership and permissions for each file. - for file_path_str in file_paths: - # The file path must be relative to the root path (ie. local_root). - if not _is_relative_to(file_path_str, local_root): - raise PathOutsideDirectoryError( - f"The provided path '{file_path_str}' is not under the root directory: {local_root}" - ) - - _change_permission_for_posix(file_path_str, os_group, file_mode) - - # Add the parent directories of each file to the set of directories whose - # group ownership and permissions will be changed. - path_components = Path(file_path_str).relative_to(local_root).parents - for path_component in path_components: - path_to_change = Path(local_root).joinpath(path_component) - dir_paths_to_change_fs_group.add(path_to_change) - - # 2. Set group ownership and permissions for the directories in the path starting from root. - for dir_path in dir_paths_to_change_fs_group: - _change_permission_for_posix(str(dir_path), os_group, dir_mode) - - -def _set_fs_permission_for_windows( - file_paths: List[str], - local_root: str, - fs_permission_settings: WindowsFileSystemPermissionSettings, -) -> None: - os_user = fs_permission_settings.os_user - dir_mode = fs_permission_settings.dir_mode - file_mode = fs_permission_settings.file_mode - - # A set that stores the unique directory paths where permissions need to be changed. - dir_paths_to_change_fs_group: Set[Path] = set() - - # 1. Set permissions for each file. - for file_path_str in file_paths: - # The file path must be relative to the root path (ie. local_root). - if not _is_relative_to(file_path_str, local_root): - raise PathOutsideDirectoryError( - f"The provided path '{file_path_str}' is not under the root directory: {local_root}" - ) - - _change_permission_for_windows(file_path_str, os_user, file_mode) - - # Add the parent directories of each file to the set of directories whose - # permissions will be changed. - path_components = ( - _normalize_windows_path(file_path_str) - .relative_to(_normalize_windows_path(local_root)) - .parents - ) - for path_component in path_components: - path_to_change = Path(local_root).joinpath(path_component) - dir_paths_to_change_fs_group.add(path_to_change) - - # 2. Set permissions for the directories in the path starting from root. - for dir_path in dir_paths_to_change_fs_group: - _change_permission_for_windows(str(dir_path), os_user, dir_mode) - - -def _change_permission_for_posix( - path_str: str, - os_group: str, - mode: int, -) -> None: - if sys.platform == "win32": - raise EnvironmentError("This function can only be executed on POSIX systems.") - - path = Path(path_str) - shutil.chown(path, group=os_group) - os.chmod(path, path.stat().st_mode | mode) - - -def _change_permission_for_windows( - path: str, - os_user: str, - mode: WindowsPermissionEnum, -) -> None: - if sys.platform != "win32": - raise EnvironmentError("This function can only be executed on Windows systems.") - - import win32security - - try: - con_mode = _get_ntsecuritycon_mode(mode) - # Lookup the user's SID (Security Identifier) - user_sid = win32security.LookupAccountName(None, os_user)[0] - # Get existing DACL (Discretionary Access Control List). If dacl is none, create a new one. - sd = win32security.GetFileSecurity(path, win32security.DACL_SECURITY_INFORMATION) - dacl = sd.GetSecurityDescriptorDacl() - if dacl is None: - dacl = win32security.ACL() - # Add new ACE (Access Control Entry) - dacl.AddAccessAllowedAce(win32security.ACL_REVISION, con_mode, user_sid) - # Set the modified DACL to the security descriptor - sd.SetSecurityDescriptorDacl(1, dacl, 0) - win32security.SetFileSecurity(path, win32security.DACL_SECURITY_INFORMATION, sd) - except win32security.error as e: - raise AssetSyncError( - f"Failed to set permissions for file or directory ({path}): {e}" - ) from e - - -def _get_ntsecuritycon_mode(mode: WindowsPermissionEnum) -> int: - """ - Get the NTSecurityCon mode for a WindowsPermissionEnum. - """ - if sys.platform != "win32": - raise EnvironmentError("This function can only be executed on Windows systems.") - - import ntsecuritycon as con - - permission_mapping = { - WindowsPermissionEnum.READ.value: con.FILE_GENERIC_READ, - WindowsPermissionEnum.WRITE.value: con.FILE_GENERIC_WRITE, - WindowsPermissionEnum.EXECUTE.value: con.FILE_GENERIC_EXECUTE, - WindowsPermissionEnum.READ_WRITE.value: con.FILE_GENERIC_READ | con.FILE_GENERIC_WRITE, - WindowsPermissionEnum.FULL_CONTROL.value: con.FILE_ALL_ACCESS, - } - return permission_mapping[mode.value] diff --git a/src/deadline/job_attachments/progress_tracker.py b/src/deadline/job_attachments/progress_tracker.py deleted file mode 100644 index 22c10c973..000000000 --- a/src/deadline/job_attachments/progress_tracker.py +++ /dev/null @@ -1,388 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from __future__ import annotations -from collections import Counter -from logging import Logger, LoggerAdapter - -import time -from dataclasses import asdict, dataclass, field -from enum import Enum -from threading import Lock -from typing import Callable, Dict, List, Optional, Union - -from ._path_summarization import human_readable_file_size - -CALLBACK_INTERVAL = 1 # in seconds -MAX_FILES_IN_CHUNK = 50 -LOG_INTERVAL = 300 # in seconds -LOG_PERCENTAGE_THRESHOLD = 10 # in percentage - - -@dataclass -class SummaryStatistics: - """ - A summary statistics metadata to be returned to the client when processing of files - (hashing or uploading) has completed. - The `skipped_files` refers to: - - if this statistics is for hashing operation: the number of files whose hashing is - skipped by the hash cache. - - if this statistics is for uploading operation: the number of files that have already - been uploaded to S3 bucket and thus skipped uploading. - """ - - total_time: float = 0.0 # time (in fractional seconds) taken to perform hashing or uploading - total_files: int = 0 - total_bytes: int = 0 - processed_files: int = 0 - processed_bytes: int = 0 - skipped_files: int = 0 - skipped_bytes: int = 0 - transfer_rate: float = 0.0 # bytes/second - - def aggregate(self, other: SummaryStatistics) -> SummaryStatistics: - """ - Aggregates other object of SummaryStatistics to this. - """ - if not isinstance(other, self.__class__): - raise TypeError("Only instances of the same type can be aggregated.") - self.total_time += other.total_time - self.total_files += other.total_files - self.total_bytes += other.total_bytes - self.processed_files += other.processed_files - self.processed_bytes += other.processed_bytes - self.skipped_files += other.skipped_files - self.skipped_bytes += other.skipped_bytes - self.transfer_rate = self.processed_bytes / self.total_time if self.total_time else 0.0 - - return self - - def __str__(self): - return ( - f"Processed {self.processed_files} file{'' if self.processed_files == 1 else 's'}" - + f" totaling {human_readable_file_size(self.processed_bytes)}.\n" - + f"Skipped re-processing {self.skipped_files} files totaling" - + f" {human_readable_file_size(self.skipped_bytes)}.\n" - + f"Total processing time of {round(self.total_time, ndigits=5)} seconds" - + f" at {human_readable_file_size(int(self.transfer_rate))}/s.\n" - ) - - -@dataclass -class DownloadSummaryStatistics(SummaryStatistics): - """ - A summary statistics metadata to be returned to the client when the downloading files has - completed. In addition to the general statistics, includes a dict mapping download locations - to the number of downloaded files in each of those locations. - """ - - file_counts_by_root_directory: Dict[str, int] = field(default_factory=dict) - downloaded_files: List[str] = field(default_factory=list) - - def aggregate(self, other: SummaryStatistics) -> SummaryStatistics: - """ - Aggregates other object of DownloadSummaryStatistics to this. - """ - super().aggregate(other) - if not hasattr(other, "file_counts_by_root_directory"): - raise TypeError( - f"{other.__class__.__name__} does not have a file_counts_by_root_directory field." - ) - else: - self.file_counts_by_root_directory = dict( - Counter(self.file_counts_by_root_directory) - + Counter(other.file_counts_by_root_directory) - ) - - return self - - def convert_to_summary_statistics(self) -> SummaryStatistics: - """ - Converts this DownloadSummaryStatistics to a SummaryStatistics. - """ - download_summary_statistics_dict = asdict(self) - del download_summary_statistics_dict["file_counts_by_root_directory"] - del download_summary_statistics_dict["downloaded_files"] - return SummaryStatistics(**download_summary_statistics_dict) - - -class ProgressStatus(Enum): - """ - Represents the current stage of asset/file processing - """ - - NONE = ("NONE", "") - """The asset manager is not assigned any work.""" - - PREPARING_IN_PROGRESS = ("PREPARING_IN_PROGRESS", "Processed") - """The asset manager is hashing files.""" - - UPLOAD_IN_PROGRESS = ("UPLOAD_IN_PROGRESS", "Uploaded") - """The asset manager is uploading files.""" - - DOWNLOAD_IN_PROGRESS = ("DOWNLOAD_IN_PROGRESS", "Downloaded") - """Downloading files""" - - SNAPSHOT_IN_PROGRESS = ("SNAPSHOT_IN_PROGRESS", "Snapshotted") - """Snapshotting files""" - - def __init__(self, title, verb_in_message): - self.title = title - self.verb_in_message = verb_in_message - - -@dataclass -class ProgressReportMetadata: - """ - A metadata (with defined key-value pairs) about the progress to be reported - back to client during file upload/downloads. Within this metadata will be - a status message and progress(%) of the hashing, uploads or downloads of files. - """ - - status: ProgressStatus - progress: float # percentage with one decimal place - transferRate: float # bytes/second - progressMessage: str # pylint: disable=invalid-name - processedFiles: int # number of files completed - - -@dataclass -class ProgressTracker: - """ - A class that records the progress of file processing, and reports the - progress data back to the client using callbacks passed from the client. - The process is one of the following - hashing, uploading, or downloading. - """ - - def __init__( - self, - status: ProgressStatus, - total_files: int, - total_bytes: int, - on_progress_callback: Optional[Callable[[ProgressReportMetadata], bool]] = None, - callback_interval: int = CALLBACK_INTERVAL, - max_files_in_chunk: int = MAX_FILES_IN_CHUNK, - logger: Optional[Union[Logger, LoggerAdapter]] = None, - log_interval: int = LOG_INTERVAL, - log_percentage_threshold: int = LOG_PERCENTAGE_THRESHOLD, - ) -> None: - def do_nothing(*args, **kwargs) -> bool: - return True - - if not on_progress_callback: - on_progress_callback = do_nothing - - self.on_progress_callback = on_progress_callback - self.continue_reporting = True - - self.reporting_interval = callback_interval - self.reporting_files_per_chunk = 1 - self.max_files_in_chunk = max_files_in_chunk - self.completed_files_in_chunk = 0 - self.last_report_time: Optional[float] = None - self.last_report_processed_bytes: int = 0 - - self.logger = logger - self.log_interval = log_interval - self.log_percentage_threshold = log_percentage_threshold - self.last_logged_time: Optional[float] = None - self.last_logged_completed_bytes: int = 0 - - self.status = status - self.total_files = total_files - self.total_bytes = total_bytes - if self.total_files >= self.max_files_in_chunk: - self.reporting_files_per_chunk = self.max_files_in_chunk - self.processed_files = 0 - self.processed_bytes = 0 - self.skipped_files = 0 - self.skipped_bytes = 0 - self.total_time = 0.0 # total time (in fractional seconds) taken for the process - - self._lock = Lock() - - def track_progress(bytes_amount: int, current_file_done: Optional[bool] = False) -> bool: - """ - When uploading or downloading files using boto3, pass this to the `Callback` argument - so that the progress can be updated with the amount of bytes processed. - """ - with self._lock: - self._initialize_timestamps_if_none() - self.processed_bytes += bytes_amount - if current_file_done: - self.processed_files += 1 - self.completed_files_in_chunk += 1 - # Logs progress message to the logger (if exists) - self._log_progress_message() - # Invokes the callback with current progress data - return self._report_progress() - - self.track_progress_callback = track_progress - - def set_total_files(self, total_files, total_bytes) -> None: - """ - Stores the number and size of files to be processed. - """ - self.total_files = total_files - self.total_bytes = total_bytes - if self.total_files >= self.max_files_in_chunk: - self.reporting_files_per_chunk = self.max_files_in_chunk - - def _initialize_timestamps_if_none(self) -> None: - """ - This is to initialize the `last_report_time` and `last_logged_time` to the - current time in alignment with the start of process (i.e., hashing, uploading, - or downloading.) These are used to calculate the current hashing or transfer - rate for the first progress report, which is the first callback invocation or - the first logging. - """ - current_time = time.perf_counter() - if self.last_report_time is None: - self.last_report_time = current_time - if self.last_logged_time is None: - self.last_logged_time = current_time - - def increase_processed(self, num_files: int = 1, file_bytes: int = 0) -> None: - """ - Adds the number and size of processed files. - """ - with self._lock: - self._initialize_timestamps_if_none() - self.processed_files += num_files - self.completed_files_in_chunk += num_files - self.processed_bytes += file_bytes - - def increase_skipped(self, num_files: int = 1, file_bytes: int = 0) -> None: - """ - Adds the number and size of skipped files. - """ - with self._lock: - self.skipped_files += num_files - self.completed_files_in_chunk += num_files - self.skipped_bytes += file_bytes - - def _report_progress(self) -> bool: - """ - Invokes the callback with current progress metadata in one of the following cases: - 1. when a specific time interval has passed since the last call, (or since the - process started,) or - 2. when a specific number of files (a chunk) has been processed, or - 3. when the progress is 100%, (including when all files were skipped during the process.) - - Sets the flag `continue_reporting` True if the operation should continue as normal, - or False to cancel, and returns the flag. - """ - if not self.continue_reporting: - return False - - current_time = time.perf_counter() - if ( - self.last_report_time is None - or current_time - self.last_report_time >= self.reporting_interval - or self.completed_files_in_chunk >= self.reporting_files_per_chunk - or self.processed_files + self.skipped_files == self.total_files - ): - self.continue_reporting = self.on_progress_callback( - self._get_progress_report_metadata() - ) - self.last_report_processed_bytes = self.processed_bytes - self.last_report_time = current_time - self.completed_files_in_chunk = 0 - return self.continue_reporting - - def report_progress(self) -> bool: - with self._lock: - return self._report_progress() - - def _get_progress_report_metadata(self) -> ProgressReportMetadata: - completed_bytes = self.processed_bytes + self.skipped_bytes - percentage = round( - completed_bytes / self.total_bytes * 100 if self.total_bytes > 0 else 0, 1 - ) - seconds_since_last_report = round( - time.perf_counter() - self.last_report_time if self.last_report_time else 0, 2 - ) - transfer_rate = ( - (self.processed_bytes - self.last_report_processed_bytes) / seconds_since_last_report - if seconds_since_last_report > 0 - else 0 - ) - transfer_rate_name = "Transfer rate" - if self.status == ProgressStatus.PREPARING_IN_PROGRESS: - transfer_rate_name = "Hashing speed" - - progress_message = ( - f"{self.status.verb_in_message}" - f" {human_readable_file_size(completed_bytes)} / {human_readable_file_size(self.total_bytes)}" - f" of {self.total_files} file{'' if self.total_files == 1 else 's'}" - f" ({transfer_rate_name}: {human_readable_file_size(int(transfer_rate))}/s)" - ) - - return ProgressReportMetadata( - status=self.status, - progress=percentage, - transferRate=transfer_rate, - progressMessage=progress_message, - processedFiles=self.processed_files, - ) - - def get_summary_statistics(self) -> SummaryStatistics: - """ - Returns the summary statistics of hashing or upload operation. - """ - transfer_rate = self.processed_bytes / self.total_time if self.total_time else 0.0 - - return SummaryStatistics( - total_time=self.total_time, - total_files=self.total_files, - total_bytes=self.total_bytes, - processed_files=self.processed_files, - processed_bytes=self.processed_bytes, - skipped_files=self.skipped_files, - skipped_bytes=self.skipped_bytes, - transfer_rate=transfer_rate, - ) - - def get_download_summary_statistics( - self, - downloaded_files_paths_by_root: dict[str, list[str]], - ) -> DownloadSummaryStatistics: - """ - Returns the summary statistics of download operation. - """ - summary_statistics_dict = asdict(self.get_summary_statistics()) - summary_statistics_dict["file_counts_by_root_directory"] = { - root: len(paths) for root, paths in downloaded_files_paths_by_root.items() - } - all_files = [] - for _, paths in downloaded_files_paths_by_root.items(): - all_files.extend(paths) - summary_statistics_dict["downloaded_files"] = sorted(all_files) - return DownloadSummaryStatistics(**summary_statistics_dict) - - def _log_progress_message(self) -> None: - """ - Logs progress message to the logger (if exists) on specific conditions: - 1. when the `log_interval` time has passed since the last call, (or since the - process started,) or, - 2. when the tracked progress percentage difference from the last log exceeds - the `log_percentage_threshold`, or - 3. when the process is fully completed (progress percentage reaches 100%). - """ - if self.logger is None: - return - - current_time = time.perf_counter() - current_completed_bytes = self.processed_bytes + self.skipped_bytes - progress_difference = ( - (current_completed_bytes - self.last_logged_completed_bytes) / self.total_bytes * 100 - ) - - if ( - self.last_logged_time is None - or current_time - self.last_logged_time >= self.log_interval - or progress_difference >= self.log_percentage_threshold - or self.processed_files + self.skipped_files == self.total_files - ): - self.logger.info(self._get_progress_report_metadata().progressMessage) - self.last_logged_completed_bytes = current_completed_bytes - self.last_logged_time = current_time diff --git a/src/deadline/job_attachments/py.typed b/src/deadline/job_attachments/py.typed deleted file mode 100644 index 7ef21167c..000000000 --- a/src/deadline/job_attachments/py.typed +++ /dev/null @@ -1 +0,0 @@ -# Marker file that indicates this package supports typing diff --git a/src/deadline/job_attachments/upload.py b/src/deadline/job_attachments/upload.py deleted file mode 100644 index d67e74e26..000000000 --- a/src/deadline/job_attachments/upload.py +++ /dev/null @@ -1,1681 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Classes for handling uploading of assets. -""" - -from __future__ import annotations - -import concurrent.futures -import random -from contextlib import contextmanager -import errno -import logging -import os -import stat -import sys -import time -from datetime import datetime -from functools import lru_cache -from io import BufferedReader, BytesIO -from math import trunc -from pathlib import Path, PurePath -from typing import Any, Callable, Generator, List, Optional, Tuple, Type, Union -import shutil - -import boto3 -from boto3.s3.transfer import ProgressCallbackInvoker -from botocore.exceptions import BotoCoreError, ClientError - -from .asset_manifests import ( - BaseAssetManifest, - BaseManifestModel, - HashAlgorithm, - hash_data, - hash_file, - ManifestModelRegistry, - ManifestVersion, - base_manifest, -) -from ._aws.aws_clients import ( - get_account_id, - get_boto3_session, - get_s3_client, - get_s3_transfer_manager, -) -from .exceptions import ( - COMMON_ERROR_GUIDANCE_FOR_S3, - AssetSyncCancelledError, - AssetSyncError, - JobAttachmentS3BotoCoreError, - JobAttachmentsError, - JobAttachmentsS3ClientError, - MisconfiguredInputsError, - MissingS3BucketError, - MissingS3RootPrefixError, -) -from .caches import HashCache, HashCacheEntry, S3CheckCache, S3CheckCacheEntry -from .models import ( - AssetRootGroup, - AssetRootManifest, - AssetUploadGroup, - Attachments, - FileStatus, - FileSystemLocationType, - JobAttachmentS3Settings, - ManifestProperties, - PathFormat, - StorageProfile, - S3_DATA_FOLDER_NAME, - S3_MANIFEST_FOLDER_NAME, -) -from .progress_tracker import ( - ProgressStatus, - ProgressTracker, - SummaryStatistics, -) -from ._utils import ( - _get_long_path_compatible_path, - _is_relative_to, - _join_s3_paths, -) - -logger = logging.getLogger("deadline.job_attachments.upload") - - -class _FileStatCache: - """Private cache for file stat results to avoid redundant filesystem calls""" - - @lru_cache(maxsize=1024) - def _get_stat(self, path_str: str) -> Optional[os.stat_result]: - """Get cached stat result for a path string""" - try: - return Path(path_str).stat() - except (FileNotFoundError, PermissionError, OSError): - return None - - def exists(self, path: Path) -> bool: - """Check if path exists, using cache when possible""" - stat_result = self._get_stat(str(path)) - if stat_result is not None: - return True - # Fall back to direct exists() call if stat failed - return path.exists() - - def is_dir(self, path: Path) -> bool: - """Check if path is directory, using cache when possible""" - stat_result = self._get_stat(str(path)) - if stat_result is not None: - return stat.S_ISDIR(stat_result.st_mode) - # Fall back to direct is_dir() call if stat failed - return path.is_dir() - - def get_size(self, path: Path) -> int: - """Get file size using cached stat""" - stat_result = self._get_stat(str(path)) - if stat_result is not None: - return stat_result.st_size - # Log warning for missing files and return 0 - logger.warning(f"Skipping file in size calculation: {path}") - return 0 - - -# The default multipart upload chunk size is 8 MB. We used this to determine the small file threshold, -# which is the chunk size multiplied by the small file threshold multiplier. -S3_MULTIPART_UPLOAD_CHUNK_SIZE: int = 8388608 # 8 MB -# The maximum number of concurrency for multipart uploads. This is used to determine the max number -# of thread workers for uploading multiple small files in parallel. -S3_UPLOAD_MAX_CONCURRENCY: int = 10 - - -class S3AssetUploader: - """ - Handler for uploading assets to S3 based off of an Asset Manifest. If no session is provided the default - credentials path will be used, see - https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials - """ - - def __init__( - self, - session: Optional[boto3.Session] = None, - *, - s3_max_pool_connections: int, - small_file_threshold_multiplier: int, - ) -> None: - if session is None: - self._session = get_boto3_session() - else: - self._session = session - - self.small_file_threshold = S3_MULTIPART_UPLOAD_CHUNK_SIZE * small_file_threshold_multiplier - - self.num_upload_workers = int( - s3_max_pool_connections - / min(small_file_threshold_multiplier, S3_UPLOAD_MAX_CONCURRENCY) - ) - if self.num_upload_workers <= 0: - self.num_upload_workers = 1 - - self._s3 = get_s3_client(self._session, s3_max_pool_connections=s3_max_pool_connections) - - # Confirm that the settings values are all positive. - error_msg = "" - if small_file_threshold_multiplier <= 0: - error_msg = f"'small_file_threshold_multiplier' ({small_file_threshold_multiplier}) must be positive integer." - elif s3_max_pool_connections <= 0: - error_msg = ( - f"'s3_max_pool_connections' ({s3_max_pool_connections}) must be positive integer." - ) - if error_msg: - raise AssetSyncError("Nonvalid value for configuration setting: " + error_msg) - - def upload_assets( - self, - job_attachment_settings: JobAttachmentS3Settings, - manifest: BaseAssetManifest, - source_root: Path, - partial_manifest_prefix: Optional[str] = None, - file_system_location_name: Optional[str] = None, - progress_tracker: Optional[ProgressTracker] = None, - s3_check_cache_dir: Optional[str] = None, - manifest_write_dir: Optional[str] = None, - manifest_name_suffix: str = "input", - manifest_metadata: dict[str, dict[str, str]] = dict(), - manifest_file_name: Optional[str] = None, - asset_root: Optional[Path] = None, - force_s3_check: Optional[bool] = None, - ) -> tuple[str, str]: - """ - Uploads assets based off of an asset manifest, uploads the asset manifest. - - Args: - manifest: The asset manifest to upload. - partial_manifest_prefix: The (partial) key prefix to use for uploading the manifest - to S3, excluding the initial section "/Manifest/". - e.g. "farm-1234/queue-1234/Inputs/" - source_root: The local root path of the assets. - job_attachment_settings: The settings for the job attachment configured in Queue. - progress_tracker: Optional progress tracker to track progress. - manifest_name_suffix: Suffix for given manifest naming. - manifest_metadata: File metadata for given manifest to be uploaded. - manifest_file_name: Optional file name for given manifest to be uploaded, otherwise use default name. - asset_root: The root in which asset actually in to facilitate path mapping. - force_s3_check: Controls S3 verification behavior: - - True: Skip the S3 check cache, always check whether uploads are already in S3. - - False/None: Use the S3 check cache, with periodic integrity sampling against S3 (default) - - Returns: - A tuple of (the partial key for the manifest on S3, the hash of input manifest). - """ - - # Upload asset manifest - hash_alg, manifest_bytes, manifest_name = S3AssetUploader._gather_upload_metadata( - manifest=manifest, - source_root=source_root, - file_system_location_name=file_system_location_name, - manifest_name_suffix=manifest_name_suffix, - ) - manifest_name = manifest_file_name if manifest_file_name else manifest_name - - if partial_manifest_prefix: - partial_manifest_key = _join_s3_paths(partial_manifest_prefix, manifest_name) - else: - partial_manifest_key = manifest_name - - full_manifest_key = job_attachment_settings.add_root_and_manifest_folder_prefix( - partial_manifest_key - ) - - if manifest_write_dir: - self._write_local_manifest( - manifest_write_dir, - manifest_name, - full_manifest_key, - manifest, - ) - - if partial_manifest_prefix: - self.upload_bytes_to_s3( - bytes=BytesIO(manifest_bytes), - bucket=job_attachment_settings.s3BucketName, - key=full_manifest_key, - extra_args=manifest_metadata, - ) - - # Verify S3 hash cache integrity, and reset cache if cached files are missing. - # Skip integrity check only when force_s3_check is True - we'll do S3 HEAD on every file anyway. - # When False or None, run the integrity check to catch stale cache entries. - if force_s3_check is not True and not self.verify_hash_cache_integrity( - s3_check_cache_dir, - manifest, - job_attachment_settings.full_cas_prefix(), - job_attachment_settings.s3BucketName, - ): - self.reset_s3_check_cache(s3_check_cache_dir) - - # Upload assets - self.upload_input_files( - manifest=manifest, - s3_bucket=job_attachment_settings.s3BucketName, - source_root=asset_root if asset_root else source_root, - s3_cas_prefix=job_attachment_settings.full_cas_prefix(), - progress_tracker=progress_tracker, - s3_check_cache_dir=s3_check_cache_dir, - force_s3_check=force_s3_check, - ) - - return (partial_manifest_key, hash_data(manifest_bytes, hash_alg)) - - def _snapshot_assets( - self, - snapshot_dir: Path, - manifest: BaseAssetManifest, - source_root: Path, - partial_manifest_prefix: Optional[str] = None, - file_system_location_name: Optional[str] = None, - progress_tracker: Optional[ProgressTracker] = None, - manifest_name_suffix: str = "input", - manifest_file_name: Optional[str] = None, - asset_root: Optional[Path] = None, - ) -> tuple[str, str]: - """ - Snapshots assets based off of an asset manifest, snapshots the asset manifest. The result - is a directory structure in snapshot_dir that matches the job attachments prefix layout. - - Args: - snapshot_dir: The directory in which to place the data and manifest snapshots. - manifest: The asset manifest to upload. - partial_manifest_prefix: The (partial) key prefix to use for uploading the manifest - to S3, excluding the initial section "/Manifest/". - e.g. "farm-1234/queue-1234/Inputs/" - source_root: The local root path of the assets. - progress_tracker: Optional progress tracker to track progress. - manifest_name_suffix: Suffix for given manifest naming. - manifest_metadata: File metadata for given manifest to be uploaded. - manifest_file_name: Optional file name for given manifest to be uploaded, otherwise use default name. - asset_root: The root in which asset actually in to facilitate path mapping. - - Returns: - A tuple of (the partial key for the manifest in the snapshot, the hash of input manifest). - """ - - # Snapshot asset manifest - hash_alg, manifest_bytes, manifest_name = S3AssetUploader._gather_upload_metadata( - manifest=manifest, - source_root=source_root, - file_system_location_name=file_system_location_name, - manifest_name_suffix=manifest_name_suffix, - ) - manifest_name = manifest_file_name if manifest_file_name else manifest_name - - if partial_manifest_prefix: - partial_manifest_key = _join_s3_paths(partial_manifest_prefix, manifest_name) - else: - partial_manifest_key = manifest_name - - manifest_file_path = snapshot_dir / S3_MANIFEST_FOLDER_NAME / partial_manifest_key - os.makedirs(_get_long_path_compatible_path(manifest_file_path.parent), exist_ok=True) - with open(_get_long_path_compatible_path(manifest_file_path), "wb") as fh: - fh.write(manifest_bytes) - - # Snapshot assets - self._snapshot_input_files( - snapshot_dir=snapshot_dir, - manifest=manifest, - source_root=asset_root if asset_root else source_root, - progress_tracker=progress_tracker, - ) - - return (partial_manifest_key, hash_data(manifest_bytes, hash_alg)) - - @staticmethod - def _gather_upload_metadata( - manifest: BaseAssetManifest, - source_root: Path, - manifest_name_suffix: str, - # TODO - remove file_system_location_name after ASSET_SYNC_JOB_USER_FEATURE completion - file_system_location_name: Optional[str] = None, - ) -> tuple[HashAlgorithm, bytes, str]: - """ - Gathers metadata information of manifest to be used for writing the local manifest - """ - hash_alg = manifest.get_default_hash_alg() - manifest_bytes = manifest.encode().encode("utf-8") - # Converting Path to str uses OS-specific separators (\ vs /), which can produce different hashes across OS - manifest_name_prefix = hash_data(str(source_root).encode(), hash_alg) - manifest_name = f"{manifest_name_prefix}_{manifest_name_suffix}" - - return (hash_alg, manifest_bytes, manifest_name) - - def _write_local_manifest( - self, - manifest_write_dir: str, - manifest_name: str, - full_manifest_key: str, - manifest: BaseAssetManifest, - root_dir_name: Optional[str] = None, - ) -> None: - """ - Writes a manifest file locally in a 'manifests' sub-directory. - Also creates/appends to a file mapping the local manifest name to the full S3 key in the same directory. - """ - self._write_local_input_manifest(manifest_write_dir, manifest_name, manifest, root_dir_name) - - self._write_local_manifest_s3_mapping(manifest_write_dir, manifest_name, full_manifest_key) - - @staticmethod - def _get_hashed_file_name_from_root_str( - manifest: BaseAssetManifest, - source_root: str, - manifest_name_suffix: str, - ) -> tuple[HashAlgorithm, str]: - """ - Gathers metadata information of manifest to be used for writing the local manifest - """ - hash_alg = manifest.get_default_hash_alg() - manifest_name_prefix = hash_data(source_root.encode(), hash_alg) - manifest_name = f"{manifest_name_prefix}_{manifest_name_suffix}" - - return (hash_alg, manifest_name) - - @staticmethod - def _write_local_input_manifest( - manifest_write_dir: str, - manifest_name: str, - manifest: BaseAssetManifest, - root_dir_name: Optional[str] = None, - ) -> Path: - """ - Creates 'manifests' sub-directory and writes a local input manifest file - """ - input_manifest_folder_name = "manifests" - if root_dir_name is not None: - input_manifest_folder_name = root_dir_name + "_" + input_manifest_folder_name - - local_manifest_file = Path(manifest_write_dir, input_manifest_folder_name, manifest_name) - logger.debug(f"Creating local manifest file: {local_manifest_file}") - local_manifest_file.parent.mkdir(parents=True, exist_ok=True) - with open(local_manifest_file, "w") as file: - file.write(manifest.encode()) - - return local_manifest_file - - def _write_local_manifest_s3_mapping( - self, - manifest_write_dir: str, - manifest_name: str, - full_manifest_key: str, - manifest_dir_name: Optional[str] = None, - ): - """ - Create or append to an existing mapping file. We use this since path lengths can go beyond the - file name length limit on Windows if we were to create the full S3 key path locally. - """ - manifest_map_file = Path( - manifest_write_dir, manifest_dir_name or "manifests", "manifest_s3_mapping" - ) - mapping = {"local_file": manifest_name, "s3_key": full_manifest_key} - with open(manifest_map_file, "a") as mapping_file: - mapping_file.write(f"{mapping}\n") - - def upload_input_files( - self, - manifest: BaseAssetManifest, - s3_bucket: str, - source_root: Path, - s3_cas_prefix: str, - progress_tracker: Optional[ProgressTracker] = None, - s3_check_cache_dir: Optional[str] = None, - force_s3_check: Optional[bool] = None, - ) -> None: - """ - Uploads all of the files listed in the given manifest to S3 if they don't exist in the - given S3 prefix already. - - The local 'S3 check cache' is used to note if we've seen an object in S3 before so we - can save the S3 API calls. - """ - - # Split into a separate 'large file' and 'small file' queues. - # Separate 'large' files from 'small' files so that we can process 'large' files serially. - # This wastes less bandwidth if uploads are cancelled, as it's better to use the multi-threaded - # multi-part upload for a single large file than multiple large files at the same time. - small_file_queue, large_file_queue = self._separate_files_by_size( - manifest.paths, self.small_file_threshold - ) - - with S3CheckCache(s3_check_cache_dir) as s3_cache: - # First, process the whole 'small file' queue with parallel object uploads. - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.num_upload_workers - ) as executor: - futures = { - executor.submit( - self.upload_object_to_cas, - file, - manifest.hashAlg, - s3_bucket, - source_root, - s3_cas_prefix, - s3_cache, - progress_tracker, - force_s3_check, - ): file - for file in small_file_queue - } - # surfaces any exceptions in the thread - for future in concurrent.futures.as_completed(futures): - is_uploaded, file_size = future.result() - if progress_tracker and not is_uploaded: - progress_tracker.increase_skipped(1, file_size) - - # Now process the whole 'large file' queue with serial object uploads (but still parallel multi-part upload.) - for file in large_file_queue: - is_uploaded, file_size = self.upload_object_to_cas( - file, - manifest.hashAlg, - s3_bucket, - source_root, - s3_cas_prefix, - s3_cache, - progress_tracker, - force_s3_check, - ) - if progress_tracker and not is_uploaded: - progress_tracker.increase_skipped(1, file_size) - - # to report progress 100% at the end, and - # to check if the job submission was canceled in the middle of processing the last batch of files. - if progress_tracker: - progress_tracker.report_progress() - if not progress_tracker.continue_reporting: - raise AssetSyncCancelledError( - "File upload cancelled.", progress_tracker.get_summary_statistics() - ) - - def _snapshot_input_files( - self, - snapshot_dir: Path, - manifest: BaseAssetManifest, - source_root: Path, - progress_tracker: Optional[ProgressTracker] = None, - ) -> None: - """ - Snapshots all of the files listed in the given manifest to snapshot_dir. - """ - os.makedirs(snapshot_dir / S3_DATA_FOLDER_NAME, exist_ok=True) - - # Process all the paths with parallel copy calls. - with concurrent.futures.ThreadPoolExecutor(max_workers=self.num_upload_workers) as executor: - futures = { - executor.submit( - self._snapshot_object_to_cas, - file, - manifest.hashAlg, - snapshot_dir, - source_root, - progress_tracker, - ): file - for file in manifest.paths - } - # surfaces any exceptions in the thread - for future in concurrent.futures.as_completed(futures): - future.result() - - # to report progress 100% at the end, and - # to check if the job snapshot was canceled in the middle of processing the last batch of files. - if progress_tracker: - progress_tracker.report_progress() - if not progress_tracker.continue_reporting: - raise AssetSyncCancelledError( - "File snapshot cancelled.", - progress_tracker.get_summary_statistics(), - ) - - def reset_s3_check_cache(self, s3_check_cache_dir: Optional[str]) -> None: - """ - Resets the S3 check cache by removing the cache altogether. - """ - with S3CheckCache(s3_check_cache_dir) as s3_check_cache: - logger.debug( - f"The s3_check_cache.db file in {s3_check_cache_dir} will be deleted, " - f"as a mismatch between the cache and the actual hash in S3 was found" - ) - # Remove the cache file - s3_check_cache.remove_cache() - - def _check_hashes_exist_in_s3(self, cache_entries: List[S3CheckCacheEntry]) -> bool: - """ - checks if the hashes in the cache entries exist in S3 - """ - for cache_entry in cache_entries: - try: - # Split the S3 key into bucket and key parts - bucket, key = cache_entry.s3_key.split("/", 1) - - # Check if the object is already uploaded and exist in S3 bucket - if self.file_already_uploaded(bucket=bucket, key=key): - logger.debug(f"cache_entry: {cache_entry} exist in S3") - - # If a mismatch found, return False to reset the cache immediately. There's no need to check the rest. - else: - return False - except Exception as e: - # If error occurs, log warning and return False to indicate cache reset required - logger.warning(f"Error occurred while checking {cache_entry}. Exception: {e}") - return False - - # Otherwise all hashes exist in S3 - return True - - def verify_hash_cache_integrity( - self, - s3_check_cache_dir: Optional[str], - manifest: BaseAssetManifest, - s3_cas_prefix: str, - s3_bucket: str, - ) -> bool: - """ - Inspects a sampling of the assets provided in manifest that are present in the S3 check cache and - verifies if the cached assets exist in S3. Returns True if all sampled cached assets exist in S3, False - otherwise. - """ - # Find the list of s3 upload keys that have been cached - s3_upload_keys: List[str] = [ - self._generate_s3_upload_key(file, manifest.hashAlg, s3_cas_prefix) - for file in manifest.paths - ] - - random.shuffle(s3_upload_keys) - sampled_cache_entries: List[S3CheckCacheEntry] = [] - with S3CheckCache(s3_check_cache_dir) as s3_cache: - local_connection = s3_cache.get_local_connection() - for upload_key in s3_upload_keys: - this_entry = s3_cache.get_connection_entry( - s3_key=f"{s3_bucket}/{upload_key}", connection=local_connection - ) - if this_entry is not None: - sampled_cache_entries.append(this_entry) - if len(sampled_cache_entries) >= 30: - break - return self._check_hashes_exist_in_s3(sampled_cache_entries) - - def _separate_files_by_size( - self, - files_to_upload: list[base_manifest.BaseManifestPath], - size_threshold: int, - ) -> Tuple[list[base_manifest.BaseManifestPath], list[base_manifest.BaseManifestPath]]: - """ - Splits the given list of files into two queues: one for small files and one for large files. - """ - small_file_queue: list[base_manifest.BaseManifestPath] = [] - large_file_queue: list[base_manifest.BaseManifestPath] = [] - for file in files_to_upload: - if file.size <= size_threshold: - small_file_queue.append(file) - else: - large_file_queue.append(file) - return (small_file_queue, large_file_queue) - - def _get_current_timestamp(self) -> str: - return str(datetime.now().timestamp()) - - def _generate_s3_upload_key( - self, - file: base_manifest.BaseManifestPath, - hash_algorithm: HashAlgorithm, - s3_cas_prefix: str, - ) -> str: - s3_upload_key = f"{file.hash}.{hash_algorithm.value}" - if s3_cas_prefix: - s3_upload_key = _join_s3_paths(s3_cas_prefix, s3_upload_key) - return s3_upload_key - - def upload_object_to_cas( - self, - file: base_manifest.BaseManifestPath, - hash_algorithm: HashAlgorithm, - s3_bucket: str, - source_root: Path, - s3_cas_prefix: str, - s3_check_cache: S3CheckCache, - progress_tracker: Optional[ProgressTracker] = None, - force_s3_check: Optional[bool] = None, - ) -> Tuple[bool, int]: - """ - Uploads an object to the S3 content-addressable storage (CAS) prefix. Optionally, - does a head-object check and only uploads the file if it doesn't exist in S3 already. - Returns a tuple (whether it has been uploaded, the file size). - - Args: - force_s3_check: Controls S3 verification behavior: - - True: Skip the S3 check cache, always check whether uploads are already in S3. - - False/None: Use the S3 check cache, with periodic integrity sampling against S3 (default) - """ - local_path = source_root.joinpath(file.path) - s3_upload_key = self._generate_s3_upload_key(file, hash_algorithm, s3_cas_prefix) - is_uploaded = False - - # Check cache first unless force_s3_check is True (skip cache entirely) - if force_s3_check is not True: - if s3_check_cache.get_connection_entry( - s3_key=f"{s3_bucket}/{s3_upload_key}", - connection=s3_check_cache.get_local_connection(), - ): - logger.debug( - f"skipping {local_path} because {s3_bucket}/{s3_upload_key} exists in the cache" - ) - return (is_uploaded, file.size) - - if self.file_already_uploaded(s3_bucket, s3_upload_key): - logger.debug( - f"skipping {local_path} because it has already been uploaded to s3://{s3_bucket}/{s3_upload_key}" - ) - else: - self.upload_file_to_s3( - local_path=local_path, - s3_bucket=s3_bucket, - s3_upload_key=s3_upload_key, - progress_tracker=progress_tracker, - ) - is_uploaded = True - - s3_check_cache.put_entry( - S3CheckCacheEntry( - s3_key=f"{s3_bucket}/{s3_upload_key}", - last_seen_time=self._get_current_timestamp(), - ) - ) - - return (is_uploaded, file.size) - - def _snapshot_object_to_cas( - self, - file: base_manifest.BaseManifestPath, - hash_algorithm: HashAlgorithm, - snapshot_dir: Path, - source_root: Path, - progress_tracker: Optional[ProgressTracker] = None, - ): - """ - Snapshots an object to the snapshot directory content-addressable storage (CAS) prefix. - """ - local_path = source_root.joinpath(file.path) - s3_upload_key = self._generate_s3_upload_key(file, hash_algorithm, S3_DATA_FOLDER_NAME) - file_size = local_path.resolve().stat().st_size - - shutil.copy2(local_path, snapshot_dir / s3_upload_key) - if progress_tracker is not None: - progress_tracker.track_progress_callback(file_size) - - def upload_file_to_s3( - self, - local_path: Path, - s3_bucket: str, - s3_upload_key: str, - progress_tracker: Optional[ProgressTracker] = None, - base_dir_path: Optional[Path] = None, - ) -> None: - """ - Uploads a single file to an S3 bucket using TransferManager, allowing mid-way - cancellation. It monitors for upload progress through a callback, `handler`, - which also checks if the upload should continue or not. If the `progress_tracker` - signals to stop, the ongoing upload is cancelled. - """ - transfer_manager = get_s3_transfer_manager(s3_client=self._s3) - - future: concurrent.futures.Future - - def handler(bytes_uploaded): - nonlocal progress_tracker - nonlocal future - - if progress_tracker: - should_continue = progress_tracker.track_progress_callback(bytes_uploaded) - if not should_continue and future is not None: - future.cancel() - - subscribers = [ProgressCallbackInvoker(handler)] - real_path = local_path.resolve() - - if base_dir_path: - # If base_dir_path is given, check if the file is actually within the base directory - is_file_within_base_dir = self._is_file_within_directory(real_path, base_dir_path) - else: - # If base_dir_path is not set, assume the file is within the base directory. - is_file_within_base_dir = True - - # Skip the file if it's (1) a directory, 2. not existing, or 3. not within the base directory. - if real_path.is_dir() or not real_path.exists() or not is_file_within_base_dir: - return - - with self._open_non_symlink_file_binary(str(real_path)) as file_obj: - if file_obj is None: - return - - future = transfer_manager.upload( - fileobj=file_obj, - bucket=s3_bucket, - key=s3_upload_key, - subscribers=subscribers, - ) - - try: - future.result() - is_uploaded = True - if progress_tracker and is_uploaded: - progress_tracker.increase_processed(1, 0) - except concurrent.futures.CancelledError as ce: - if progress_tracker and progress_tracker.continue_reporting is False: - raise AssetSyncCancelledError( - "File upload cancelled.", - progress_tracker.get_summary_statistics(), - ) - else: - raise AssetSyncError("File upload failed.", ce) from ce - except ClientError as exc: - status_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - status_code_guidance = { - **COMMON_ERROR_GUIDANCE_FOR_S3, - 403: ( - ( - "Forbidden or Access denied. Please check your AWS credentials, and ensure that " - "your AWS IAM Role or User has the 's3:PutObject' permission for this bucket. " - ) - if "kms:" not in str(exc) - else ( - "Forbidden or Access denied. Please check your AWS credentials and Job Attachments S3 bucket " - "encryption settings. If a customer-managed KMS key is set, confirm that your AWS IAM Role or " - "User has the 'kms:GenerateDataKey' and 'kms:DescribeKey' permissions for the key used to encrypt the bucket." - ) - ), - 404: "Not found. Please check your bucket name and object key, and ensure that they exist in the AWS account.", - } - raise JobAttachmentsS3ClientError( - action="uploading file", - status_code=status_code, - bucket_name=s3_bucket, - key_or_prefix=s3_upload_key, - message=f"{status_code_guidance.get(status_code, '')} {str(exc)} (Failed to upload {str(local_path)})", - ) from exc - except BotoCoreError as bce: - raise JobAttachmentS3BotoCoreError( - action="uploading file", - error_details=str(bce), - ) from bce - except Exception as e: - raise AssetSyncError(e) from e - - @contextmanager - def _open_non_symlink_file_binary( - self, path: str - ) -> Generator[Optional[BufferedReader], None, None]: - """ - Open a file in binary mode after verifying that it is not a symbolic link. - Raises: - OSError: If the given path is a symbolic link or doesn't match the actual file. - """ - fd = None - file_obj = None - try: - open_flags = os.O_RDONLY - # Make sure the file isn’t following a symlink to a different path. - if hasattr(os, "O_NOFOLLOW"): - open_flags |= os.O_NOFOLLOW - elif sys.platform != "win32" and not os.path.islink(path): - # We are on a non-Windows system that does not support O_NOFOLLOW. When we encounter - # symbolic link, we cannot guarantee security here, so log a warning and reject the file. - logger.warning( - f"Job Attachments does not support files referenced by symbolic links on this system ({sys.platform}). " - "Please refrain from using symbolic links in Job Attachment asset roots and use real files instead. " - f"The following file will be skipped: {path}." - ) - yield None - - fd = os.open(path, open_flags) - if sys.platform == "win32": - # Windows does not support O_NOFOLLOW. So, check the file handle with GetFinalPathNameByHandle - # to verify it is actually pointing to the path that we verified to be safe to open. - if not self._is_path_win32_final_path_of_file_descriptor(path, fd): - # ELOOP is the error code that open with NOFOLLOW will return - # if the path is a symlink. We raise the same error here for - # the sake of consistency. - raise OSError(errno.ELOOP, "Mismatch between path and its final path", path) - - if str(Path(path).resolve()) != path: - raise OSError(errno.ELOOP, "Mismatch between path and its final path", path) - - with os.fdopen(fd, "rb", closefd=False) as file_obj: - yield file_obj - except OSError as e: - logger.warning(f"Failed to open file. The following file will be skipped: {path}: {e}") - yield None - finally: - if fd is not None: - os.close(fd) - if file_obj is not None: - file_obj.close() - - def _is_path_win32_final_path_of_file_descriptor(self, path: str, fd: int): - """ - Check if the normalized path from the file descriptor matches the specified path. - """ - if sys.platform != "win32": - raise EnvironmentError("This function can only be executed on Windows systems.") - - import ctypes - import msvcrt - from ._windows import file as win_file - - # Get the handle from the file descriptor - try: - h = msvcrt.get_osfhandle(fd) - except OSError as e: - logger.warning(f"Error resolving file descriptor ({fd}) to '{path}': {e}") - return False - - # Get the final path name using Win32 API GetFinalPathNameByHandleW - buffer_len = 4096 - buffer = ctypes.create_unicode_buffer(buffer_len) - path_len = win_file.GetFinalPathNameByHandleW( - h, - buffer, - buffer_len, - win_file.VOLUME_NAME_DOS, - ) - if path_len == 0: - ctypes.WinError() - elif path_len > buffer_len: - # path_len has the required buffer length (returned by GetFinalPathNameByHandleW) - # Create a buffer of this size and call the API again - buffer_len = path_len - buffer = ctypes.create_unicode_buffer(buffer_len) - path_len = win_file.GetFinalPathNameByHandleW( - h, - buffer, - buffer_len, - win_file.VOLUME_NAME_DOS, - ) - - if path_len != buffer_len or path_len == 0: - # MS documentation states that if GetFinalPathNameByHandleW returns a positive value - # greater than the initial buffer length, it is the required buffer length to fit the - # path name. This branch uses the that value to create a new buffer, so this should - # never fail unless GetFinalPathNameByHandleW behavior has changed. - logger.error( - "GetFinalPathNameByHandleW reported incorrect required buffer length. " - f"Rejecting file at '{path}'" - ) - return False - - final_path = ctypes.wstring_at(buffer) - - # GetFinalPathNameByHandleW() returns a path that starts with the \\?\ - # prefix, which pathlib.Path.resolve() removes. The following is intended - # to match the behavior of resolve(). - prefix = r"\\?" "\\" - unc_prefix = r"\\?\UNC" "\\" - - if final_path.startswith(prefix) and not path.startswith(prefix): - if final_path.startswith(unc_prefix): - simplified_path = "\\\\" + final_path[len(unc_prefix) :] - else: - simplified_path = final_path[len(prefix) :] - - final_path = simplified_path - - return path == final_path - - def _is_file_within_directory(self, file_path: Path, directory_path: Path) -> bool: - """ - Checks if the given file path is within the given directory path. - """ - real_file_path = file_path.resolve() - real_directory_path = directory_path.resolve() - common_path = os.path.commonpath([real_file_path, real_directory_path]) - return common_path.startswith(str(real_directory_path)) - - def file_already_uploaded(self, bucket: str, key: str) -> bool: - """ - Check whether the file has already been uploaded by doing a head-object call. - """ - try: - self._s3.head_object( - Bucket=bucket, - Key=key, - ) - return True - except ClientError as exc: - error_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - if error_code == 403: - message = ( - f"Access denied. Ensure that the bucket is in the account {get_account_id(session=self._session)}, " - "and your AWS IAM Role or User has the 's3:ListBucket' permission for this bucket." - ) - raise JobAttachmentsS3ClientError( - "checking if object exists", error_code, bucket, key, message - ) from exc - return False - except BotoCoreError as bce: - raise JobAttachmentS3BotoCoreError( - action="checking for the existence of an object in the S3 bucket", - error_details=str(bce), - ) from bce - except Exception as e: - raise AssetSyncError(e) from e - - def upload_bytes_to_s3( - self, - bytes: BytesIO, - bucket: str, - key: str, - progress_handler: Optional[Callable[[int], None]] = None, - extra_args: dict[str, Any] = dict(), - ) -> None: - try: - extra_args_merged: dict[str, Union[str, dict]] = { - "ExpectedBucketOwner": get_account_id(session=self._session), - **extra_args, - } - - self._s3.upload_fileobj( - bytes, - bucket, - key, - ExtraArgs=extra_args_merged, - Callback=progress_handler, - ) - except ClientError as exc: - status_code = int(exc.response["ResponseMetadata"]["HTTPStatusCode"]) - status_code_guidance = { - **COMMON_ERROR_GUIDANCE_FOR_S3, - 403: ( - ( - "Forbidden or Access denied. Please check your AWS credentials, and ensure that " - "your AWS IAM Role or User has the 's3:PutObject' permission for this bucket. " - ) - if "kms:" not in str(exc) - else ( - "Forbidden or Access denied. Please check your AWS credentials and Job Attachments S3 bucket " - "encryption settings. If a customer-managed KMS key is set, confirm that your AWS IAM Role or " - "User has the 'kms:GenerateDataKey' and 'kms:DescribeKey' permissions for the key used to encrypt the bucket." - ) - ), - 404: "Not found. Please check your bucket name, and ensure that it exists in the AWS account.", - } - raise JobAttachmentsS3ClientError( - action="uploading binary file", - status_code=status_code, - bucket_name=bucket, - key_or_prefix=key, - message=f"{status_code_guidance.get(status_code, '')} {str(exc)}", - ) from exc - except BotoCoreError as bce: - raise JobAttachmentS3BotoCoreError( - action="uploading binary file", - error_details=str(bce), - ) from bce - except Exception as e: - raise AssetSyncError(e) from e - - -class S3AssetManager: - """ - Asset handler that creates an asset manifest and uploads assets. Based on an S3 file system. - """ - - def __init__( - self, - farm_id: Optional[str] = None, - queue_id: Optional[str] = None, - job_attachment_settings: Optional[JobAttachmentS3Settings] = None, - asset_uploader: Optional[S3AssetUploader] = None, - session: Optional[boto3.Session] = None, - asset_manifest_version: ManifestVersion = ManifestVersion.v2023_03_03, - s3_max_pool_connections: int = 50, - small_file_threshold_multiplier: int = 20, - ) -> None: - self.farm_id = farm_id - self.queue_id = queue_id - self.job_attachment_settings: Optional[JobAttachmentS3Settings] = job_attachment_settings - - if self.job_attachment_settings: - if not self.job_attachment_settings.s3BucketName: - raise MissingS3BucketError( - "To use Job Attachments, the 's3BucketName' must be set in your queue's JobAttachmentSettings" - ) - if not self.job_attachment_settings.rootPrefix: - raise MissingS3RootPrefixError( - "To use Job Attachments, the 'rootPrefix' must be set in your queue's JobAttachmentSettings" - ) - - if asset_uploader is None: - asset_uploader = S3AssetUploader( - session=session, - s3_max_pool_connections=s3_max_pool_connections, - small_file_threshold_multiplier=small_file_threshold_multiplier, - ) - - self.asset_uploader = asset_uploader - self.session = session - - self.manifest_version: ManifestVersion = asset_manifest_version - self._stat_cache = _FileStatCache() - - def _process_input_path( - self, - path: Path, - root_path: str, - hash_cache: HashCache, - progress_tracker: Optional[ProgressTracker] = None, - update: bool = True, - ) -> Tuple[FileStatus, int, base_manifest.BaseManifestPath]: - # If it's cancelled, raise an AssetSyncCancelledError exception - if progress_tracker and not progress_tracker.continue_reporting: - raise AssetSyncCancelledError( - "File hashing cancelled.", progress_tracker.get_summary_statistics() - ) - - manifest_model: Type[BaseManifestModel] = ManifestModelRegistry.get_manifest_model( - version=self.manifest_version - ) - hash_alg: HashAlgorithm = manifest_model.AssetManifest.get_default_hash_alg() - - full_path = str(path.resolve()) - file_status: FileStatus = FileStatus.UNCHANGED - actual_modified_time = str(datetime.fromtimestamp(path.stat().st_mtime)) - - entry: Optional[HashCacheEntry] = hash_cache.get_connection_entry( - full_path, hash_alg, connection=hash_cache.get_local_connection() - ) - if entry is not None: - # If the file was modified, we need to rehash it - if actual_modified_time != entry.last_modified_time: - entry.last_modified_time = actual_modified_time - entry.file_hash = hash_file(full_path, hash_alg) - entry.hash_algorithm = hash_alg - file_status = FileStatus.MODIFIED - else: - entry = HashCacheEntry( - file_path=full_path, - hash_algorithm=hash_alg, - file_hash=hash_file(full_path, hash_alg), - last_modified_time=actual_modified_time, - ) - file_status = FileStatus.NEW - - if file_status != FileStatus.UNCHANGED and update: - hash_cache.put_entry(entry) - - file_size = path.resolve().stat().st_size - path_args: dict[str, Any] = { - "path": path.relative_to(root_path).as_posix(), - "hash": entry.file_hash, - } - - # stat().st_mtime_ns returns an int that represents the time in nanoseconds since the epoch. - # The asset manifest spec requires the mtime to be represented as an integer in microseconds. - path_args["mtime"] = trunc(path.stat().st_mtime_ns // 1000) - path_args["size"] = file_size - - return (file_status, file_size, manifest_model.Path(**path_args)) - - def _create_manifest_file( - self, - input_paths: list[Path], - root_path: str, - hash_cache: HashCache, - progress_tracker: Optional[ProgressTracker] = None, - ) -> BaseAssetManifest: - manifest_model: Type[BaseManifestModel] = ManifestModelRegistry.get_manifest_model( - version=self.manifest_version - ) - if manifest_model.manifest_version in { - ManifestVersion.v2023_03_03, - }: - paths: list[base_manifest.BaseManifestPath] = [] - - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = { - executor.submit( - self._process_input_path, - path, - root_path, - hash_cache, - progress_tracker, - ): path - for path in input_paths - } - for future in concurrent.futures.as_completed(futures): - file_status, file_size, path_to_put_in_manifest = future.result() - paths.append(path_to_put_in_manifest) - if progress_tracker: - if file_status == FileStatus.NEW or file_status == FileStatus.MODIFIED: - progress_tracker.increase_processed(1, file_size) - else: - progress_tracker.increase_skipped(1, file_size) - progress_tracker.report_progress() - - # Need to sort the list to keep it canonical - paths.sort(key=lambda x: x.path, reverse=True) - - manifest_args: dict[str, Any] = { - "hash_alg": manifest_model.AssetManifest.get_default_hash_alg(), - "paths": paths, - } - - manifest_args["total_size"] = sum([path.size for path in paths]) - - return manifest_model.AssetManifest(**manifest_args) - else: - raise NotImplementedError( - f"Creation of manifest version {manifest_model.manifest_version} is not supported." - ) - - def _get_asset_groups( - self, - input_paths: set[str], - output_paths: set[str], - referenced_paths: set[str], - local_type_locations: dict[str, str] = {}, - shared_type_locations: dict[str, str] = {}, - require_paths_exist: bool = False, - ) -> list[AssetRootGroup]: - """ - For the given input paths and output paths, a list of groups is returned, where paths sharing - the same root path are grouped together. Note that paths can be files or directories. - - The returned list satisfies the following conditions: - - If a path is relative to any of the paths in the given `shared_type_locations` paths, it is - excluded from the list. - - The given `local_type_locations` paths can each form a group based on its root path. In other - words, if there are paths relative to any of the `local_type_locations` paths, they are grouped - together as one. - - The referenced paths may have no files or directories associated, but they always live - relative to one of the AssetRootGroup objects returned. - """ - groupings: dict[str, AssetRootGroup] = {} - missing_input_paths = set() - misconfigured_directories = set() - - # Resolve full path, then cast to pure path to get top-level directory - for _path in input_paths: - # Need to use absolute to not resolve symlinks, but need normpath to get rid of relative paths, i.e. '..' - abs_path = Path(os.path.normpath(Path(_path).absolute())) - if not self._stat_cache.exists(abs_path): - if require_paths_exist: - missing_input_paths.add(abs_path) - else: - logger.warning( - f"Input path '{_path}' resolving to '{abs_path}' does not exist. Adding to referenced paths." - ) - referenced_paths.add(_path) - continue - if self._stat_cache.is_dir(abs_path): - misconfigured_directories.add(abs_path) - continue - - # Skips the upload if the path is relative to any of the File System Location - # of SHARED type that was set in the Job. - if any(_is_relative_to(abs_path, shared) for shared in shared_type_locations): - continue - - # If the path is relative to any of the File System Location of LOCAL type, - # groups the files into a single group using the path of that location. - matched_root = self._find_matched_root_from_local_type_locations( - groupings=groupings, - abs_path=abs_path, - local_type_locations=local_type_locations, - ) - matched_group = self._get_matched_group(matched_root, groupings) - matched_group.inputs.add(abs_path) - - if missing_input_paths or misconfigured_directories: - all_misconfigured_inputs = "" - misconfigured_inputs_msg = ( - "Job submission contains missing input files or directories specified as files." - " All inputs must exist and be classified properly." - ) - if missing_input_paths: - missing_inputs_list: list[str] = sorted([str(i) for i in missing_input_paths]) - all_missing_inputs = "\n\t".join(missing_inputs_list) - all_misconfigured_inputs += f"\nMissing input files:\n\t{all_missing_inputs}" - if misconfigured_directories: - misconfigured_directories_list: list[str] = sorted( - [str(d) for d in misconfigured_directories] - ) - all_misconfigured_directories = "\n\t".join(misconfigured_directories_list) - all_misconfigured_inputs += ( - f"\nDirectories classified as files:\n\t{all_misconfigured_directories}" - ) - raise MisconfiguredInputsError(misconfigured_inputs_msg + all_misconfigured_inputs) - - for _path in output_paths: - abs_path = Path(os.path.normpath(Path(_path).absolute())) - - # Skips the upload if the path is relative to any of the File System Location - # of SHARED type that was set in the Job. - if any(_is_relative_to(abs_path, shared) for shared in shared_type_locations): - continue - - # If the path is relative to any of the File System Location of LOCAL type, - # groups the files into a single group using the path of that location. - matched_root = self._find_matched_root_from_local_type_locations( - groupings=groupings, - abs_path=abs_path, - local_type_locations=local_type_locations, - ) - matched_group = self._get_matched_group(matched_root, groupings) - matched_group.outputs.add(abs_path) - - for _path in referenced_paths: - abs_path = Path(os.path.normpath(Path(_path).absolute())) - - # Skips the reference if the path is relative to any of the File System Location - # of SHARED type that was set in the Job. - if any(_is_relative_to(abs_path, shared) for shared in shared_type_locations): - continue - # If the path is relative to any of the File System Location of LOCAL type, - # groups the references into a single group using the path of that location. - matched_root = self._find_matched_root_from_local_type_locations( - groupings=groupings, - abs_path=abs_path, - local_type_locations=local_type_locations, - ) - matched_group = self._get_matched_group(matched_root, groupings) - matched_group.references.add(abs_path) - - # Finally, build the list of asset root groups - for asset_group in groupings.values(): - common_path: Path = Path( - os.path.commonpath( - list(asset_group.inputs | asset_group.outputs | asset_group.references) - ) - ) - if common_path.is_file(): - common_path = common_path.parent - asset_group.root_path = str(common_path) - - return sorted(groupings.values(), key=lambda v: (v.root_path, v.file_system_location_name)) - - def _get_matched_group( - self, root_path: str, groupings: dict[str, AssetRootGroup] - ) -> AssetRootGroup: - root_normcase = os.path.normcase(root_path) - matched_group = next( - (group for key, group in groupings.items() if os.path.normcase(key) == root_normcase), - None, - ) - if matched_group is None: - raise ValueError( - f"No group found for the root path '{root_path}' in the groupings dictionary: {groupings}" - ) - return matched_group - - def _find_matched_root_from_local_type_locations( - self, - groupings: dict[str, AssetRootGroup], - abs_path: Path, - local_type_locations: dict[str, str] = {}, - ) -> str: - """ - Checks if the given `abs_path` is relative to any of the File System Locations of LOCAL type. - If it is, select the most specific File System Location, and add a new grouping keyed by that - matched root path (if the key does not exist.) Then, returns the matched root path. - If no match is found, returns the top directory of `abs_path` as the key used for grouping. - """ - matched_root = None - for root_path in local_type_locations.keys(): - if _is_relative_to(abs_path, root_path) and ( - matched_root is None or len(root_path) > len(matched_root) - ): - matched_root = root_path - - if matched_root is not None: - if matched_root not in groupings: - groupings[matched_root] = AssetRootGroup( - file_system_location_name=local_type_locations[matched_root], - ) - return matched_root - else: - keys_normcase = [os.path.normcase(key) for key in groupings.keys()] - top_directory = PurePath(abs_path).parts[0] - top_directory_normcase = os.path.normcase(top_directory) - if top_directory_normcase not in keys_normcase: - groupings[top_directory] = AssetRootGroup() - else: - return top_directory_normcase - return top_directory - - def _get_total_size_of_files(self, paths: list[str]) -> int: - def get_file_size(path_str: str) -> int: - return self._stat_cache.get_size(Path(path_str)) - - with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: - sizes = list(executor.map(get_file_size, paths)) - - return sum(sizes) - - def _get_total_input_size_from_manifests( - self, manifests: list[AssetRootManifest] - ) -> tuple[int, int]: - total_files = 0 - total_bytes = 0 - for asset_root_manifest in manifests: - if asset_root_manifest.asset_manifest: - total_files += len(asset_root_manifest.asset_manifest.paths) - for path in asset_root_manifest.asset_manifest.paths: - total_bytes += path.size - return (total_files, total_bytes) - - def _get_total_input_size_from_asset_group( - self, groups: list[AssetRootGroup] - ) -> tuple[int, int]: - total_files = 0 - total_bytes = 0 - for group in groups: - input_paths = [str(input) for input in group.inputs] - total_bytes += self._get_total_size_of_files(input_paths) - total_files += len(input_paths) - return (total_files, total_bytes) - - def _get_file_system_locations_by_type( - self, - storage_profile_for_queue: StorageProfile, - ) -> Tuple[dict, dict]: - """ - Given the Storage Profile for Queue object, extracts and groups - path and name pairs from the File System Locations into two dicts, - LOCAL and SHARED type, respectively. Returns a tuple of two dicts. - """ - local_type_locations: dict[str, str] = {} - shared_type_locations: dict[str, str] = {} - for fs_loc in storage_profile_for_queue.fileSystemLocations: - if fs_loc.type == FileSystemLocationType.LOCAL: - local_type_locations[fs_loc.path] = fs_loc.name - elif fs_loc.type == FileSystemLocationType.SHARED: - shared_type_locations[fs_loc.path] = fs_loc.name - return local_type_locations, shared_type_locations - - def _group_asset_paths( - self, - input_paths: list[str], - output_paths: list[str], - referenced_paths: list[str], - storage_profile: Optional[StorageProfile], - require_paths_exist: bool, - ) -> list[AssetRootGroup]: - """ - Resolves all of the paths that will be uploaded, sorting by storage profile location. - """ - local_type_locations: dict[str, str] = {} - shared_type_locations: dict[str, str] = {} - if storage_profile: - ( - local_type_locations, - shared_type_locations, - ) = self._get_file_system_locations_by_type(storage_profile) - - # Group the paths by asset root, removing duplicates and empty strings - asset_groups: list[AssetRootGroup] = self._get_asset_groups( - {ip_path for ip_path in input_paths if ip_path}, - {op_path for op_path in output_paths if op_path}, - {rf_path for rf_path in referenced_paths if rf_path}, - local_type_locations, - shared_type_locations, - require_paths_exist, - ) - - return asset_groups - - def prepare_paths_for_upload( - self, - input_paths: list[str], - output_paths: list[str], - referenced_paths: list[str], - storage_profile: Optional[StorageProfile] = None, - require_paths_exist: bool = False, - ) -> AssetUploadGroup: - """ - Processes all of the paths required for upload, grouping them by asset root and local storage profile locations. - Returns an object containing the grouped paths, which also includes a dictionary of input directories and file counts - for files that were not under the root path or any local storage profile locations. - """ - asset_groups = self._group_asset_paths( - input_paths, - output_paths, - referenced_paths, - storage_profile, - require_paths_exist, - ) - input_file_count, input_bytes = self._get_total_input_size_from_asset_group(asset_groups) - return AssetUploadGroup( - asset_groups=asset_groups, - total_input_files=input_file_count, - total_input_bytes=input_bytes, - ) - - def hash_assets_and_create_manifest( - self, - asset_groups: list[AssetRootGroup], - total_input_files: int, - total_input_bytes: int, - hash_cache_dir: Optional[str] = None, - on_preparing_to_submit: Optional[Callable[[Any], bool]] = None, - ) -> tuple[SummaryStatistics, list[AssetRootManifest]]: - """ - Computes the hashes for input files, and creates manifests using the local hash cache. - - Args: - hash_cache_dir: a path to local hash cache directory. If it's None, use default path. - on_preparing_to_submit: a callback to be called to periodically report progress to the caller. - The callback returns True if the operation should continue as normal, or False to cancel. - - Returns: - a tuple with (1) the summary statistics of the hash operation, and - (2) a list of AssetRootManifest (a manifest and output paths for each asset root). - """ - start_time = time.perf_counter() - - # Sets up progress tracker to report upload progress back to the caller. - progress_tracker = ProgressTracker( - status=ProgressStatus.PREPARING_IN_PROGRESS, - total_files=total_input_files, - total_bytes=total_input_bytes, - on_progress_callback=on_preparing_to_submit, - ) - - asset_root_manifests: list[AssetRootManifest] = [] - for group in asset_groups: - # Might have output directories, but no inputs for this group - asset_manifest: Optional[BaseAssetManifest] = None - if group.inputs: - # Create manifest, using local hash cache - with HashCache(hash_cache_dir) as hash_cache: - asset_manifest = self._create_manifest_file( - sorted(list(group.inputs)), - group.root_path, - hash_cache, - progress_tracker, - ) - - asset_root_manifests.append( - AssetRootManifest( - file_system_location_name=group.file_system_location_name, - root_path=group.root_path, - asset_manifest=asset_manifest, - outputs=sorted(list(group.outputs)), - ) - ) - - progress_tracker.total_time = time.perf_counter() - start_time - - return (progress_tracker.get_summary_statistics(), asset_root_manifests) - - def upload_assets( - self, - manifests: list[AssetRootManifest], - on_uploading_assets: Optional[Callable[[Any], bool]] = None, - s3_check_cache_dir: Optional[str] = None, - manifest_write_dir: Optional[str] = None, - force_s3_check: Optional[bool] = None, - ) -> tuple[SummaryStatistics, Attachments]: - """ - Uploads all the files for provided manifests and manifests themselves to S3. - - Args: - manifests: a list of manifests that contain assets to be uploaded - on_uploading_assets: a callback to be called to periodically report progress to the caller. - The callback returns True if the operation should continue as normal, or False to cancel. - force_s3_check: Controls S3 verification behavior: - - True: Skip the S3 check cache, always check whether uploads are already in S3. - - False/None: Use the S3 check cache, with periodic integrity sampling against S3 (default) - - Returns: - a tuple with (1) the summary statistics of the upload operation, and - (2) the S3 path to the asset manifest file. - """ - # This is a programming error if the user did not construct the object with Farm and Queue IDs. - if not self.farm_id or not self.queue_id: - logger.error("upload_assets: Farm or Fleet ID is missing.") - raise JobAttachmentsError("upload_assets: Farm or Fleet ID is missing.") - - # Sets up progress tracker to report upload progress back to the caller. - input_files, input_bytes = self._get_total_input_size_from_manifests(manifests) - progress_tracker = ProgressTracker( - status=ProgressStatus.UPLOAD_IN_PROGRESS, - total_files=input_files, - total_bytes=input_bytes, - on_progress_callback=on_uploading_assets, - ) - - start_time = time.perf_counter() - - manifest_properties_list: list[ManifestProperties] = [] - - for asset_root_manifest in manifests: - output_rel_paths: list[str] = [ - str(path.relative_to(asset_root_manifest.root_path)) - for path in asset_root_manifest.outputs - ] - - manifest_properties = ManifestProperties( - fileSystemLocationName=asset_root_manifest.file_system_location_name, - rootPath=asset_root_manifest.root_path, - rootPathFormat=PathFormat.get_host_path_format(), - outputRelativeDirectories=output_rel_paths, - ) - - if asset_root_manifest.asset_manifest: - partial_manifest_key, asset_manifest_hash = self.asset_uploader.upload_assets( - job_attachment_settings=self.job_attachment_settings, # type: ignore[arg-type] - manifest=asset_root_manifest.asset_manifest, - partial_manifest_prefix=self.job_attachment_settings.partial_manifest_prefix( # type: ignore[union-attr] - self.farm_id, self.queue_id - ), - source_root=Path(asset_root_manifest.root_path), - file_system_location_name=asset_root_manifest.file_system_location_name, - progress_tracker=progress_tracker, - s3_check_cache_dir=s3_check_cache_dir, - manifest_write_dir=manifest_write_dir, - force_s3_check=force_s3_check, - ) - manifest_properties.inputManifestPath = partial_manifest_key - manifest_properties.inputManifestHash = asset_manifest_hash - - manifest_properties_list.append(manifest_properties) - - logger.debug("Asset manifests - locations in S3:") - logger.debug( - "\n".join( - filter( - None, - ( - manifest_properties.inputManifestPath - for manifest_properties in manifest_properties_list - ), - ) - ) - ) - - progress_tracker.total_time = time.perf_counter() - start_time - - return ( - progress_tracker.get_summary_statistics(), - Attachments(manifests=manifest_properties_list), - ) - - def snapshot_assets( - self, - snapshot_dir: str, - manifests: list[AssetRootManifest], - on_snapshotting_assets: Optional[Callable[[Any], bool]] = None, - ) -> tuple[SummaryStatistics, Attachments]: - """ - Copies all the files for provided manifests and manifests themselves into a snapshot directory - that matches the layout of a job attachments prefix in S3. - - Args: - snapshot_dir: A directory in which to place the snapshot. Data and manifest files will go in Data - and Manifest subdirectories, respectively. - manifests: A list of manifests that contain assets to be uploaded - on_snapshotting_assets: A callback to be called to periodically report progress to the caller. - The callback must return True if the operation should continue as normal, or False to cancel. - - Returns: - a tuple with (1) the summary statistics of the upload operation, and - (2) the S3 path to the asset manifest file. - """ - # This is a programming error if the user did not construct the object with Farm and Queue IDs. - if not self.farm_id or not self.queue_id: - logger.error("snapshot_assets: Farm or Fleet ID is missing.") - raise JobAttachmentsError("snapshot_assets: Farm or Fleet ID is missing.") - - # Sets up progress tracker to report upload progress back to the caller. - input_files, input_bytes = self._get_total_input_size_from_manifests(manifests) - progress_tracker = ProgressTracker( - status=ProgressStatus.SNAPSHOT_IN_PROGRESS, - total_files=input_files, - total_bytes=input_bytes, - on_progress_callback=on_snapshotting_assets, - ) - - start_time = time.perf_counter() - - manifest_properties_list: list[ManifestProperties] = [] - - for asset_root_manifest in manifests: - output_rel_paths: list[str] = [ - str(path.relative_to(asset_root_manifest.root_path)) - for path in asset_root_manifest.outputs - ] - - manifest_properties = ManifestProperties( - fileSystemLocationName=asset_root_manifest.file_system_location_name, - rootPath=asset_root_manifest.root_path, - rootPathFormat=PathFormat.get_host_path_format(), - outputRelativeDirectories=output_rel_paths, - ) - - if asset_root_manifest.asset_manifest: - partial_manifest_key, asset_manifest_hash = self.asset_uploader._snapshot_assets( - snapshot_dir=Path(snapshot_dir), - manifest=asset_root_manifest.asset_manifest, - partial_manifest_prefix=self.job_attachment_settings.partial_manifest_prefix( # type: ignore[union-attr] - self.farm_id, self.queue_id - ), - source_root=Path(asset_root_manifest.root_path), - file_system_location_name=asset_root_manifest.file_system_location_name, - progress_tracker=progress_tracker, - ) - manifest_properties.inputManifestPath = partial_manifest_key - manifest_properties.inputManifestHash = asset_manifest_hash - - manifest_properties_list.append(manifest_properties) - - progress_tracker.total_time = time.perf_counter() - start_time - - return ( - progress_tracker.get_summary_statistics(), - Attachments(manifests=manifest_properties_list), - ) diff --git a/src/deadline/job_attachments/vfs.py b/src/deadline/job_attachments/vfs.py deleted file mode 100644 index 92565bdb2..000000000 --- a/src/deadline/job_attachments/vfs.py +++ /dev/null @@ -1,531 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import logging -import os -import shutil -import subprocess -import time -from pathlib import Path -import threading -from typing import Callable, Dict, Union, Optional - -from .exceptions import ( - VFSExecutableMissingError, - VFSFailedToMountError, - VFSLaunchScriptMissingError, - VFSRunPathNotSetError, -) - -from .os_file_permission import PosixFileSystemPermissionSettings - -log = logging.getLogger(__name__) - -DEADLINE_VFS_ENV_VAR = "DEADLINE_VFS_PATH" -DEADLINE_VFS_CACHE_ENV_VAR = "DEADLINE_VFS_CACHE" -DEADLINE_VFS_EXECUTABLE = "deadline_vfs" -DEADLINE_VFS_INSTALL_PATH = "/opt/deadline_vfs" -DEADLINE_VFS_EXECUTABLE_SCRIPT = "/scripts/production/al2/run_deadline_vfs_al2.sh" - -DEADLINE_VFS_PID_FILE_NAME = "vfs_pids.txt" -DEADLINE_MANIFEST_GROUP_READ_PERMS = 0o640 - -VFS_CACHE_REL_PATH_IN_SESSION = ".vfs_object_cache" -VFS_MANIFEST_FOLDER_IN_SESSION = ".vfs_manifests" -VFS_LOGS_FOLDER_IN_SESSION = ".vfs_logs" - -VFS_MANIFEST_FOLDER_PERMISSIONS = PosixFileSystemPermissionSettings( - os_user="", - os_group="", - dir_mode=0o31, - file_mode=0o64, -) - - -class VFSProcessManager(object): - exe_path: Optional[str] = None - launch_script_path: Optional[str] = None - library_path: Optional[str] = None - cwd_path: Optional[str] = None - - _mount_point: str - _vfs_proc: Optional[subprocess.Popen] - _vfs_thread: Optional[threading.Thread] - _mount_temp_directory: Optional[str] - _run_path: Optional[Union[os.PathLike, str]] - _asset_bucket: str - _region: str - _manifest_path: str - _os_user: str - _os_env_vars: Dict[str, str] - _os_group: Optional[str] - _cas_prefix: Optional[str] - _asset_cache_path: Optional[str] - - def __init__( - self, - asset_bucket: str, - region: str, - manifest_path: str, - mount_point: str, - os_user: str, - os_env_vars: Dict[str, str], - os_group: Optional[str] = None, - cas_prefix: Optional[str] = None, - asset_cache_path: Optional[str] = None, - on_mount_complete: Optional[Callable[[bool], None]] = None, - ): - self._mount_point = mount_point - self._vfs_proc = None - self._vfs_thread = None - self._mount_temp_directory = None - self._run_path = None - self._asset_bucket = asset_bucket - self._region = region - self._manifest_path = manifest_path - self._os_user = os_user - self._os_group = os_group - self._os_env_vars = os_env_vars - self._cas_prefix = cas_prefix - self._asset_cache_path = asset_cache_path - self._on_mount_complete = on_mount_complete - - @classmethod - def kill_all_processes(cls, session_dir: Path, os_user: str) -> None: - """ - Kill all existing VFS processes when outputs have been uploaded. - :param session_dir: tmp directory for session - :param os_user: the user running the job. - """ - log.info("Terminating all VFS processes.") - try: - pid_file_path = (session_dir / DEADLINE_VFS_PID_FILE_NAME).resolve() - with open(pid_file_path, "r") as file: - for line in file.readlines(): - line = line.strip() - mount_point, _, _ = line.split(":") - cls.shutdown_libfuse_mount(mount_point, os_user, session_dir) - os.remove(pid_file_path) - except FileNotFoundError: - log.warning(f"VFS pid file not found at {pid_file_path}") - - @classmethod - def get_shutdown_args(cls, mount_path: str, os_user: str): - """ - Return the argument list to provide the subprocess run command to shut down the mount - :param mount_path: path to mounted folder - :param os_user: the user running the job. - """ - fusermount3_path = os.path.join(cls.find_vfs_link_dir(), "fusermount3") - if not os.path.exists(fusermount3_path): - log.warning(f"fusermount3 not found at {cls.find_vfs_link_dir()}") - return None - return ["sudo", "-u", os_user, fusermount3_path, "-u", mount_path] - - @classmethod - def shutdown_libfuse_mount(cls, mount_path: str, os_user: str, session_dir: Path) -> bool: - """ - Shut down the mount at the provided path using the fusermount3 unmount option - as the provided user - :param mount_path: path to mounted folder - """ - log.info(f"Attempting to shut down {mount_path} as {os_user}") - shutdown_args = cls.get_shutdown_args(mount_path, os_user) - if not shutdown_args: - return False - try: - run_result = subprocess.run(shutdown_args, check=True) - except subprocess.CalledProcessError as e: - log.warning(f"Shutdown failed with error {e}") - # Don't reraise, check if mount is gone - log.info(f"Shutdown returns {run_result.returncode}") - return cls.wait_for_mount(mount_path, session_dir, expected=False) - - @classmethod - def kill_process_at_mount(cls, session_dir: Path, mount_point: str, os_user: str) -> bool: - """ - Kill the VFS instance running at the given mount_point and modify the VFS pid tracking - file to remove the entry. - - :param session_dir: tmp directory for session - :param mount_point: local directory to search for - :param os_user: user to attempt shut down as - """ - if not cls.is_mount(mount_point): - log.info(f"{mount_point} is not a mount, returning") - return False - log.info(f"Terminating deadline_vfs processes at {mount_point}.") - mount_point_found: bool = False - try: - pid_file_path = (session_dir / DEADLINE_VFS_PID_FILE_NAME).resolve() - with open(pid_file_path, "r") as file: - lines = file.readlines() - with open(pid_file_path, "w") as file: - for line in lines: - line = line.strip() - if mount_point_found: - file.write(line) - else: - mount_for_pid, _, _ = line.split(":") - if mount_for_pid == mount_point: - cls.shutdown_libfuse_mount(mount_point, os_user, session_dir) - mount_point_found = True - else: - file.write(line) - except FileNotFoundError: - log.warning(f"VFS pid file not found at {pid_file_path}") - return False - - return mount_point_found - - @classmethod - def get_manifest_path_for_mount(cls, session_dir: Path, mount_point: str) -> Optional[Path]: - """ - Given a mount_point this searches the pid file for the associated manifest path. - - :param session_dir: tmp directory for session - :param mount_point: local directory associated with the desired manifest - - :returns: Path to the manifest file for mount if there is one - """ - try: - pid_file_path = (session_dir / DEADLINE_VFS_PID_FILE_NAME).resolve() - with open(pid_file_path, "r") as file: - for line in file.readlines(): - line = line.strip() - mount_for_pid, _, manifest_path = line.split(":") - if mount_for_pid == mount_point: - if os.path.exists(manifest_path): - return Path(manifest_path) - else: - log.warning(f"Expected VFS input manifest at {manifest_path}") - return None - except FileNotFoundError: - log.warning(f"VFS pid file not found at {pid_file_path}") - - log.warning(f"No manifest found for mount {mount_point}") - return None - - @classmethod - def is_mount(cls, path) -> bool: - """ - os.path.ismount returns false for libfuse mounts owned by "other users", - use findmnt instead - """ - return subprocess.run(["findmnt", path]).returncode == 0 - - @classmethod - def wait_for_mount(cls, mount_path, session_dir, mount_wait_seconds=60, expected=True) -> bool: - """ - After we've launched the VFS subprocess we need to wait - for the OS to validate that the mount is in place before use - :param mount_path: Path to mount to watch for - :param session_dir: Session folder associated with mount - :param mount_wait_seconds: Duration to wait for mount state - :param expected: Wait for the mount to exist or no longer exist - """ - log.info(f"Waiting for is_mount at {mount_path} to return {expected}..") - wait_seconds = mount_wait_seconds - while wait_seconds >= 0: - if cls.is_mount(mount_path) == expected: - log.info(f"is_mount on {mount_path} returns {expected}, returning") - return True - wait_seconds -= 1 - if wait_seconds >= 0: - log.info(f"is_mount on {mount_path} not {expected}, sleeping...") - time.sleep(1) - log.info(f"Failed to find is_mount {expected} at {mount_path} after {mount_wait_seconds}") - cls.print_log_end(session_dir) - return False - - @classmethod - def logs_folder_path(cls, session_dir: Path) -> Union[os.PathLike, str]: - """ - Find the folder we expect VFS logs to be written to - """ - return session_dir / VFS_LOGS_FOLDER_IN_SESSION - - def get_logs_folder(self) -> Union[os.PathLike, str]: - """ - Find the folder we expect VFS logs to be written to - """ - if self._run_path: - return self.logs_folder_path(Path(self._run_path)) - raise VFSRunPathNotSetError("Attempted to find logs folder without run path") - - @classmethod - def print_log_end( - self, session_dir: Path, log_file_name="vfs_log.txt", lines=100, log_level=logging.WARNING - ): - """ - Print out the end of our VFS Log. Reads the full log file into memory. Our VFS logs are size - capped so this is not an issue for the intended use case. - :param session_dir: Session folder for mount - :param log_file_name: Name of file within the logs folder to read from. Defaults to vfs_log.txt which - is our "most recent" log file. - :param lines: Maximum number of lines from the end of the log to print - :param log_level: Level to print logging as - """ - log_file_path = self.logs_folder_path(session_dir) / log_file_name - log.log(log_level, f"Printing last {lines} lines from {log_file_path}") - if not os.path.exists(log_file_path): - log.warning(f"No log file found at {log_file_path}") - return - with open(log_file_path, "r") as log_file: - for this_line in log_file.readlines()[lines * -1 :]: - log.log(log_level, this_line) - - @classmethod - def find_vfs_link_dir(cls) -> str: - """ - Get the path where links to any necessary executables which should be added to the path should live - :returns: Path to the link folder - """ - return os.path.join(os.path.dirname(VFSProcessManager.find_vfs()), "..", "link") - - def build_launch_command(self, mount_point: Union[os.PathLike, str]) -> str: - """ - Build command to pass to Popen to launch VFS - :param mount_point: directory to mount which must be the first parameter seen by our executable - :return: command - """ - executable = VFSProcessManager.find_vfs_launch_script() - - command = ( - f"sudo -E -u {self._os_user}" - f" {executable} {mount_point} -f --clienttype=deadline" - f" --bucket={self._asset_bucket}" - f" --manifest={self._manifest_path}" - f" --region={self._region}" - f" -oallow_other" - ) - if self._cas_prefix is not None: - command += f" --casprefix={self._cas_prefix}" - if self._asset_cache_path is not None: - command += f" --cachedir={self._asset_cache_path}" - - log.info(f"Got launch command {command}") - return command - - @classmethod - def find_vfs_launch_script(cls) -> Union[os.PathLike, str]: - """ - Determine where the VFS launch script lives so we can build the launch command - :return: Path to VFS launch script - """ - if VFSProcessManager.launch_script_path is not None: - log.info(f"Using saved path {VFSProcessManager.launch_script_path} for launch script") - return VFSProcessManager.launch_script_path - - exe = DEADLINE_VFS_EXECUTABLE - # for exe in executables: - log.info(f"Searching for {exe} launch script") - exe_script = DEADLINE_VFS_EXECUTABLE_SCRIPT - # Look for env var to construct script path - if DEADLINE_VFS_ENV_VAR in os.environ: - log.info(f"{DEADLINE_VFS_ENV_VAR} found in environment") - environ_check = os.environ[DEADLINE_VFS_ENV_VAR] + exe_script - else: - log.warning(f"{DEADLINE_VFS_ENV_VAR} not found in environment") - environ_check = DEADLINE_VFS_INSTALL_PATH + exe_script - # Test if script path exists - if os.path.exists(environ_check): - log.info(f"Environ check found {exe} launch script at {environ_check}") - VFSProcessManager.launch_script_path = environ_check - return environ_check # type: ignore[return-value] - else: - log.error(f"Failed to find {exe} launch script!") - - log.error("Failed to find both executables scripts!") - raise VFSLaunchScriptMissingError - - @classmethod - def find_vfs(cls) -> Union[os.PathLike, str]: - """ - Determine where the VFS executable we'll be launching lives so we can - find the correct relative paths around it for LD_LIBRARY_PATH and config files - :return: Path to VFS executable - """ - if VFSProcessManager.exe_path is not None: - log.info(f"Using saved path {VFSProcessManager.exe_path}") - return VFSProcessManager.exe_path - - exe = DEADLINE_VFS_EXECUTABLE - # Use "which deadline_vfs" by default to find the executable location - found_path = shutil.which(exe) - if found_path is None: - log.info(f"Cwd when finding {exe} is {os.getcwd()}") - # If VFS executable isn't on the PATH, check if environment variable is set - if DEADLINE_VFS_ENV_VAR in os.environ: - log.info(f"{DEADLINE_VFS_ENV_VAR} set to {os.environ[DEADLINE_VFS_ENV_VAR]}") - environ_check = os.environ[DEADLINE_VFS_ENV_VAR] + f"/bin/{exe}" - else: - log.info(f"{DEADLINE_VFS_ENV_VAR} env var not set") - environ_check = DEADLINE_VFS_INSTALL_PATH + f"/bin/{exe}" - if os.path.exists(environ_check): - log.info(f"Environ check found {exe} at {environ_check}") - found_path = environ_check - else: - # Last attempt looks for deadline_vfs in bin - bin_check = os.path.join(os.getcwd(), f"bin/{exe}") - if os.path.exists(bin_check): - log.info(f"Bin check found VFS at {bin_check}") - found_path = bin_check - else: - log.error(f"Failed to find {exe}!") - - # Run final check to see if exe path was found - if found_path is not None: - log.info(f"Found {exe} at {found_path}") - VFSProcessManager.exe_path = found_path - return found_path # type: ignore[return-value] - - log.error("Failed to find both executables!") - raise VFSExecutableMissingError - - @classmethod - def get_library_path(cls) -> Union[os.PathLike, str]: - """ - Find our library dependencies which should be at ../lib relative to our executable - """ - if VFSProcessManager.library_path is None: - exe_path = VFSProcessManager.find_vfs() - VFSProcessManager.library_path = os.path.normpath( - os.path.join(os.path.dirname(exe_path), "../lib") - ) - log.info(f"Using library path {VFSProcessManager.library_path}") - return VFSProcessManager.library_path - - def get_file_path(self, relative_file_name: str) -> Union[os.PathLike, str]: - return os.path.join(self._mount_point, relative_file_name) - - @classmethod - def create_mount_point(cls, mount_point: Union[os.PathLike, str]) -> None: - """ - By default fuse won't create our mount folder, create it if it doesn't exist - """ - if os.path.exists(mount_point) is False: - log.info(f"Creating mount point at {mount_point}") - os.makedirs(mount_point, exist_ok=True) - log.info(f"Modifying permissions of mount point at {mount_point}") - os.chmod(path=mount_point, mode=0o777) - - @classmethod - def get_cwd(cls) -> Union[os.PathLike, str]: - """ - Determine the cwd we should hand to Popen. - We expect a config/logging.ini file to exist relative to this folder. - """ - if VFSProcessManager.cwd_path is None: - exe_path = VFSProcessManager.find_vfs() - # Use cwd one folder up from bin - VFSProcessManager.cwd_path = os.path.normpath( - os.path.join(os.path.dirname(exe_path), "..") - ) - return VFSProcessManager.cwd_path - - def get_launch_environ(self) -> dict: - """ - Get the environment variables we'll pass to the launch command. - :returns: dictionary of default environment variables with VFS changes applied - """ - my_env = {**self._os_env_vars} - my_env["PATH"] = f"{VFSProcessManager.find_vfs_link_dir()}{os.pathsep}{os.environ['PATH']}" - my_env["LD_LIBRARY_PATH"] = VFSProcessManager.get_library_path() # type: ignore[assignment] - if os.environ.get(DEADLINE_VFS_CACHE_ENV_VAR) is not None: - my_env[DEADLINE_VFS_CACHE_ENV_VAR] = os.environ.get(DEADLINE_VFS_CACHE_ENV_VAR) # type: ignore[assignment] - - return my_env - - def set_manifest_owner(self) -> None: - """ - Set the manifest path to be owned by _os_user - """ - log.info( - f"Attempting to set group ownership on {self._manifest_path} for {self._os_user} to {self._os_group}" - ) - if not os.path.exists(self._manifest_path): - log.error(f"Manifest not found at {self._manifest_path}") - return - if self._os_group is not None: - try: - shutil.chown(self._manifest_path, group=self._os_group) - os.chmod(self._manifest_path, DEADLINE_MANIFEST_GROUP_READ_PERMS) - except OSError as e: - log.error(f"Failed to set ownership with error {e}") - raise - - def start(self, session_dir: Path) -> None: - """ - Start our VFS process - :return: VFS process id - """ - self._run_path = session_dir - log.info(f"Using run_path {self._run_path}") - log.info(f"Using mount_point {self._mount_point}") - self.set_manifest_owner() - VFSProcessManager.create_mount_point(self._mount_point) - start_command = self.build_launch_command(self._mount_point) - launch_env = self.get_launch_environ() - log.info(f"Launching VFS with command {start_command}") - log.info(f"Launching with environment {launch_env}") - log.info(f"Launching as user {self._os_user}") - - try: - - def read_output_thread(pipe, log): - # Runs in a thread to redirect VFS output into our log - try: - for line in pipe: - log.info(line.decode("utf-8").strip()) - except Exception: - log.exception("Error reading VFS output") - - self._vfs_proc = subprocess.Popen( - args=start_command, - stdout=subprocess.PIPE, # Create a new pipe - stderr=subprocess.STDOUT, # Merge stderr into the stdout pipe - cwd=str(self._run_path), - env=launch_env, - shell=True, - executable="/bin/bash", - ) - - self._vfs_thread = threading.Thread( - target=read_output_thread, args=[self._vfs_proc.stdout, log], daemon=True - ) - self._vfs_thread.start() - - except Exception as e: - log.exception(f"Exception during launch with command {start_command} exception {e}") - raise e - log.info(f"Launched VFS as pid {self._vfs_proc.pid}") - - is_mounted = VFSProcessManager.wait_for_mount(self.get_mount_point(), session_dir) - if self._on_mount_complete is not None: - self._on_mount_complete(is_mounted) - - if not is_mounted: - log.error("Failed to mount, shutting down") - raise VFSFailedToMountError - - try: - # if the pid file exists, add the new VFS instance and remove any it replaced - pid_file_path = (session_dir / DEADLINE_VFS_PID_FILE_NAME).resolve() - with open(pid_file_path, "r") as file: - lines = file.readlines() - with open(pid_file_path, "w") as file: - file.write(f"{self._mount_point}:{self._vfs_proc.pid}:{self._manifest_path}\n") - for line in lines: - line = line.strip() - entry_mount_point, entry_pid, entry_manifest_path = line.split(":") - if self._mount_point != entry_mount_point: - file.write(f"{line}\n") - else: - log.warning(f"Pid {entry_pid} entry not removed at {entry_mount_point}") - except FileNotFoundError: - # if the pid file doesn't exist, this will create it - with open(pid_file_path, "a") as file: - file.write(f"{self._mount_point}:{self._vfs_proc.pid}:{self._manifest_path}") - - def get_mount_point(self) -> Union[os.PathLike, str]: - return self._mount_point diff --git a/test/integ/deadline_job_attachments/__init__.py b/test/integ/deadline_job_attachments/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/integ/deadline_job_attachments/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/integ/deadline_job_attachments/conftest.py b/test/integ/deadline_job_attachments/conftest.py deleted file mode 100644 index 768c78d70..000000000 --- a/test/integ/deadline_job_attachments/conftest.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import getpass -import sys - - -def is_windows_non_admin(): - return sys.platform == "win32" and getpass.getuser() != "Administrator" diff --git a/test/integ/deadline_job_attachments/test_data/inputs/scene.ma b/test/integ/deadline_job_attachments/test_data/inputs/scene.ma deleted file mode 100644 index b2bad0ef0..000000000 --- a/test/integ/deadline_job_attachments/test_data/inputs/scene.ma +++ /dev/null @@ -1 +0,0 @@ -this is a scene file \ No newline at end of file diff --git a/test/integ/deadline_job_attachments/test_data/inputs/textures/brick.png b/test/integ/deadline_job_attachments/test_data/inputs/textures/brick.png deleted file mode 100644 index 90584e648..000000000 --- a/test/integ/deadline_job_attachments/test_data/inputs/textures/brick.png +++ /dev/null @@ -1 +0,0 @@ -this is a brick png \ No newline at end of file diff --git a/test/integ/deadline_job_attachments/test_data/inputs/textures/cloth.png b/test/integ/deadline_job_attachments/test_data/inputs/textures/cloth.png deleted file mode 100644 index 5b9f8b5c5..000000000 --- a/test/integ/deadline_job_attachments/test_data/inputs/textures/cloth.png +++ /dev/null @@ -1 +0,0 @@ -this is a png of a cloth \ No newline at end of file diff --git a/test/integ/deadline_job_attachments/test_data/outputs/not_for_sync_outputs.txt b/test/integ/deadline_job_attachments/test_data/outputs/not_for_sync_outputs.txt deleted file mode 100644 index d6cfdd1fd..000000000 --- a/test/integ/deadline_job_attachments/test_data/outputs/not_for_sync_outputs.txt +++ /dev/null @@ -1 +0,0 @@ -Although it is in the output directory, it is actually an input file. It should be downloaded (to the worker's session working directory) during sync_inputs, and should not be captured as an output file when sync_outputs. \ No newline at end of file diff --git a/test/integ/deadline_job_attachments/test_job_attachments.py b/test/integ/deadline_job_attachments/test_job_attachments.py deleted file mode 100644 index ebb0aba11..000000000 --- a/test/integ/deadline_job_attachments/test_job_attachments.py +++ /dev/null @@ -1,1608 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Integration tests for Job Attachments.""" - -import logging -import math -import os -import shutil -import time -from dataclasses import dataclass -from pathlib import Path, WindowsPath -from unittest.mock import MagicMock -import sys - -from deadline.job_attachments.models import JobAttachmentS3Settings -from deadline.job_attachments.download import WINDOWS_MAX_PATH_LENGTH -import boto3 -import pytest -from deadline_test_fixtures.job_attachment_manager import JobAttachmentManager -from pytest import LogCaptureFixture, TempPathFactory - -from deadline.job_attachments import asset_sync, download, upload -from deadline.job_attachments.asset_manifests import ( - ManifestVersion, - HashAlgorithm, - hash_data, - hash_file, -) -from deadline.job_attachments._aws.deadline import get_queue -from deadline.job_attachments.exceptions import AssetSyncError, JobAttachmentsS3ClientError -from deadline.job_attachments.models import ( - Attachments, - ManifestProperties, - PathFormat, -) -from deadline.job_attachments.progress_tracker import SummaryStatistics -from deadline.job_attachments._utils import ( - WINDOWS_UNC_PATH_STRING_PREFIX, - _get_unique_dest_dir_name, -) -from .conftest import is_windows_non_admin - - -def notifier_callback(progress: float, message: str) -> None: - pass - - -class JobAttachmentTest: - """ - Hold information used across all job attachment integration tests. - """ - - ASSET_ROOT = Path(__file__).parent / "test_data" - OUTPUT_PATH = ASSET_ROOT / "outputs" - INPUT_PATH = ASSET_ROOT / "inputs" - SCENE_MA_PATH = INPUT_PATH / "scene.ma" - SCENE_MA_HASH = hash_file(str(SCENE_MA_PATH), HashAlgorithm.XXH128) - BRICK_PNG_PATH = INPUT_PATH / "textures" / "brick.png" - CLOTH_PNG_PATH = INPUT_PATH / "textures" / "cloth.png" - INPUT_IN_OUTPUT_DIR_PATH = OUTPUT_PATH / "not_for_sync_outputs.txt" - FIRST_RENDER_OUTPUT_PATH = Path("outputs/render0000.exr") - SECOND_RENDER_OUTPUT_PATH = Path("outputs/render0001.exr") - MOV_FILE_OUTPUT_PATH = Path("outputs/end.mov") - - def __init__( - self, - deploy_job_attachment_resources: JobAttachmentManager, - tmp_path_factory: TempPathFactory, - manifest_version: ManifestVersion, - ): - """ - Sets up resource that these integration tests will need. - """ - self.job_attachment_resources = deploy_job_attachment_resources - - if self.job_attachment_resources.farm_id is None: - raise TypeError("The Farm ID was not properly retrieved when initializing resources.") - if ( - self.job_attachment_resources.queue is None - or self.job_attachment_resources.queue_with_no_settings is None - ): - raise TypeError("The Queues were not properly created when initializing resources.") - - self.farm_id = self.job_attachment_resources.farm_id - self.queue_id = self.job_attachment_resources.queue.id - self.queue_with_no_settings_id = self.job_attachment_resources.queue_with_no_settings.id - - self.bucket = boto3.resource("s3").Bucket(self.job_attachment_resources.bucket_name) - self.deadline_client = self.job_attachment_resources.deadline_client - - self.bucket_root_prefix = self.job_attachment_resources.bucket_root_prefix - self.hash_cache_dir = tmp_path_factory.mktemp("hash_cache") - self.s3_cache_dir = tmp_path_factory.mktemp("s3_check_cache") - self.session = boto3.Session() - self.deadline_endpoint = os.getenv( - "AWS_ENDPOINT_URL_DEADLINE", - f"https://deadline.{self.session.region_name}.amazonaws.com", - ) - - self.manifest_version = manifest_version - - -@pytest.fixture(scope="session", params=[ManifestVersion.v2023_03_03]) -def job_attachment_test( - deploy_job_attachment_resources: JobAttachmentManager, - tmp_path_factory: TempPathFactory, - request: pytest.FixtureRequest, -): - """ - Fixture to get the session's JobAttachmentTest object. - """ - - return JobAttachmentTest( - deploy_job_attachment_resources, tmp_path_factory, manifest_version=request.param - ) - - -@pytest.fixture(scope="session") -def upload_input_files_assets_not_in_cas(job_attachment_test: JobAttachmentTest): - """ - When no assets are in the CAS, make sure all files are uploaded. - """ - # IF - - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise TypeError("Job attachment settings must be set for this test.") - - asset_manager = upload.S3AssetManager( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_attachment_settings=job_attachment_settings, - asset_manifest_version=job_attachment_test.manifest_version, - ) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_files = MagicMock(return_value=True) - - # WHEN - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[str(job_attachment_test.SCENE_MA_PATH)], - output_paths=[str(job_attachment_test.OUTPUT_PATH)], - referenced_paths=[], - ) - (_, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(job_attachment_test.hash_cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - asset_manager.upload_assets( - manifests, - on_uploading_assets=mock_on_uploading_files, - s3_check_cache_dir=str(job_attachment_test.s3_cache_dir), - ) - - # THEN - scene_ma_s3_path = ( - f"{job_attachment_settings.full_cas_prefix()}/{job_attachment_test.SCENE_MA_HASH}.xxh128" - ) - - object_summary_iterator = job_attachment_test.bucket.objects.filter( - Prefix=scene_ma_s3_path, - ) - - assert list(object_summary_iterator)[0].key == scene_ma_s3_path - - -@dataclass -class UploadInputFilesOneAssetInCasOutputs: - attachments: Attachments - - -@pytest.fixture(scope="session") -def upload_input_files_one_asset_in_cas( - job_attachment_test: JobAttachmentTest, upload_input_files_assets_not_in_cas: None -) -> UploadInputFilesOneAssetInCasOutputs: - """ - Test that when one asset is already in the CAS, that every file except for the one in the CAS is uploaded. - """ - # IF - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise Exception("Job attachment settings must be set for this test.") - - asset_manager = upload.S3AssetManager( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_attachment_settings=job_attachment_settings, - asset_manifest_version=job_attachment_test.manifest_version, - ) - - input_paths = [ - str(job_attachment_test.SCENE_MA_PATH), - str(job_attachment_test.BRICK_PNG_PATH), - str(job_attachment_test.CLOTH_PNG_PATH), - str(job_attachment_test.INPUT_IN_OUTPUT_DIR_PATH), - ] - - scene_ma_s3_path = ( - f"{job_attachment_settings.full_cas_prefix()}/{job_attachment_test.SCENE_MA_HASH}.xxh128" - ) - - # This file has already been uploaded - scene_ma_upload_time = job_attachment_test.bucket.Object(scene_ma_s3_path).last_modified - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_files = MagicMock(return_value=True) - - # WHEN - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=input_paths, - output_paths=[str(job_attachment_test.OUTPUT_PATH)], - referenced_paths=[], - ) - (_, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(job_attachment_test.hash_cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - (_, attachments) = asset_manager.upload_assets( - manifests, - on_uploading_assets=mock_on_uploading_files, - s3_check_cache_dir=str(job_attachment_test.s3_cache_dir), - ) - - # THEN - brick_png_hash = hash_file(str(job_attachment_test.BRICK_PNG_PATH), HashAlgorithm.XXH128) - cloth_png_hash = hash_file(str(job_attachment_test.CLOTH_PNG_PATH), HashAlgorithm.XXH128) - input_in_output_dir_hash = hash_file( - str(job_attachment_test.INPUT_IN_OUTPUT_DIR_PATH), HashAlgorithm.XXH128 - ) - - brick_png_s3_path = f"{job_attachment_settings.full_cas_prefix()}/{brick_png_hash}.xxh128" - cloth_png_s3_path = f"{job_attachment_settings.full_cas_prefix()}/{cloth_png_hash}.xxh128" - input_in_output_dir_s3_path = ( - f"{job_attachment_settings.full_cas_prefix()}/{input_in_output_dir_hash}.xxh128" - ) - - object_summary_iterator = job_attachment_test.bucket.objects.filter( - Prefix=f"{job_attachment_settings.full_cas_prefix()}/", - ) - - s3_objects = {obj.key: obj for obj in object_summary_iterator} - - assert {brick_png_s3_path, cloth_png_s3_path, input_in_output_dir_s3_path} <= set( - map(lambda x: x.key, object_summary_iterator) - ) - - assert brick_png_s3_path in s3_objects - assert cloth_png_s3_path in s3_objects - assert input_in_output_dir_s3_path in s3_objects - # Make sure that the file hasn't been modified/reuploaded - assert s3_objects[scene_ma_s3_path].last_modified == scene_ma_upload_time - - return UploadInputFilesOneAssetInCasOutputs(attachments) - - -@pytest.mark.integ -def test_upload_input_files_all_assets_in_cas( - job_attachment_test: JobAttachmentTest, - upload_input_files_one_asset_in_cas: UploadInputFilesOneAssetInCasOutputs, -) -> None: - """ - Test that when all assets are already in the CAS, that no files are uploaded. - """ - # IF - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise Exception("Job attachment settings must be set for this test.") - - asset_manager = upload.S3AssetManager( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_attachment_settings=job_attachment_settings, - asset_manifest_version=job_attachment_test.manifest_version, - ) - - input_paths = [ - str(job_attachment_test.SCENE_MA_PATH), - str(job_attachment_test.BRICK_PNG_PATH), - str(job_attachment_test.CLOTH_PNG_PATH), - str(job_attachment_test.INPUT_IN_OUTPUT_DIR_PATH), - ] - - # This file has already been uploaded - asset_upload_time = { - obj.key: obj.last_modified - for obj in job_attachment_test.bucket.objects.filter( - Prefix=f"{job_attachment_settings.full_cas_prefix()}/" - ) - } - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_files = MagicMock(return_value=True) - - # WHEN - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=input_paths, - output_paths=[str(job_attachment_test.OUTPUT_PATH)], - referenced_paths=[], - ) - (_, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(job_attachment_test.hash_cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - (_, attachments) = asset_manager.upload_assets( - manifests, - on_uploading_assets=mock_on_uploading_files, - s3_check_cache_dir=str(job_attachment_test.s3_cache_dir), - ) - - # THEN - - assert attachments.manifests[0].inputManifestPath is not None - - # Confirm nothing was uploaded - for obj in job_attachment_test.bucket.objects.filter( - Prefix=f"{job_attachment_settings.full_cas_prefix()}/" - ): - if ( - f"{attachments.manifests[0].inputManifestPath}" - == f"s3://{job_attachment_test.bucket.name}/{obj.key}" - ): - # Skip checking the manifest file - continue - - assert obj.last_modified == asset_upload_time[obj.key] - - -@dataclass -class SyncInputsOutputs: - session_dir: Path - dest_dir: Path - asset_syncer: asset_sync.AssetSync - attachments: Attachments - job_id: str - - -@pytest.fixture(scope="session") -def sync_inputs( - job_attachment_test: JobAttachmentTest, - upload_input_files_one_asset_in_cas: UploadInputFilesOneAssetInCasOutputs, - tmp_path_factory: TempPathFactory, - default_job_template: str, -) -> SyncInputsOutputs: - """ - Test that all of the input files get synced locally. - """ - # IF - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - job_response = job_attachment_test.deadline_client.create_job( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - attachments=upload_input_files_one_asset_in_cas.attachments.to_dict(), # type: ignore - targetTaskRunStatus="SUSPENDED", - template=default_job_template, - templateType="JSON", - priority=50, - ) - - syncer = asset_sync.AssetSync(job_attachment_test.farm_id) - session_dir = tmp_path_factory.mktemp("session_dir") - - def on_downloading_files(*args, **kwargs): - return True - - # WHEN - syncer.sync_inputs( - job_attachment_settings, - upload_input_files_one_asset_in_cas.attachments, - job_attachment_test.queue_id, - job_response["jobId"], - session_dir, - on_downloading_files=on_downloading_files, - ) - - dest_dir = _get_unique_dest_dir_name(str(job_attachment_test.ASSET_ROOT)) - - # THEN - assert Path(session_dir / dest_dir / job_attachment_test.SCENE_MA_PATH).exists() - assert Path(session_dir / dest_dir / job_attachment_test.BRICK_PNG_PATH).exists() - assert Path(session_dir / dest_dir / job_attachment_test.CLOTH_PNG_PATH).exists() - assert Path(session_dir / dest_dir / job_attachment_test.INPUT_IN_OUTPUT_DIR_PATH).exists() - - return SyncInputsOutputs( - session_dir=session_dir, - dest_dir=Path(dest_dir), - asset_syncer=syncer, - attachments=upload_input_files_one_asset_in_cas.attachments, - job_id=job_response["jobId"], - ) - - -@dataclass -class SyncInputsNoJobAttachmentS3SettingsOutput: - job_id: str - asset_syncer: asset_sync.AssetSync - session_dir: Path - - -@pytest.fixture() -def sync_inputs_no_job_attachment_s3_settings( - job_attachment_test: JobAttachmentTest, - upload_input_files_one_asset_in_cas: UploadInputFilesOneAssetInCasOutputs, - tmp_path_factory: TempPathFactory, - default_job_template_one_task_one_step: str, - caplog: LogCaptureFixture, -) -> SyncInputsNoJobAttachmentS3SettingsOutput: - """ - Test that when there are no job attachment settings on a queue, the input sync is skipped. - """ - # IF - caplog.set_level(logging.INFO) - - job_response = job_attachment_test.deadline_client.create_job( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_with_no_settings_id, - attachments=upload_input_files_one_asset_in_cas.attachments.to_dict(), # type: ignore - targetTaskRunStatus="SUSPENDED", - template=default_job_template_one_task_one_step, - templateType="JSON", - priority=50, - ) - - syncer = asset_sync.AssetSync( - farm_id=job_attachment_test.farm_id, - boto3_session=job_attachment_test.session, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ) - session_dir = tmp_path_factory.mktemp("session_dir") - - def on_downloading_files(*args, **kwargs): - return True - - # WHEN - assert syncer.sync_inputs( - syncer.get_s3_settings( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_with_no_settings_id, - ), - syncer.get_attachments( - job_attachment_test.farm_id, - job_attachment_test.queue_with_no_settings_id, - job_response["jobId"], - ), - job_attachment_test.queue_with_no_settings_id, - job_response["jobId"], - session_dir, - on_downloading_files=on_downloading_files, - ) == (SummaryStatistics(), []) - - assert ( - "No Job Attachment settings configured for Queue " - f"{job_attachment_test.queue_with_no_settings_id}, no inputs to sync." in caplog.text - ) - - return SyncInputsNoJobAttachmentS3SettingsOutput( - job_id=job_response["jobId"], asset_syncer=syncer, session_dir=session_dir - ) - - -@dataclass -class SyncInputsNoJobAttachmentSettingsInJobOutput: - job_id: str - asset_syncer: asset_sync.AssetSync - session_dir: Path - - -@pytest.fixture() -def sync_inputs_no_job_attachment_settings_in_job( - job_attachment_test: JobAttachmentTest, - upload_input_files_one_asset_in_cas: UploadInputFilesOneAssetInCasOutputs, - tmp_path_factory: TempPathFactory, - default_job_template_one_task_one_step: str, - caplog: LogCaptureFixture, -) -> SyncInputsNoJobAttachmentSettingsInJobOutput: - """ - Test that when there are no job attachment settings on a job, the input sync is skipped. - """ - # IF - caplog.set_level(logging.INFO) - - job_response = job_attachment_test.deadline_client.create_job( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - targetTaskRunStatus="SUSPENDED", - template=default_job_template_one_task_one_step, - templateType="JSON", - priority=50, - ) - - syncer = asset_sync.AssetSync( - farm_id=job_attachment_test.farm_id, - boto3_session=job_attachment_test.session, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ) - session_dir = tmp_path_factory.mktemp("session_dir") - - def on_downloading_files(*args, **kwargs): - return True - - # WHEN - assert syncer.sync_inputs( - syncer.get_s3_settings( - farm_id=job_attachment_test.farm_id, queue_id=job_attachment_test.queue_id - ), - syncer.get_attachments( - job_attachment_test.farm_id, job_attachment_test.queue_id, job_response["jobId"] - ), - job_attachment_test.queue_id, - job_response["jobId"], - session_dir, - on_downloading_files=on_downloading_files, - ) == (SummaryStatistics(), []) - - assert ( - f"No attachments configured for Job {job_response['jobId']}, no inputs to sync." - in caplog.text - ) - - return SyncInputsNoJobAttachmentSettingsInJobOutput( - job_id=job_response["jobId"], asset_syncer=syncer, session_dir=session_dir - ) - - -@pytest.mark.integ -def test_sync_outputs_no_job_attachment_settings_in_job( - job_attachment_test: JobAttachmentTest, - sync_inputs_no_job_attachment_settings_in_job: SyncInputsNoJobAttachmentSettingsInJobOutput, - caplog: LogCaptureFixture, -) -> None: - """ - Test that syncing outputs is skipped when the queue has no job attachment settings. - """ - # IF - caplog.set_level(logging.INFO) - - waiter = job_attachment_test.deadline_client.get_waiter("job_create_complete") - waiter.wait( - jobId=sync_inputs_no_job_attachment_settings_in_job.job_id, - queueId=job_attachment_test.queue_id, - farmId=job_attachment_test.farm_id, - ) - - step_id = job_attachment_test.deadline_client.list_steps( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs_no_job_attachment_settings_in_job.job_id, - )["steps"][0]["stepId"] - - task_id = job_attachment_test.deadline_client.list_tasks( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs_no_job_attachment_settings_in_job.job_id, - stepId=step_id, - )["tasks"][0]["taskId"] - - # WHEN - sync_inputs_no_job_attachment_settings_in_job.asset_syncer.sync_outputs( - s3_settings=sync_inputs_no_job_attachment_settings_in_job.asset_syncer.get_s3_settings( - job_attachment_test.farm_id, job_attachment_test.queue_id - ), - attachments=sync_inputs_no_job_attachment_settings_in_job.asset_syncer.get_attachments( - job_attachment_test.farm_id, - job_attachment_test.queue_id, - sync_inputs_no_job_attachment_settings_in_job.job_id, - ), - queue_id=job_attachment_test.queue_id, - job_id=sync_inputs_no_job_attachment_settings_in_job.job_id, - step_id=step_id, - task_id=task_id, - session_action_id="session_action_id", - start_time=time.time(), - session_dir=sync_inputs_no_job_attachment_settings_in_job.session_dir, - ) - - # THEN - assert ( - "No attachments configured for Job " - f"{sync_inputs_no_job_attachment_settings_in_job.job_id}, no outputs to sync." - in caplog.text - ) - - -@pytest.mark.integ -def test_sync_outputs_no_job_attachment_s3_settings( - job_attachment_test: JobAttachmentTest, - sync_inputs_no_job_attachment_s3_settings: SyncInputsNoJobAttachmentS3SettingsOutput, - caplog: LogCaptureFixture, -) -> None: - """ - Test that syncing outputs is skipped when the job has no job attachment settings. - """ - # IF - caplog.set_level(logging.INFO) - - waiter = job_attachment_test.deadline_client.get_waiter("job_create_complete") - waiter.wait( - jobId=sync_inputs_no_job_attachment_s3_settings.job_id, - queueId=job_attachment_test.queue_with_no_settings_id, - farmId=job_attachment_test.farm_id, - ) - - step_id = job_attachment_test.deadline_client.list_steps( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_with_no_settings_id, - jobId=sync_inputs_no_job_attachment_s3_settings.job_id, - )["steps"][0]["stepId"] - - task_id = job_attachment_test.deadline_client.list_tasks( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_with_no_settings_id, - jobId=sync_inputs_no_job_attachment_s3_settings.job_id, - stepId=step_id, - )["tasks"][0]["taskId"] - - # WHEN - sync_inputs_no_job_attachment_s3_settings.asset_syncer.sync_outputs( - s3_settings=sync_inputs_no_job_attachment_s3_settings.asset_syncer.get_s3_settings( - job_attachment_test.farm_id, job_attachment_test.queue_with_no_settings_id - ), - attachments=sync_inputs_no_job_attachment_s3_settings.asset_syncer.get_attachments( - job_attachment_test.farm_id, - job_attachment_test.queue_with_no_settings_id, - sync_inputs_no_job_attachment_s3_settings.job_id, - ), - queue_id=job_attachment_test.queue_with_no_settings_id, - job_id=sync_inputs_no_job_attachment_s3_settings.job_id, - step_id=step_id, - task_id=task_id, - session_action_id="session_action_id", - start_time=time.time(), - session_dir=sync_inputs_no_job_attachment_s3_settings.session_dir, - ) - - # THEN - assert ( - "No Job Attachment settings configured for Queue " - f"{job_attachment_test.queue_with_no_settings_id}, no outputs to sync." in caplog.text - ) - - -@dataclass -class SyncOutputsOutput: - step0_task0_id: str - step0_task1_id: str - step1_task0_id: str - step0_id: str - step1_id: str - job_id: str - attachments: Attachments - step0_task0_output_file: Path - step0_task1_output_file: Path - step1_task0_output_file: Path - - -@pytest.fixture(scope="session") -def sync_outputs( - job_attachment_test: JobAttachmentTest, - sync_inputs: SyncInputsOutputs, -) -> SyncOutputsOutput: - """ - Test that all outputs from the job get synced to the JobAttachment S3 Bucket. - """ - # IF - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise Exception("Job attachment settings must be set for this test.") - - waiter = job_attachment_test.deadline_client.get_waiter("job_create_complete") - waiter.wait( - jobId=sync_inputs.job_id, - queueId=job_attachment_test.queue_id, - farmId=job_attachment_test.farm_id, - ) - - list_steps_response = job_attachment_test.deadline_client.list_steps( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs.job_id, - ) - - step_ids = {step["name"]: step["stepId"] for step in list_steps_response["steps"]} - - step0_id = step_ids["custom-step"] - step1_id = step_ids["custom-step-2"] - - list_tasks_response = job_attachment_test.deadline_client.list_tasks( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs.job_id, - stepId=step0_id, - ) - - task_ids = { - task["parameters"]["frame"]["int"]: task["taskId"] for task in list_tasks_response["tasks"] - } - - step0_task0_id = task_ids["0"] - step0_task1_id = task_ids["1"] - - step1_task0_id = list_tasks_response = job_attachment_test.deadline_client.list_tasks( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs.job_id, - stepId=step1_id, - )["tasks"][0]["taskId"] - - file_to_be_synced_step0_task0_base = job_attachment_test.FIRST_RENDER_OUTPUT_PATH - file_to_be_synced_step0_task1_base = job_attachment_test.SECOND_RENDER_OUTPUT_PATH - file_to_be_synced_step1_task0_base = job_attachment_test.MOV_FILE_OUTPUT_PATH - - file_to_be_synced_step0_task0 = ( - sync_inputs.session_dir / sync_inputs.dest_dir / file_to_be_synced_step0_task0_base - ) - file_to_be_synced_step0_task1 = ( - sync_inputs.session_dir / sync_inputs.dest_dir / file_to_be_synced_step0_task1_base - ) - file_to_be_synced_step1_task0 = ( - sync_inputs.session_dir / sync_inputs.dest_dir / file_to_be_synced_step1_task0_base - ) - - render_start_time = time.time() - - # WHEN - mock_on_uploading_files = MagicMock(return_value=True) - - # First step and task - # Create files after the render start time in the output dir, these should be synced - with open(file_to_be_synced_step0_task0, "w") as f: - f.write("this is the first render") - - summary_stats = sync_inputs.asset_syncer.sync_outputs( - s3_settings=job_attachment_settings, - attachments=sync_inputs.attachments, - queue_id=job_attachment_test.queue_id, - job_id=sync_inputs.job_id, - step_id=step0_id, - task_id=step0_task0_id, - session_action_id="session_action_id", - start_time=render_start_time, - session_dir=sync_inputs.session_dir, - on_uploading_files=mock_on_uploading_files, - ) - # There should be one synced output for this task, Step 0 - Task 0 - assert summary_stats.total_files == 1 - - render_start_time = time.time() - - # First step and second task - with open(file_to_be_synced_step0_task1, "w") as f: - f.write("this is a second render") - - summary_stats = sync_inputs.asset_syncer.sync_outputs( - s3_settings=job_attachment_settings, - attachments=sync_inputs.attachments, - queue_id=job_attachment_test.queue_id, - job_id=sync_inputs.job_id, - step_id=step0_id, - task_id=step0_task1_id, - session_action_id="session_action_id", - start_time=render_start_time, - session_dir=sync_inputs.session_dir, - on_uploading_files=mock_on_uploading_files, - ) - # There should be one synced output for this task, Step 0 - Task 1 - assert summary_stats.total_files == 1 - - render_start_time = time.time() - - # Second step and first task - with open(file_to_be_synced_step1_task0, "w") as f: - f.write("this is a comp") - - summary_stats = sync_inputs.asset_syncer.sync_outputs( - s3_settings=job_attachment_settings, - attachments=sync_inputs.attachments, - queue_id=job_attachment_test.queue_id, - job_id=sync_inputs.job_id, - step_id=step1_id, - task_id=step1_task0_id, - session_action_id="session_action_id", - start_time=render_start_time, - session_dir=sync_inputs.session_dir, - on_uploading_files=mock_on_uploading_files, - ) - # There should be one synced output for this task, Step 1 - Task 0 - assert summary_stats.total_files == 1 - - # THEN - object_summary_iterator = job_attachment_test.bucket.objects.filter( - Prefix=f"{job_attachment_settings.full_cas_prefix()}/", - ) - - object_key_set = set(obj.key for obj in object_summary_iterator) - - assert ( - f"{job_attachment_settings.full_cas_prefix()}/{hash_file(str(file_to_be_synced_step0_task0), HashAlgorithm.XXH128)}.xxh128" - in object_key_set - ) - - return SyncOutputsOutput( - step0_id=step0_id, - step1_id=step1_id, - step0_task0_id=step0_task0_id, - step0_task1_id=step0_task1_id, - step1_task0_id=step1_task0_id, - job_id=sync_inputs.job_id, - attachments=sync_inputs.attachments, - step0_task0_output_file=file_to_be_synced_step0_task0_base, - step0_task1_output_file=file_to_be_synced_step0_task1_base, - step1_task0_output_file=file_to_be_synced_step1_task0_base, - ) - - -@pytest.mark.integ -@pytest.mark.skipif( - is_windows_non_admin(), - reason="Windows requires Admin to create symlinks, skipping this test.", -) -def test_sync_outputs_with_symlink( - job_attachment_test: JobAttachmentTest, - sync_inputs: SyncInputsOutputs, - tmp_path, -) -> None: - """ - Test that a symlink pointing to a file outside the session directory is not synced as output. - """ - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise Exception("Job attachment settings must be set for this test.") - - waiter = job_attachment_test.deadline_client.get_waiter("job_create_complete") - waiter.wait( - jobId=sync_inputs.job_id, - queueId=job_attachment_test.queue_id, - farmId=job_attachment_test.farm_id, - ) - - # Get the Step ID - list_steps_response = job_attachment_test.deadline_client.list_steps( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs.job_id, - ) - step_ids = {step["name"]: step["stepId"] for step in list_steps_response["steps"]} - step0_id = step_ids["custom-step"] - - # Get the Task ID - list_tasks_response = job_attachment_test.deadline_client.list_tasks( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs.job_id, - stepId=step0_id, - ) - task_ids = { - task["parameters"]["frame"]["int"]: task["taskId"] for task in list_tasks_response["tasks"] - } - step0_task0_id = task_ids["0"] - - # Create a symlink, in the output directory, pointing to a file located outside the session directory. - symlink_output = Path("outputs/symlink") - symlink_path = sync_inputs.session_dir / sync_inputs.dest_dir / symlink_output - tmp_dir = tmp_path / "tmp_dir" - tmp_dir.mkdir() - symlink_target_path = tmp_dir / "symlink_target" - symlink_target_path.write_text( - "this is a symlink target, located outside the session directory" - ) - symlink_path.symlink_to(symlink_target_path) - assert symlink_path.is_symlink() - - mock_on_uploading_files = MagicMock(return_value=True) - - summary_stats = sync_inputs.asset_syncer.sync_outputs( - s3_settings=job_attachment_settings, - attachments=sync_inputs.attachments, - queue_id=job_attachment_test.queue_id, - job_id=sync_inputs.job_id, - step_id=step0_id, - task_id=step0_task0_id, - session_action_id="session_action_id", - start_time=time.time(), - session_dir=sync_inputs.session_dir, - on_uploading_files=mock_on_uploading_files, - ) - # The symlink should not be synced as output. - assert summary_stats.total_files == 0 - - -@pytest.mark.integ -def test_sync_inputs_with_step_dependencies( - job_attachment_test: JobAttachmentTest, - tmp_path_factory: TempPathFactory, - sync_outputs: SyncOutputsOutput, -): - """ - Test that sync_inputs() syncs the inputs specified in job settings, and the outputs from other steps - specified in step dependencies. - """ - # IF - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - list_steps_response = job_attachment_test.deadline_client.list_steps( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_outputs.job_id, - ) - step_ids = {step["name"]: step["stepId"] for step in list_steps_response["steps"]} - step0_id = step_ids["custom-step"] - - session_dir = tmp_path_factory.mktemp("session_dir") - - # WHEN - syncer = asset_sync.AssetSync(job_attachment_test.farm_id) - - def on_downloading_files(*args, **kwargs): - return True - - syncer.sync_inputs( - job_attachment_settings, - sync_outputs.attachments, - job_attachment_test.queue_id, - sync_outputs.job_id, - session_dir, - step_dependencies=[step0_id], - on_downloading_files=on_downloading_files, - ) - - dest_dir = _get_unique_dest_dir_name(str(job_attachment_test.ASSET_ROOT)) - - # THEN - # Check if the inputs specified in job settings were downloaded - assert Path(session_dir / dest_dir / job_attachment_test.SCENE_MA_PATH).exists() - assert Path(session_dir / dest_dir / job_attachment_test.BRICK_PNG_PATH).exists() - assert Path(session_dir / dest_dir / job_attachment_test.CLOTH_PNG_PATH).exists() - assert Path(session_dir / dest_dir / job_attachment_test.INPUT_IN_OUTPUT_DIR_PATH).exists() - # Check if the outputs from step0_id ("custom-step") were downloaded - assert Path(session_dir / dest_dir / job_attachment_test.FIRST_RENDER_OUTPUT_PATH).exists() - assert Path(session_dir / dest_dir / job_attachment_test.SECOND_RENDER_OUTPUT_PATH).exists() - # Check if the outputs from the other step ("custom-step-1") were not downloaded - assert not Path(session_dir / dest_dir / job_attachment_test.MOV_FILE_OUTPUT_PATH).exists() - - -@pytest.mark.integ -def test_download_outputs_with_job_id_step_id_task_id_and_download_directory( - job_attachment_test: JobAttachmentTest, tmp_path: Path, sync_outputs: SyncOutputsOutput -): - """ - Test that outputs for a task are downloaded to the correct location locally - """ - # GIVEN - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise TypeError("Job attachment settings must be set for this test.") - - # WHEN - try: - job_output_downloader = download.OutputDownloader( - s3_settings=job_attachment_settings, - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_id=sync_outputs.job_id, - step_id=sync_outputs.step0_id, - task_id=sync_outputs.step0_task0_id, - ) - job_output_downloader.download_job_output() - - # THEN - assert Path(job_attachment_test.ASSET_ROOT / sync_outputs.step0_task0_output_file).exists() - finally: - _cleanup_outputs_dir(job_attachment_test) - - -@pytest.mark.integ -def test_download_outputs_with_job_id_step_id_and_download_directory( - job_attachment_test: JobAttachmentTest, tmp_path: Path, sync_outputs: SyncOutputsOutput -): - """ - Test that outputs for a step are downloaded to the correct location locally - """ - # GIVEN - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise TypeError("Job attachment settings must be set for this test.") - - # WHEN - try: - job_output_downloader = download.OutputDownloader( - s3_settings=job_attachment_settings, - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_id=sync_outputs.job_id, - step_id=sync_outputs.step0_id, - task_id=None, - ) - job_output_downloader.download_job_output() - - # THEN - assert Path(job_attachment_test.ASSET_ROOT / sync_outputs.step0_task0_output_file).exists() - assert Path(job_attachment_test.ASSET_ROOT / sync_outputs.step0_task1_output_file).exists() - finally: - _cleanup_outputs_dir(job_attachment_test) - - -@pytest.mark.integ -def test_download_outputs_with_job_id_and_download_directory( - job_attachment_test: JobAttachmentTest, tmp_path: Path, sync_outputs: SyncOutputsOutput -): - """ - Test that outputs for a job are downloaded to the correct location locally - """ - # GIVEN - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise TypeError("Job attachment settings must be set for this test.") - - # WHEN - try: - job_output_downloader = download.OutputDownloader( - s3_settings=job_attachment_settings, - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_id=sync_outputs.job_id, - step_id=None, - task_id=None, - ) - job_output_downloader.download_job_output() - - # THEN - assert Path(job_attachment_test.ASSET_ROOT / sync_outputs.step0_task0_output_file).exists() - assert Path(job_attachment_test.ASSET_ROOT / sync_outputs.step0_task1_output_file).exists() - assert Path(job_attachment_test.ASSET_ROOT / sync_outputs.step1_task0_output_file).exists() - finally: - _cleanup_outputs_dir(job_attachment_test) - - -def _cleanup_outputs_dir(job_attachment_test: JobAttachmentTest) -> None: - shutil.rmtree(job_attachment_test.OUTPUT_PATH) - # Revive the INPUT_IN_OUTPUT_DIR_PATH file. - job_attachment_test.OUTPUT_PATH.mkdir(parents=True, exist_ok=True) - with open(job_attachment_test.INPUT_IN_OUTPUT_DIR_PATH, "w") as f: - f.write( - "Although it is in the output directory, it is actually an input file. It should be" - " downloaded (to the worker's session working directory) during sync_inputs, and" - " should not be captured as an output file when sync_outputs." - ) - - -@dataclass -class UploadInputFilesNoInputPathsOutput: - attachments: Attachments - - -@pytest.fixture(scope="session") -def upload_input_files_no_input_paths( - job_attachment_test: JobAttachmentTest, -) -> UploadInputFilesNoInputPathsOutput: - """ - Test that the created job settings object doesn't have the requiredAssets field when there are no input files. - """ - # IF - - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise TypeError("Job attachment settings must be set for this test.") - - asset_manager = upload.S3AssetManager( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_attachment_settings=job_attachment_settings, - asset_manifest_version=job_attachment_test.manifest_version, - ) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_files = MagicMock(return_value=True) - - # WHEN - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[], - output_paths=[str(job_attachment_test.OUTPUT_PATH)], - referenced_paths=[], - ) - (_, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(job_attachment_test.hash_cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - (_, attachments) = asset_manager.upload_assets( - manifests, - on_uploading_assets=mock_on_uploading_files, - s3_check_cache_dir=str(job_attachment_test.s3_cache_dir), - ) - - # THEN - mock_host_path_format_name = PathFormat.get_host_path_format_string() - assert attachments.manifests == [ - ManifestProperties( - rootPath=str(job_attachment_test.OUTPUT_PATH), - rootPathFormat=PathFormat(mock_host_path_format_name), - outputRelativeDirectories=["."], - ) - ] - - return UploadInputFilesNoInputPathsOutput(attachments=attachments) - - -@pytest.mark.integ -def test_upload_input_files_no_download_paths(job_attachment_test: JobAttachmentTest) -> None: - """ - Test that if there are no output directories, when upload_assets is called, - then the resulting attachments object has no output directories in it. - """ - # IF - - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise TypeError("Job attachment settings must be set for this test.") - - asset_manager = upload.S3AssetManager( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_attachment_settings=job_attachment_settings, - asset_manifest_version=job_attachment_test.manifest_version, - ) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_files = MagicMock(return_value=True) - - # WHEN - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[str(job_attachment_test.SCENE_MA_PATH)], - output_paths=[], - referenced_paths=[], - ) - (_, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(job_attachment_test.hash_cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - (_, attachments) = asset_manager.upload_assets( - manifests, - on_uploading_assets=mock_on_uploading_files, - s3_check_cache_dir=str(job_attachment_test.s3_cache_dir), - ) - - # THEN - if manifests[0].asset_manifest is None: - raise TypeError("Asset manifest must be set for this test.") - - mock_host_path_format_name = PathFormat.get_host_path_format_string() - asset_root_hash = hash_data(str(job_attachment_test.INPUT_PATH).encode(), HashAlgorithm.XXH128) - manifest_hash = hash_data( - bytes(manifests[0].asset_manifest.encode(), "utf-8"), HashAlgorithm.XXH128 - ) - - assert len(attachments.manifests) == 1 - assert attachments.manifests[0].fileSystemLocationName is None - assert attachments.manifests[0].rootPath == str(job_attachment_test.INPUT_PATH) - assert attachments.manifests[0].rootPathFormat == PathFormat(mock_host_path_format_name) - assert attachments.manifests[0].outputRelativeDirectories == [] - assert attachments.manifests[0].inputManifestPath is not None - assert attachments.manifests[0].inputManifestPath.startswith( - f"{job_attachment_test.farm_id}/{job_attachment_test.queue_id}/Inputs/" - ) - assert attachments.manifests[0].inputManifestPath.endswith(f"/{asset_root_hash}_input") - assert attachments.manifests[0].inputManifestHash == manifest_hash - - -@pytest.mark.integ -def test_sync_inputs_no_inputs( - job_attachment_test: JobAttachmentTest, - upload_input_files_no_input_paths: UploadInputFilesNoInputPathsOutput, - tmp_path: Path, - default_job_template_one_task_one_step: str, -) -> None: - """ - Test that all of the input files get synced locally. - """ - # IF - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - job_response = job_attachment_test.deadline_client.create_job( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - attachments=upload_input_files_no_input_paths.attachments.to_dict(), # type: ignore - targetTaskRunStatus="SUSPENDED", - template=default_job_template_one_task_one_step, - templateType="JSON", - priority=50, - ) - - syncer = asset_sync.AssetSync(job_attachment_test.farm_id) - session_dir = tmp_path / "session_dir" - session_dir.mkdir() - - def on_downloading_files(*args, **kwargs): - return True - - # WHEN - syncer.sync_inputs( - job_attachment_settings, - upload_input_files_no_input_paths.attachments, - job_attachment_test.queue_id, - job_response["jobId"], - session_dir, - on_downloading_files=on_downloading_files, - ) - - # THEN - assert not any(Path(session_dir).iterdir()) - - -@pytest.mark.cross_account -@pytest.mark.integ -def test_upload_bucket_wrong_account(external_bucket: str, job_attachment_test: JobAttachmentTest): - """ - Test that if trying to upload to a bucket that isn't in the farm's AWS account, the correct error is thrown. - """ - # IF - job_attachment_settings = JobAttachmentS3Settings( - s3BucketName=external_bucket, - rootPrefix=job_attachment_test.bucket_root_prefix, - ) - - asset_manager = upload.S3AssetManager( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_attachment_settings=job_attachment_settings, - asset_manifest_version=job_attachment_test.manifest_version, - ) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_files = MagicMock(return_value=True) - - # WHEN - with pytest.raises( - # Note: This error is raised in this case when the s3:PutObject operation is denied - # due to the ExpectedBucketOwner check on our s3 operation. If the bucket is in the expected - # account, then the error is a different access denied error. - JobAttachmentsS3ClientError, - match=".*when calling the PutObject operation: Access Denied", - ): - # The attempt to upload the asset manifest should be blocked. - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[str(job_attachment_test.SCENE_MA_PATH)], - output_paths=[str(job_attachment_test.OUTPUT_PATH)], - referenced_paths=[], - ) - (_, manifests) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(job_attachment_test.hash_cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - asset_manager.upload_assets( - manifests, - on_uploading_assets=mock_on_uploading_files, - s3_check_cache_dir=str(job_attachment_test.s3_cache_dir), - ) - - -@pytest.mark.cross_account -@pytest.mark.integ -def test_sync_inputs_bucket_wrong_account( - external_bucket: str, - job_attachment_test: JobAttachmentTest, - upload_input_files_one_asset_in_cas: UploadInputFilesOneAssetInCasOutputs, - default_job_template: str, - tmp_path_factory: TempPathFactory, -): - """ - Test that if trying to sync inputs to a bucket that isn't in the farm's AWS account, the correct error is thrown. - """ - # IF - job_attachment_settings = JobAttachmentS3Settings( - s3BucketName=external_bucket, - rootPrefix=job_attachment_test.bucket_root_prefix, - ) - - job_response = job_attachment_test.deadline_client.create_job( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - attachments=upload_input_files_one_asset_in_cas.attachments.to_dict(), # type: ignore - targetTaskRunStatus="SUSPENDED", - template=default_job_template, - templateType="JSON", - priority=50, - ) - - syncer = asset_sync.AssetSync(job_attachment_test.farm_id) - session_dir = tmp_path_factory.mktemp("session_dir") - - def on_downloading_files(*args, **kwargs): - return True - - # WHEN - with pytest.raises( - JobAttachmentsS3ClientError, - match=f"Error downloading binary file in bucket '{external_bucket}'", - ): - syncer.sync_inputs( - job_attachment_settings, - upload_input_files_one_asset_in_cas.attachments, - job_attachment_test.queue_id, - job_response["jobId"], - session_dir, - on_downloading_files=on_downloading_files, - ) - - -@pytest.mark.cross_account -@pytest.mark.integ -def test_sync_outputs_bucket_wrong_account( - job_attachment_test: JobAttachmentTest, - sync_inputs: SyncInputsOutputs, - external_bucket: str, -) -> None: - """ - Test that if trying to sync outputs to a bucket that isn't in the farm's AWS account, the correct error is thrown. - This is ensuring that the S3 file upload is passing the ExpectedBucketOwner property and verifying that the returned - error is what we expect when using that property (rather than just plain not having access to the bucket). - """ - # IF - job_attachment_settings = JobAttachmentS3Settings( - s3BucketName=external_bucket, - rootPrefix=job_attachment_test.bucket_root_prefix, - ) - - waiter = job_attachment_test.deadline_client.get_waiter("job_create_complete") - waiter.wait( - jobId=sync_inputs.job_id, - queueId=job_attachment_test.queue_id, - farmId=job_attachment_test.farm_id, - ) - - list_steps_response = job_attachment_test.deadline_client.list_steps( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs.job_id, - ) - - step_ids = {step["name"]: step["stepId"] for step in list_steps_response["steps"]} - - step0_id = step_ids["custom-step"] - - list_tasks_response = job_attachment_test.deadline_client.list_tasks( - farmId=job_attachment_test.farm_id, - queueId=job_attachment_test.queue_id, - jobId=sync_inputs.job_id, - stepId=step0_id, - ) - - task_ids = { - task["parameters"]["frame"]["int"]: task["taskId"] for task in list_tasks_response["tasks"] - } - - step0_task0_id = task_ids["0"] - - Path(sync_inputs.session_dir / sync_inputs.dest_dir / "outputs").mkdir(exist_ok=True) - - file_to_be_synced_step0_task0_base = job_attachment_test.FIRST_RENDER_OUTPUT_PATH - - file_to_be_synced_step0_task0 = ( - sync_inputs.session_dir / sync_inputs.dest_dir / file_to_be_synced_step0_task0_base - ) - - render_start_time = time.time() - - # WHEN - - # First step and task - # Create files after the render start time in the output dir, these should be synced - with open(file_to_be_synced_step0_task0, "w") as f: - f.write("this is the first render") - mock_on_uploading_files = MagicMock(return_value=True) - - # WHEN - with pytest.raises( - AssetSyncError, match=f"Error checking if object exists in bucket '{external_bucket}'" - ): - sync_inputs.asset_syncer.sync_outputs( - s3_settings=job_attachment_settings, - attachments=sync_inputs.attachments, - queue_id=job_attachment_test.queue_id, - job_id=sync_inputs.job_id, - step_id=step0_id, - task_id=step0_task0_id, - session_action_id="session_action_id", - start_time=render_start_time, - session_dir=sync_inputs.session_dir, - on_uploading_files=mock_on_uploading_files, - ) - - -@pytest.mark.cross_account -@pytest.mark.integ -def test_download_outputs_bucket_wrong_account( - job_attachment_test: JobAttachmentTest, - tmp_path: Path, - sync_outputs: SyncOutputsOutput, - external_bucket: str, -): - """ - Test that if trying to download outputs to a bucket - that isn't in the farm's AWS account, the correct error is thrown. - """ - # GIVEN - job_attachment_settings = JobAttachmentS3Settings( - s3BucketName=external_bucket, - rootPrefix=job_attachment_test.bucket_root_prefix, - ) - - # WHEN - with pytest.raises( - JobAttachmentsS3ClientError, - match=f"Error listing bucket contents in bucket '{external_bucket}'", - ): - job_output_downloader = download.OutputDownloader( - s3_settings=job_attachment_settings, - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_id=sync_outputs.job_id, - step_id=sync_outputs.step0_id, - task_id=sync_outputs.step0_task0_id, - ) - job_output_downloader.download_job_output() - - -@pytest.mark.integ -def test_download_outputs_no_outputs_dir( - job_attachment_test: JobAttachmentTest, - sync_outputs: SyncOutputsOutput, -): - """ - Test that if trying to download outputs but not specify a file path - Download will be saved to the current directory. - """ - - download_path = Path( - os.path.normpath(Path("").absolute()) / sync_outputs.step0_task0_output_file - ) - - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise Exception("Job attachment settings must be set for this test.") - - job_output_downloader = download.OutputDownloader( - s3_settings=job_attachment_settings, - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_id=sync_outputs.job_id, - step_id=sync_outputs.step0_id, - task_id=sync_outputs.step0_task0_id, - ) - job_output_downloader.set_root_path(str(job_attachment_test.ASSET_ROOT), "") - - # WHEN - try: - job_output_downloader.download_job_output() - # THEN - # The output file should be downloaded to the current directory - assert download_path.exists() - finally: - shutil.rmtree(download_path.parent) - - -@pytest.mark.integ -@pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for Windows file path length UNC, skipping this if os not Windows", -) -def test_download_outputs_windows_long_file_path( - job_attachment_test: JobAttachmentTest, sync_outputs: SyncOutputsOutput, tmp_path: WindowsPath -): - """ - Test that when trying to download outputs to a file path that - longer than 260 chars in Windows, the download is successful. - """ - - tmp_path_len: int = len(str(tmp_path)) - long_root_path_remaining_length: int = WINDOWS_MAX_PATH_LENGTH - tmp_path_len - 20 - long_root_path: str = os.path.join( - tmp_path, - *["path"] - * math.floor( - long_root_path_remaining_length / 5 - ), # Create a temp path that barely does not exceed the windows path limit - ) - - os.makedirs(long_root_path) - assert len(long_root_path) <= WINDOWS_MAX_PATH_LENGTH - 10 - - job_attachment_settings = get_queue( - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - deadline_endpoint_url=job_attachment_test.deadline_endpoint, - ).jobAttachmentSettings - - if job_attachment_settings is None: - raise Exception("Job attachment settings must be set for this test.") - - job_output_downloader = download.OutputDownloader( - s3_settings=job_attachment_settings, - farm_id=job_attachment_test.farm_id, - queue_id=job_attachment_test.queue_id, - job_id=sync_outputs.job_id, - step_id=sync_outputs.step0_id, - task_id=sync_outputs.step0_task0_id, - ) - job_output_downloader.set_root_path(str(job_attachment_test.ASSET_ROOT), str(long_root_path)) - - # WHEN - try: - job_output_downloader.download_job_output() - # THEN - # The output file should be downloaded to the current directory - # Prepend \\?\ when checking the file exists, otherwise Python will not find it - output_file_path = Path( - WINDOWS_UNC_PATH_STRING_PREFIX + long_root_path, sync_outputs.step0_task0_output_file - ) - assert output_file_path.exists() - assert len(str(output_file_path)) > 260, ( - f"Expected full output file path to be over the windows path length limit of {WINDOWS_MAX_PATH_LENGTH}, got {len(str(output_file_path))}" - ) - finally: - shutil.rmtree(WINDOWS_UNC_PATH_STRING_PREFIX + long_root_path) diff --git a/test/unit/deadline/job_attachments/test_file_stat_cache.py b/test/unit/deadline/job_attachments/test_file_stat_cache.py deleted file mode 100644 index 088d3f185..000000000 --- a/test/unit/deadline/job_attachments/test_file_stat_cache.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import stat -from pathlib import Path -from unittest.mock import patch, MagicMock - -from deadline.job_attachments.upload import _FileStatCache - - -class TestFileStatCache: - def test_get_stat_caches_result(self, tmp_path): - """Test that stat results are cached and not called multiple times""" - cache = _FileStatCache() - test_file = tmp_path / "test.txt" - test_file.write_text("test") - - with patch.object(Path, "stat") as mock_stat: - mock_stat.return_value = MagicMock() - - # First call should invoke stat - result1 = cache._get_stat(test_file) - assert mock_stat.call_count == 1 - - # Second call should use cache - result2 = cache._get_stat(test_file) - assert mock_stat.call_count == 1 - assert result1 is result2 - - def test_get_stat_handles_missing_file(self, tmp_path): - """Test that missing files return None and are cached""" - cache = _FileStatCache() - missing_file = tmp_path / "missing.txt" - - result1 = cache._get_stat(missing_file) - result2 = cache._get_stat(missing_file) - - assert result1 is None - assert result2 is None - - def test_exists_with_existing_file(self, tmp_path): - """Test exists() returns True for existing files""" - cache = _FileStatCache() - test_file = tmp_path / "test.txt" - test_file.write_text("test") - - assert cache.exists(test_file) is True - - def test_exists_with_missing_file(self, tmp_path): - """Test exists() returns False for missing files""" - cache = _FileStatCache() - missing_file = tmp_path / "missing.txt" - - assert cache.exists(missing_file) is False - - def test_is_dir_with_directory(self, tmp_path): - """Test is_dir() returns True for directories""" - cache = _FileStatCache() - test_dir = tmp_path / "testdir" - test_dir.mkdir() - - assert cache.is_dir(test_dir) is True - - def test_is_dir_with_file(self, tmp_path): - """Test is_dir() returns False for files""" - cache = _FileStatCache() - test_file = tmp_path / "test.txt" - test_file.write_text("test") - - assert cache.is_dir(test_file) is False - - def test_is_dir_with_missing_path(self, tmp_path): - """Test is_dir() returns False for missing paths""" - cache = _FileStatCache() - missing_path = tmp_path / "missing" - - assert cache.is_dir(missing_path) is False - - def test_get_size_with_file(self, tmp_path): - """Test get_size() returns correct file size""" - cache = _FileStatCache() - test_file = tmp_path / "test.txt" - content = "test content" - test_file.write_text(content) - - size = cache.get_size(test_file) - assert size == len(content.encode()) - - def test_get_size_with_missing_file(self, tmp_path, caplog): - """Test get_size() returns 0 for missing files and emits the expected message""" - cache = _FileStatCache() - missing_file = tmp_path / "missing.txt" - - assert cache.get_size(missing_file) == 0 - assert "Skipping file in size calculation" in caplog.text - - def test_cache_reuse_across_methods(self, tmp_path): - """Test that cache is shared across different methods""" - cache = _FileStatCache() - test_file = tmp_path / "test.txt" - test_file.write_text("test") - - with patch.object(Path, "stat") as mock_stat: - mock_stat.return_value = MagicMock(st_mode=stat.S_IFREG, st_size=4) - - # Call different methods - cache.exists(test_file) - cache.is_dir(test_file) - cache.get_size(test_file) - - # Should only call stat once - assert mock_stat.call_count == 1 diff --git a/test/unit/deadline_job_attachments/__init__.py b/test/unit/deadline_job_attachments/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/unit/deadline_job_attachments/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/unit/deadline_job_attachments/api/__init__.py b/test/unit/deadline_job_attachments/api/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/unit/deadline_job_attachments/api/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/unit/deadline_job_attachments/api/conftest.py b/test/unit/deadline_job_attachments/api/conftest.py deleted file mode 100644 index 4612fcbf6..000000000 --- a/test/unit/deadline_job_attachments/api/conftest.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Common fixtures for deadline job attachments tests. -""" - -from unittest.mock import patch -import os -import time -import pytest -import deadline -import tempfile - - -@pytest.fixture(scope="function", autouse=True) -def session_hash_db_dir_mock(): - with tempfile.TemporaryDirectory() as tmpdir_path: - # We have to use time as a seed, otherwise pytest has fixed random for reproducibility. - tmpdir_path = os.path.join(tmpdir_path, str(int(time.time()))) - with patch( - f"{deadline.__package__}.client.config.config_file.get_cache_directory", - return_value=str(tmpdir_path), - ), patch( - f"{deadline.__package__}.job_attachments.caches.CacheDB.get_default_cache_db_file_dir", - return_value=str(tmpdir_path), - ): - yield diff --git a/test/unit/deadline_job_attachments/api/test_attachment.py b/test/unit/deadline_job_attachments/api/test_attachment.py deleted file mode 100644 index 45adb8438..000000000 --- a/test/unit/deadline_job_attachments/api/test_attachment.py +++ /dev/null @@ -1,494 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Test the deadline.client.api functions relating to attachment -""" - -from unittest.mock import patch -from typing import Dict, List -from pathlib import Path - -import os -import pytest -import json - -import deadline - -from deadline.client import api -from deadline.job_attachments.exceptions import NonValidInputError -from deadline.job_attachments.api.attachment import ( - _attachment_download, - _attachment_upload, -) -from deadline.job_attachments.exceptions import MalformedAttachmentSettingError -from deadline.job_attachments.progress_tracker import DownloadSummaryStatistics -from deadline.job_attachments.asset_manifests import HashAlgorithm, hash_data -from deadline.job_attachments.asset_manifests.decode import decode_manifest -from deadline.job_attachments.asset_manifests.base_manifest import BaseAssetManifest -from deadline.job_attachments.api.attachment import _process_path_mapping -from deadline.job_attachments.upload import S3AssetUploader -from deadline.job_attachments.models import ( - FileConflictResolution, - JobAttachmentS3Settings, - UploadManifestInfo, - PathMappingRule, -) - -PATH_MAPPING = { - "source_path_format": "posix", - "source_path": "/local/home/test", - "destination_path": "/local/home/test/output", -} - -OPENJD_PATH_MAPPING = { - "version": "pathmapping-1.0", - "path_mapping_rules": [ - { - "source_path_format": "posix", - "source_path": "/local/home/test", - "destination_path": "/local/home/test/output", - } - ], -} -PATH_MAPPING_HASH = "4ab97c97c825551aaa963888278ef9ec" - -MOCK_MANIFEST_CASE = { - "unmapped_file_name": { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "19a71beb47d7cc2d654ac4637e680c88", - "mtime": 1720199667787520, - "path": "files/file2.txt", - "size": 14, - } - ], - "totalSize": 14, - }, - PATH_MAPPING_HASH: { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "b03f20b08a76635964ab008a10cd20a8", - "mtime": 1720199667787520, - "path": "files/file1.txt", - "size": 14, - } - ], - "totalSize": 14, - }, -} - -TEST_S3_URI = "s3://bucket/root" - - -def test_process_path_mapping(temp_assets_dir): - mapping_file_path = os.path.join(temp_assets_dir, "mapping") - with open(mapping_file_path, "w", encoding="utf8") as f: - json.dump([PATH_MAPPING], f) - - path_mappings: List[PathMappingRule] = _process_path_mapping( - mapping_file_path, [temp_assets_dir] - ) - assert len(path_mappings) == 2 - - -def test_process_openjd_path_mapping(temp_assets_dir): - mapping_file_path = os.path.join(temp_assets_dir, "mapping") - with open(mapping_file_path, "w", encoding="utf8") as f: - json.dump(OPENJD_PATH_MAPPING, f) - - path_mappings: List[PathMappingRule] = _process_path_mapping( - mapping_file_path, [temp_assets_dir] - ) - assert len(path_mappings) == 2 - - -@pytest.fixture -def session_mock(): - with patch.object(api._session, "get_boto3_session") as session_mock: - yield session_mock - - -class TestAttachmentDownload: - @pytest.fixture - def mock_download_files_from_manifests(self): - with patch( - f"{deadline.__package__}.job_attachments.api.attachment.download_files_from_manifests", - return_value=DownloadSummaryStatistics(), - ) as mock_download_files_from_manifests: - yield mock_download_files_from_manifests - - def test_download_single_to_mapped_invalid_path_mapping(self, temp_assets_dir, session_mock): - with open( - os.path.join(temp_assets_dir, PATH_MAPPING_HASH), - "w", - encoding="utf8", - ) as f: - json.dump(MOCK_MANIFEST_CASE[PATH_MAPPING_HASH], f) - - mapping_file_path = os.path.join(temp_assets_dir, "mapping") - with open(mapping_file_path, "w", encoding="utf8") as f: - json.dump(PATH_MAPPING, f) - - with pytest.raises( - AssertionError, - match="Path mapping rules have to be a list of dict.", - ): - _attachment_download( - manifests=[os.path.join(temp_assets_dir, PATH_MAPPING_HASH)], - s3_root_uri="s3://bucket/assetRoot", - boto3_session=session_mock, - path_mapping_rules=mapping_file_path, - ) - - @pytest.mark.parametrize( - "conflict_resolution", - [ - FileConflictResolution.CREATE_COPY, - FileConflictResolution.OVERWRITE, - FileConflictResolution.SKIP, - None, - ], - ) - def test_download_conflict_resolution( - self, - temp_assets_dir, - session_mock, - mock_download_files_from_manifests, - conflict_resolution, - ): - with open( - os.path.join(temp_assets_dir, PATH_MAPPING_HASH), - "w", - encoding="utf8", - ) as f: - json.dump(MOCK_MANIFEST_CASE[PATH_MAPPING_HASH], f) - - mapping_file_path = os.path.join(temp_assets_dir, "mapping") - with open(mapping_file_path, "w", encoding="utf8") as f: - json.dump([PATH_MAPPING], f) - - if conflict_resolution: - result = _attachment_download( - manifests=[os.path.join(temp_assets_dir, PATH_MAPPING_HASH)], - s3_root_uri="s3://bucket/assetRoot", - boto3_session=session_mock, - path_mapping_rules=mapping_file_path, - conflict_resolution=conflict_resolution, - ) - else: - result = _attachment_download( - manifests=[os.path.join(temp_assets_dir, PATH_MAPPING_HASH)], - s3_root_uri="s3://bucket/assetRoot", - boto3_session=session_mock, - path_mapping_rules=mapping_file_path, - ) - - assert isinstance(result, DownloadSummaryStatistics) - - mock_download_files_from_manifests.assert_called_once_with( - s3_bucket="bucket", - manifests_by_root={ - PATH_MAPPING["destination_path"]: decode_manifest( - json.dumps(MOCK_MANIFEST_CASE[PATH_MAPPING_HASH]) - ), - }, - cas_prefix="assetRoot/Data", - session=session_mock, - conflict_resolution=( - conflict_resolution if conflict_resolution else FileConflictResolution.CREATE_COPY - ), - ) - - @pytest.mark.parametrize("manifest_case_key", MOCK_MANIFEST_CASE.keys()) - def test_download_single_to_current( - self, - temp_assets_dir, - session_mock, - mock_download_files_from_manifests, - manifest_case_key, - ): - with open( - os.path.join(temp_assets_dir, manifest_case_key), - "w", - encoding="utf8", - ) as f: - json.dump(MOCK_MANIFEST_CASE[manifest_case_key], f) - - result = _attachment_download( - manifests=[os.path.join(temp_assets_dir, manifest_case_key)], - s3_root_uri="s3://bucket/assetRoot", - boto3_session=session_mock, - ) - - assert isinstance(result, DownloadSummaryStatistics) - - mock_download_files_from_manifests.assert_called_once_with( - s3_bucket="bucket", - manifests_by_root={ - f"{os.getcwd()}/{manifest_case_key}": decode_manifest( - json.dumps(MOCK_MANIFEST_CASE[manifest_case_key]) - ), - }, - cas_prefix="assetRoot/Data", - session=session_mock, - conflict_resolution=FileConflictResolution.CREATE_COPY, - ) - - def test_download_multiple_to_current( - self, temp_assets_dir, session_mock, mock_download_files_from_manifests - ): - expected_merged: Dict[str, BaseAssetManifest] = dict() - - for manifest_case_key in MOCK_MANIFEST_CASE.keys(): - expected_merged[f"{os.getcwd()}/{manifest_case_key}"] = decode_manifest( - json.dumps(MOCK_MANIFEST_CASE[manifest_case_key]) - ) - with open( - os.path.join(temp_assets_dir, manifest_case_key), - "w", - encoding="utf8", - ) as f: - json.dump(MOCK_MANIFEST_CASE[manifest_case_key], f) - - result = _attachment_download( - manifests=[os.path.join(temp_assets_dir, key) for key in MOCK_MANIFEST_CASE.keys()], - s3_root_uri="s3://bucket/assetRoot", - boto3_session=session_mock, - ) - - assert isinstance(result, DownloadSummaryStatistics) - - mock_download_files_from_manifests.assert_called_once_with( - s3_bucket="bucket", - manifests_by_root=expected_merged, - cas_prefix="assetRoot/Data", - session=session_mock, - conflict_resolution=FileConflictResolution.CREATE_COPY, - ) - - def test_download_invalid_input_manifests(self, session_mock): - with pytest.raises(NonValidInputError): - _attachment_download( - manifests=["file-not-found"], - s3_root_uri=TEST_S3_URI, - boto3_session=session_mock, - ) - - def test_download_invalid_input_path_mapping_rules(self, session_mock): - with pytest.raises(NonValidInputError): - _attachment_download( - manifests=[], - s3_root_uri=TEST_S3_URI, - boto3_session=session_mock, - path_mapping_rules="file-not-found", - ) - - def test_download_invalid_input_s3_root_uri(self, session_mock): - with pytest.raises(MalformedAttachmentSettingError): - _attachment_download( - manifests=[], - s3_root_uri="MalformedPath", - boto3_session=session_mock, - ) - - -class TestAttachmentUpload: - @pytest.fixture - def mock_upload_assets(self): - with patch.object( - S3AssetUploader, "upload_assets", return_value=("key", "data") - ) as mock_upload_assets: - yield mock_upload_assets - - def test_upload_returns_manifest_info_list(self, temp_assets_dir, session_mock): - """Test that _attachment_upload returns a list of UploadManifestInfo objects corresponding to the input manifests.""" - # Create a path mapping file with two rules - path_mapping = [ - { - "source_path_format": "posix", - "source_path": "/local/home/test", - "destination_path": "/local/home/test1/output", - }, - { - "source_path_format": "posix", - "source_path": "/local/home/test2", - "destination_path": "/local/home/test2/output", - }, - ] - path_mapping_file = os.path.join(temp_assets_dir, "path_mapping.json") - with open(path_mapping_file, "w") as f: - json.dump(path_mapping, f) - - # Create a manifest file that only has changes for the first asset root - manifest_case_key = PATH_MAPPING_HASH - file_name = f"{PATH_MAPPING_HASH}.manifest" - with open(os.path.join(temp_assets_dir, file_name), "w") as f: - json.dump(MOCK_MANIFEST_CASE[manifest_case_key], f) - - # Mock asset_uploader.upload_assets to return known values - with patch( - "deadline.job_attachments.upload.S3AssetUploader.upload_assets" - ) as mock_upload_assets: - mock_upload_assets.return_value = ("key1", "hash1") - - # Call _attachment_upload - result = _attachment_upload( - manifests=[os.path.join(temp_assets_dir, file_name)], - s3_root_uri=TEST_S3_URI, - boto3_session=session_mock, - path_mapping_rules=path_mapping_file, - ) - - # Verify the result structure - assert isinstance(result, list) - assert len(result) == 1 # We only passed one manifest - - # Verify the UploadManifestInfo object has the correct values - assert isinstance(result[0], UploadManifestInfo) - assert result[0].output_manifest_path == "key1" - assert result[0].output_manifest_hash == "hash1" - assert result[0].source_path == "/local/home/test" - - def test_upload_invalid_input_manifests(self, session_mock): - with pytest.raises(NonValidInputError): - _attachment_upload( - manifests=["file-not-found"], - s3_root_uri=TEST_S3_URI, - boto3_session=session_mock, - ) - - def test_upload_invalid_input_path_mapping_rules(self, session_mock): - with pytest.raises(NonValidInputError): - _attachment_upload( - manifests=[], - s3_root_uri=TEST_S3_URI, - boto3_session=session_mock, - path_mapping_rules="file-not-found", - ) - - def test_upload_invalid_input_s3_root_uri(self, temp_assets_dir, session_mock): - with pytest.raises(MalformedAttachmentSettingError): - _attachment_upload( - manifests=[], - s3_root_uri="MalformedPath", - root_dirs=[temp_assets_dir], - boto3_session=session_mock, - ) - - def test_upload_single_from_mapped(self, temp_assets_dir, session_mock, mock_upload_assets): - file_name: str = f"{PATH_MAPPING_HASH}.manifest" - with open( - os.path.join(temp_assets_dir, file_name), - "w", - encoding="utf8", - ) as f: - json.dump(MOCK_MANIFEST_CASE[PATH_MAPPING_HASH], f) - - mapping_file_path = os.path.join(temp_assets_dir, "mapping") - with open(mapping_file_path, "w", encoding="utf8") as f: - json.dump([PATH_MAPPING], f) - - _attachment_upload( - manifests=[os.path.join(temp_assets_dir, file_name)], - s3_root_uri=TEST_S3_URI, - boto3_session=session_mock, - path_mapping_rules=mapping_file_path, - upload_manifest_path="test", - ) - - mock_upload_assets.assert_called_once_with( - job_attachment_settings=JobAttachmentS3Settings.from_s3_root_uri(TEST_S3_URI), - manifest=decode_manifest(json.dumps(MOCK_MANIFEST_CASE[PATH_MAPPING_HASH])), - partial_manifest_prefix="test", - manifest_file_name=file_name, - manifest_metadata={ - "Metadata": { - "asset-root": PATH_MAPPING["source_path"], - "file-system-location-name": PATH_MAPPING["source_path_format"], - } - }, - source_root=Path(PATH_MAPPING["source_path"]), - asset_root=Path(PATH_MAPPING["destination_path"]), - s3_check_cache_dir=None, - ) - - @pytest.mark.parametrize("manifest_case_key", MOCK_MANIFEST_CASE.keys()) - def test_upload_single_map_from_root( - self, temp_assets_dir, session_mock, mock_upload_assets, manifest_case_key - ): - file_name_prefix: str = hash_data(temp_assets_dir.encode("utf-8"), HashAlgorithm.XXH128) - file_name: str = f"{file_name_prefix}_output" - - with open( - os.path.join(temp_assets_dir, file_name), - "w", - encoding="utf8", - ) as f: - json.dump(MOCK_MANIFEST_CASE[manifest_case_key], f) - - _attachment_upload( - manifests=[os.path.join(temp_assets_dir, file_name)], - s3_root_uri=TEST_S3_URI, - boto3_session=session_mock, - root_dirs=[temp_assets_dir], - upload_manifest_path="test", - ) - - mock_upload_assets.assert_called_once_with( - job_attachment_settings=JobAttachmentS3Settings.from_s3_root_uri(TEST_S3_URI), - manifest=decode_manifest(json.dumps(MOCK_MANIFEST_CASE[manifest_case_key])), - partial_manifest_prefix="test", - manifest_file_name=file_name, - manifest_metadata={"Metadata": {"asset-root": temp_assets_dir}}, - source_root=Path(temp_assets_dir), - asset_root=Path(temp_assets_dir), - s3_check_cache_dir=None, - ) - - @pytest.mark.parametrize("manifest_case_key", MOCK_MANIFEST_CASE.keys()) - def test_upload_no_mapped_root( - self, temp_assets_dir, session_mock, mock_upload_assets, manifest_case_key - ): - with open( - os.path.join(temp_assets_dir, manifest_case_key), - "w", - encoding="utf8", - ) as f: - json.dump(MOCK_MANIFEST_CASE[manifest_case_key], f) - - # Test No valid root defined for given manifest - with pytest.raises(NonValidInputError) as error: - _attachment_upload( - manifests=[os.path.join(temp_assets_dir, manifest_case_key)], - s3_root_uri="s3://bucket/assetRoot", - root_dirs=[temp_assets_dir], - boto3_session=session_mock, - ) - - assert f"No valid root defined for given manifest {manifest_case_key}" in str(error.value) - - def test_upload_no_root_dir_or_mapping(self, temp_assets_dir, session_mock): - with pytest.raises(NonValidInputError) as error: - _attachment_upload( - manifests=[], - s3_root_uri="s3://bucketName/rootPrefix", - boto3_session=session_mock, - ) - - assert str(error.value) == "One of path mapping rule and root dir must exist, and not both." - - def test_upload_both_root_dir_and_mapping(self, temp_assets_dir, session_mock): - with pytest.raises(NonValidInputError) as error: - _attachment_upload( - manifests=[], - path_mapping_rules="fakefilepath", - root_dirs=[temp_assets_dir], - s3_root_uri="s3://bucketName/rootPrefix", - boto3_session=session_mock, - ) - - assert str(error.value) == "One of path mapping rule and root dir must exist, and not both." diff --git a/test/unit/deadline_job_attachments/api/test_manifest_diff.py b/test/unit/deadline_job_attachments/api/test_manifest_diff.py deleted file mode 100644 index 9104c1a0f..000000000 --- a/test/unit/deadline_job_attachments/api/test_manifest_diff.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import json -import os -from pathlib import Path -import tempfile -from typing import Optional -from deadline.job_attachments.api.manifest import _manifest_diff, _manifest_snapshot -from deadline.job_attachments.models import ManifestDiff, ManifestSnapshot -import pytest - - -TEST_FILE = "test_file" - - -class TestDiffAPI: - @pytest.fixture - def temp_dir(self): - with tempfile.TemporaryDirectory() as tmpdir_path: - yield tmpdir_path - - def _snapshot_folder_helper(self, temp_dir, root_dir) -> str: - """ - Snapshot with a folder and a single file in it. Should generate a manifest containing 1 file. - """ - - # Given snapshot folder and 1 test file - test_file_name = TEST_FILE - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.manifest is not None - with open(manifest.manifest, "r") as manifest_file: - manifest_payload = json.load(manifest_file) - assert len(manifest_payload["paths"]) == 1 - assert manifest_payload["paths"][0]["path"] == test_file_name - - # Return the tested manifest. - return manifest.manifest - - def test_diff_no_change(self, temp_dir): - """ - Diff with the same folder, no new files. Should return with all empty no diff result. - """ - # Given - root_dir = os.path.join(temp_dir, "snapshot") - manifest_file = self._snapshot_folder_helper(temp_dir=temp_dir, root_dir=root_dir) - - # When - manifest_diff: ManifestDiff = _manifest_diff(root=root_dir, manifest=manifest_file) - assert len(manifest_diff.deleted) == 0 - assert len(manifest_diff.modified) == 0 - assert len(manifest_diff.new) == 0 - - def test_diff_new_files(self, temp_dir): - """ - Diff with the same folder, new files. Should return with all empty no diff result. - """ - # Given - root_dir = os.path.join(temp_dir, "snapshot") - manifest_file = self._snapshot_folder_helper(temp_dir=temp_dir, root_dir=root_dir) - - # When - # Make 2 new files, one in the snapshot dir, another in a nested dir. - new_file_name = "new_file" - new_file = os.path.join(root_dir, new_file_name) - Path(new_file).touch() - - new_dir = "new_dir" - new_file2_name = "new_file2" - new_file2 = os.path.join(root_dir, new_dir, new_file2_name) - os.makedirs(os.path.dirname(new_file2), exist_ok=True) - Path(new_file2).touch() - - # Then - manifest_diff: ManifestDiff = _manifest_diff(root=root_dir, manifest=manifest_file) - assert len(manifest_diff.deleted) == 0 - assert len(manifest_diff.modified) == 0 - assert len(manifest_diff.new) == 2 - assert new_file_name in manifest_diff.new - assert f"{new_dir}/{new_file2_name}" in manifest_diff.new - - def test_diff_deleted_file(self, temp_dir): - """ - Diff with the same folder, delete the test file. It should be found by delete. - """ - # Given - root_dir = os.path.join(temp_dir, "snapshot") - manifest_file = self._snapshot_folder_helper(temp_dir=temp_dir, root_dir=root_dir) - - # When - os.remove(os.path.join(root_dir, TEST_FILE)) - manifest_diff: ManifestDiff = _manifest_diff(root=root_dir, manifest=manifest_file) - - # Then - assert len(manifest_diff.modified) == 0 - assert len(manifest_diff.new) == 0 - assert len(manifest_diff.deleted) == 1 - assert TEST_FILE in manifest_diff.deleted - - def test_diff_modified_file_size(self, temp_dir): - """ - Diff with the same folder, modified the test file. It should be found by modified. - """ - # Given - root_dir = os.path.join(temp_dir, "snapshot") - manifest_file = self._snapshot_folder_helper(temp_dir=temp_dir, root_dir=root_dir) - - # When - test_file = os.path.join(root_dir, TEST_FILE) - with open(test_file, "w") as f: - f.write("something_different") - - manifest_diff: ManifestDiff = _manifest_diff(root=root_dir, manifest=manifest_file) - - # Then - assert len(manifest_diff.new) == 0 - assert len(manifest_diff.deleted) == 0 - assert len(manifest_diff.modified) == 1 - assert TEST_FILE in manifest_diff.modified diff --git a/test/unit/deadline_job_attachments/api/test_manifest_download.py b/test/unit/deadline_job_attachments/api/test_manifest_download.py deleted file mode 100644 index b6c7172d7..000000000 --- a/test/unit/deadline_job_attachments/api/test_manifest_download.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import tempfile -from typing import List -from unittest.mock import MagicMock, patch -import pytest - -from deadline.job_attachments.api.manifest import _manifest_download -from deadline.job_attachments.models import ( - ManifestDownloadResponse, - JobAttachmentS3Settings, -) - - -class TestManifestDownload: - @pytest.fixture - def temp_dir(self): - with tempfile.TemporaryDirectory() as tmpdir_path: - yield tmpdir_path - - @patch("deadline.job_attachments.api.manifest.get_manifest_from_s3") - @patch("deadline.job_attachments.api.manifest.get_output_manifests_by_asset_root") - @pytest.mark.parametrize( - "job_manifests,step_manifests", - [ - pytest.param([], []), - pytest.param( - [{"inputManifestPath": "s3://hello/world", "rootPath": "/some/root"}], - [], - ), - pytest.param([], [{"stepId": "step-123456"}]), - pytest.param( - [{"inputManifestPath": "s3://hello/world", "rootPath": "/some/root"}], - [{"stepId": "step-123456"}], - ), - ], - ) - def test_download_job( - self, - mock_get_output_manifest: MagicMock, - mock_get_manifest_from_s3: MagicMock, - job_manifests: List, - step_manifests: List, - temp_dir: str, - ) -> None: - # This is heavily mocked, so return nothing. Integration tests tests full manifest merging. - mock_get_manifest_from_s3.return_value = None - mock_get_output_manifest.return_value = {} - - # Mock Boto - mock_boto_session = MagicMock() - - # Mock Get Queue Credentials - mock_queue_session = MagicMock() - - # Mock up Deadline. - mock_deadline_client = MagicMock() - mock_boto_session.client.return_value = mock_deadline_client - - # Mock the result of get_queue - mock_deadline_client.get_queue.return_value = { - "displayName": "queue", - "jobAttachmentSettings": { - "s3BucketName": "bucket", - "rootPrefix": "root_prefix", - }, - } - - queue_s3_settings = JobAttachmentS3Settings(s3BucketName="bucket", rootPrefix="root_prefix") - - # Mock the result of get_job - mock_deadline_client.get_job.return_value = { - "name": "Mock Job", - "attachments": { - "manifests": job_manifests, - }, - } - # Mock the result of list_step_dependencies - mock_deadline_client.list_step_dependencies.return_value = {"dependencies": step_manifests} - - output: ManifestDownloadResponse = _manifest_download( - download_dir=temp_dir, - farm_id="farm-12345", - queue_id="queue-12345", - job_id="job-12345", - step_id="step-12345", - queue_s3_settings=queue_s3_settings, - deadline_client=mock_deadline_client, - queue_role_session=mock_queue_session, - ) - assert output is not None - - # list_step_dependencies should have been called once as there is no pagination - assert mock_deadline_client.list_step_dependencies.call_count == 1 - - @patch("deadline.job_attachments.api.manifest.get_manifest_from_s3") - @patch("deadline.job_attachments.api.manifest.get_output_manifests_by_asset_root") - def test_download_job_paginate_through_step_dependencies( - self, - mock_get_output_manifest: MagicMock, - mock_get_manifest_from_s3: MagicMock, - temp_dir: str, - ): - # This is heavily mocked, so return nothing. Integration tests tests full manifest merging. - mock_get_manifest_from_s3.return_value = None - mock_get_output_manifest.return_value = {} - - # Mock Boto - mock_boto_session = MagicMock() - - # Mock Get Queue Credentials - mock_queue_session = MagicMock() - - # Mock up Deadline. - mock_deadline_client = MagicMock() - mock_boto_session.client.return_value = mock_deadline_client - - # Mock the result of get_queue - mock_deadline_client.get_queue.return_value = { - "displayName": "queue", - "jobAttachmentSettings": { - "s3BucketName": "bucket", - "rootPrefix": "root_prefix", - }, - } - queue_s3_settings = JobAttachmentS3Settings(s3BucketName="bucket", rootPrefix="root_prefix") - # Mock the result of get_job - mock_deadline_client.get_job.return_value = { - "name": "Mock Job", - "attachments": { - "manifests": [{"inputManifestPath": "s3://hello/world", "rootPath": "/some/root"}], - }, - } - # Mock the result of list_step_dependencies, have a nextToken to make sure that our code paginates - mock_deadline_client.list_step_dependencies.side_effect = [ - { - "dependencies": [{"stepId": f"step-{i}"} for i in range(100)], - "nextToken": "abcasd", - }, - {"dependencies": [{"stepId": f"step-{i}"} for i in range(100, 150)]}, - ] - - output: ManifestDownloadResponse = _manifest_download( - download_dir=temp_dir, - farm_id="farm-12345", - queue_id="queue-12345", - job_id="job-12345", - step_id="step-12345", - queue_s3_settings=queue_s3_settings, - deadline_client=mock_deadline_client, - queue_role_session=mock_queue_session, - ) - assert output is not None - - # list_step_dependencies should have been called twice to paginate - assert mock_deadline_client.list_step_dependencies.call_count == 2 diff --git a/test/unit/deadline_job_attachments/api/test_manifest_merge.py b/test/unit/deadline_job_attachments/api/test_manifest_merge.py deleted file mode 100644 index a014d30b7..000000000 --- a/test/unit/deadline_job_attachments/api/test_manifest_merge.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -import json - -from typing import Optional - -from deadline.job_attachments.api.manifest import _manifest_merge -from deadline.job_attachments.models import ManifestMerge - - -class TestMergeAPI: - def test_merge_same_file(self, temp_dir, test_manifest_one): - """ - Merge with one manifest file - """ - # Given - merge_dir = os.path.join(temp_dir, "merge") - manifest_one = os.path.join(temp_dir, "manifest_1") - - with open(manifest_one, "w", encoding="utf8") as f: - json.dump(test_manifest_one, f) - - # When - manifest_merge: Optional[ManifestMerge] = _manifest_merge( - root=temp_dir, manifest_files=[manifest_one], destination=merge_dir, name="merge" - ) - - # Then - assert manifest_merge is not None - assert manifest_merge.manifest_root == temp_dir - assert merge_dir in manifest_merge.local_manifest_path - - def test_merge_different_files(self, temp_dir, test_manifest_one, test_manifest_two): - """ - Merge two different manifest files - """ - # Given - manifest_one = os.path.join(temp_dir, "manifest_1") - manifest_two = os.path.join(temp_dir, "manifest_2") - merge_dir = os.path.join(temp_dir, "merge") - - with open(manifest_one, "w", encoding="utf8") as f: - json.dump(test_manifest_one, f) - - with open(manifest_two, "w", encoding="utf8") as f: - json.dump(test_manifest_two, f) - - # When - manifest_merge: Optional[ManifestMerge] = _manifest_merge( - root=temp_dir, - manifest_files=[manifest_one, manifest_two], - destination=merge_dir, - name="merge", - ) - - # Then - assert manifest_merge is not None - assert manifest_merge.manifest_root == temp_dir - assert merge_dir in manifest_merge.local_manifest_path diff --git a/test/unit/deadline_job_attachments/api/test_manifest_upload.py b/test/unit/deadline_job_attachments/api/test_manifest_upload.py deleted file mode 100644 index cdaf6dcd3..000000000 --- a/test/unit/deadline_job_attachments/api/test_manifest_upload.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - - -import os -import tempfile -from unittest.mock import ANY, MagicMock, patch -import pytest - -from deadline.job_attachments.api.manifest import _manifest_upload - - -TEST_MANIFEST = '{"foo":"bar"}' -TEST_BUCKET_NAME = "s3://foobarbucket" -TEST_CAS_PREFIX = "in/a/galaxy" -TEST_KEY_PREFIX = "far/far/away" - - -class TestManifestUpload: - @pytest.fixture - def temp_dir(self): - with tempfile.TemporaryDirectory() as tmpdir_path: - yield tmpdir_path - - @pytest.fixture - def mock_manifest_file(self, temp_dir) -> str: - """ - Create a Mock manifest file saved to the temp dir. - :return path to the test file. - """ - path = os.path.join(temp_dir, "test.manifest") - with open(path, "w") as manifest_file: - manifest_file.write(TEST_MANIFEST) - return path - - @patch("deadline.job_attachments.api.manifest.S3AssetUploader") - @patch("deadline.client.api.get_boto3_session") - def test_upload( - self, - mock_get_boto3_session: MagicMock, - mock_upload_assets: MagicMock, - mock_manifest_file: str, - ) -> None: - """ - Upload is really simple. It is a pass through to S3AssetUploader. Make sure it is called correctly. - """ - # Given - mock_boto_session = MagicMock() - mock_get_boto3_session.return_value = mock_boto_session - - # When the API is called.... - _manifest_upload( - manifest_file=mock_manifest_file, - s3_bucket_name=TEST_BUCKET_NAME, - s3_cas_prefix=TEST_CAS_PREFIX, - boto_session=mock_boto_session, - ) - - # Then - mock_upload_assets.return_value.upload_bytes_to_s3.assert_called_once_with( - bytes=ANY, - bucket=TEST_BUCKET_NAME, - key=TEST_CAS_PREFIX + "/Manifests/test.manifest", - progress_handler=ANY, - extra_args=ANY, - ) - - @patch("deadline.job_attachments.api.manifest.S3AssetUploader") - @patch("deadline.client.api.get_boto3_session") - def test_upload_with_prefix( - self, - mock_get_boto3_session: MagicMock, - mock_upload_assets: MagicMock, - mock_manifest_file: str, - ) -> None: - """ - Upload is really simple. It is a pass through to S3AssetUploader. Make sure it is called correctly with prefix - """ - # Given - mock_boto_session = MagicMock() - mock_get_boto3_session.return_value = mock_boto_session - - # When the API is called.... - _manifest_upload( - manifest_file=mock_manifest_file, - s3_bucket_name=TEST_BUCKET_NAME, - s3_cas_prefix=TEST_CAS_PREFIX, - s3_key_prefix=TEST_KEY_PREFIX, - boto_session=mock_boto_session, - ) - - # Then - mock_upload_assets.return_value.upload_bytes_to_s3.assert_called_once_with( - bytes=ANY, - bucket=TEST_BUCKET_NAME, - key=TEST_CAS_PREFIX + "/Manifests/" + TEST_KEY_PREFIX + "/test.manifest", - progress_handler=ANY, - extra_args=ANY, - ) diff --git a/test/unit/deadline_job_attachments/api/test_path_summarization.py b/test/unit/deadline_job_attachments/api/test_path_summarization.py deleted file mode 100644 index 6de41718b..000000000 --- a/test/unit/deadline_job_attachments/api/test_path_summarization.py +++ /dev/null @@ -1,700 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -import os - -import pytest - -from deadline.job_attachments.api import ( - PathSummary, - human_readable_file_size, - summarize_paths_by_sequence, - summarize_paths_by_nested_directory, - summarize_path_list, -) -from deadline.job_attachments.models import PathFormat - - -PARAMETRIZE_CASES: tuple = ( - (1000000000000000000, "1000.0 PB"), - (89234597823492938, "89.23 PB"), - (1000000000000001, "1.0 PB"), - (1000000000000000, "1.0 PB"), - (999999999999999, "1.0 PB"), - (999995000000000, "1.0 PB"), - (999994000000000, "999.99 TB"), - (8934587945678, "8.93 TB"), - (1000000000001, "1.0 TB"), - (1000000000000, "1.0 TB"), - (999999999999, "1.0 TB"), - (999995000000, "1.0 TB"), - (999994000000, "999.99 GB"), - (83748237582, "83.75 GB"), - (1000000001, "1.0 GB"), - (1000000000, "1.0 GB"), - (999999999, "1.0 GB"), - (999995000, "1.0 GB"), - (999994000, "999.99 MB"), - (500229150, "500.23 MB"), - (1000001, "1.0 MB"), - (1000000, "1.0 MB"), - (999999, "1.0 MB"), - (999995, "1.0 MB"), - (999994, "999.99 KB"), - (96771, "96.77 KB"), - (1001, "1.0 KB"), - (1000, "1.0 KB"), - (999, "999 B"), - (934, "934 B"), - (0, "0 B"), -) - - -@pytest.mark.parametrize( - ("file_size", "expected_output"), - PARAMETRIZE_CASES, -) -def test_human_readable_file_size(file_size: int, expected_output: str): - """ - Test that given a file size in bytes, the expected human readable file size is output. - """ - assert human_readable_file_size(file_size) == expected_output - - -@pytest.mark.parametrize( - ("path_list", "expected_output"), - [ - ( - ["frame_1.png", "frame_3.png", "frame_20.png", "readme.txt"], - [PathSummary("frame_%d.png", index_set={1, 3, 20}), PathSummary("readme.txt")], - ), - ( - ["frame_01.png", "frame_1.png", "frame_30.png", "frame_09.png"], - [PathSummary("frame_%02d.png", index_set={1, 9, 30}), PathSummary("frame_1.png")], - ), - (["00", "02", "03", "07", "10"], [PathSummary("%02d", index_set={0, 2, 3, 7, 10})]), - (["0", "5", "99", "207"], [PathSummary("%d", index_set={0, 5, 99, 207})]), - ( - [ - "dataset_2.tar.gz", - "dataset_821.tar.gz", - "dataset_1.tar.gz", - "dataset_3.tar.gz", - "dataset_12345.tar.gz", - "dataset_23.tar.gz", - ], - [PathSummary("dataset_%d.tar.gz", index_set={1, 2, 3, 23, 821, 12345})], - ), - ( - ["frame_1.png", "frame_20.png", "7", "12", "000", "100", "789", "1000"], - [ - PathSummary("%03d", index_set={0, 100, 789, 1000}), - PathSummary("12"), - PathSummary("7"), - PathSummary("frame_1.png"), - PathSummary("frame_20.png"), - ], - ), - ], -) -def test_summarize_paths_without_nesting_without_sizes(path_list, expected_output): - """Given each list of paths, confirm the expected output.""" - assert summarize_paths_by_sequence(path_list) == expected_output - # Since there's no nesting in these cases, this is equivalent - assert summarize_paths_by_nested_directory(path_list) == expected_output - - -PARAMETRIZE_CASES = ( - ( - ["frame_1.png", "frame_3.png", "frame_20.png", "readme.txt"], - {"frame_1.png": 1, "frame_3.png": 2, "frame_20.png": 4, "readme.txt": 8}, - [ - PathSummary("frame_%d.png", index_set={1, 3, 20}, total_size=7), - PathSummary("readme.txt", total_size=8), - ], - ), - ( - ["frame_01.png", "frame_1.png", "frame_30.png", "frame_09.png"], - {"frame_01.png": 1, "frame_1.png": 2, "frame_30.png": 4, "frame_09.png": 8}, - [ - PathSummary("frame_%02d.png", index_set={1, 9, 30}, total_size=13), - PathSummary("frame_1.png", total_size=2), - ], - ), - ( - ["00", "02", "03", "07", "10"], - {"00": 1, "02": 2, "03": 4, "07": 8, "10": 16}, - [PathSummary("%02d", index_set={0, 2, 3, 7, 10}, total_size=31)], - ), - ( - ["0", "5", "99", "207"], - {"0": 1, "5": 2, "99": 4, "207": 8}, - [PathSummary("%d", index_set={0, 5, 99, 207}, total_size=15)], - ), - ( - [ - "dataset_2.tar.gz", - "dataset_821.tar.gz", - "dataset_1.tar.gz", - "dataset_3.tar.gz", - "dataset_12345.tar.gz", - "dataset_23.tar.gz", - ], - { - "dataset_2.tar.gz": 1, - "dataset_821.tar.gz": 2, - "dataset_1.tar.gz": 4, - "dataset_3.tar.gz": 8, - "dataset_12345.tar.gz": 16, - "dataset_23.tar.gz": 32, - }, - [PathSummary("dataset_%d.tar.gz", index_set={1, 2, 3, 23, 821, 12345}, total_size=63)], - ), - ( - ["frame_1.png", "frame_20.png", "7", "12", "000", "100", "789", "1000"], - { - "frame_1.png": 1, - "frame_20.png": 2, - "7": 4, - "12": 8, - "000": 16, - "100": 32, - "789": 64, - "1000": 128, - }, - [ - PathSummary("%03d", index_set={0, 100, 789, 1000}, total_size=16 + 32 + 64 + 128), - PathSummary("12", total_size=8), - PathSummary("7", total_size=4), - PathSummary("frame_1.png", total_size=1), - PathSummary("frame_20.png", total_size=2), - ], - ), -) - - -@pytest.mark.parametrize( - ("path_list", "sizes", "expected_output"), - PARAMETRIZE_CASES, -) -def test_summarize_paths_without_nesting_with_sizes(path_list, sizes, expected_output): - """Given each list of paths, confirm the expected output.""" - assert summarize_paths_by_sequence(path_list, total_size_by_path=sizes) == expected_output - # Since there's no nesting in these cases, this is equivalent - assert ( - summarize_paths_by_nested_directory(path_list, total_size_by_path=sizes) == expected_output - ) - - -PARAMETRIZE_CASES = ( - ( - [], - [], - [], - ), - ( - ["a/b/c/frame_1.png", "a/b/c/frame_3.png", "a/b/c/frame_20.png", "a/b/d/readme.txt"], - [ - PathSummary("a/b/c/frame_%d.png".replace("/", os.path.sep), index_set={1, 3, 20}), - PathSummary("a/b/d/readme.txt".replace("/", os.path.sep)), - ], - [ - PathSummary( - "a/b/".replace("/", os.path.sep), - file_count=4, - children={ - "c": PathSummary( - "a/b/c/".replace("/", os.path.sep), - file_count=3, - children={ - "frame_%d.png": PathSummary( - "a/b/c/frame_%d.png".replace("/", os.path.sep), - index_set={1, 3, 20}, - ) - }, - ), - "d": PathSummary( - "a/b/d/".replace("/", os.path.sep), - file_count=1, - children={ - "readme.txt": PathSummary("a/b/d/readme.txt".replace("/", os.path.sep)) - }, - ), - }, - ) - ], - ), - ( - ["seq/frame_01.png", "frame_1.png", "seq/frame_30.png", "seq/frame_09.png"], - [ - PathSummary("frame_1.png"), - PathSummary("seq/frame_%02d.png".replace("/", os.path.sep), index_set={1, 9, 30}), - ], - [ - PathSummary("frame_1.png"), - PathSummary("seq/frame_%02d.png".replace("/", os.path.sep), index_set={1, 9, 30}), - ], - ), - ( - ["/abc/def/ghi/00", "/abc/xyz/02", "/abc/def/jkl/mno/03", "/www/07", "/abc/def/10"], - [ - PathSummary("/abc/def/10".replace("/", os.path.sep)), - PathSummary("/abc/def/ghi/00".replace("/", os.path.sep)), - PathSummary("/abc/def/jkl/mno/03".replace("/", os.path.sep)), - PathSummary("/abc/xyz/02".replace("/", os.path.sep)), - PathSummary("/www/07".replace("/", os.path.sep)), - ], - [ - PathSummary( - os.path.sep, - file_count=5, - children={ - "abc": PathSummary( - "/abc/".replace("/", os.path.sep), - file_count=4, - children={ - "def": PathSummary( - "/abc/def/".replace("/", os.path.sep), - file_count=3, - children={ - "ghi": PathSummary( - "/abc/def/ghi/".replace("/", os.path.sep), - file_count=1, - children={ - "00": PathSummary( - "/abc/def/ghi/00".replace("/", os.path.sep) - ) - }, - ), - "jkl": PathSummary( - "/abc/def/jkl/".replace("/", os.path.sep), - file_count=1, - children={ - "mno": PathSummary( - "/abc/def/jkl/mno/".replace("/", os.path.sep), - file_count=1, - children={ - "03": PathSummary( - "/abc/def/jkl/mno/03".replace( - "/", os.path.sep - ) - ) - }, - ) - }, - ), - "10": PathSummary("/abc/def/10".replace("/", os.path.sep)), - }, - ), - "xyz": PathSummary( - "/abc/xyz/".replace("/", os.path.sep), - file_count=1, - children={ - "02": PathSummary("/abc/xyz/02".replace("/", os.path.sep)) - }, - ), - }, - ), - "www": PathSummary( - "/www/".replace("/", os.path.sep), - file_count=1, - children={"07": PathSummary("/www/07".replace("/", os.path.sep))}, - ), - }, - ) - ], - ), -) - - -@pytest.mark.parametrize( - ("path_list", "expected_seq_output", "expected_nest_output"), - PARAMETRIZE_CASES, -) -def test_summarize_paths_with_nesting_without_sizes( - path_list, expected_seq_output, expected_nest_output -): - """Given each list of paths, confirm the expected output.""" - assert summarize_paths_by_sequence(path_list) == expected_seq_output - assert summarize_paths_by_nested_directory(path_list) == expected_nest_output - - -PARAMETRIZE_CASES = ( - ([], {}, [], []), - ( - # Repeated paths will be deduplicated - ["a/b/c", "a/b/c", "a/b/c", "a/b/c"], - {"a/b/c": 1}, - [PathSummary("a/b/c".replace("/", os.path.sep), total_size=1)], - [PathSummary("a/b/c".replace("/", os.path.sep), total_size=1)], - ), - ( - ["a/b/c/frame_1.png", "a/b/c/frame_3.png", "a/b/c/frame_20.png", "a/b/d/readme.txt"], - { - "a/b/c/frame_1.png": 1, - "a/b/c/frame_3.png": 2, - "a/b/c/frame_20.png": 4, - "a/b/d/readme.txt": 8, - }, - [ - PathSummary( - "a/b/c/frame_%d.png".replace("/", os.path.sep), index_set={1, 3, 20}, total_size=7 - ), - PathSummary("a/b/d/readme.txt".replace("/", os.path.sep), total_size=8), - ], - [ - PathSummary( - "a/b/".replace("/", os.path.sep), - file_count=4, - total_size=15, - children={ - "c": PathSummary( - "a/b/c/".replace("/", os.path.sep), - file_count=3, - total_size=7, - children={ - "frame_%d.png": PathSummary( - "a/b/c/frame_%d.png".replace("/", os.path.sep), - index_set={1, 3, 20}, - total_size=7, - ) - }, - ), - "d": PathSummary( - "a/b/d/".replace("/", os.path.sep), - file_count=1, - total_size=8, - children={ - "readme.txt": PathSummary( - "a/b/d/readme.txt".replace("/", os.path.sep), total_size=8 - ) - }, - ), - }, - ) - ], - ), - ( - ["seq/frame_01.png", "frame_1.png", "seq/frame_30.png", "seq/frame_09.png"], - {"seq/frame_01.png": 1, "frame_1.png": 2, "seq/frame_30.png": 4, "seq/frame_09.png": 8}, - [ - PathSummary("frame_1.png", total_size=2), - PathSummary( - "seq/frame_%02d.png".replace("/", os.path.sep), index_set={1, 9, 30}, total_size=13 - ), - ], - [ - PathSummary("frame_1.png", total_size=2), - PathSummary( - "seq/frame_%02d.png".replace("/", os.path.sep), index_set={1, 9, 30}, total_size=13 - ), - ], - ), - ( - ["/abc/def/ghi/00", "/abc/xyz/02", "/abc/def/jkl/mno/03", "/www/07", "/abc/def/10"], - { - "/abc/def/ghi/00": 1, - "/abc/xyz/02": 2, - "/abc/def/jkl/mno/03": 4, - "/www/07": 8, - "/abc/def/10": 16, - }, - [ - PathSummary("/abc/def/10".replace("/", os.path.sep), total_size=16), - PathSummary("/abc/def/ghi/00".replace("/", os.path.sep), total_size=1), - PathSummary("/abc/def/jkl/mno/03".replace("/", os.path.sep), total_size=4), - PathSummary("/abc/xyz/02".replace("/", os.path.sep), total_size=2), - PathSummary("/www/07".replace("/", os.path.sep), total_size=8), - ], - [ - PathSummary( - os.path.sep, - file_count=5, - total_size=31, - children={ - "abc": PathSummary( - "/abc/".replace("/", os.path.sep), - file_count=4, - total_size=23, - children={ - "def": PathSummary( - "/abc/def/".replace("/", os.path.sep), - file_count=3, - total_size=21, - children={ - "ghi": PathSummary( - "/abc/def/ghi/".replace("/", os.path.sep), - file_count=1, - total_size=1, - children={ - "00": PathSummary( - "/abc/def/ghi/00".replace("/", os.path.sep), - total_size=1, - ) - }, - ), - "jkl": PathSummary( - "/abc/def/jkl/".replace("/", os.path.sep), - file_count=1, - total_size=4, - children={ - "mno": PathSummary( - "/abc/def/jkl/mno/".replace("/", os.path.sep), - file_count=1, - total_size=4, - children={ - "03": PathSummary( - "/abc/def/jkl/mno/03".replace( - "/", os.path.sep - ), - total_size=4, - ) - }, - ) - }, - ), - "10": PathSummary( - "/abc/def/10".replace("/", os.path.sep), total_size=16 - ), - }, - ), - "xyz": PathSummary( - "/abc/xyz/".replace("/", os.path.sep), - file_count=1, - total_size=2, - children={ - "02": PathSummary( - "/abc/xyz/02".replace("/", os.path.sep), total_size=2 - ) - }, - ), - }, - ), - "www": PathSummary( - "/www/".replace("/", os.path.sep), - file_count=1, - total_size=8, - children={ - "07": PathSummary("/www/07".replace("/", os.path.sep), total_size=8) - }, - ), - }, - ) - ], - ), -) - - -@pytest.mark.parametrize( - ("path_list", "sizes", "expected_seq_output", "expected_nest_output"), PARAMETRIZE_CASES -) -def test_summarize_paths_with_nesting_with_sizes( - path_list, sizes, expected_seq_output, expected_nest_output -): - """Given each list of paths, confirm the expected output.""" - assert summarize_paths_by_sequence(path_list, total_size_by_path=sizes) == expected_seq_output - assert ( - summarize_paths_by_nested_directory(path_list, total_size_by_path=sizes) - == expected_nest_output - ) - - -PATH_LIST_DATASET_SEQ_TAR_GZ = [ - "dataset_2.tar.gz", - "dataset_821.tar.gz", - "dataset_1.tar.gz", - "dataset_3.tar.gz", - "dataset_12345.tar.gz", - "dataset_23.tar.gz", -] -PATH_LIST_DATASET_SEQ_TAR_GZ_SIZES = { - "dataset_2.tar.gz": 1, - "dataset_821.tar.gz": 2, - "dataset_1.tar.gz": 4, - "dataset_3.tar.gz": 8, - "dataset_12345.tar.gz": 16, - "dataset_23.tar.gz": 32, -} - -PATH_LIST_UNNUMBERED_FILES = [ - "file1.tar.gz", - "file2.tar.gz", - "sword.txt", - "stone.png", - "imagination.md", -] -PATH_LIST_UNNUMBERED_FILES_SIZES = { - "file1.tar.gz": 1, - "file2.tar.gz": 2, - "sword.txt": 4, - "stone.png": 8, - "imagination.md": 16, -} - -PATH_LIST_NESTED_FILES = [ - "seq/file1.tar.gz", - "seq/file2.tar.gz", - "seq/file3.tar.gz", - "doc/sword.txt", - "doc/images/stone.png", - "doc/imagination.md", - "README.md", -] -PATH_LIST_NESTED_FILES_SIZES = { - "seq/file1.tar.gz": 1, - "seq/file2.tar.gz": 2, - "seq/file3.tar.gz": 4, - "doc/sword.txt": 8, - "doc/images/stone.png": 16, - "doc/imagination.md": 32, - "README.md": 64, -} - -PARAMETRIZE_CASES = ( - ( - [], - dict(), - "", - ), - ( - PATH_LIST_DATASET_SEQ_TAR_GZ, - dict(), - "dataset_%d.tar.gz (6 files, sequence 1-3,23,821,12345)\n", - ), - ( - PATH_LIST_DATASET_SEQ_TAR_GZ, - dict(include_totals=False), - "dataset_%d.tar.gz (sequence indexes 1-3,23,821,12345)\n", - ), - ( - PATH_LIST_DATASET_SEQ_TAR_GZ, - dict(total_size_by_path=PATH_LIST_DATASET_SEQ_TAR_GZ_SIZES), - "dataset_%d.tar.gz (6 files, 63 B, sequence 1-3,23,821,12345)\n", - ), - ( - PATH_LIST_DATASET_SEQ_TAR_GZ, - dict(total_size_by_path=PATH_LIST_DATASET_SEQ_TAR_GZ_SIZES, include_totals=False), - "dataset_%d.tar.gz (sequence indexes 1-3,23,821,12345)\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict(), - "file1.tar.gz (1 file)\nfile2.tar.gz (1 file)\nimagination.md (1 file)\nstone.png (1 file)\nsword.txt (1 file)\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict(max_entries=5), - "file1.tar.gz (1 file)\nfile2.tar.gz (1 file)\nimagination.md (1 file)\nstone.png (1 file)\nsword.txt (1 file)\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict( - max_entries=4 - ), # Special case, this shows 5 entries because the 5th line "..." would be less info than showing the actual file - "file1.tar.gz (1 file)\nfile2.tar.gz (1 file)\nimagination.md (1 file)\nstone.png (1 file)\nsword.txt (1 file)\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict(include_totals=False), - "file1.tar.gz\nfile2.tar.gz\nimagination.md\nstone.png\nsword.txt\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict(include_totals=False, max_entries=3), - "file1.tar.gz\nfile2.tar.gz\nimagination.md\n... and 2 more\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict(total_size_by_path=PATH_LIST_UNNUMBERED_FILES_SIZES), - "imagination.md (1 file, 16 B)\nstone.png (1 file, 8 B)\nsword.txt (1 file, 4 B)\nfile2.tar.gz (1 file, 2 B)\nfile1.tar.gz (1 file, 1 B)\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict(total_size_by_path=PATH_LIST_UNNUMBERED_FILES_SIZES, max_entries=3), - "imagination.md (1 file, 16 B)\nstone.png (1 file, 8 B)\nsword.txt (1 file, 4 B)\n... and 2 more (2 files, 3 B)\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict(total_size_by_path=PATH_LIST_UNNUMBERED_FILES_SIZES, include_totals=False), - "imagination.md\nstone.png\nsword.txt\nfile2.tar.gz\nfile1.tar.gz\n", - ), - ( - PATH_LIST_UNNUMBERED_FILES, - dict( - total_size_by_path=PATH_LIST_UNNUMBERED_FILES_SIZES, include_totals=False, max_entries=3 - ), - "imagination.md\nstone.png\nsword.txt\n... and 2 more\n", - ), - ( - PATH_LIST_NESTED_FILES, - dict(), - "doc/ (3 files):\n images/ (1 file)\n imagination.md (1 file)\n sword.txt (1 file)\nseq/file%d.tar.gz (3 files, sequence 1-3)\nREADME.md (1 file)\n", - ), - ( - PATH_LIST_NESTED_FILES, - dict(max_entries=2), - "doc/ (3 files):\n images/ (1 file)\n ... and 2 more\n... and 2 more (4 files)\n".replace( - "/", os.path.sep - ), - ), - ( - PATH_LIST_NESTED_FILES, - dict(max_entries=2, total_size_by_path=PATH_LIST_NESTED_FILES_SIZES), - "README.md (1 file, 64 B)\n... and 2 more (6 files, 63 B)\n".replace("/", os.path.sep), - ), - ( - PATH_LIST_NESTED_FILES, - dict(max_entries=5, total_size_by_path=PATH_LIST_NESTED_FILES_SIZES), - "README.md (1 file, 64 B)\ndoc/ (3 files, 56 B):\n imagination.md (1 file, 32 B)\n images/ (1 file, 16 B)\n sword.txt (1 file, 8 B)\nseq/file%d.tar.gz (3 files, 7 B, sequence 1-3)\n".replace( - "/", os.path.sep - ), - ), - ( - PATH_LIST_NESTED_FILES, - dict(max_entries=5, total_size_by_path=PATH_LIST_NESTED_FILES_SIZES, include_totals=False), - "README.md\ndoc/:\n imagination.md\n images/\n sword.txt\nseq/file%d.tar.gz (sequence indexes 1-3)\n".replace( - "/", os.path.sep - ), - ), - ( - PATH_LIST_NESTED_FILES, - dict(max_entries=5, include_totals=False), - "doc/:\n images/\n imagination.md\n sword.txt\nseq/file%d.tar.gz (sequence indexes 1-3)\nREADME.md\n".replace( - "/", os.path.sep - ), - ), -) - - -@pytest.mark.parametrize(("path_list", "kwargs", "expected_output"), PARAMETRIZE_CASES) -def test_summarize_path_list(path_list, kwargs, expected_output): - if "total_size_by_path" in kwargs: - kwargs["total_size_by_path"] = { - path.replace("/", os.path.sep): size - for path, size in kwargs["total_size_by_path"].items() - } - assert summarize_path_list( - [path.replace("/", os.path.sep) for path in path_list], **kwargs - ) == expected_output.replace("/", os.path.sep) - - -PARAMETRIZE_CASES = ( - ( - PATH_LIST_NESTED_FILES, - dict(path_format=PathFormat.POSIX), - "doc/ (3 files):\n images/ (1 file)\n imagination.md (1 file)\n sword.txt (1 file)\nseq/file%d.tar.gz (3 files, sequence 1-3)\nREADME.md (1 file)\n", - ), - ( - PATH_LIST_NESTED_FILES, - dict(path_format=PathFormat.WINDOWS), - "doc\\ (3 files):\n images\\ (1 file)\n imagination.md (1 file)\n sword.txt (1 file)\nseq\\file%d.tar.gz (3 files, sequence 1-3)\nREADME.md (1 file)\n", - ), - ( - [path.replace("/", "\\") for path in PATH_LIST_NESTED_FILES], - dict(path_format=PathFormat.WINDOWS), - "doc\\ (3 files):\n images\\ (1 file)\n imagination.md (1 file)\n sword.txt (1 file)\nseq\\file%d.tar.gz (3 files, sequence 1-3)\nREADME.md (1 file)\n", - ), -) - - -@pytest.mark.parametrize(("path_list", "kwargs", "expected_output"), PARAMETRIZE_CASES) -def test_summarize_path_list_with_path_format(path_list, kwargs, expected_output): - assert summarize_path_list(path_list, **kwargs) == expected_output diff --git a/test/unit/deadline_job_attachments/api/test_snapshot.py b/test/unit/deadline_job_attachments/api/test_snapshot.py deleted file mode 100644 index 645c72f51..000000000 --- a/test/unit/deadline_job_attachments/api/test_snapshot.py +++ /dev/null @@ -1,509 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import json -import os -from pathlib import Path -import tempfile -from typing import List, Optional, Set -from deadline.job_attachments.api.manifest import _manifest_snapshot -from deadline.job_attachments.models import ManifestSnapshot -from deadline.job_attachments._utils import _retry -import pytest - - -class TestSnapshotAPI: - @pytest.fixture - def temp_dir(self): - with tempfile.TemporaryDirectory() as tmpdir_path: - yield tmpdir_path - - def get_manifest_files(self, manifest_path) -> Set[str]: - """Helper method to extract file paths from a manifest""" - with open(manifest_path, "r") as manifest_file: - manifest_payload = json.load(manifest_file) - return {item["path"] for item in manifest_payload["paths"]} - - def test_snapshot_empty_folder(self, temp_dir): - """ - Snapshot with an invalid folder. Should find nothing and no manifest. - """ - - # Given foobar folder - root_dir = os.path.join(temp_dir, "foobar") - os.makedirs(root_dir) - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is None - - def test_snapshot_folder(self, temp_dir): - """ - Snapshot with a folder and a single file in it. Should generate a manifest containing 1 file. - """ - - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.root == root_dir - assert manifest.manifest is not None - with open(manifest.manifest, "r") as manifest_file: - manifest_payload = json.load(manifest_file) - assert len(manifest_payload["paths"]) == 1 - assert manifest_payload["paths"][0]["path"] == test_file_name - - def test_snapshot_recursive_folder(self, temp_dir): - """ - Snapshot with a folder a file, a nested folder and a file in the nested folder. - """ - - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - nested_test_file_name = "nested_file" - nested_folder = "nested" - nested_test_file = os.path.join(root_dir, nested_folder, nested_test_file_name) - os.makedirs(os.path.dirname(nested_test_file), exist_ok=True) - with open(nested_test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.root == root_dir - assert manifest.manifest is not None - with open(manifest.manifest, "r") as manifest_file: - manifest_payload = json.load(manifest_file) - assert len(manifest_payload["paths"]) == 2 - files = set() - for item in manifest_payload["paths"]: - files.add(item["path"]) - - assert test_file_name in files - assert f"{nested_folder}/{nested_test_file_name}" in files - - @pytest.mark.parametrize( - "includes, excludes, results", - [ - pytest.param( - ["test_file", "**/nested_file"], None, ["test_file", "nested/nested_file"] - ), - pytest.param( - ["nested/**"], None, ["nested/excluded_nested_file", "nested/nested_file"] - ), - pytest.param( - None, - ["excluded_test_file", "**/excluded_nested_file"], - ["test_file", "nested/nested_file"], - ), - pytest.param( - ["test_file"], ["excluded_test_file", "**/excluded_nested_file"], ["test_file"] - ), - pytest.param( - ["**/nested_file"], - ["excluded_test_file", "**/excluded_nested_file"], - ["nested/nested_file"], - ), - ], - ) - def test_snapshot_includes_excludes( - self, temp_dir, includes: List[str], excludes: List[str], results: List[str] - ): - """ - Snapshot with a folder a file, a nested folder and a file in the nested folder. - Include glob includes "test_file", "nested_file". - Should not pick up "excluded". - """ - - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - excluded_test_file_name = "excluded_test_file" - excluded_test_file = os.path.join(root_dir, excluded_test_file_name) - with open(excluded_test_file, "w") as f: - f.write("testing123") - - nested_test_file_name = "nested_file" - nested_folder = "nested" - nested_test_file = os.path.join(root_dir, nested_folder, nested_test_file_name) - os.makedirs(os.path.dirname(nested_test_file), exist_ok=True) - with open(nested_test_file, "w") as f: - f.write("testing123") - - nested_excluded_test_file_name = "excluded_nested_file" - nested_excluded_test_file = os.path.join( - root_dir, nested_folder, nested_excluded_test_file_name - ) - with open(nested_excluded_test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, - destination=temp_dir, - name="test", - include=includes, - exclude=excludes, - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.root == root_dir - assert manifest.manifest is not None - with open(manifest.manifest, "r") as manifest_file: - manifest_payload = json.load(manifest_file) - assert len(manifest_payload["paths"]) == len(results) - files = set() - for item in manifest_payload["paths"]: - files.add(item["path"]) - - for result in results: - assert result in files - - def test_snapshot_diff(self, temp_dir): - """ - Create a snapshot with 1 file. Add a second file and make a diff manifest. - Only the second file should be found. - """ - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.manifest is not None - - # Given a second new file. - second_test_file_name = "second_file" - second_test_file = os.path.join(root_dir, second_test_file_name) - os.makedirs(os.path.dirname(second_test_file), exist_ok=True) - with open(second_test_file, "w") as f: - f.write("second123") - - # When snapshot again. - diffed_manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test", diff=manifest.manifest - ) - - # Then. We should find only the second file. - assert diffed_manifest is not None - assert diffed_manifest.manifest is not None - with open(diffed_manifest.manifest, "r") as diff_manifest_file: - manifest_payload = json.load(diff_manifest_file) - assert len(manifest_payload["paths"]) == 1 - files = set() - for item in manifest_payload["paths"]: - files.add(item["path"]) - - assert second_test_file_name in files - - def test_snapshot_time_diff(self, temp_dir): - """ - Create a snapshot with 1 file. Change the time stamp of the file. - The diff manifest should contain the file again. - """ - - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.manifest is not None - - # Given the file's timestamp is updated. - os.utime(test_file, (1234567890, 1234567890)) - - # When snapshot again. - diffed_manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test", diff=manifest.manifest - ) - - # Then. We should find the file again. - assert diffed_manifest is not None - assert diffed_manifest.manifest is not None - with open(diffed_manifest.manifest, "r") as diff_manifest_file: - manifest_payload = json.load(diff_manifest_file) - assert len(manifest_payload["paths"]) == 1 - files = set() - for item in manifest_payload["paths"]: - files.add(item["path"]) - - assert test_file_name in files - - def test_snapshot_size_diff(self, temp_dir): - """ - Create a snapshot with 1 file. Change the contents of the file. - The diff manifest should contain the file again. - """ - - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.manifest is not None - - # Given the file's contents is updated. - with open(test_file, "w") as f: - f.write("testing123testing123testing123") - - # When snapshot again. - diffed_manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test", diff=manifest.manifest - ) - - # Then. We should find the file again. - assert diffed_manifest is not None - assert diffed_manifest.manifest is not None - with open(diffed_manifest.manifest, "r") as diff_manifest_file: - manifest_payload = json.load(diff_manifest_file) - assert len(manifest_payload["paths"]) == 1 - files = set() - for item in manifest_payload["paths"]: - files.add(item["path"]) - - assert test_file_name in files - - def test_snapshot_diff_no_diff(self, temp_dir): - """ - Create a snapshot with 1 file. Snapshot again and diff. It should have no manifest. - """ - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.manifest is not None - - # When snapshot again. - diffed_manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test", diff=manifest.manifest - ) - - # Then. We should find no new manifest, there were no files to snapshot - assert diffed_manifest is None - - @_retry( - tries=2, delay=0.1, backoff=0.1 - ) # os.utime may take time for the file system to stablize. - def test_snapshot_diff_no_diff_modified_mtime(self, temp_dir): - """ - Create a snapshot with 1 file. Modify the mtime of the snapshot to simulate the attachment download operation. - Snapshot again and diff. It should have no manifest. - """ - # Given snapshot folder and 1 test file - root_dir = os.path.join(temp_dir, "snapshot") - - test_file_name = "test_file" - test_file = os.path.join(root_dir, test_file_name) - os.makedirs(os.path.dirname(test_file), exist_ok=True) - with open(test_file, "w") as f: - f.write("testing123") - - # When - manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test" - ) - - # Then - assert manifest is not None - assert manifest.root is not None - assert manifest.manifest is not None - - with open(manifest.manifest, "r") as manifest_file: - manifest_payload = json.load(manifest_file) - assert len(manifest_payload["paths"]) == 1 - modified_time_override = manifest_payload["paths"][0]["mtime"] / 1000000 - - # When simulate the file timestamp override from downloaded asset - os.utime(test_file, (modified_time_override, modified_time_override)) - assert Path(test_file).stat().st_mtime == modified_time_override - - # When snapshot again. - diffed_manifest: Optional[ManifestSnapshot] = _manifest_snapshot( - root=root_dir, destination=temp_dir, name="test", diff=manifest.manifest - ) - # Then. We should find no new manifest, there were no files to snapshot - assert diffed_manifest is None - - @pytest.mark.parametrize( - "test_case,initial_include,initial_exclude,diff_include,diff_exclude,modified_files,new_files,expected_diff_files", - [ - # Test case 1: Include filter with same filter in diff - ( - "include_filter", - None, # initial include - None, # initial exclude - ["subdir1/**"], # diff include - None, # diff exclude - ["subdir1/file1.txt"], # files to modify - [ - ("subdir1/file2.txt", "new txt"), - ("subdir1/file1.txt", "new dat"), - ], # new files to add - {"subdir1/file1.txt", "subdir1/file2.txt"}, # expected files in diff - ), - # Test case 2: Exclude filter with same filter in diff - ( - "exclude_filter", - None, # initial include - ["*.dat"], # initial exclude - None, # diff include - ["*.dat"], # diff exclude - ["file1.txt", "file1.dat"], # files to modify - [], # no new files - {"file1.txt"}, # expected files in diff (dat file excluded) - ), - ], - ) - def test_diff_with_includes_excludes( - self, - temp_dir, - test_case, - initial_include, - initial_exclude, - diff_include, - diff_exclude, - modified_files, - new_files, - expected_diff_files, - ): - """ - Parametrized test for different filter scenarios with diff: - 1. Create initial snapshot with specified include/exclude filters - 2. Modify specified files and add new files - 3. Create diff snapshot with specified include/exclude filters - 4. Verify the diff contains the expected files - """ - # Setup test directory - root_dir = os.path.join(temp_dir, "snapshot") - os.makedirs(root_dir, exist_ok=True) - - # Create initial files - subdir1 = os.path.join(root_dir, "subdir1") - subdir2 = os.path.join(root_dir, "subdir2") - os.makedirs(subdir1) - os.makedirs(subdir2) - Path(os.path.join(subdir1, "file1.txt")).touch() - Path(os.path.join(subdir2, "file2.txt")).touch() - - # Create initial snapshot with specified filters - initial_manifest = _manifest_snapshot( - root=root_dir, - destination=temp_dir, - name=f"initial_{test_case}", - include=initial_include, - exclude=initial_exclude, - ) - - assert initial_manifest is not None - initial_paths = self.get_manifest_files(initial_manifest.manifest) - assert len(initial_paths) == 2 - - # Modify specified files - for filename in modified_files: - with open(os.path.join(root_dir, filename), "w") as f: - f.write(f"modified {filename}") - - # Add new files - for filename, content in new_files: - with open(os.path.join(root_dir, filename), "w") as f: - f.write(content) - - # Create diff snapshot with specified filters - diff_manifest = _manifest_snapshot( - root=root_dir, - destination=temp_dir, - name=f"diff_{test_case}", - include=diff_include, - exclude=diff_exclude, - diff=initial_manifest.manifest, - ) - - assert diff_manifest is not None - diff_files = self.get_manifest_files(diff_manifest.manifest) - - # Verify diff manifest contains expected files - assert diff_files == expected_diff_files diff --git a/test/unit/deadline_job_attachments/api/test_utils.py b/test/unit/deadline_job_attachments/api/test_utils.py deleted file mode 100644 index 4dfc159db..000000000 --- a/test/unit/deadline_job_attachments/api/test_utils.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -import json -from typing import Dict -import pytest -from unittest.mock import patch - -from deadline.job_attachments.exceptions import NonValidInputError -from deadline.job_attachments.asset_manifests.base_manifest import BaseAssetManifest -from deadline.job_attachments.api._utils import _read_manifests - - -class TestReadManifests: - def test_valid_manifests(self, temp_dir, test_manifest_one): - """Test valid manifest file for read - - Args: - temp_dir: a temporary directory - test_manifest_one: test manifest - """ - - # Given - manifest_file_name = "manifest_1" - file_path = os.path.join(temp_dir, manifest_file_name) - - with open(file_path, "w", encoding="utf8") as f: - json.dump(test_manifest_one, f) - - # When - result: Dict[str, BaseAssetManifest] = _read_manifests([file_path]) - - # Then - assert len(result) == 1 - assert result.get(manifest_file_name) is not None - - manifest = result.get(manifest_file_name) - assert isinstance(manifest, BaseAssetManifest) - assert len(manifest.paths) == 3 - - def test_invalid_file_path(self): - """ - Test with non-existent file - """ - - with patch("os.path.isfile", return_value=False): - with pytest.raises(NonValidInputError) as exc_info: - _read_manifests(["/path/to/nonexistent.json"]) - - assert "not valid" in str(exc_info.value) - - def test_empty_manifest_list(self): - """ - Test with empty input - """ - - # When - result = _read_manifests([]) - - # Then - assert isinstance(result, dict) - assert len(result) == 0 diff --git a/test/unit/deadline_job_attachments/asset_manifests/__init__.py b/test/unit/deadline_job_attachments/asset_manifests/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/unit/deadline_job_attachments/asset_manifests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/unit/deadline_job_attachments/asset_manifests/test_decode.py b/test/unit/deadline_job_attachments/asset_manifests/test_decode.py deleted file mode 100644 index 1823b838f..000000000 --- a/test/unit/deadline_job_attachments/asset_manifests/test_decode.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for the asset_manifests.decode module""" - -from __future__ import annotations -from enum import Enum - -import json -from dataclasses import dataclass -import re -from typing import Any -from unittest.mock import patch - -import pytest - -import deadline -from deadline.job_attachments.asset_manifests import decode, versions, HashAlgorithm -from deadline.job_attachments.asset_manifests.v2023_03_03 import ( - AssetManifest as AssetManifest_v2023_03_03, -) -from deadline.job_attachments.asset_manifests.v2023_03_03 import ManifestPath as Path_v2023_03_03 -from deadline.job_attachments.exceptions import ManifestDecodeValidationError - - -@dataclass -class ManifestParam: - manifest_str: str - manifest_version: versions.ManifestVersion - - -@pytest.fixture -def manifest_params(default_manifest_str_v2023_03_03: str) -> list[ManifestParam]: - return [ - ManifestParam(default_manifest_str_v2023_03_03, versions.ManifestVersion.v2023_03_03), - ] - - -def test_validate_manifest(manifest_params: list[ManifestParam]): - """ - Test the a valid manifest is correctly validated. - """ - for manifest_param in manifest_params: - manifest: dict[str, Any] = json.loads(manifest_param.manifest_str) - assert decode.validate_manifest(manifest, manifest_param.manifest_version) == (True, None) - - -def test_validate_manifest_manifest_not_valid_manifest(manifest_params: list[ManifestParam]): - """ - Test that a manifest is returned as not valid with an expected error string if the manifest isn't valid - """ - for manifest_param in manifest_params: - manifest: dict[str, Any] = json.loads(manifest_param.manifest_str) - del manifest["hashAlg"] - valid, error_str = decode.validate_manifest(manifest, manifest_param.manifest_version) - assert not valid - assert error_str is not None - assert error_str.startswith("manifest is missing required field(s) ['hashAlg']") - - -def test_decode_manifest_v2023_03_03(default_manifest_str_v2023_03_03: str): - """ - Test that a v2023-03-03 manifest string decodes to an AssetManifest object as expected. - """ - expected_manifest = AssetManifest_v2023_03_03( - hash_alg=HashAlgorithm.XXH128, - total_size=10, - paths=[ - Path_v2023_03_03(path="\r", hash="CarriageReturn", size=1, mtime=1679079744833848), - Path_v2023_03_03(path="1", hash="One", size=1, mtime=1679079344833868), - Path_v2023_03_03(path="another_test_file", hash="c", size=1, mtime=1675079344833848), - Path_v2023_03_03(path="test_dir/test_file", hash="b", size=1, mtime=1479079344833848), - Path_v2023_03_03(path="test_file", hash="a", size=1, mtime=167907934333848), - Path_v2023_03_03(path="\u0080", hash="Control", size=1, mtime=1679079344833348), - Path_v2023_03_03( - path="\u00c3\u00b1", hash="UserTestCase", size=1, mtime=1679579344833848 - ), - Path_v2023_03_03( - path="ö", hash="LatinSmallLetterOWithDiaeresis", size=1, mtime=1679079344833848 - ), - Path_v2023_03_03(path="€", hash="EuroSign", size=1, mtime=1679079344836848), - Path_v2023_03_03(path="😀", hash="EmojiGrinningFace", size=1, mtime=1679579344833848), - Path_v2023_03_03(path="\ude0a", hash="EmojiTestCase", size=1, mtime=1679579344833848), - Path_v2023_03_03( - path="דּ", hash="HebrewLetterDaletWithDagesh", size=1, mtime=1679039344833848 - ), - ], - ) - assert decode.decode_manifest(default_manifest_str_v2023_03_03) == expected_manifest - - -def test_decode_manifest_version_not_supported(): - """ - Test that a ManifestDecodeValidationError is raised if the manifest passed has a version that isn't valid. - """ - with pytest.raises( - ManifestDecodeValidationError, - match=re.escape( - "Unknown manifest version: 1900-06-06 (Currently supported Manifest versions: 2023-03-03)" - ), - ): - decode.decode_manifest('{"manifestVersion": "1900-06-06"}') - - -def test_decode_manifest_version_not_supported_when_multiple_versions_are_supported(): - """ - Test that a ManifestDecodeValidationError is raised with a descriptive error message if the manifest passed - has a version that isn't valid. In this test, the ManifestVersion class is mocked to simulate having multple - supported manifest versions. - """ - - class MockManifestVersion(str, Enum): - UNDEFINED = "UNDEFINED" - v2023_03_03 = "2023-03-03" - v2024_04_03 = "2024-04-03" - v2025_05_03 = "2025-05-03" - - with patch( - f"{deadline.__package__}.job_attachments.asset_manifests.decode.ManifestVersion", - new=MockManifestVersion, - ): - with pytest.raises( - ManifestDecodeValidationError, - match=re.escape( - "Unknown manifest version: 1900-06-06 " - "(Currently supported Manifest versions: 2023-03-03, 2024-04-03, 2025-05-03)" - ), - ): - decode.decode_manifest('{"manifestVersion": "1900-06-06"}') - - -def test_decode_manifest_not_valid_manifest(): - """ - Test that a ManifestDecodeValidationError is raised if the manifest passed in is not valid. - """ - with pytest.raises( - ManifestDecodeValidationError, match=r"manifest is missing required field\(s\).*" - ): - decode.decode_manifest('{"manifestVersion": "2023-03-03"}') - - -def test_decode_manifest_missing_manifest_version(): - """ - Test that a ManifestDecodeValidationError is raised if the manifest passed in is missing the manifestVersion field. - """ - with pytest.raises( - ManifestDecodeValidationError, - match='Manifest is missing the required "manifestVersion" field', - ): - decode.decode_manifest('{"hashAlg": "xxh128"}') - - -def test_decode_manifest_hash_not_alphanumeric(): - """ - Test that a ManifestDecodeValidationError is raised if the manifest contains non-alphanumeric hashes - """ - invalid_hashes: list[tuple[str, str]] = [ - ("no_dots", "O.o"), - ("no_foward_slash", "a/b"), - ("no_back_slash", "a\\\\b"), - ("no_tildas", "o~o"), - ] - - for path, hash in invalid_hashes: - with pytest.raises(ManifestDecodeValidationError, match=r".*is not alphanumeric"): - manifest_str = ( - "{" - '"hashAlg":"xxh128",' - '"manifestVersion":"2023-03-03",' - '"paths":[' - f'{{"hash":"{hash}","mtime":1679079744833848,"path":"{path}","size":1}}' - "]," - '"totalSize":10' - "}" - ) - decode.decode_manifest(manifest_str) diff --git a/test/unit/deadline_job_attachments/asset_manifests/test_manifest_model.py b/test/unit/deadline_job_attachments/asset_manifests/test_manifest_model.py deleted file mode 100644 index e4e38f00d..000000000 --- a/test/unit/deadline_job_attachments/asset_manifests/test_manifest_model.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for the asset manifest model""" - -import pytest - -from deadline.job_attachments.asset_manifests import ( - BaseManifestModel, - ManifestModelRegistry, - v2023_03_03, -) -from deadline.job_attachments.asset_manifests.versions import ManifestVersion - - -@pytest.mark.parametrize( - "version,expected_model", - [ - (ManifestVersion.v2023_03_03, v2023_03_03.ManifestModel), - ], -) -def test_get_manifest_model(version: ManifestVersion, expected_model: BaseManifestModel): - """ - Test to ensure that the appropriate manifest model is returned given a manifest version - """ - model = ManifestModelRegistry.get_manifest_model(version=version) - assert model == expected_model # type: ignore[comparison-overlap] - - -def test_get_manifest_model_no_manifest_for_version(): - """ - Test to ensure the correct error gets raised when there is no asset manifest model for a given version. - """ - with pytest.raises( - RuntimeError, match=r"No model for asset manifest version: (ManifestVersion.)?UNDEFINED" - ): - ManifestModelRegistry.get_manifest_model(version=ManifestVersion.UNDEFINED) diff --git a/test/unit/deadline_job_attachments/asset_manifests/v2023_03_03/__init__.py b/test/unit/deadline_job_attachments/asset_manifests/v2023_03_03/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/unit/deadline_job_attachments/asset_manifests/v2023_03_03/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/unit/deadline_job_attachments/asset_manifests/v2023_03_03/test_asset_manifest.py b/test/unit/deadline_job_attachments/asset_manifests/v2023_03_03/test_asset_manifest.py deleted file mode 100644 index d5c2d5efa..000000000 --- a/test/unit/deadline_job_attachments/asset_manifests/v2023_03_03/test_asset_manifest.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for the v2023-03-03 version of the manifest file.""" - -import json - -from deadline.job_attachments.asset_manifests.v2023_03_03.asset_manifest import ( - AssetManifest, - ManifestPath, -) -from deadline.job_attachments.asset_manifests import HashAlgorithm - - -def test_encode(): - """ - Ensure the expected JSON string is returned from the encode function. - """ - manifest = AssetManifest( - hash_alg=HashAlgorithm("xxh128"), - total_size=10, - paths=[ - ManifestPath(path="test_file", hash="a", size=1, mtime=167907934333848), - ManifestPath(path="test_dir/test_file", hash="b", size=1, mtime=1479079344833848), - ManifestPath(path="another_test_file", hash="c", size=1, mtime=1675079344833848), - ManifestPath(path="€", hash="EuroSign", size=1, mtime=1679079344836848), - ManifestPath(path="\r", hash="CarriageReturn", size=1, mtime=1679079744833848), - ManifestPath( - path="דּ", hash="HebrewLetterDaletWithDagesh", size=1, mtime=1679039344833848 - ), - ManifestPath(path="1", hash="One", size=1, mtime=1679079344833868), - ManifestPath(path="😀", hash="EmojiGrinningFace", size=1, mtime=1679579344833848), - ManifestPath(path="\u0080", hash="Control", size=1, mtime=1679079344833348), - ManifestPath( - path="ö", hash="LatinSmallLetterOWithDiaeresis", size=1, mtime=1679079344833848 - ), - ], - ) - - expected = ( - "{" - '"hashAlg":"xxh128",' - '"manifestVersion":"2023-03-03",' - '"paths":[' - r'{"hash":"CarriageReturn","mtime":1679079744833848,"path":"\r","size":1},' - '{"hash":"One","mtime":1679079344833868,"path":"1","size":1},' - '{"hash":"c","mtime":1675079344833848,"path":"another_test_file","size":1},' - '{"hash":"b","mtime":1479079344833848,"path":"test_dir/test_file","size":1},' - '{"hash":"a","mtime":167907934333848,"path":"test_file","size":1},' - r'{"hash":"Control","mtime":1679079344833348,"path":"\u0080","size":1},' - r'{"hash":"LatinSmallLetterOWithDiaeresis","mtime":1679079344833848,"path":"\u00f6","size":1},' - r'{"hash":"EuroSign","mtime":1679079344836848,"path":"\u20ac","size":1},' - r'{"hash":"EmojiGrinningFace","mtime":1679579344833848,"path":"\ud83d\ude00","size":1},' - r'{"hash":"HebrewLetterDaletWithDagesh","mtime":1679039344833848,"path":"\ufb33","size":1}' - "]," - '"totalSize":10' - "}" - ) - - a = manifest.encode() - assert a == expected - - -def test_decode(default_manifest_str_v2023_03_03: str): - """ - Ensure the expected AssetManifest is returned from the decode function. - """ - expected = AssetManifest( - hash_alg=HashAlgorithm("xxh128"), - total_size=10, - paths=[ - ManifestPath(path="\r", hash="CarriageReturn", size=1, mtime=1679079744833848), - ManifestPath(path="1", hash="One", size=1, mtime=1679079344833868), - ManifestPath(path="another_test_file", hash="c", size=1, mtime=1675079344833848), - ManifestPath(path="test_dir/test_file", hash="b", size=1, mtime=1479079344833848), - ManifestPath(path="test_file", hash="a", size=1, mtime=167907934333848), - ManifestPath(path="\u0080", hash="Control", size=1, mtime=1679079344833348), - ManifestPath(path="\u00c3\u00b1", hash="UserTestCase", size=1, mtime=1679579344833848), - ManifestPath( - path="ö", hash="LatinSmallLetterOWithDiaeresis", size=1, mtime=1679079344833848 - ), - ManifestPath(path="€", hash="EuroSign", size=1, mtime=1679079344836848), - ManifestPath(path="😀", hash="EmojiGrinningFace", size=1, mtime=1679579344833848), - ManifestPath(path="\ude0a", hash="EmojiTestCase", size=1, mtime=1679579344833848), - ManifestPath( - path="דּ", hash="HebrewLetterDaletWithDagesh", size=1, mtime=1679039344833848 - ), - ], - ) - assert ( - AssetManifest.decode(manifest_data=json.loads(default_manifest_str_v2023_03_03)) == expected - ) diff --git a/test/unit/deadline_job_attachments/aws/__init__.py b/test/unit/deadline_job_attachments/aws/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/unit/deadline_job_attachments/aws/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/unit/deadline_job_attachments/aws/test_aws_clients.py b/test/unit/deadline_job_attachments/aws/test_aws_clients.py deleted file mode 100644 index d8f0ca53c..000000000 --- a/test/unit/deadline_job_attachments/aws/test_aws_clients.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for aws clients""" - -import os -from unittest.mock import Mock, patch - -import pytest - -from deadline.job_attachments._aws.aws_clients import ( - get_boto3_session, - get_botocore_session, - get_deadline_client, - get_s3_client, - get_sts_client, -) -import deadline -from deadline.job_attachments._aws.aws_config import ( - S3_CONNECT_TIMEOUT_IN_SECS, - S3_READ_TIMEOUT_IN_SECS, -) - - -def _make_client(service_name, session=None): - """Create a client using the production factory functions with an optional fresh session.""" - if session is None: - session = get_boto3_session(get_botocore_session()) - factories = { - "s3": get_s3_client, - "sts": get_sts_client, - "deadline": get_deadline_client, - } - return factories[service_name](session=session) - - -def test_get_deadline_client(boto_config): - """ - Test that get_deadline_client returns the correct deadline client - """ - session_mock = Mock() - with patch( - f"{deadline.__package__}.job_attachments._aws.aws_clients.get_boto3_session" - ) as get_session: - get_session.return_value = session_mock - session_mock.client.return_value = Mock() - get_deadline_client() - - session_mock.client.assert_called_with("deadline", endpoint_url=None) - - -def test_get_deadline_client_non_default_endpoint(boto_config): - """ - Test that get_deadline_client returns the correct deadline client - and that the endpoint url is the given one when provided. - """ - test_endpoint = "https://test.com" - session_mock = Mock() - with patch( - f"{deadline.__package__}.job_attachments._aws.aws_clients.get_boto3_session" - ) as get_session: - get_session.return_value = session_mock - session_mock.client.return_value = Mock() - get_deadline_client(endpoint_url=test_endpoint) - - session_mock.client.assert_called_with("deadline", endpoint_url=test_endpoint) - - -def test_get_s3_client(boto_config): - """ - Test that get_s3_client returns a properly configured S3 client. - """ - s3_client = get_s3_client() - - assert s3_client.meta.config.signature_version == "s3v4" - assert s3_client.meta.config.connect_timeout == S3_CONNECT_TIMEOUT_IN_SECS - assert s3_client.meta.config.read_timeout == S3_READ_TIMEOUT_IN_SECS - - -def test_get_sts_client(boto_config): - sts_client = get_sts_client() - - assert sts_client.meta.service_model.service_name == "sts" - - -@pytest.mark.parametrize("service_name", ["s3", "sts", "deadline"]) -def test_default_regional_endpoint(boto_config, service_name): - """ - Test that S3 and STS clients (previously global by default) now use regional endpoints by default. - """ - region = os.environ["AWS_DEFAULT_REGION"] - client = _make_client(service_name) - assert client.meta.endpoint_url == f"https://{service_name}.{region}.amazonaws.com" - - -@pytest.mark.parametrize( - "service_name, env_var", - [ - ("s3", "AWS_ENDPOINT_URL_S3"), - ("sts", "AWS_ENDPOINT_URL_STS"), - ("deadline", "AWS_ENDPOINT_URL_DEADLINE"), - ], -) -def test_endpoint_url_override_via_env(boto_config, service_name, env_var): - """ - Test that clients respect service-specific AWS_ENDPOINT_URL_* environment variables. - """ - custom_endpoint = f"https://custom-{service_name}-env.example.com" - with patch.dict(os.environ, {env_var: custom_endpoint}): - client = _make_client(service_name) - assert client.meta.endpoint_url == custom_endpoint - - -@pytest.mark.parametrize( - "service_name", - ["s3", "sts", "deadline"], -) -def test_endpoint_url_override_via_config_profile(boto_config, tmp_path, service_name): - """ - Test that clients respect endpoint_url set in an AWS config profile. - """ - custom_endpoint = f"https://custom-{service_name}-config.example.com" - config_file = tmp_path / "config" - config_file.write_text(f""" -[profile testprofile] -services = testprofile-services - -[services testprofile-services] -{service_name} = - endpoint_url = {custom_endpoint} -""") - with patch.dict( - os.environ, - { - "AWS_CONFIG_FILE": str(config_file), - "AWS_PROFILE": "testprofile", - }, - ): - client = _make_client(service_name) - assert client.meta.endpoint_url == custom_endpoint diff --git a/test/unit/deadline_job_attachments/aws/test_deadline.py b/test/unit/deadline_job_attachments/aws/test_deadline.py deleted file mode 100644 index 9c58d3d26..000000000 --- a/test/unit/deadline_job_attachments/aws/test_deadline.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for Deadline AWS calls.""" - -import pytest -from unittest.mock import MagicMock, patch -from botocore.exceptions import ClientError - -import deadline -from deadline.job_attachments._aws.deadline import get_queue -from deadline.job_attachments.exceptions import JobAttachmentsError -from deadline.job_attachments.models import Queue - - -@patch(f"{deadline.__package__}.job_attachments._aws.aws_clients.get_boto3_session") -def test_get_queue(mock_get_boto3_session, default_queue: Queue, create_get_queue_response): - # Set up the mock session and mock deadline client - mock_session = MagicMock() - mock_get_boto3_session.return_value = mock_session - mock_deadline_client = MagicMock() - mock_session.client.return_value = mock_deadline_client - # Simulate a response from get_queue - mock_deadline_client.get_queue.return_value = create_get_queue_response(default_queue) - - result = get_queue(default_queue.farmId, default_queue.queueId) - - mock_get_boto3_session.assert_called_once() - mock_session.client.assert_called_with("deadline", endpoint_url=None) - mock_deadline_client.get_queue.assert_called_once_with( - farmId=default_queue.farmId, queueId=default_queue.queueId - ) - assert result == default_queue - - -@patch(f"{deadline.__package__}.job_attachments._aws.deadline.get_deadline_client") -def test_get_queue_client_error(mock_get_deadline_client, default_queue: Queue): - # Set up the mock deadline client - mock_client = mock_get_deadline_client.return_value - # Simulate a ClientError from get_queue - mock_client.get_queue.side_effect = ClientError( - {"Error": {"Code": "SomeErrorCode", "Message": "SomeErrorMessage"}}, - "GetQueue", - ) - - with pytest.raises(JobAttachmentsError) as exc_info: - get_queue(default_queue.farmId, default_queue.queueId) - - # Check that the correct exception is raised - assert 'Failed to get queue "queue-01234567890123456789012345678901" from Deadline: ' in str( - exc_info.value - ) diff --git a/test/unit/deadline_job_attachments/caches/__init__.py b/test/unit/deadline_job_attachments/caches/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/unit/deadline_job_attachments/caches/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/unit/deadline_job_attachments/caches/test_caches.py b/test/unit/deadline_job_attachments/caches/test_caches.py deleted file mode 100644 index 25978bf52..000000000 --- a/test/unit/deadline_job_attachments/caches/test_caches.py +++ /dev/null @@ -1,849 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -import threading -from datetime import datetime -from sqlite3 import OperationalError -from unittest.mock import patch - -import pytest - -import deadline -from deadline.job_attachments.asset_manifests import HashAlgorithm -from deadline.job_attachments.exceptions import JobAttachmentsError -from deadline.job_attachments.caches import ( - CacheDB, - HashCache, - HashCacheEntry, - S3CheckCache, - S3CheckCacheEntry, - WHOLE_FILE_RANGE_END, -) - - -class TestCacheDB: - """ - Tests for the CacheDB abstract base class - """ - - def test_get_default_cache_db_file_dir_env_var_path_exists(self, tmpdir): - """ - Tests that when an environment variable exists, it uses that path for the hash cache - """ - expected_path = os.path.join(str(tmpdir), ".deadline", "job_attachments") - with patch( - "deadline.job_attachments.caches.cache_db.os.path.expanduser", return_value=str(tmpdir) - ): - assert CacheDB.get_default_cache_db_file_dir() == expected_path - - def test_init_empty_path_no_default_throws_error(self): - """ - Tests that when no cache file path is given and home dir cannot be resolved, an error is raised. - """ - with patch("deadline.job_attachments.caches.cache_db.os.path.expanduser", return_value="~"): - with pytest.raises(JobAttachmentsError): - CacheDB("name", "table", "query") - - def test_enter_bad_cache_path_throws_error(self, tmpdir): - """ - Tests that an error is raised when a bad path is provided to the CacheDB constructor - """ - with pytest.raises(JobAttachmentsError) as err: - cdb = CacheDB("name", "table", "query", tmpdir) - cdb.cache_dir = "/some/bad/path" - with cdb: - assert False, ( - "Context manager should throw a JobAttachmentsError, this assert should not be reached" - ) - assert isinstance(err.value.__cause__, OperationalError) - - @pytest.mark.parametrize( - "cache_name, table_name, create_query", - [ - pytest.param("", "table", "query"), - pytest.param("name", "", "query"), - pytest.param("name", "table", ""), - ], - ) - def test_init_throws_error_on_empty_strings(self, cache_name, table_name, create_query): - """Tests that a JobAttachmentsError is raised if init args are empty""" - with pytest.raises(JobAttachmentsError): - CacheDB(cache_name, table_name, create_query) - - def test_get_local_connection_same_thread(self, tmpdir): - """Tests that get_local_connection returns the same connection for a single thread""" - cache_dir = tmpdir.mkdir("cache") - - with CacheDB( - "test", "test_table", "CREATE TABLE test_table (id INTEGER)", cache_dir - ) as cdb: - # Get connection from main thread - conn1 = cdb.get_local_connection() - conn2 = cdb.get_local_connection() - - # Should return same connection for same thread - assert conn1 is conn2 - - def test_get_local_connection_different_threads(self, tmpdir): - """Tests that get_local_connection creates separate connections for different threads""" - cache_dir = tmpdir.mkdir("cache") - connections = {} - - # Create the cache and table first - with CacheDB( - "test", "test_table", "CREATE TABLE test_table (id INTEGER)", cache_dir - ) as cdb: - - def get_connection(thread_id): - connections[thread_id] = cdb.get_local_connection() - - # Create connections from different threads - thread1 = threading.Thread(target=get_connection, args=(1,)) - thread2 = threading.Thread(target=get_connection, args=(2,)) - - thread1.start() - thread2.start() - thread1.join() - thread2.join() - - # Connections should be different for different threads - assert connections[1] is not connections[2] - - def test_get_local_connection_handles_sqlite_error(self, tmpdir): - """Tests that get_local_connection raises JobAttachmentsError on SQLite errors""" - with CacheDB("test", "test_table", "CREATE TABLE test_table (id INTEGER)", tmpdir) as cdb: - # Mock sqlite3.connect to raise OperationalError - with patch("sqlite3.connect", side_effect=OperationalError("test error")): - with pytest.raises(JobAttachmentsError) as exc_info: - cdb.get_local_connection() - assert "Could not create connection to cache" in str(exc_info.value) - - def test_enter_retries_on_operational_error(self, tmpdir): - """Tests that __enter__ retries on OperationalError and succeeds on final attempt""" - from unittest.mock import MagicMock - - # Create a mock connection that will be returned on successful connect - mock_connection = MagicMock() - mock_connection.execute.return_value = None - - # Create side effect that fails twice then succeeds - connect_calls = 0 - - def connect_side_effect(*args, **kwargs): - nonlocal connect_calls - connect_calls += 1 - if connect_calls <= 2: - raise OperationalError("database is locked") - return mock_connection - - with patch("sqlite3.connect", side_effect=connect_side_effect): - # This should succeed after 2 retries - with CacheDB( - "test", "test_table", "CREATE TABLE test_table (id INTEGER)", tmpdir - ) as cdb: - # Verify the connection was established - assert cdb.db_connection == mock_connection - # Verify we made the expected number of connection attempts - assert connect_calls == CacheDB._RETRY_ATTEMPTS - - def test_enter_fails_after_max_retries(self, tmpdir): - """Tests that __enter__ fails with JobAttachmentsError after max retries""" - - # Track connection attempts - connect_calls = 0 - - def connect_side_effect(*args, **kwargs): - nonlocal connect_calls - connect_calls += 1 - raise OperationalError("database is locked") - - # Mock sqlite3.connect to always raise OperationalError - with patch("sqlite3.connect", side_effect=connect_side_effect): - with pytest.raises(JobAttachmentsError) as exc_info: - with CacheDB("test", "test_table", "CREATE TABLE test_table (id INTEGER)", tmpdir): - pass - - # Verify the error message indicates retry exhaustion - assert ( - f"Could not access cache file after {CacheDB._RETRY_ATTEMPTS} retry attempts" - in str(exc_info.value) - ) - # Verify we made the expected number of connection attempts - assert connect_calls == CacheDB._RETRY_ATTEMPTS - - def test_get_local_connection_retries_on_operational_error(self, tmpdir): - """Tests that get_local_connection retries on OperationalError and succeeds""" - from unittest.mock import MagicMock - - # Create a mock connection that will be returned on successful connect - mock_connection = MagicMock() - - # Create side effect that fails twice then succeeds - connect_calls = 0 - - def connect_side_effect(*args, **kwargs): - nonlocal connect_calls - connect_calls += 1 - if connect_calls <= 2: - raise OperationalError("database is locked") - return mock_connection - - with CacheDB("test", "test_table", "CREATE TABLE test_table (id INTEGER)", tmpdir) as cdb: - with patch("sqlite3.connect", side_effect=connect_side_effect): - # This should succeed after 2 retries - connection = cdb.get_local_connection() - - # Verify the connection was established - assert connection == mock_connection - # Verify we made the expected number of connection attempts - assert connect_calls == CacheDB._RETRY_ATTEMPTS - - -class TestHashCache: - """ - Tests for the local Hash Cache - """ - - def test_init_empty_path(self, tmpdir): - """ - Tests that when no cache file path is given, the default is used. - """ - with patch( - f"{deadline.__package__}.job_attachments.caches.CacheDB.get_default_cache_db_file_dir", - side_effect=[tmpdir], - ): - hc = HashCache() - assert hc.cache_dir == tmpdir.join(f"{HashCache.CACHE_NAME}.db") - - @pytest.mark.parametrize( - "file_path", - [ - # Simple ascii filename - pytest.param("file", id="ascii_name"), - # Name from test case that was failing on Windows for a user - pytest.param("ñ/\u00c3\u00b1.txt", id="regression_test_filename"), - # Name from a generated emoji filename on Windows - pytest.param("\ude0a.txt", id="surrogate_emoji_example"), - ], - ) - def test_get_entry_returns_valid_entry(self, tmpdir, file_path): - """ - Tests that a valid entry is returned when it exists in the cache already - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - expected_entry = HashCacheEntry( - file_path=file_path, - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - ) - - # WHEN - with HashCache(cache_dir) as hc: - hc.put_entry(expected_entry) - actual_entry = hc.get_entry(file_path, HashAlgorithm.XXH128) - - # THEN - assert actual_entry == expected_entry - - def test_enter_sqlite_import_error(self, tmpdir): - """ - Tests that the cache doesn't throw errors when the SQLite module can't be found - """ - with patch.dict("sys.modules", {"sqlite3": None}): - new_dir = tmpdir.join("does_not_exist") - hc = HashCache(new_dir) - assert not os.path.exists(new_dir) - with hc: - assert hc.get_entry("/no/file", HashAlgorithm.XXH128) is None - hc.put_entry( - HashCacheEntry( - file_path="/no/file", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="abc", - last_modified_time="1234.56", - ) - ) - assert hc.get_entry("/no/file", HashAlgorithm.XXH128) is None - - def test_get_entry_with_byte_range(self, tmpdir): - """ - Tests that a byte range entry is returned when it exists in the cache - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - expected_entry = HashCacheEntry( - file_path="large_file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="chunk_hash_1", - last_modified_time="1234.5678", - range_start=0, - range_end=268435456, # 256MB - ) - - # WHEN - with HashCache(cache_dir) as hc: - hc.put_entry(expected_entry) - actual_entry = hc.get_entry( - "large_file.bin", HashAlgorithm.XXH128, range_start=0, range_end=268435456 - ) - - # THEN - assert actual_entry == expected_entry - assert actual_entry.range_start == 0 - assert actual_entry.range_end == 268435456 - - def test_get_entry_multiple_byte_ranges_same_file(self, tmpdir): - """ - Tests that multiple byte range entries for the same file are stored and retrieved correctly - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - chunk_size = 268435456 # 256MB - entries = [ - HashCacheEntry( - file_path="large_file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash=f"chunk_hash_{i}", - last_modified_time="1234.5678", - range_start=i * chunk_size, - range_end=(i + 1) * chunk_size, - ) - for i in range(4) # 4 chunks - ] - - # WHEN - with HashCache(cache_dir) as hc: - for entry in entries: - hc.put_entry(entry) - - # THEN - each chunk should be retrievable independently - for i, expected_entry in enumerate(entries): - actual_entry = hc.get_entry( - "large_file.bin", - HashAlgorithm.XXH128, - range_start=i * chunk_size, - range_end=(i + 1) * chunk_size, - ) - assert actual_entry == expected_entry - assert actual_entry.file_hash == f"chunk_hash_{i}" - - def test_get_entry_byte_range_not_found(self, tmpdir): - """ - Tests that None is returned when a specific byte range doesn't exist - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - entry = HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="chunk_hash", - last_modified_time="1234.5678", - range_start=0, - range_end=1000, - ) - - # WHEN - with HashCache(cache_dir) as hc: - hc.put_entry(entry) - - # THEN - different range should return None - assert ( - hc.get_entry("file.bin", HashAlgorithm.XXH128, range_start=0, range_end=2000) - is None - ) - assert ( - hc.get_entry("file.bin", HashAlgorithm.XXH128, range_start=1000, range_end=2000) - is None - ) - - def test_get_entry_whole_file_vs_byte_range_independent(self, tmpdir): - """ - Tests that whole-file hashes and byte-range hashes are stored independently - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - whole_file_entry = HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="whole_file_hash", - last_modified_time="1234.5678", - range_start=0, - range_end=WHOLE_FILE_RANGE_END, - ) - chunk_entry = HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="chunk_hash", - last_modified_time="1234.5678", - range_start=0, - range_end=1000, - ) - - # WHEN - with HashCache(cache_dir) as hc: - hc.put_entry(whole_file_entry) - hc.put_entry(chunk_entry) - - # THEN - both should be retrievable independently - actual_whole = hc.get_entry("file.bin", HashAlgorithm.XXH128) - actual_chunk = hc.get_entry( - "file.bin", HashAlgorithm.XXH128, range_start=0, range_end=1000 - ) - - assert actual_whole == whole_file_entry - assert actual_whole.file_hash == "whole_file_hash" - assert actual_chunk == chunk_entry - assert actual_chunk.file_hash == "chunk_hash" - - def test_get_connection_entry_with_byte_range(self, tmpdir): - """ - Tests that get_connection_entry works with byte range parameters - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - expected_entry = HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="chunk_hash", - last_modified_time="1234.5678", - range_start=1000, - range_end=2000, - ) - - # WHEN - with HashCache(cache_dir) as hc: - hc.put_entry(expected_entry) - connection = hc.get_local_connection() - actual_entry = hc.get_connection_entry( - "file.bin", HashAlgorithm.XXH128, connection, range_start=1000, range_end=2000 - ) - - # THEN - assert actual_entry == expected_entry - - def test_hash_cache_entry_is_whole_file(self): - """ - Tests the is_whole_file() helper method on HashCacheEntry - """ - whole_file = HashCacheEntry( - file_path="file.txt", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - ) - assert whole_file.is_whole_file() is True - - chunk = HashCacheEntry( - file_path="file.txt", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - range_start=0, - range_end=1000, - ) - assert chunk.is_whole_file() is False - - # Edge case: range_start != 0 but range_end == -1 should not be whole file - weird_entry = HashCacheEntry( - file_path="file.txt", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - range_start=100, - range_end=WHOLE_FILE_RANGE_END, - ) - assert weird_entry.is_whole_file() is False - - def test_put_entry_replaces_existing_byte_range(self, tmpdir): - """ - Tests that put_entry replaces an existing entry with the same byte range - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - original_entry = HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="original_hash", - last_modified_time="1234.5678", - range_start=0, - range_end=1000, - ) - updated_entry = HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="updated_hash", - last_modified_time="9999.9999", - range_start=0, - range_end=1000, - ) - - # WHEN - with HashCache(cache_dir) as hc: - hc.put_entry(original_entry) - hc.put_entry(updated_entry) - actual_entry = hc.get_entry( - "file.bin", HashAlgorithm.XXH128, range_start=0, range_end=1000 - ) - - # THEN - assert actual_entry.file_hash == "updated_hash" - assert actual_entry.last_modified_time == "9999.9999" - - def test_hash_cache_entry_to_dict_includes_range(self): - """ - Tests that to_dict() includes range_start and range_end - """ - entry = HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - range_start=100, - range_end=200, - ) - result = entry.to_dict() - - assert result["file_path"] == "file.bin" - assert result["hash_algorithm"] == "xxh128" - assert result["file_hash"] == "hash" - assert result["last_modified_time"] == "1234.5678" - assert result["range_start"] == 100 - assert result["range_end"] == 200 - - def test_hash_cache_entry_validates_byte_range(self): - """ - Tests that HashCacheEntry raises ValueError when range_end <= range_start for byte-range entries - """ - # Valid byte-range entry should work - HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - range_start=0, - range_end=100, - ) - - # Whole-file entry (range_end=-1) should work regardless of range_start - HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - range_start=0, - range_end=WHOLE_FILE_RANGE_END, - ) - - # Invalid: range_end == range_start - with pytest.raises(ValueError, match="range_end.*must be greater than.*range_start"): - HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - range_start=100, - range_end=100, - ) - - # Invalid: range_end < range_start - with pytest.raises(ValueError, match="range_end.*must be greater than.*range_start"): - HashCacheEntry( - file_path="file.bin", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="hash", - last_modified_time="1234.5678", - range_start=200, - range_end=100, - ) - - -class TestS3CheckCache: - """ - Tests for the local S3 Check Hash - """ - - def test_init_empty_path(self, tmpdir): - """ - Tests that when no cache file path is given, the default is used. - """ - with patch( - f"{deadline.__package__}.job_attachments.caches.CacheDB.get_default_cache_db_file_dir", - side_effect=[tmpdir], - ): - s3c = S3CheckCache() - assert s3c.cache_dir == tmpdir.join(f"{S3CheckCache.CACHE_NAME}.db") - - def test_get_entry_returns_valid_entry(self, tmpdir): - """ - Tests that a valid entry is returned when it exists in the cache already - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - expected_entry = S3CheckCacheEntry( - s3_key="bucket/Data/somehash", - last_seen_time=str(datetime.now().timestamp()), - ) - - # WHEN - with S3CheckCache(cache_dir) as s3c: - s3c.put_entry(expected_entry) - actual_entry = s3c.get_entry("bucket/Data/somehash") - - # THEN - assert actual_entry == expected_entry - - def test_get_entry_returns_none_with_expired_entry(self, tmpdir): - """ - Tests that nothing is returned when an existing entry is expired - """ - # GIVEN - cache_dir = tmpdir.mkdir("cache") - expected_entry = S3CheckCacheEntry( - s3_key="bucket/Data/somehash", - last_seen_time="123.456", # a looong time ago - ) - - # WHEN - with S3CheckCache(cache_dir) as s3c: - s3c.put_entry(expected_entry) - actual_entry = s3c.get_entry("bucket/Data/somehash") - - # THEN - assert actual_entry is None - - def test_enter_sqlite_import_error(self, tmpdir): - """ - Tests that the cache doesn't throw errors when the SQLite module can't be found - """ - with patch.dict("sys.modules", {"sqlite3": None}): - new_dir = tmpdir.join("does_not_exist") - s3c = S3CheckCache(new_dir) - assert not os.path.exists(new_dir) - with s3c: - assert s3c.get_entry("bucket/Data/somehash") is None - s3c.put_entry( - S3CheckCacheEntry( - s3_key="bucket/Data/somehash", - last_seen_time=str(datetime.now().timestamp()), - ) - ) - assert s3c.get_entry("bucket/Data/somehash") is None - - def test_delete_cache(self, tmpdir): - """ - Tests if the cache file can be deleted when calling remove_cache - """ - cache_dir = tmpdir.mkdir("cache") - with S3CheckCache(cache_dir) as s3c: - file_name: str = os.path.join(cache_dir, "s3_check_cache.db") - assert os.path.exists(file_name) - s3c.remove_cache() - - # Test if the cache file was deleted - assert not os.path.exists(file_name) - - def test_get_connection_entry_returns_valid_entry(self, tmpdir): - """Tests that get_connection_entry returns a valid entry with provided connection""" - cache_dir = tmpdir.mkdir("cache") - expected_entry = S3CheckCacheEntry( - s3_key="bucket/Data/somehash", - last_seen_time=str(datetime.now().timestamp()), - ) - - with S3CheckCache(cache_dir) as s3c: - s3c.put_entry(expected_entry) - connection = s3c.get_local_connection() - actual_entry = s3c.get_connection_entry("bucket/Data/somehash", connection) - - assert actual_entry == expected_entry - - def test_get_connection_entry_returns_none_for_nonexistent_key(self, tmpdir): - """Tests that get_connection_entry returns None for non-existent key""" - cache_dir = tmpdir.mkdir("cache") - - with S3CheckCache(cache_dir) as s3c: - connection = s3c.get_local_connection() - actual_entry = s3c.get_connection_entry("nonexistent/key", connection) - - assert actual_entry is None - - def test_get_connection_entry_returns_none_for_expired_entry(self, tmpdir): - """Tests that get_connection_entry returns None for expired entries""" - cache_dir = tmpdir.mkdir("cache") - expired_entry = S3CheckCacheEntry( - s3_key="bucket/Data/somehash", - last_seen_time="123.456", # very old timestamp - ) - - with S3CheckCache(cache_dir) as s3c: - s3c.put_entry(expired_entry) - connection = s3c.get_local_connection() - actual_entry = s3c.get_connection_entry("bucket/Data/somehash", connection) - - assert actual_entry is None - - def test_concurrent_read_write_operations_should_not_lock(self, tmpdir): - """Test that concurrent reads and writes don't cause database locked errors""" - import threading - import time - - cache_dir = tmpdir.mkdir("cache") - errors = {} - results = {} - - with HashCache(cache_dir) as cache: - - def aggressive_writer_thread(thread_id): - """Thread that aggressively writes to cache with transactions""" - try: - for i in range(20): # More operations - entry = HashCacheEntry( - file_path=f"/test/file_{thread_id}_{i}.txt", - hash_algorithm=HashAlgorithm.XXH128, - file_hash=f"hash_{thread_id}_{i}", - last_modified_time=str(time.time()), - ) - cache.put_entry(entry) - except Exception as e: - errors[f"writer_{thread_id}"] = str(e) - - def aggressive_reader_thread(thread_id): - """Thread that aggressively reads from cache using thread-local connection""" - try: - conn = cache.get_local_connection() - for i in range(50): # Many more read operations - # Try to read - this should never get "database is locked" - entry = cache.get_connection_entry( - f"/test/file_{i % 4}_{i % 5}.txt", HashAlgorithm.XXH128, conn - ) - results[f"reader_{thread_id}_{i}"] = entry is not None - except Exception as e: - errors[f"reader_{thread_id}"] = str(e) - - # Start many more threads concurrently - threads = [] - - # Start 5 writer threads (more write contention) - for i in range(5): - t = threading.Thread(target=aggressive_writer_thread, args=(i,)) - threads.append(t) - t.start() - - # Start 10 reader threads (more read contention) - for i in range(10): - t = threading.Thread(target=aggressive_reader_thread, args=(i,)) - threads.append(t) - t.start() - - # Wait for all threads - for t in threads: - t.join() - - # Should have no "database is locked" errors - locked_errors = {k: v for k, v in errors.items() if "database is locked" in v} - assert len(locked_errors) == 0, f"Got database locked errors: {locked_errors}" - assert len(errors) == 0, f"Got other errors: {errors}" - - def test_large_db_concurrent_operations_expose_timeout_issues(self, tmpdir): - """Test that large database (~50MB) exposes timeout issues without proper SQLite configuration""" - import threading - import time - - cache_dir = tmpdir.mkdir("cache") - - # Prepopulate database to ~50MB (approximately 500K records for more realistic size) - print("Prepopulating database to ~50MB...") - with HashCache(cache_dir) as cache: - # Initialize cache with one entry to ensure table exists - init_entry = HashCacheEntry( - file_path="/init.txt", - hash_algorithm=HashAlgorithm.XXH128, - file_hash="init_hash", - last_modified_time=str(time.time()), - ) - cache.put_entry(init_entry) - - conn = cache.db_connection - batch_data = [] - batch_size = 10000 - - for i in range(500000): # Increased to 500K records - batch_data.append( - ( - f"/large/test/file_{i:06d}.txt", - HashAlgorithm.XXH128.value, - f"hash_{i:032x}" * 2, # Longer hash to increase record size - str(time.time() + i), - ) - ) - - if len(batch_data) >= batch_size: - conn.executemany( - f"INSERT OR REPLACE INTO {cache.table_name} (file_path, hash_algorithm, file_hash, last_modified_time) VALUES (?, ?, ?, ?)", - batch_data, - ) - conn.commit() - batch_data = [] - if i % 50000 == 0: - print(f" Added {i + 1} records...") - - # Insert remaining records - if batch_data: - conn.executemany( - f"INSERT OR REPLACE INTO {cache.table_name} (file_path, hash_algorithm, file_hash, last_modified_time) VALUES (?, ?, ?, ?)", - batch_data, - ) - conn.commit() - - print("Database prepopulated. Starting aggressive concurrency test...") - - errors = {} - results = {} - - with HashCache(cache_dir) as cache: - - def aggressive_writer_thread(thread_id): - try: - for i in range(50): # More write operations - entry = HashCacheEntry( - file_path=f"/concurrent/file_{thread_id}_{i}.txt", - hash_algorithm=HashAlgorithm.XXH128, - file_hash=f"concurrent_hash_{thread_id}_{i}", - last_modified_time=str(time.time()), - ) - cache.put_entry(entry) - except Exception as e: - errors[f"writer_{thread_id}"] = str(e) - - def aggressive_reader_thread(thread_id): - try: - conn = cache.get_local_connection() - for i in range(100): # Many more read operations - entry = cache.get_connection_entry( - f"/large/test/file_{i:06d}.txt", HashAlgorithm.XXH128, conn - ) - results[f"reader_{thread_id}_{i}"] = entry is not None - except Exception as e: - errors[f"reader_{thread_id}"] = str(e) - - # Start many concurrent operations on large database - threads = [] - for i in range(15): # More writer threads - t = threading.Thread(target=aggressive_writer_thread, args=(i,)) - threads.append(t) - t.start() - - for i in range(25): # More reader threads - t = threading.Thread(target=aggressive_reader_thread, args=(i,)) - threads.append(t) - t.start() - - for t in threads: - t.join() - - # With proper SQLite configuration (timeout + WAL), should have no lock errors - locked_errors = {k: v for k, v in errors.items() if "database is locked" in v} - assert len(locked_errors) == 0, f"Got database locked errors: {locked_errors}" - assert len(errors) == 0, f"Got other errors: {errors}" diff --git a/test/unit/deadline_job_attachments/conftest.py b/test/unit/deadline_job_attachments/conftest.py deleted file mode 100644 index 272e377c0..000000000 --- a/test/unit/deadline_job_attachments/conftest.py +++ /dev/null @@ -1,523 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Common fixtures for deadline tests. -""" - -from __future__ import annotations -import dataclasses -import json -import os -import tempfile -from datetime import datetime -from io import BytesIO -from typing import Any, Callable, Generator - -import pytest -from moto import mock_aws - -from botocore.client import BaseClient # noqa: E402 isort:skip - -from deadline.job_attachments._aws import aws_clients # noqa: E402 isort:skip -from deadline.job_attachments.asset_sync import AssetSync # noqa: E402 isort:skip -from deadline.job_attachments.models import ( # noqa: E402 isort:skip - JobAttachmentsFileSystem, - Attachments, - ManifestProperties, - Job, - JobAttachmentS3Settings, - PathFormat, - Queue, -) - - -@pytest.fixture(scope="function") -def temp_dir(): - """ - Fixture to provide a temporary directory. - """ - - with tempfile.TemporaryDirectory() as temp_dir: - yield temp_dir - - -@pytest.fixture(scope="function") -def temp_assets_dir(): - """ - Fixture to provide a temporary directory for asset files. - """ - - with tempfile.TemporaryDirectory() as assets_dir: - yield assets_dir - - -@pytest.fixture(scope="function", autouse=True) -def boto_config() -> Generator[None, None, None]: - os.environ["AWS_ACCESS_KEY_ID"] = "ACCESSKEY" - os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" - os.environ["AWS_SECURITY_TOKEN"] = "testing" - os.environ["AWS_SESSION_TOKEN"] = "testing" - os.environ["AWS_DEFAULT_REGION"] = "us-west-2" - - os.environ.pop("AWS_ENDPOINT_URL_S3", None) - os.environ.pop("AWS_ENDPOINT_URL_STS", None) - os.environ.pop("AWS_ENDPOINT_URL_DEADLINE", None) - - mock = mock_aws() - mock.start() - yield - mock.stop() - - -@pytest.fixture(scope="function", name="s3") -def s3_fixture(boto_config) -> Generator[BaseClient, None, None]: - """ - Fixture to get a mock S3 client. - """ - yield aws_clients.get_s3_client() - - -@pytest.fixture(scope="function") -def create_s3_bucket(boto_config, s3) -> Callable[[str], None]: # pylint: disable=invalid-name - """ - Fixture that returns a function that creates moto S3 buckets. - """ - - def create_bucket(bucket_name): - s3.create_bucket( - Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": "us-west-2"} - ) - - return create_bucket - - -@pytest.fixture(name="default_job_attachment_s3_settings") -def fixture_default_job_attachment_s3_settings(): - """ - Fixture returning default settings for an S3 bucket associated with a Queue - """ - return JobAttachmentS3Settings( - s3BucketName="test-bucket", - rootPrefix="assetRoot", - ) - - -@pytest.fixture(name="default_attachments") -def fixture_default_attachments(farm_id, queue_id): - """ - Fixture returning default settings for a Job - """ - return Attachments( - manifests=[ - ManifestProperties( - rootPath="/tmp", - rootPathFormat=PathFormat.POSIX, - inputManifestPath=f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/manifest_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=["test/outputs"], - ) - ], - ) - - -@pytest.fixture(name="vfs_attachments") -def fixture_vfs_attachments(): - """ - Fixture returning default settings for a Job - """ - return Attachments( - manifests=[ - ManifestProperties( - rootPath="/tmp", - rootPathFormat=PathFormat.POSIX, - inputManifestPath="manifest.json", - inputManifestHash="manifesthash", - outputRelativeDirectories=["test/outputs"], - ) - ], - fileSystem=JobAttachmentsFileSystem.VIRTUAL, - ) - - -@pytest.fixture(name="windows_attachments") -def fixture_windows_attachments(): - """ - Fixture returning default settings for a Job submitted on a Windows machine - """ - return Attachments( - manifests=[ - ManifestProperties( - rootPath=r"C:\Users\temp", - rootPathFormat=PathFormat.WINDOWS, - inputManifestPath="manifest.json", - inputManifestHash="manifesthash", - outputRelativeDirectories=["test\\outputs"], - ) - ], - ) - - -@pytest.fixture(name="attachments_no_inputs") -def fixture_attachments_no_required_assets(): - """ - Fixture returning Job settings with no required assets (inputs) - """ - return Attachments( - manifests=[ - ManifestProperties( - rootPath="/tmp", - rootPathFormat=PathFormat.POSIX, - outputRelativeDirectories=["test/outputs"], - ) - ], - ) - - -@pytest.fixture(name="default_asset_sync") -def fixture_default_asset_sync(farm_id: str): - """ - Fixture returning a default AssetSync instance - """ - return AssetSync(farm_id) - - -@pytest.fixture -def assert_manifest(): - """ - Assert that a manifest file in a mock s3 bucket matches what's expected. - """ - - def __inner_func__(bucket, manifest_location, expected_manifest): - with BytesIO() as manifest: - bucket.download_fileobj(manifest_location, manifest) - manifest_json = json.loads(manifest.getvalue().decode("utf-8")) - - assert manifest_json == expected_manifest - - return __inner_func__ - - -@pytest.fixture -def assert_canonical_manifest(): - """ - Assert that a canonical manifest file in a mock s3 bucket matches what's expected. - """ - - def __inner_func__(bucket, manifest_location: str, expected_manifest: str): - with BytesIO() as manifest: - bucket.download_fileobj(manifest_location, manifest) - - assert manifest.getvalue().decode("utf-8") == expected_manifest - - return __inner_func__ - - -@pytest.fixture -def assert_expected_files_on_s3(): - """ - Assert that all provided files are in an S3 bucket. - """ - - def __inner_func__(bucket, expected_files): - actual_files = set() - - for bucket_object in bucket.objects.all(): - actual_files.add(bucket_object.key) - - assert actual_files == expected_files - - return __inner_func__ - - -@pytest.fixture -def variables(): - return { - "frame": 1, - } - - -@pytest.fixture -def default_manifest_str_v2023_03_03() -> str: - manifest_file = os.path.abspath( - os.path.join(os.path.dirname(__file__), "data", "manifest_v2023_03_03.json") - ) - with open(manifest_file) as f: - return f.read() - - -@pytest.fixture -def farm_id(): - return "farm-1234567890abcdefghijklmnopqrstuv" - - -@pytest.fixture -def queue_id(): - return "queue-01234567890123456789012345678901" - - -@pytest.fixture -def job_id(): - return "job-01234567890123456789012345678901" - - -@pytest.fixture -def session_action_id(): - return "session-action-1" - - -@pytest.fixture(name="default_job") -def fixture_default_job(job_id, default_attachments): - """ - Fixture returning a job that can be used for most tests. - """ - return Job( - jobId=job_id, - attachments=default_attachments, - ) - - -@pytest.fixture(name="vfs_job") -def fixture_vfs_job(job_id, vfs_attachments): - """ - Fixture returning a job that can be used for most tests. - """ - return Job( - jobId=job_id, - attachments=vfs_attachments, - ) - - -@pytest.fixture(name="default_queue") -def fixture_default_queue(farm_id, queue_id, default_job_attachment_s3_settings): - """ - Fixture returning a queue that can be used for most tests. - """ - return Queue( - displayName="queue_name", - queueId=queue_id, - farmId=farm_id, - status="ENABLED", - defaultBudgetAction="None", - jobAttachmentSettings=default_job_attachment_s3_settings, - ) - - -@pytest.fixture(scope="function") -def create_get_queue_response(response_metadata) -> Callable[[Queue], dict[str, Any]]: - """ - Fixture used to create get_queue responses - """ - - def _inner_func_(queue_info: Queue): - response = dict( - dataclasses.asdict( - queue_info, dict_factory=lambda x: {k: v for (k, v) in x if v is not None} - ), - **response_metadata, - ) - - response["createdAt"] = datetime.strptime( - "2023-07-13 14:35:26.123456", "%Y-%m-%d %H:%M:%S.%f" - ) - response["createdBy"] = "job attachments tests" - response["defaultBudgetAction"] = "None" - - return response - - return _inner_func_ - - -@pytest.fixture(scope="function") -def create_get_job_response(response_metadata) -> Callable[[Job], dict[str, Any]]: - """ - Fixture used to create get_job responses - """ - - def _inner_func_(job_info: Job): - now = datetime.now() - return dict( - dataclasses.asdict( - job_info, dict_factory=lambda x: {k: v for (k, v) in x if v is not None} - ), - **{ - "jobId": job_info.jobId, - "createdAt": now, - "lifecycleStatus": "READY", - "createdBy": "CreatedBy", - "taskRunStatusCounts": {"READY": 1}, - "priority": 50, - }, - **response_metadata, - ) - - return _inner_func_ - - -@pytest.fixture(name="response_metadata") -def fixture_response_metadata(): - """ - Fixture returning a ResponseMetadata to be included in get_queue, get_job response - """ - return { - "ResponseMetadata": { - "RequestId": "abc123", - "HTTPStatusCode": 200, - "HostId": "abc123", - } - } - - -@pytest.fixture(name="test_manifest_one") -def fixture_test_manifest_one(): - return { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "a96ddfc33590cd7d2391f1972f66a72a", - "mtime": 1111111111111111, - "path": "a.txt", - "size": 2, - }, - { - "hash": "b96ddfc33590cd7d2391f1972f66a72a", - "mtime": 2222222222222222, - "path": "b.txt", - "size": 4, - }, - { - "hash": "c96ddfc33590cd7d2391f1972f66a72a", - "mtime": 3333333333333333, - "path": "c.txt", - "size": 6, - }, - ], - "totalSize": 12, - } - - -@pytest.fixture(name="test_manifest_two") -def fixture_test_manifest_two(): - return { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "a20ddfc33590cd7d2391f1972f66a72a", - "mtime": 4444444444444444, - "path": "a.txt", - "size": 20, - }, - { - "hash": "d96ddfc33590cd7d2391f1972f66a72a", - "mtime": 5555555555555555, - "path": "d.txt", - "size": 40, - }, - ], - "totalSize": 60, - } - - -@pytest.fixture(name="merged_manifest") -def fixture_merged_manifest(): - return { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "a20ddfc33590cd7d2391f1972f66a72a", - "mtime": 4444444444444444, - "path": "a.txt", - "size": 20, - }, - { - "hash": "b96ddfc33590cd7d2391f1972f66a72a", - "mtime": 2222222222222222, - "path": "b.txt", - "size": 4, - }, - { - "hash": "c96ddfc33590cd7d2391f1972f66a72a", - "mtime": 3333333333333333, - "path": "c.txt", - "size": 6, - }, - { - "hash": "d96ddfc33590cd7d2391f1972f66a72a", - "mtime": 5555555555555555, - "path": "d.txt", - "size": 40, - }, - ], - "totalSize": 70, - } - - -@pytest.fixture(name="really_big_manifest") -def fixture_really_big_manifest(): - return { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "a20ddfc33590cd7d2391f1972f66a72a", - "mtime": 4444444444444444, - "path": "a.txt", - "size": 100000000000000000, # 100 Petabytes - }, - { - "hash": "b96ddfc33590cd7d2391f1972f66a72a", - "mtime": 2222222222222222, - "path": "b.txt", - "size": 200000000000000000, # 200 Petabytes - }, - ], - "totalSize": 300000000000000000, - } - - -def has_posix_target_user() -> bool: - """Returns if the testing environment exported the env variables for doing - cross-account posix target-user tests. - """ - return ( - os.environ.get("DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_USER") is not None - and os.environ.get("DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_GROUP") is not None - ) - - -def has_posix_disjoint_user() -> bool: - """Returns if the testing environment exported the env variables for doing - cross-account posix disjoint-user tests. - """ - return ( - os.environ.get("DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_USER") is not None - and os.environ.get("DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_GROUP") is not None - ) - - -@pytest.fixture(scope="function") -def posix_target_group() -> str: - # Intentionally fail if the var is not defined. - return os.environ["DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_GROUP"] - - -@pytest.fixture(scope="function") -def posix_disjoint_group() -> str: - # Intentionally fail if the var is not defined. - return os.environ["DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_GROUP"] - - -@pytest.fixture(scope="function") -def test_glob_folder() -> str: - glob_data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "data", "glob")) - return glob_data_dir - - -@pytest.fixture(scope="function") -def glob_config_file() -> str: - manifest_file = os.path.abspath( - os.path.join(os.path.dirname(__file__), "data", "glob_config.txt") - ) - return manifest_file diff --git a/test/unit/deadline_job_attachments/data/glob/exclude.txt b/test/unit/deadline_job_attachments/data/glob/exclude.txt deleted file mode 100644 index b1c53915d..000000000 --- a/test/unit/deadline_job_attachments/data/glob/exclude.txt +++ /dev/null @@ -1 +0,0 @@ -hello world - test file \ No newline at end of file diff --git a/test/unit/deadline_job_attachments/data/glob/include.txt b/test/unit/deadline_job_attachments/data/glob/include.txt deleted file mode 100644 index b1c53915d..000000000 --- a/test/unit/deadline_job_attachments/data/glob/include.txt +++ /dev/null @@ -1 +0,0 @@ -hello world - test file \ No newline at end of file diff --git a/test/unit/deadline_job_attachments/data/glob/nested/nested_exclude.txt b/test/unit/deadline_job_attachments/data/glob/nested/nested_exclude.txt deleted file mode 100644 index b1c53915d..000000000 --- a/test/unit/deadline_job_attachments/data/glob/nested/nested_exclude.txt +++ /dev/null @@ -1 +0,0 @@ -hello world - test file \ No newline at end of file diff --git a/test/unit/deadline_job_attachments/data/glob/nested/nested_include.txt b/test/unit/deadline_job_attachments/data/glob/nested/nested_include.txt deleted file mode 100644 index b1c53915d..000000000 --- a/test/unit/deadline_job_attachments/data/glob/nested/nested_include.txt +++ /dev/null @@ -1 +0,0 @@ -hello world - test file \ No newline at end of file diff --git a/test/unit/deadline_job_attachments/data/glob_config.txt b/test/unit/deadline_job_attachments/data/glob_config.txt deleted file mode 100644 index 397ff5012..000000000 --- a/test/unit/deadline_job_attachments/data/glob_config.txt +++ /dev/null @@ -1,8 +0,0 @@ -{ - "include": [ - "include.file" - ], - "exclude": [ - "exclude.file" - ] -} diff --git a/test/unit/deadline_job_attachments/data/manifest_bados.json b/test/unit/deadline_job_attachments/data/manifest_bados.json deleted file mode 100644 index eafb1f912..000000000 --- a/test/unit/deadline_job_attachments/data/manifest_bados.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "sourceRoot": "/tmp", - "sourceOS": "maclindows", - "s3bucket": "test-bucket", - "s3inputPrefix": "input-ca", - "s3outputPrefix": "output", - "inputFiles": [ - { - "relativePath": "test_inputs/input.txt", - "s3Path": "test-abc-123" - } - ], - "outputDirectories": [ - { - "relativePath": "test_outputs" - } - ] - } \ No newline at end of file diff --git a/test/unit/deadline_job_attachments/data/manifest_v2023_03_03.json b/test/unit/deadline_job_attachments/data/manifest_v2023_03_03.json deleted file mode 100644 index 47fcb487a..000000000 --- a/test/unit/deadline_job_attachments/data/manifest_v2023_03_03.json +++ /dev/null @@ -1 +0,0 @@ -{"hashAlg":"xxh128","manifestVersion":"2023-03-03","paths":[{"hash":"CarriageReturn","mtime":1679079744833848,"path":"\r","size":1},{"hash":"One","mtime":1679079344833868,"path":"1","size":1},{"hash":"c","mtime":1675079344833848,"path":"another_test_file","size":1},{"hash":"b","mtime":1479079344833848,"path":"test_dir/test_file","size":1},{"hash":"a","mtime":167907934333848,"path":"test_file","size":1},{"hash":"Control","mtime":1679079344833348,"path":"\u0080","size":1},{"hash":"UserTestCase","mtime":1679579344833848,"path":"\u00c3\u00b1","size":1},{"hash":"LatinSmallLetterOWithDiaeresis","mtime":1679079344833848,"path":"\u00f6","size":1},{"hash":"EuroSign","mtime":1679079344836848,"path":"\u20ac","size":1},{"hash":"EmojiGrinningFace","mtime":1679579344833848,"path":"\ud83d\ude00","size":1},{"hash":"EmojiTestCase","mtime":1679579344833848,"path":"\ude0a","size":1},{"hash":"HebrewLetterDaletWithDagesh","mtime":1679039344833848,"path":"\ufb33","size":1}],"totalSize":10} \ No newline at end of file diff --git a/test/unit/deadline_job_attachments/incremental_downloads/__init__.py b/test/unit/deadline_job_attachments/incremental_downloads/__init__.py deleted file mode 100644 index 8d929cc86..000000000 --- a/test/unit/deadline_job_attachments/incremental_downloads/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/test/unit/deadline_job_attachments/incremental_downloads/test_add_output_manifests_from_s3.py b/test/unit/deadline_job_attachments/incremental_downloads/test_add_output_manifests_from_s3.py deleted file mode 100644 index 3653d010c..000000000 --- a/test/unit/deadline_job_attachments/incremental_downloads/test_add_output_manifests_from_s3.py +++ /dev/null @@ -1,515 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Tests for the _add_output_manifests_from_s3 function in the incremental downloads module. - -This module contains comprehensive unit tests for the _add_output_manifests_from_s3 function, -which is responsible for populating missing manifest information in session actions by -retrieving output manifest paths from S3. -""" - -from __future__ import annotations - -import json -from typing import Any -from unittest.mock import patch - -import boto3 -from moto import mock_aws - -from deadline.job_attachments._incremental_downloads._manifest_s3_downloads import ( - _add_output_manifests_from_s3, -) -from deadline.job_attachments.asset_manifests import hash_data as ja_hash_data -from deadline.job_attachments.asset_manifests.v2023_03_03.asset_manifest import DEFAULT_HASH_ALG - - -# Test data generation utilities - - -def generate_test_job(job_id: str, root_paths: list[str]) -> dict[str, Any]: - """Generate a test job with specified root paths for attachment manifests.""" - return { - "jobId": job_id, - "name": f"test-job-{job_id}", - "attachments": { - "manifests": [ - { - "rootPath": root_path, - "rootPathFormat": "POSIX", - "inputManifestPath": f"input_manifest_{i}", - "inputManifestHash": f"input_hash_{i}", - "outputRelativeDirectories": [f"output_{i}"], - } - for i, root_path in enumerate(root_paths) - ] - }, - } - - -def generate_session_actions( - session_action_ids: list[str], - farm_id: str = "farm-01234567890123456789012345678901", - queue_id: str = "queue-01234567890123456789012345678901", - job_id: str = "job-01234567890123456789012345678901", - root_path_hashes: list[str] | None = None, - *, - with_manifests: bool = False, -) -> list[dict[str, Any]]: - """Generate session actions with or without existing manifests following the correct S3 path pattern.""" - session_actions = [] - - # Default root path hashes if not provided - if root_path_hashes is None: - root_path_hashes = ["hash1", "hash2"] - - for session_action_id in session_action_ids: - session_action: dict[str, Any] = {"sessionActionId": session_action_id} - if with_manifests: - # Generate manifests following the pattern: - # /////_/_output - step_id = "step-123" - task_id = "task-456" - timestamp = "2023-01-01T12:00:00.000000Z" - - manifests = [] - for i, root_path_hash in enumerate(root_path_hashes): - manifest_path = ( - f"{farm_id}/{queue_id}/{job_id}/{step_id}/{task_id}/" - f"{timestamp}_{session_action_id}/{root_path_hash}_output" - ) - manifests.append({"outputManifestPath": manifest_path}) - session_action["manifests"] = manifests - session_actions.append(session_action) - return session_actions - - -def calculate_root_path_hashes(root_paths: list[str]) -> list[str]: - """Calculate hashes for root paths using the same algorithm as the function under test.""" - return [ja_hash_data(root_path.encode(), DEFAULT_HASH_ALG) for root_path in root_paths] - - -class ManifestKeyBuilder: - """Utility class to build S3 manifest keys following the documented pattern.""" - - def __init__(self, root_prefix: str, farm_id: str, queue_id: str, job_id: str): - self.root_prefix = root_prefix - self.farm_id = farm_id - self.queue_id = queue_id - self.job_id = job_id - - def build_key(self, session_action_id: str, root_path_hash: str) -> str: - """Build a manifest S3 key for testing. - - Pattern: /Manifests//////_/_output - """ - step_id = "step-123" - task_id = "task-456" - timestamp = "2023-01-01T12:00:00.000000Z" - return ( - f"{self.root_prefix}/Manifests/{self.farm_id}/{self.queue_id}/" - f"{self.job_id}/{step_id}/{task_id}/{timestamp}_{session_action_id}/" - f"{root_path_hash}_output" - ) - - -def create_manifest_s3_objects(s3_client, bucket_name: str, manifest_keys: list[str]): - """Create S3 objects for manifest keys with minimal valid content.""" - manifest_content = json.dumps( - {"manifestVersion": "2023-03-03", "hashAlg": "xxh128", "paths": [], "totalSize": 0} - ) - - for key in manifest_keys: - s3_client.put_object(Bucket=bucket_name, Key=key, Body=manifest_content.encode("utf-8")) - - -# Validation utility functions - - -def validate_manifest_structure(manifest_entry: dict[str, Any]) -> None: - """ - Validate that a manifest entry has the expected structure with proper "outputManifestPath" field format. - - Requirements: 4.1 - Verify that the populated manifest structure matches the expected format - """ - assert isinstance(manifest_entry, dict), "Manifest entry must be a dictionary" - assert "outputManifestPath" in manifest_entry, ( - "Manifest entry must contain 'outputManifestPath' field" - ) - assert isinstance(manifest_entry["outputManifestPath"], str), ( - "outputManifestPath must be a string" - ) - assert len(manifest_entry["outputManifestPath"]) > 0, "outputManifestPath cannot be empty" - - # Validate that the path doesn't contain the S3 prefix (should be removed) - output_path = manifest_entry["outputManifestPath"] - assert not output_path.startswith("test-prefix/Manifests/"), ( - "S3 prefix should be removed from outputManifestPath" - ) - assert not output_path.startswith("/Manifests/"), ( - "Manifest prefix should be removed from outputManifestPath" - ) - - -def validate_root_path_hash_matching( - job: dict[str, Any], session_actions: list[dict[str, Any]], expected_root_path_hashes: list[str] -) -> None: - """ - Validate that manifests are populated at correct indices based on root path hash matching. - - Requirements: 4.2 - Verify that each manifest is populated at the correct index in the manifests array - Requirements: 4.4 - Verify that manifest keys are correctly matched to job attachment manifests based on hash - """ - job_manifests = job.get("attachments", {}).get("manifests", []) - - for session_action in session_actions: - if "manifests" not in session_action: - continue - - manifests = session_action["manifests"] - assert len(manifests) == len(job_manifests), ( - "Number of manifests should match job attachment manifests" - ) - - for i, manifest in enumerate(manifests): - if manifest and "outputManifestPath" in manifest: - # Verify this manifest corresponds to the correct root path hash at index i - output_path = manifest["outputManifestPath"] - expected_hash = expected_root_path_hashes[i] - - # The output path should contain the root path hash for the corresponding job manifest - assert expected_hash in output_path, ( - f"Manifest at index {i} should contain root path hash {expected_hash} " - f"but path is {output_path}" - ) - - # Verify the session action ID is in the path - session_action_id = session_action["sessionActionId"] - assert session_action_id in output_path, ( - f"Output path should contain session action ID {session_action_id}" - ) - - -def validate_existing_session_action_manifests_are_unmodified( - original_session_actions: list[dict[str, Any]], processed_session_actions: list[dict[str, Any]] -) -> None: - """ - Validate that session actions with existing manifests are not modified during processing. - """ - for i, (original, processed) in enumerate( - zip(original_session_actions, processed_session_actions) - ): - if "manifests" in original: - # Session actions that already had manifests should remain unchanged - assert original == processed, ( - f"Session action {i} with existing manifests should not be modified. " - f"Original: {original}, Processed: {processed}" - ) - - -def validate_manifest_keys_do_not_have_prefix( - session_actions: list[dict[str, Any]], root_prefix: str -) -> None: - """ - Validate that output manifest paths do not start with the S3 prefix configured on the queue. - """ - manifest_prefix = f"{root_prefix}/Manifests/" - - for session_action in session_actions: - for manifest in session_action.get("manifests", []): - if manifest and "outputManifestPath" in manifest: - output_path = manifest["outputManifestPath"] - - # Verify the S3 prefix has been removed - assert not output_path.startswith(manifest_prefix), ( - f"Output manifest path should not start with S3 prefix '{manifest_prefix}'. " - f"Found: {output_path}" - ) - - # Verify it's a relative path (doesn't start with /) - assert not output_path.startswith("/"), ( - f"Output manifest path should be relative, not absolute. Found: {output_path}" - ) - - -@mock_aws -def test_add_output_manifests_from_s3_fill_in_missing_manifests(fresh_deadline_config): - """ - Test that _add_output_manifests_from_s3 correctly fills in missing manifest information from S3. - - This test uses moto to create a mocked S3 bucket structure containing S3 objects with appropriate key names - for the test, and a mock list_session_actions response where some manifests are already provided - and others are not. It confirms that the list_session_actions response is updated where the manifests are - missing, and is not modified where they are. - """ - # Test constants - farm_id = "farm-01234567890123456789012345678901" - queue_id = "queue-01234567890123456789012345678901" - job_id = "job-01234567890123456789012345678901" - bucket_name = "test-bucket" - root_prefix = "test-prefix" - - # Create S3 client and bucket - s3_client = boto3.client("s3", region_name="us-west-2") - s3_client.create_bucket( - Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": "us-west-2"} - ) - - # Create test job with multiple attachment manifests having different root paths - root_paths = ["/tmp/input1", "/tmp/input2"] - job = generate_test_job(job_id, root_paths) - - # Create queue structure - queue = { - "queueId": queue_id, - "jobAttachmentSettings": {"s3BucketName": bucket_name, "rootPrefix": root_prefix}, - } - - # Generate session actions missing "manifests" fields - session_action_ids = ["sessionaction-abc123-0", "sessionaction-def456-1"] - session_actions = generate_session_actions(session_action_ids, with_manifests=False) - - # Calculate root path hashes for job attachment manifests - root_path_hashes = calculate_root_path_hashes(root_paths) - - # Generate S3 manifest keys containing session action IDs and root path hashes - key_builder = ManifestKeyBuilder(root_prefix, farm_id, queue_id, job_id) - manifest_keys = [] - - for session_action_id in session_action_ids: - for root_path_hash in root_path_hashes: - manifest_key = key_builder.build_key(session_action_id, root_path_hash) - manifest_keys.append(manifest_key) - - # Create S3 objects for the manifest keys with minimal valid content - create_manifest_s3_objects(s3_client, bucket_name, manifest_keys) - - # Create boto3 session for the function call - boto3_session = boto3.Session() - - # Call the function under test - _add_output_manifests_from_s3( - farm_id=farm_id, - queue=queue, - job=job, - boto3_session=boto3_session, - session_action_list=session_actions, - ) - - # Validate manifest structure for all populated manifests - for session_action in session_actions: - assert "manifests" in session_action - assert len(session_action["manifests"]) == len(root_paths) - - for manifest in session_action["manifests"]: - validate_manifest_structure(manifest) - - # Validate root path hash matching - validate_root_path_hash_matching(job, session_actions, root_path_hashes) - - # Validate manifest prefix removal - validate_manifest_keys_do_not_have_prefix(session_actions, root_prefix) - - # Verify that all expected manifest keys were created and populated correctly - expected_manifest_count = len(session_action_ids) * len(root_paths) - actual_manifest_count = sum( - len([m for m in sa["manifests"] if "outputManifestPath" in m]) for sa in session_actions - ) - assert actual_manifest_count == expected_manifest_count - - -def test_add_output_manifests_from_s3_already_stored(fresh_deadline_config): - """ - Test that _add_output_manifests_from_s3 does not modify session actions with existing manifests. - - This test confirms that no S3 APIs are accessed, and that the manifests already in the mock - list_session_actions response are not modified when session actions already contain manifest data. - """ - # Test constants - farm_id = "farm-01234567890123456789012345678901" - queue_id = "queue-01234567890123456789012345678901" - job_id = "job-01234567890123456789012345678901" - bucket_name = "test-bucket" - root_prefix = "test-prefix" - - # Create test job with multiple attachment manifests having different root paths - root_paths = ["/tmp/input1", "/tmp/input2"] - job = generate_test_job(job_id, root_paths) - - # Create queue structure - queue = { - "queueId": queue_id, - "jobAttachmentSettings": {"s3BucketName": bucket_name, "rootPrefix": root_prefix}, - } - - # Calculate root path hashes for the job attachment manifests - root_path_hashes = calculate_root_path_hashes(root_paths) - - # Create session actions that already contain "manifests" fields - session_action_ids = ["sessionaction-abc123-0", "sessionaction-def456-1"] - session_actions = generate_session_actions( - session_action_ids, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - root_path_hashes=root_path_hashes, - with_manifests=True, - ) - - # Store original manifest data for comparison - original_session_actions = json.loads(json.dumps(session_actions)) - - # Create boto3 session for the function call - boto3_session = boto3.Session() - - # Mock _get_tasks_manifests_keys_from_s3 to track if it's called - with patch( - "deadline.job_attachments._incremental_downloads._manifest_s3_downloads._get_tasks_manifests_keys_from_s3" - ) as mock_get_keys: - # Call the function under test - _add_output_manifests_from_s3( - farm_id=farm_id, - queue=queue, - job=job, - boto3_session=boto3_session, - session_action_list=session_actions, - ) - - # Verify that _get_tasks_manifests_keys_from_s3 was never called - mock_get_keys.assert_not_called() - - # Existing manifests should be preserved - validate_existing_session_action_manifests_are_unmodified( - original_session_actions, session_actions - ) - - # Validate manifest prefix removal from S3 object keys - validate_manifest_keys_do_not_have_prefix(session_actions, root_prefix) - - # Assert that existing manifest data in session actions remains unchanged - assert session_actions == original_session_actions - - -@mock_aws -def test_add_output_manifests_from_s3_edge_cases(fresh_deadline_config): - """ - Test that _add_output_manifests_from_s3 handles edge cases and mixed scenarios correctly. - - This test validates edge cases including empty session action lists and mixed scenarios - where some session actions have manifests and others don't, ensuring the function - handles various data structures robustly and maintains data integrity. - """ - # Test constants - farm_id = "farm-01234567890123456789012345678901" - queue_id = "queue-01234567890123456789012345678901" - job_id = "job-01234567890123456789012345678901" - bucket_name = "test-bucket" - root_prefix = "test-prefix" - - # Create S3 client and bucket - s3_client = boto3.client("s3", region_name="us-west-2") - s3_client.create_bucket( - Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": "us-west-2"} - ) - - # Create test job with multiple attachment manifests having different root paths - root_paths = ["/tmp/input1", "/tmp/input2"] - job = generate_test_job(job_id, root_paths) - - # Create queue structure - queue = { - "queueId": queue_id, - "jobAttachmentSettings": {"s3BucketName": bucket_name, "rootPrefix": root_prefix}, - } - - # Calculate root path hashes for job attachment manifests - root_path_hashes = calculate_root_path_hashes(root_paths) - - # Create boto3 session for the function call - boto3_session = boto3.Session() - - # Test Case 1: Empty session action list - empty_session_actions: list[dict[str, Any]] = [] - _add_output_manifests_from_s3( - farm_id=farm_id, - queue=queue, - job=job, - boto3_session=boto3_session, - session_action_list=empty_session_actions, - ) - - # List is still empty - assert empty_session_actions == [] - - # Test Case 2: Mixed scenario - some with manifests, some without - session_action_ids_with = ["sessionaction-with-1", "sessionaction-with-2"] - session_action_ids_without = ["sessionaction-without-1", "sessionaction-without-2"] - - # Create session actions with existing manifests - session_actions_with_manifests = generate_session_actions( - session_action_ids_with, - farm_id=farm_id, - queue_id=queue_id, - job_id=job_id, - root_path_hashes=root_path_hashes, - with_manifests=True, - ) - - # Create session actions without manifests - session_actions_without_manifests = generate_session_actions( - session_action_ids_without, with_manifests=False - ) - - # Combine both types for mixed scenario - mixed_session_actions = session_actions_with_manifests + session_actions_without_manifests - - # Store original data for comparison - original_mixed_session_actions = json.loads(json.dumps(mixed_session_actions)) - - # Create S3 objects for the session actions that need manifests - key_builder = ManifestKeyBuilder(root_prefix, farm_id, queue_id, job_id) - manifest_keys = [] - - for session_action_id in session_action_ids_without: - for root_path_hash in root_path_hashes: - manifest_key = key_builder.build_key(session_action_id, root_path_hash) - manifest_keys.append(manifest_key) - - create_manifest_s3_objects(s3_client, bucket_name, manifest_keys) - - # Call the function under test with mixed scenario - _add_output_manifests_from_s3( - farm_id=farm_id, - queue=queue, - job=job, - boto3_session=boto3_session, - session_action_list=mixed_session_actions, - ) - - # Validate that session actions with existing manifests were not modified - for i, session_action in enumerate(mixed_session_actions): - if session_action["sessionActionId"] in session_action_ids_with: - # Find corresponding original session action - original_sa = next( - sa - for sa in original_mixed_session_actions - if sa["sessionActionId"] == session_action["sessionActionId"] - ) - assert session_action == original_sa, ( - f"Session action with existing manifests should not be modified: {session_action['sessionActionId']}" - ) - - # Validate that session actions without manifests were populated - for session_action in mixed_session_actions: - if session_action["sessionActionId"] in session_action_ids_without: - assert "manifests" in session_action, ( - f"Session action without manifests should be populated: {session_action['sessionActionId']}" - ) - for manifest in session_action["manifests"]: - validate_manifest_structure(manifest) - - # Validate manifest prefix removal for all session actions - validate_manifest_keys_do_not_have_prefix(mixed_session_actions, root_prefix) - - # Validate root path hash matching for all populated manifests - validate_root_path_hash_matching(job, mixed_session_actions, root_path_hashes) diff --git a/test/unit/deadline_job_attachments/incremental_downloads/test_incremental_download_state.py b/test/unit/deadline_job_attachments/incremental_downloads/test_incremental_download_state.py deleted file mode 100644 index c86c95da9..000000000 --- a/test/unit/deadline_job_attachments/incremental_downloads/test_incremental_download_state.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -import json -import pytest -import tempfile -from unittest.mock import MagicMock - -from deadline.job_attachments._incremental_downloads.incremental_download_state import ( - IncrementalDownloadJob, - IncrementalDownloadState, - EVENTUAL_CONSISTENCY_MAX_SECONDS, -) -from datetime import datetime - - -class TestIncrementalDownloadState: - @pytest.fixture - def mock_logger(self): - """ - Fixture to create a mock logger object. - """ - logger = MagicMock() - logger.echo = MagicMock() - return logger - - @pytest.fixture - def temp_dir(self): - """ - Fixture to provide a temporary directory that is cleaned up after tests. - """ - with tempfile.TemporaryDirectory() as tmpdir: - yield tmpdir - - @pytest.fixture - def test_paths(self, temp_dir): - """ - Fixture to provide test file paths using a real temporary directory. - """ - return { - "location": temp_dir, - "progress_file": os.path.join(temp_dir, "download_checkpoint.json"), - } - - @pytest.fixture - def sample_state_data(self): - """ - Fixture to provide sample state data. - """ - return { - "downloadsStartedTimestamp": "2023-01-01T00:00:00+00:00", - "downloadsCompletedTimestamp": "2023-01-02T00:00:00+00:00", - "eventualConsistencyMaxSeconds": EVENTUAL_CONSISTENCY_MAX_SECONDS, - "jobs": [{"jobId": "job-123", "name": "Job 1"}, {"jobId": "job-124", "name": "Job 2"}], - } - - @pytest.fixture - def state_file(self, test_paths, sample_state_data): - """ - Fixture to create a real state file with sample data. - """ - with open(test_paths["progress_file"], "w") as f: - json.dump(sample_state_data, f, indent=2) - yield test_paths["progress_file"] - # Cleanup is handled by the temp_dir fixture - - @pytest.fixture - def mock_download_state(self): - """ - Fixture to create a sample IncrementalDownloadState. This state matches the sample_state_data fixture. - """ - return IncrementalDownloadState( - "sp-123", - downloads_started_timestamp=datetime.fromisoformat("2023-01-01T00:00:00+00:00"), - downloads_completed_timestamp=datetime.fromisoformat("2023-01-02T00:00:00+00:00"), - jobs=[ - IncrementalDownloadJob({"jobId": "job-123", "name": "Job 1"}, None, {}), - IncrementalDownloadJob( - {"jobId": "job-124", "name": "Job 2"}, - datetime.fromisoformat("2023-01-02T00:00:00+00:00"), - {}, - ), - ], - ) - - def test_incremental_download_state_init(self): - """ - Test IncrementalDownloadState initialization. - """ - bootstrap_time = datetime.fromisoformat("2023-01-01T00:00:00") - completed_time = datetime.fromisoformat("2023-01-02T00:00:00") - - # Test with minimal bootstrapped construction - state = IncrementalDownloadState("sp-123", bootstrap_time) - assert state.downloads_started_timestamp == bootstrap_time - assert state.downloads_completed_timestamp == bootstrap_time - assert state.eventual_consistency_max_seconds == 120 - assert state.jobs == [] - - # Test with provided values - jobs = [IncrementalDownloadJob({"jobId": "job-123"}, None, {})] - state = IncrementalDownloadState( - "sp-123", - downloads_started_timestamp=bootstrap_time, - downloads_completed_timestamp=completed_time, - jobs=jobs, - ) - assert state.downloads_started_timestamp == bootstrap_time - assert state.downloads_completed_timestamp == completed_time - assert state.jobs == jobs - - def test_incremental_download_state_dict_roundtrip(self, mock_download_state): - """ - Test IncrementalDownloadState.from_dict and to_dict methods, by roundtripping. - """ - - dict_state = mock_download_state.to_dict() - - assert dict_state == IncrementalDownloadState.from_dict(dict_state).to_dict() - - def test_incremental_download_state_file_roundtrip( - self, temp_dir, mock_download_state: IncrementalDownloadState - ): - """ - Test IncrementalDownloadState.from_file and save_file methods, by roundtripping. - """ - - dict_state = mock_download_state.to_dict() - - file_path = os.path.join(temp_dir, "checkpoint.json") - mock_download_state.save_file(file_path) - roundtrip_state = IncrementalDownloadState.from_file(file_path) - - assert roundtrip_state.to_dict() == dict_state diff --git a/test/unit/deadline_job_attachments/incremental_downloads/test_manifest_download.py b/test/unit/deadline_job_attachments/incremental_downloads/test_manifest_download.py deleted file mode 100644 index 5fd7d98c1..000000000 --- a/test/unit/deadline_job_attachments/incremental_downloads/test_manifest_download.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from __future__ import annotations - -import uuid -import random -import os -import time -from pathlib import Path, PurePosixPath -from datetime import datetime -import sys - -import boto3 -from moto import mock_aws -import pytest - -from deadline.job_attachments.asset_manifests.hash_algorithms import HashAlgorithm, hash_data -from deadline.job_attachments.asset_manifests import BaseAssetManifest, BaseManifestPath -from deadline.job_attachments.asset_manifests.v2023_03_03 import ( - AssetManifest, - ManifestPath, -) -from deadline.job_attachments.progress_tracker import ( - ProgressReportMetadata, -) - -from deadline.job_attachments._incremental_downloads._manifest_s3_downloads import ( - _download_all_manifests_with_absolute_paths, - _merge_absolute_path_manifest_list, - _download_manifest_paths, -) -from deadline.job_attachments.models import FileConflictResolution - -""" -Tests the manifest and file download functionality used by the queue incremental download operation, -by generating fake job data and populating a moto mocked S3 bucket with actual manifests and files. -""" - - -def generate_random_path(): - """Generate a random path with a few subdirectories.""" - part_count = random.randrange(1, 3) - return str( - PurePosixPath( - *[f"part-{i}-{random.randrange(4)}" for i in range(part_count - 1)], - f"{uuid.uuid4()}.ext", - ) - ) - - -def generate_random_files(file_count, file_size_min, file_size_max): - random_files = {} - for i in range(file_count): - file_size = random.randrange(file_size_min, file_size_max) - file_contents = random.randbytes(file_size) # type: ignore - file_path = generate_random_path() - - random_files[file_path] = file_contents - - return random_files - - -def generate_fake_job_with_output_manifest( - tmp_path: Path, - queue: dict, - file_count: int, - file_size_min: int, - file_size_max: int, - out_jobs: list, - out_job_sessions: dict, - out_expected_download_files: dict, -): - """ - Given a fake queue and a moto session for s3, generates a fake job with corresponding - session and session actions. Puts files and manifests into S3, with paths under the tmp_path - directory. Adds all the files to expected_download_files as {abs_path: file_contents}. - """ - manifest_count = random.randrange(1, 5) - - s3 = boto3.resource("s3") - bucket = s3.Bucket(queue["jobAttachmentSettings"]["s3BucketName"]) - - # Create the fake job - job_id = f"job-{str(uuid.uuid4()).replace('-', '')}" - - # Initialize the manifests for the job - job_manifests: list = [ - { - "rootPath": str(tmp_path / f"{job_id}-root-path-{i}"), - "rootPathFormat": "POSIX", - "outputRelativeDirectories": ["."], - } - for i in range(manifest_count) - ] - - job = { - "jobId": job_id, - "name": f"test-job-{job_id}", - "attachments": {"manifests": job_manifests}, - } - out_jobs.append(job) - - # Generate random files - random_files = generate_random_files(file_count, file_size_min, file_size_max) - - # Divide the files randomly among the manifests, and add to the expected download files - files_in_manifests: list = [{} for i in range(manifest_count)] - for file, contents in random_files.items(): - manifest_index = random.randrange(manifest_count) - files_in_manifests[manifest_index][file] = contents - out_expected_download_files[str(Path(job_manifests[manifest_index]["rootPath"]) / file)] = ( - contents - ) - - # Put the files and manifests into S3, and record their locations - session_action_manifests = [] - for manifest_index, files in enumerate(files_in_manifests): - total_size = 0 - paths: list[BaseManifestPath] = [] - for file, contents in files.items(): - s3_key = f"{queue['jobAttachmentSettings']['rootPrefix']}/Data/{hash_data(contents, HashAlgorithm.XXH128)}.xxh128" - bucket.put_object( - Key=s3_key, - Body=contents, - ) - paths.append( - ManifestPath( - path=file, - hash=hash_data(contents, HashAlgorithm.XXH128), - size=len(contents), - mtime=int(time.time() * 1e6), - ) - ) - total_size += len(contents) - - if paths: - manifest = AssetManifest( - hash_alg=HashAlgorithm.XXH128, paths=paths, total_size=total_size - ) - manifest_bytes = manifest.encode().encode("utf-8") - manifest_hash = hash_data(manifest_bytes, HashAlgorithm.XXH128) - bucket.put_object( - Key=f"{queue['jobAttachmentSettings']['rootPrefix']}/Manifests/{manifest_hash}.xxh128", - Body=manifest_bytes, - ) - - session_action_manifests.append( - { - "outputManifestHash": manifest_hash, - "outputManifestPath": f"{manifest_hash}.xxh128", - } - ) - else: - session_action_manifests.append({}) - - # Use one session for the job - session: dict = { - "sessionId": f"session-{str(uuid.uuid4()).replace('-', '')}", - "fleetId": f"fleet-{str(uuid.uuid4()).replace('-', '')}", - "workerId": f"worker-{str(uuid.uuid4()).replace('-', '')}", - "startedAt": "2025-08-06T00:15:45.712000+00:00", - "endedAt": "2025-08-06T00:20:59.992000+00:00", - "lifecycleStatus": "ENDED", - } - out_job_sessions[job_id] = [session] - - # Use one session action in the session - session_action: dict = { - "sessionActionId": session["sessionId"].replace("session-", "sessionaction-") + "-0", - "status": "SUCCEEDED", - "startedAt": "2025-08-06T00:20:58.454000+00:00", - "endedAt": "2025-08-06T00:20:59.992000+00:00", - "progressPercent": 100.0, - "definition": { - "taskRun": { - "taskId": "task-b1764261dff54214aace3932bde8ae7e-0", - "stepId": "step-b1764261dff54214aace3932bde8ae7e", - } - }, - # This test doesn't go into the S3 object layer, so the manifests list is empty. - "manifests": session_action_manifests, - } - session["sessionActions"] = [session_action] - - -@pytest.mark.skipif( - sys.version_info < (3, 9), reason="test uses random.randbytes which is Python >= 3.9" -) -@mock_aws -def test_manifest_and_output_downloads(tmp_path, fresh_deadline_config): - """ - This test uses moto3 to mock a bunch of job attachment output data in S3, and then - calls the sequence of functions used in incremental downloads - """ - queue_id = "queue-01234567890123456789012345678901" - bucket_name = "test-bucket" - root_prefix = "test-prefix" - - # Create S3 client and bucket - boto3_session = boto3.Session(region_name="us-west-2") - s3_client = boto3_session.client("s3", region_name="us-west-2") - s3_client.create_bucket( - Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": "us-west-2"} - ) - - # Create queue structure - queue = { - "queueId": queue_id, - "jobAttachmentSettings": {"s3BucketName": bucket_name, "rootPrefix": root_prefix}, - } - - jobs: list = [] - job_sessions: dict = {} - expected_download_files: dict = {} - # Generate fake jobs, mostly with small files and one job with files > 1MB so that the test runs both - # _download_file_with_get_object and _download_file_with_transfer_manager - generate_fake_job_with_output_manifest( - tmp_path, queue, 2, 2, 1024, jobs, job_sessions, expected_download_files - ) - generate_fake_job_with_output_manifest( - tmp_path, queue, 30, 2, 50, jobs, job_sessions, expected_download_files - ) - generate_fake_job_with_output_manifest( - tmp_path, queue, 2, 1500000, 2000000, jobs, job_sessions, expected_download_files - ) - - # WHEN: Download all the output manifests for all the jobs we made, and make their paths absolute - unmapped_paths: dict = {} - downloaded_manifests: list[tuple[datetime, BaseAssetManifest]] = ( - _download_all_manifests_with_absolute_paths( - queue, - {job["jobId"]: job for job in jobs}, - job_sessions, - {}, - unmapped_paths, - boto3_session, - print, - ) - ) - - # THEN: There should be no unmapped paths because we provided {} for the path mapping applier - assert unmapped_paths == {} - # All the manifest paths should be absolute - for _, manifest in downloaded_manifests: - for manifest_path in manifest.paths: - assert os.path.isabs(manifest_path.path) - - # WHEN: Merge all the manifests into one list of paths - manifest_paths_to_download: list[BaseManifestPath] = _merge_absolute_path_manifest_list( - downloaded_manifests - ) - - # THEN: The full set of paths should exactly match the keys of expected_download_files - assert {v.path for v in manifest_paths_to_download} == set(expected_download_files.keys()) - - # WHEN: Download all the paths from the manifests - def on_downloading_files( - download_metadata: ProgressReportMetadata, - ) -> bool: - return True - - _download_manifest_paths( - manifest_paths_to_download, - HashAlgorithm.XXH128, - queue, - boto3_session, - FileConflictResolution.OVERWRITE, - on_downloading_files=on_downloading_files, - print_function_callback=print, - ) - - # THEN: All the files should be downloaded, and match the randomly generated contents - for file, contents in expected_download_files.items(): - assert os.path.exists(file) - assert os.path.isfile(file) - with open(file, "rb") as fh: - assert fh.read() == contents diff --git a/test/unit/deadline_job_attachments/incremental_downloads/test_manifest_download_long_path.py b/test/unit/deadline_job_attachments/incremental_downloads/test_manifest_download_long_path.py deleted file mode 100644 index 4b25ff2cb..000000000 --- a/test/unit/deadline_job_attachments/incremental_downloads/test_manifest_download_long_path.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for long path handling in incremental downloads.""" - -from __future__ import annotations - -import sys -import tempfile -from collections import defaultdict -from pathlib import Path -from threading import Lock -from typing import DefaultDict -from unittest.mock import MagicMock, patch - -import pytest - -import deadline -from deadline.job_attachments._utils import ( - WINDOWS_MAX_PATH_LENGTH, - TEMP_DOWNLOAD_ADDED_CHARS_LENGTH, -) -from deadline.job_attachments.asset_manifests.hash_algorithms import HashAlgorithm -from deadline.job_attachments.asset_manifests.v2023_03_03 import ManifestPath -from deadline.job_attachments.models import FileConflictResolution -from deadline.job_attachments._incremental_downloads._manifest_s3_downloads import ( - _download_file, -) - - -class TestIncrementalDownloadLongPath: - """Tests for long path handling in incremental download operations.""" - - def _test_create_copy_long_path_scenario( - self, base_dir: Path, long_base_name: str, expect_unc_prefix: bool = False - ) -> None: - """ - Common test logic for CREATE_COPY long path scenarios in incremental downloads. - Tests that original path is not long but copy becomes long. - """ - original_file = Path(base_dir) / f"{long_base_name}.txt" - copy_file = Path(base_dir) / f"{long_base_name} (1).txt" - - # Verify our test scenario is correct - original_len = len(str(original_file)) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH - copy_len = len(str(copy_file)) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH - assert original_len < WINDOWS_MAX_PATH_LENGTH, ( - f"Original should NOT be long path: {original_len}" - ) - assert copy_len >= WINDOWS_MAX_PATH_LENGTH, f"Copy should become long path: {copy_len}" - - # Create test file path object for the manifest - path should be absolute - file_path = ManifestPath( - path=str(original_file), hash="testhash", size=100, mtime=1234000000 - ) - - # Mock S3 operations - mock_s3_client = MagicMock() - mock_boto3_session = MagicMock() - mock_progress_tracker = MagicMock() - mock_progress_tracker.continue_reporting = True - - mock_lock = Lock() - mock_collision_dict: DefaultDict[str, int] = defaultdict(int) - - with patch( - f"{deadline.__package__}.job_attachments._incremental_downloads._manifest_s3_downloads._download_file_with_get_object" - ) as mock_download_get_object, patch( - f"{deadline.__package__}.job_attachments._utils._is_windows_long_path_registry_enabled", - return_value=False, # Ensure UNC prefix is used for Windows - ), patch( - "pathlib.Path.is_file", - return_value=True, # Simulate that original file exists to force conflict - ), patch( - f"{deadline.__package__}.job_attachments._incremental_downloads._manifest_s3_downloads._get_new_copy_file_path", - return_value=copy_file, - ), patch("pathlib.Path.mkdir"), patch( - f"{deadline.__package__}.job_attachments._incremental_downloads._manifest_s3_downloads.os.utime" - ), patch( - f"{deadline.__package__}.job_attachments._incremental_downloads._manifest_s3_downloads.os.path.getsize", - return_value=100, # Return the same size as file.size to pass validation - ): - # Call _download_file with CREATE_COPY resolution - _download_file( - file=file_path, - hash_algorithm=HashAlgorithm.XXH128, - collision_lock=mock_lock, - collision_file_dict=mock_collision_dict, - s3_bucket="test-bucket", - cas_prefix="rootPrefix/Data", - s3_client=mock_s3_client, - boto3_session_for_s3=mock_boto3_session, - progress_tracker=mock_progress_tracker, - file_conflict_resolution=FileConflictResolution.CREATE_COPY, - ) - - # Verify the download was called - download_calls = mock_download_get_object.call_args_list - assert len(download_calls) == 1, "Should have made exactly one download call" - - download_call = download_calls[0] - # Get local_file_path from kwargs - local_file_path = download_call.kwargs.get("local_file_path", "") - fileobj_path = str(local_file_path) - - # Platform-specific path format validation - if expect_unc_prefix: - # Windows: verify UNC prefix is used - assert fileobj_path.startswith("\\\\?\\"), ( - f"Copy file path should use UNC prefix for long paths, got: {fileobj_path}" - ) - - # Verify the underlying path length that triggered the conversion - underlying_path = fileobj_path.replace("\\\\?\\", "") - assert ( - len(underlying_path) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH - >= WINDOWS_MAX_PATH_LENGTH - ), ( - f"The underlying path + temp chars should be at/over Windows limit: {len(underlying_path) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH}" - ) - - # Verify expected path components are present - assert str(base_dir).lstrip("\\\\?\\") in fileobj_path, ( - f"Path should contain base directory: {base_dir}" - ) - else: - # POSIX: verify no UNC prefix is used - assert not fileobj_path.startswith("\\\\?\\"), ( - f"POSIX systems should not use UNC prefix, got: {fileobj_path}" - ) - - # Verify the path is the expected copy path - expected_copy_path = str(copy_file) - assert fileobj_path == expected_copy_path, ( - f"Should use normal path format on POSIX: expected {expected_copy_path}, got {fileobj_path}" - ) - - # Verify expected path components are present - assert str(base_dir) in fileobj_path, ( - f"Path should contain base directory: {base_dir}" - ) - - # Verify it contains the copy filename pattern - assert f"{long_base_name} (1).txt" in fileobj_path, ( - "Should contain the copy filename pattern" - ) - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for Windows long path handling only.", - ) - def test_download_file_create_copy_becomes_long_path_windows(self) -> None: - """ - Test that when CREATE_COPY conflict resolution creates a filename that becomes a Windows long path, - _download_file converts it to use the UNC prefix (\\?\\) format and successfully downloads the file. - - This tests the fix for GitHub issue #617 in the incremental downloads module. - """ - # Create a path that's just under the Windows limit, but becomes long with " (1)" - base_dir = Path("C:\\" + "a" * 100) # Directory part - long_base_name = "b" * 141 # Filename part - calculated to hit threshold - - # Use the common test logic with Windows-specific validation - self._test_create_copy_long_path_scenario(base_dir, long_base_name, expect_unc_prefix=True) - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for POSIX systems.", - ) - @pytest.mark.parametrize( - "dir_multiplier,filename_len", - [ - (14, 85) if sys.platform == "darwin" else (22, 72), - ], - ) - def test_download_file_create_copy_long_path_posix( - self, dir_multiplier: int, filename_len: int - ) -> None: - """ - Test that CREATE_COPY conflict resolution works correctly on POSIX systems - with long filenames and actually downloads the file. - - The variables have been decided such that they cross the max path length of 260. - They are different for MacOS and Linux because of different temp directory lengths. - """ - with tempfile.TemporaryDirectory() as tmp_dir: - tmp_path = Path(tmp_dir) - nested_dir = tmp_path / ("longdir" * dir_multiplier) - long_base_name = "a" * filename_len - - # Use the common test logic with POSIX-specific validation - self._test_create_copy_long_path_scenario( - nested_dir, long_base_name, expect_unc_prefix=False - ) diff --git a/test/unit/deadline_job_attachments/test_asset_sync.py b/test/unit/deadline_job_attachments/test_asset_sync.py deleted file mode 100644 index 2929a4fe7..000000000 --- a/test/unit/deadline_job_attachments/test_asset_sync.py +++ /dev/null @@ -1,1698 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for the Asset Synching class for task-level attachments.""" - -import json -from logging import getLogger -import os -import shutil -from math import trunc -from pathlib import Path -from typing import Optional, Dict -from unittest.mock import ANY, MagicMock, patch - -import boto3 -import pytest -from moto import mock_aws - -import deadline -from deadline.job_attachments.asset_manifests.decode import decode_manifest -from deadline.job_attachments.asset_sync import AssetSync -from deadline.job_attachments.os_file_permission import PosixFileSystemPermissionSettings - -from deadline.job_attachments.exceptions import ( - AssetSyncError, - VFSExecutableMissingError, - JobAttachmentsS3ClientError, - VFSOSUserNotSetError, -) -from deadline.job_attachments.models import ( - Attachments, - FileConflictResolution, - Job, - JobAttachmentsFileSystem, - JobAttachmentS3Settings, - ManifestProperties, - PathFormat, - Queue, -) -from deadline.job_attachments.progress_tracker import ( - DownloadSummaryStatistics, - ProgressStatus, - SummaryStatistics, -) -from deadline.job_attachments.api import human_readable_file_size -from ..conftest import is_windows_non_admin - - -class TestAssetSync: - @pytest.fixture(autouse=True) - def before_test( - self, - request, - create_s3_bucket, - default_job_attachment_s3_settings: JobAttachmentS3Settings, - default_asset_sync: AssetSync, - ): - """ - Setup the default queue and s3 bucket for all asset tests. - Mark test with `no_setup` if you don't want this setup to run. - """ - if "no_setup" in request.keywords: - return - - create_s3_bucket(bucket_name=default_job_attachment_s3_settings.s3BucketName) - self.default_asset_sync = default_asset_sync - - @pytest.fixture - def client(self) -> MagicMock: - return MagicMock() - - @pytest.fixture - def asset_sync(self, farm_id: str, client: MagicMock) -> AssetSync: - asset_sync = AssetSync(farm_id) - asset_sync.s3_uploader._s3 = client - return asset_sync - - def test_sync_inputs_no_inputs_successful( - self, - tmp_path: Path, - default_queue: Queue, - default_job: Job, - attachments_no_inputs: Attachments, - ): - """Asserts that sync_inputs is successful when no required assets exist for the Job""" - # GIVEN - default_job.attachments = attachments_no_inputs - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ): - mock_on_downloading_files = MagicMock(return_value=True) - - (summary_statistics, result_pathmap_rules) = self.default_asset_sync.sync_inputs( - default_queue.jobAttachmentSettings, - attachments_no_inputs, - default_queue.queueId, - default_job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - expected_source_path_format = ( - "windows" - if default_job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": default_job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - - expected_summary_statistics = SummaryStatistics( - total_time=summary_statistics.total_time, - total_files=0, - total_bytes=0, - processed_files=0, - processed_bytes=0, - skipped_files=0, - skipped_bytes=0, - transfer_rate=0.0, - ) - assert summary_statistics == expected_summary_statistics - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ("vfs_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_sync_inputs_successful( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that a valid manifest can be processed to download attachments from S3""" - # GIVEN - job: Job = request.getfixturevalue(job_fixture_name) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - test_fs_permission_settings: PosixFileSystemPermissionSettings = ( - PosixFileSystemPermissionSettings( - os_user="test-user", - os_group="test-group", - dir_mode=0o20, - file_mode=0o20, - ) - ) - os_env_vars: Dict[str, str] = {"AWS_PROFILE": "test-profile"} - assert job.attachments - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.mount_vfs_from_manifests" - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.find_vfs" - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - fs_permission_settings=test_fs_permission_settings, - os_env_vars=os_env_vars, - ) - - # THEN - expected_source_path_format = ( - "windows" - if job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_sync_inputs_404_error( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that a specific error message is raised when getting 404 errors synching inputs""" - # GIVEN - download_exception = JobAttachmentsS3ClientError( - action="get-object", - status_code=404, - bucket_name="test bucket", - key_or_prefix="test-key.xxh128", - message="File not found", - ) - job: Job = request.getfixturevalue(job_fixture_name) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - dest_dir = "assetroot-27bggh78dd2b568ab123" - assert job.attachments - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=download_exception, - ): - with pytest.raises(JobAttachmentsS3ClientError) as excinfo: - self.default_asset_sync.sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - ) - - # THEN - assert "usually located in the home directory (~/.deadline/cache/s3_check_cache.db)" in str( - excinfo - ) - - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_sync_inputs_with_step_dependencies( - self, - tmp_path: Path, - default_queue: Queue, - default_job: Job, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that input syncing is done correctly when step dependencies are provided.""" - # GIVEN - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - assert default_job.attachments - - step_output_root = "/home/outputs_roots" - step_dest_dir = "assetroot-8a7d189e9c17186fb88b" - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir, step_dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_output_manifests_by_asset_root", - side_effect=[{step_output_root: {}}], - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.sync_inputs( - s3_settings, - default_job.attachments, - default_queue.queueId, - default_job.jobId, - tmp_path, - step_dependencies=["step-1"], - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - expected_source_path_format = ( - "windows" - if default_job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": default_job.attachments.manifests[0].rootPath, - "destination_path": local_root, - }, - ] - - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_sync_inputs_with_step_dependencies_same_root_vfs_on_posix( - self, - tmp_path: Path, - default_queue: Queue, - vfs_job: Job, - s3_settings_fixture_name: str, - test_manifest_one: dict, - test_manifest_two: dict, - request: pytest.FixtureRequest, - ): - """Asserts that input syncing is done correctly when step dependencies are provided.""" - # GIVEN - job = vfs_job - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_fs_permission_settings: PosixFileSystemPermissionSettings = ( - PosixFileSystemPermissionSettings( - os_user="test-user", - os_group="test-group", - dir_mode=0o20, - file_mode=0o20, - ) - ) - os_env_vars: Dict[str, str] = {"AWS_PROFILE": "test-profile"} - assert job.attachments - - test_manifest = decode_manifest(json.dumps(test_manifest_two)) - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=json.dumps(test_manifest_one), - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - return_value=dest_dir, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_output_manifests_by_asset_root", - return_value={"tmp/": [(test_manifest, "hello")]}, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.merge_asset_manifests", - ) as merge_manifests_mock, patch( - f"{deadline.__package__}.job_attachments.asset_sync.AssetSync._ensure_disk_capacity", - ) as disk_capacity_mock, patch( - f"{deadline.__package__}.job_attachments.download._write_manifest_to_temp_file", - return_value="tmp_manifest", - ), patch("sys.platform", "linux"), patch( - f"{deadline.__package__}.job_attachments.asset_sync.mount_vfs_from_manifests" - ), patch(f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.find_vfs"): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - step_dependencies=["step-1"], - on_downloading_files=mock_on_downloading_files, - fs_permission_settings=test_fs_permission_settings, - os_env_vars=os_env_vars, - ) - - # THEN - merge_manifests_mock.assert_called() - disk_capacity_mock.assert_not_called() - expected_source_path_format = ( - "windows" - if job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": job.attachments.manifests[0].rootPath, - "destination_path": local_root, - }, - ] - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_sync_inputs_no_space_left( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - really_big_manifest: dict, - request: pytest.FixtureRequest, - ): - """Asserts that an AssetSyncError is thrown if there is not enough space left on the device to download all inputs.""" - # GIVEN - job: Job = request.getfixturevalue(job_fixture_name) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - dest_dir = "assetroot-27bggh78dd2b568ab123" - test_manifest = decode_manifest(json.dumps(really_big_manifest)) - test_fs_permission_settings: PosixFileSystemPermissionSettings = ( - PosixFileSystemPermissionSettings( - os_user="test-user", - os_group="test-group", - dir_mode=0o20, - file_mode=0o20, - ) - ) - os_env_vars: Dict[str, str] = {"AWS_PROFILE": "test-profile"} - assert job.attachments - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - with pytest.raises(AssetSyncError) as ase: - self.default_asset_sync.sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - fs_permission_settings=test_fs_permission_settings, - os_env_vars=os_env_vars, - ) - - # THEN - assert ( - "Total file size required for download (300.0 PB) is larger than available disk space" - in str(ase) - ) - - @mock_aws - @pytest.mark.parametrize( - ( - "s3_settings_fixture_name", - "attachments_fixture_name", - "expected_cas_prefix", - "expected_output_prefix", - ), - [ - ( - "default_job_attachment_s3_settings", - "default_attachments", - "assetRoot/Data/", - "assetRoot/Manifests/farm-1234567890abcdefghijklmnopqrstuv/queue-01234567890123456789012345678901/job-01234567890123456789012345678901/test_step4/test_task4/2023-07-13T14:35:26.123456Z_session-action-1/", - ), - ( - "default_job_attachment_s3_settings", - "windows_attachments", - "assetRoot/Data/", - "assetRoot/Manifests/farm-1234567890abcdefghijklmnopqrstuv/queue-01234567890123456789012345678901/job-01234567890123456789012345678901/test_step4/test_task4/2023-07-13T14:35:26.123456Z_session-action-1/", - ), - ], - ) - def test_sync_outputs( - self, - tmp_path: Path, - default_queue: Queue, - default_job: Job, - session_action_id: str, - s3_settings_fixture_name: str, - attachments_fixture_name: str, - expected_cas_prefix: str, - expected_output_prefix: str, - request: pytest.FixtureRequest, - assert_expected_files_on_s3, - assert_canonical_manifest, - ): - """ - Test that output files get uploaded to the CAS, skipping upload for files that are already in the CAS, - and tests that an output manifest is uploaded to the Output prefix. - """ - # GIVEN - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - attachments: Attachments = request.getfixturevalue(attachments_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - default_job.attachments = attachments - root_path = str(tmp_path) - local_root = Path(f"{root_path}/assetroot-15addf56bb1a568df964") - test_step = "test_step4" - test_task = "test_task4" - - expected_output_root = Path(local_root).joinpath("test/outputs") - expected_file_path = Path(expected_output_root).joinpath("test.txt") - expected_sub_file_path = Path(expected_output_root).joinpath("inner_dir/test2.txt") - - expected_file_rel_path = "test/outputs/test.txt" - expected_sub_file_rel_path = "test/outputs/inner_dir/test2.txt" - - # Add the files to S3 - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(s3_settings.s3BucketName) - bucket.put_object( - Key=f"{expected_cas_prefix}hash1.xxh128", - Body="a", - ) - expected_metadata = s3.meta.client.head_object( - Bucket=s3_settings.s3BucketName, Key=f"{expected_cas_prefix}hash1.xxh128" - ) - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.hash_file", - side_effect=["hash1", "hash2"], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.hash_data", side_effect=["hash3"] - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[local_root], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._float_to_iso_datetime_string", - side_effect=["2023-07-13T14:35:26.123456Z"], - ): - mock_on_uploading_files = MagicMock(return_value=True) - - try: - # Need to test having multiple files and subdirectories with files - Path(expected_file_path).parent.mkdir(parents=True, exist_ok=True) - with open(expected_file_path, "w") as test_file: - test_file.write("Test Output\n") - Path(expected_sub_file_path).parent.mkdir(parents=True, exist_ok=True) - with open(expected_sub_file_path, "w") as test_file: - test_file.write("Test Sub-Output\n") - - expected_processed_bytes = expected_sub_file_path.resolve().stat().st_size - expected_skipped_bytes = expected_file_path.resolve().stat().st_size - expected_total_bytes = expected_processed_bytes + expected_skipped_bytes - expected_file_mtime = trunc(expected_file_path.stat().st_mtime_ns // 1000) - expected_sub_file_mtime = trunc(expected_sub_file_path.stat().st_mtime_ns // 1000) - - # Actually run the test - summary_statistics = self.default_asset_sync.sync_outputs( - s3_settings=s3_settings, - attachments=attachments, - queue_id=default_queue.queueId, - job_id=default_job.jobId, - step_id=test_step, - task_id=test_task, - session_action_id=session_action_id, - start_time=1234.56, - session_dir=tmp_path, - on_uploading_files=mock_on_uploading_files, - ) - finally: - # Need to clean up after - if local_root.exists(): - shutil.rmtree(local_root) - - # THEN - actual_metadata = s3.meta.client.head_object( - Bucket=s3_settings.s3BucketName, Key=f"{expected_cas_prefix}hash1.xxh128" - ) - assert actual_metadata["LastModified"] == expected_metadata["LastModified"] - assert_expected_files_on_s3( - bucket, - expected_files={ - f"{expected_cas_prefix}hash1.xxh128", - f"{expected_cas_prefix}hash2.xxh128", - f"{expected_output_prefix}hash3_output", - }, - ) - - assert_canonical_manifest( - bucket, - f"{expected_output_prefix}hash3_output", - expected_manifest='{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - f'"paths":[{{"hash":"hash2","mtime":{expected_sub_file_mtime},"path":"{expected_sub_file_rel_path}",' - f'"size":{expected_processed_bytes}}},' - f'{{"hash":"hash1","mtime":{expected_file_mtime},"path":"{expected_file_rel_path}",' - f'"size":{expected_skipped_bytes}}}],' - f'"totalSize":{expected_total_bytes}}}', - ) - - readable_total_input_bytes = human_readable_file_size(expected_total_bytes) - - expected_summary_statistics = SummaryStatistics( - total_time=summary_statistics.total_time, - total_files=2, - total_bytes=expected_total_bytes, - processed_files=1, - processed_bytes=expected_processed_bytes, - skipped_files=1, - skipped_bytes=expected_skipped_bytes, - transfer_rate=expected_processed_bytes / summary_statistics.total_time, - ) - - actual_args, _ = mock_on_uploading_files.call_args - actual_last_progress_report = actual_args[0] - assert actual_last_progress_report.status == ProgressStatus.UPLOAD_IN_PROGRESS - assert actual_last_progress_report.progress == 100.0 - assert ( - f"Uploaded {readable_total_input_bytes} / {readable_total_input_bytes} of 2 files (Transfer rate: " - in actual_last_progress_report.progressMessage - ) - - assert summary_statistics == expected_summary_statistics - - @pytest.mark.parametrize( - "file_path, directory_path, expected", - [ - (Path("/path/to/directory/file.txt"), Path("/path/to/directory"), True), - (Path("/path/to/another/directory/file.txt"), Path("/path/to/directory"), False), - (Path("/path/to/directory/subdirectory/file.txt"), Path("/path/to/directory"), True), - (Path("/path/to/directory/file.txt"), Path("/"), True), - (Path("/path/to/directory/../file.txt"), Path("/path/to"), True), - (Path("directory/file.txt"), Path("directory"), True), - ], - ) - def test_is_file_within_directory(self, file_path, directory_path, expected): - assert ( - self.default_asset_sync._is_file_within_directory(file_path, directory_path) == expected - ) - - @pytest.mark.skipif( - is_windows_non_admin(), - reason="Windows requires Admin to create symlinks, skipping this test.", - ) - def test_is_file_within_directory_with_symlink(self, tmp_path: Path): - """ - Test the `_is_file_within_directory` method when dealing with symbolic links. - Ensures that it correctly identifies whether the target file of the given - symlink is within the specified directory or not. - """ - tmp_dir = tmp_path / "tmp_dir" - tmp_dir.mkdir() - - # Create a file inside the directory - inside_file_path = tmp_dir / "file.txt" - inside_file_path.touch() - # Create a file outside the directory - outside_file_path = tmp_path / "outside_file.txt" - outside_file_path.touch() - - # Create a symlink that points to a file inside the directory - symlink_path_inside = tmp_dir / "symlink_inside.txt" - os.symlink(inside_file_path, symlink_path_inside) - # Create a symlink that points to a file outside the directory - symlink_path_outside = tmp_dir / "symlink_outside.txt" - os.symlink(outside_file_path, symlink_path_outside) - - assert symlink_path_inside.is_symlink() - assert symlink_path_outside.is_symlink() - assert ( - self.default_asset_sync._is_file_within_directory(symlink_path_inside, tmp_dir) is True - ) - assert ( - self.default_asset_sync._is_file_within_directory(symlink_path_outside, tmp_dir) - is False - ) - - @pytest.mark.parametrize( - ("job", "expected_settings"), - [(Job(jobId="job-98765567890123456789012345678901"), None), (None, None)], - ) - def test_get_attachments_not_found_return_none( - self, job: Job, expected_settings: Optional[Attachments] - ): - """Tests that get_attachments returns the expected result if Job or settings are None""" - with patch(f"{deadline.__package__}.job_attachments.asset_sync.get_job", side_effect=[job]): - actual = self.default_asset_sync.get_attachments("test-farm", "test-queue", "test-job") - assert actual == expected_settings - - def test_get_attachments_successful( - self, default_job: Job, default_attachments: Optional[Attachments] - ): - """Tests that get_attachments returns the expected result""" - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_job", side_effect=[default_job] - ): - actual = self.default_asset_sync.get_attachments( - "test-farm", "test-queue", default_job.jobId - ) - assert actual == default_attachments - - @pytest.mark.parametrize( - ("queue", "expected_settings"), - [ - ( - Queue( - queueId="queue-98765567890123456789012345678901", - displayName="test-queue", - farmId="test-farm", - status="test", - defaultBudgetAction="NONE", - ), - None, - ), - (None, None), - ], - ) - def test_get_s3_settings_not_found_return_none( - self, queue: Queue, expected_settings: Optional[JobAttachmentS3Settings] - ): - """Tests that get_s3_settings returns the expected result if Queue or S3 settings are None""" - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_queue", side_effect=[queue] - ): - actual = self.default_asset_sync.get_s3_settings("test-farm", "test-queue") - assert actual == expected_settings - - def test_get_s3_settings_successful( - self, - default_queue: Queue, - default_job_attachment_s3_settings: Optional[JobAttachmentS3Settings], - ): - """Tests that get_s3_settings returns the expected result""" - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_queue", - side_effect=[default_queue], - ): - actual = self.default_asset_sync.get_s3_settings("test-farm", default_queue.queueId) - assert actual == default_job_attachment_s3_settings - - def test_sync_inputs_with_storage_profiles_path_mapping_rules( - self, - default_queue: Queue, - default_job: Job, - test_manifest_one: dict, - tmp_path: Path, - ): - """Tests when a non-empty `storage_profiles_path_mapping_rules` is passed to `sync_inputs`. - Check that, for input manifests with an `fileSystemLocationName`, if the root path - corresponding to it exists in the `storage_profiles_path_mapping_rules`, the download - is attempted to the correct destination path.""" - # GIVEN - default_job.attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath="/tmp", - rootPathFormat=PathFormat.POSIX, - inputManifestPath="manifest_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=["test/outputs"], - ), - ManifestProperties( - fileSystemLocationName="Movie 1", - rootPath="/home/user/movie1", - rootPathFormat=PathFormat.POSIX, - inputManifestPath="manifest-movie1_input", - inputManifestHash="manifestmovie1hash", - outputRelativeDirectories=["test/outputs"], - ), - ], - ) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(tmp_path.joinpath(dest_dir)) - - storage_profiles_path_mapping_rules = { - "/home/user/movie1": "/tmp/movie1", - } - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - return_value=DownloadSummaryStatistics(), - ) as mock_download_files_from_manifests, patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - (summary_statistics, result_pathmap_rules) = self.default_asset_sync.sync_inputs( - s3_settings=default_queue.jobAttachmentSettings, - attachments=default_job.attachments, - queue_id=default_queue.queueId, - job_id=default_job.jobId, - session_dir=tmp_path, - storage_profiles_path_mapping_rules=storage_profiles_path_mapping_rules, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - assert result_pathmap_rules == [ - { - "source_path_format": "posix", - "source_path": default_job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - - mock_download_files_from_manifests.assert_called_once_with( - s3_bucket="test-bucket", - manifests_by_root={ - f"{local_root}": test_manifest, - "/tmp/movie1": test_manifest, - }, - cas_prefix="assetRoot/Data", - fs_permission_settings=None, - session=ANY, - on_downloading_files=mock_on_downloading_files, - logger=getLogger("deadline.job_attachments"), - ) - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ("vfs_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_sync_inputs_successful_using_vfs_fallback( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that a valid manifest can be processed to download attachments from S3""" - # GIVEN - job: Job = request.getfixturevalue(job_fixture_name) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - assert job.attachments - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.find_vfs", - side_effect=VFSExecutableMissingError, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.mount_vfs_from_manifests" - ) as mock_mount_vfs, patch("sys.platform", "linux"), patch.object( - Path, "stat", MagicMock(st_mtime_ns=1234512345123451) - ): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - expected_source_path_format = ( - "windows" - if job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - mock_mount_vfs.assert_not_called() - - def test_cleanup_session_vfs_terminate_called(self, tmp_path): - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.find_vfs", - ) as mock_find_vfs, patch( - f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.kill_all_processes", - ): - self.default_asset_sync.cleanup_session( - session_dir=tmp_path, - file_system=JobAttachmentsFileSystem.COPIED, - os_user="test-user", - ) - - mock_find_vfs.assert_not_called() - - self.default_asset_sync.cleanup_session( - session_dir=tmp_path, - file_system=JobAttachmentsFileSystem.VIRTUAL, - os_user="test-user", - ) - - mock_find_vfs.assert_called_once() - - def test_cleanup_session_virtual_witout_os_user_raises(self, tmp_path): - self.default_asset_sync.cleanup_session( - session_dir=tmp_path, - file_system=JobAttachmentsFileSystem.COPIED, - ) - - with pytest.raises(VFSOSUserNotSetError): - self.default_asset_sync.cleanup_session( - session_dir=tmp_path, - file_system=JobAttachmentsFileSystem.VIRTUAL, - ) - - def test_attachment_sync_inputs_no_inputs_successful( - self, - tmp_path: Path, - default_queue: Queue, - default_job: Job, - attachments_no_inputs: Attachments, - ): - """Asserts that sync_inputs is successful when no required assets exist for the Job""" - # GIVEN - default_job.attachments = attachments_no_inputs - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ): - mock_on_downloading_files = MagicMock(return_value=True) - - (summary_statistics, result_pathmap_rules) = ( - self.default_asset_sync.attachment_sync_inputs( - default_queue.jobAttachmentSettings, - attachments_no_inputs, - default_queue.queueId, - default_job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - ) - ) - - # THEN - expected_source_path_format = ( - "windows" - if default_job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": default_job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - expected_summary_statistics = SummaryStatistics( - total_time=summary_statistics.total_time, - total_files=0, - total_bytes=0, - processed_files=0, - processed_bytes=0, - skipped_files=0, - skipped_bytes=0, - transfer_rate=0.0, - ) - assert summary_statistics == expected_summary_statistics - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ("vfs_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_attachment_sync_inputs_successful( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that a valid manifest can be processed to download attachments from S3""" - # GIVEN - job: Job = request.getfixturevalue(job_fixture_name) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - test_fs_permission_settings: PosixFileSystemPermissionSettings = ( - PosixFileSystemPermissionSettings( - os_user="test-user", - os_group="test-group", - dir_mode=0o20, - file_mode=0o20, - ) - ) - os_env_vars: Dict[str, str] = {"AWS_PROFILE": "test-profile"} - assert job.attachments - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.mount_vfs_from_manifests" - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.find_vfs" - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.attachment_sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - fs_permission_settings=test_fs_permission_settings, - os_env_vars=os_env_vars, - ) - # THEN - expected_source_path_format = ( - "windows" - if job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_attachment_sync_inputs_404_error( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that a specific error message is raised when getting 404 errors synching inputs""" - # GIVEN - download_exception = JobAttachmentsS3ClientError( - action="get-object", - status_code=404, - bucket_name="test bucket", - key_or_prefix="test-key.xxh128", - message="File not found", - ) - job: Job = request.getfixturevalue(job_fixture_name) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - dest_dir = "assetroot-27bggh78dd2b568ab123" - assert job.attachments - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=download_exception, - ): - with pytest.raises(JobAttachmentsS3ClientError) as excinfo: - self.default_asset_sync.attachment_sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - ) - # THEN - assert "usually located in the home directory (~/.deadline/cache/s3_check_cache.db)" in str( - excinfo - ) - - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_attachment_sync_inputs_with_step_dependencies( - self, - tmp_path: Path, - default_queue: Queue, - default_job: Job, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that input syncing is done correctly when step dependencies are provided.""" - # GIVEN - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - assert default_job.attachments - step_output_root = "/home/outputs_roots" - step_dest_dir = "assetroot-8a7d189e9c17186fb88b" - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir, step_dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_output_manifests_by_asset_root", - side_effect=[{step_output_root: {}}], - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.attachment_sync_inputs( - s3_settings, - default_job.attachments, - default_queue.queueId, - default_job.jobId, - tmp_path, - step_dependencies=["step-1"], - on_downloading_files=mock_on_downloading_files, - ) - # THEN - expected_source_path_format = ( - "windows" - if default_job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": default_job.attachments.manifests[0].rootPath, - "destination_path": local_root, - }, - ] - - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_step_dependency_download_uses_overwrite_conflict_resolution( - self, - tmp_path: Path, - default_queue: Queue, - default_job: Job, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Downloads should always use OVERWRITE conflict resolution since the worker - downloads into a fresh session directory.""" - # GIVEN - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - dest_dir = "assetroot-27bggh78dd2b568ab123" - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - assert default_job.attachments - - # WHEN — no step dependencies - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ) as mock_download, patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - self.default_asset_sync.attachment_sync_inputs( - s3_settings, - default_job.attachments, - default_queue.queueId, - default_job.jobId, - tmp_path, - on_downloading_files=MagicMock(return_value=True), - ) - - # THEN — should always use OVERWRITE - mock_download.assert_called_once() - call_kwargs = mock_download.call_args - assert call_kwargs.kwargs.get("conflict_resolution") == FileConflictResolution.OVERWRITE - - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_attachment_sync_inputs_with_step_dependencies_same_root_vfs_on_posix( - self, - tmp_path: Path, - default_queue: Queue, - vfs_job: Job, - s3_settings_fixture_name: str, - test_manifest_one: dict, - test_manifest_two: dict, - request: pytest.FixtureRequest, - ): - """Asserts that input syncing is done correctly when step dependencies are provided.""" - # GIVEN - job = vfs_job - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_fs_permission_settings: PosixFileSystemPermissionSettings = ( - PosixFileSystemPermissionSettings( - os_user="test-user", - os_group="test-group", - dir_mode=0o20, - file_mode=0o20, - ) - ) - os_env_vars: Dict[str, str] = {"AWS_PROFILE": "test-profile"} - assert job.attachments - test_manifest = decode_manifest(json.dumps(test_manifest_two)) - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=json.dumps(test_manifest_one), - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - return_value=dest_dir, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_output_manifests_by_asset_root", - return_value={"tmp/": [(test_manifest, "hello")]}, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.merge_asset_manifests", - ) as merge_manifests_mock, patch( - f"{deadline.__package__}.job_attachments.asset_sync.AssetSync._ensure_disk_capacity", - ) as disk_capacity_mock, patch( - f"{deadline.__package__}.job_attachments.download._write_manifest_to_temp_file", - return_value="tmp_manifest", - ), patch("sys.platform", "linux"), patch( - f"{deadline.__package__}.job_attachments.asset_sync.mount_vfs_from_manifests" - ), patch(f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.find_vfs"): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.attachment_sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - step_dependencies=["step-1"], - on_downloading_files=mock_on_downloading_files, - fs_permission_settings=test_fs_permission_settings, - os_env_vars=os_env_vars, - ) - # THEN - merge_manifests_mock.assert_called() - disk_capacity_mock.assert_not_called() - expected_source_path_format = ( - "windows" - if job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": job.attachments.manifests[0].rootPath, - "destination_path": local_root, - }, - ] - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_attachment_sync_inputs_no_space_left( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - really_big_manifest: dict, - request: pytest.FixtureRequest, - ): - """Asserts that an AssetSyncError is thrown if there is not enough space left on the device to download all inputs.""" - # GIVEN - job: Job = request.getfixturevalue(job_fixture_name) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - dest_dir = "assetroot-27bggh78dd2b568ab123" - test_manifest = decode_manifest(json.dumps(really_big_manifest)) - test_fs_permission_settings: PosixFileSystemPermissionSettings = ( - PosixFileSystemPermissionSettings( - os_user="test-user", - os_group="test-group", - dir_mode=0o20, - file_mode=0o20, - ) - ) - os_env_vars: Dict[str, str] = {"AWS_PROFILE": "test-profile"} - assert job.attachments - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - with pytest.raises(AssetSyncError) as ase: - self.default_asset_sync.attachment_sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - fs_permission_settings=test_fs_permission_settings, - os_env_vars=os_env_vars, - ) - - # THEN - assert ( - "Total file size required for download (300.0 PB) is larger than available disk space" - in str(ase) - ) - - def test_aggregate_asset_root_manifests_and_write( - self, - default_queue: Queue, - default_job: Job, - default_job_attachment_s3_settings: JobAttachmentS3Settings, - test_manifest_one: dict, - tmp_path: Path, - ): - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - dest_dir = "assetroot" - - default_job.attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath="/root/tmp", - rootPathFormat=PathFormat.POSIX, - inputManifestPath="manifest_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=["test/outputs"], - ), - ManifestProperties( - fileSystemLocationName="Movie 1", - rootPath="/home/user/movie1", - rootPathFormat=PathFormat.POSIX, - inputManifestPath="manifest-movie1_input", - inputManifestHash="manifestmovie1hash", - outputRelativeDirectories=["test/outputs"], - ), - ], - ) - manifest_count = len(default_job.attachments.manifests) - storage_profiles_path_mapping_rules = { - "/home/user/movie1": "/root/tmp/movie1", - } - path_write_local_input_manifest = tmp_path.joinpath("manifest/hash_manifest") - - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ) as mock_get_manifest_from_s3, patch( - f"{deadline.__package__}.job_attachments.asset_sync.merge_asset_manifests", - return_value=test_manifest, - ) as mock_merge_asset_manifests, patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.S3AssetUploader._write_local_input_manifest", - return_value=path_write_local_input_manifest, - ) as mock__write_local_input_manifest: - merged_manifests_by_root = self.default_asset_sync._aggregate_asset_root_manifests( - session_dir=tmp_path, - s3_settings=default_job_attachment_s3_settings, - queue_id=default_queue.queueId, - job_id=default_job.jobId, - attachments=default_job.attachments, - dynamic_mapping_rules=self.default_asset_sync.generate_dynamic_path_mapping( - session_dir=tmp_path, attachments=default_job.attachments - ), - storage_profiles_path_mapping_rules=storage_profiles_path_mapping_rules, - ) - assert mock_merge_asset_manifests.call_count == manifest_count - assert mock_get_manifest_from_s3.call_count == manifest_count - - manifest_paths_by_root = self.default_asset_sync._check_and_write_local_manifests( - merged_manifests_by_root=merged_manifests_by_root, - manifest_write_dir=str(tmp_path), - manifest_name_suffix="test", - ) - assert mock__write_local_input_manifest.call_count == manifest_count - assert len(self.default_asset_sync._local_root_to_src_map) == len( - manifest_paths_by_root - ) - assert len(manifest_paths_by_root) == manifest_count - assert "/root/tmp/movie1" in manifest_paths_by_root - assert "test" in manifest_paths_by_root["/root/tmp/movie1"] - assert str(tmp_path.joinpath(dest_dir)) in manifest_paths_by_root - - def test_attachment_sync_inputs_with_storage_profiles_path_mapping_rules( - self, - default_queue: Queue, - default_job: Job, - test_manifest_one: dict, - tmp_path: Path, - ): - """Tests when a non-empty `storage_profiles_path_mapping_rules` is passed to `sync_inputs`. - Check that, for input manifests with an `fileSystemLocationName`, if the root path - corresponding to it exists in the `storage_profiles_path_mapping_rules`, the download - is attempted to the correct destination path.""" - # GIVEN - default_job.attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath="/tmp", - rootPathFormat=PathFormat.POSIX, - inputManifestPath="manifest_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=["test/outputs"], - ), - ManifestProperties( - fileSystemLocationName="Movie 1", - rootPath="/home/user/movie1", - rootPathFormat=PathFormat.POSIX, - inputManifestPath="manifest-movie1_input", - inputManifestHash="manifestmovie1hash", - outputRelativeDirectories=["test/outputs"], - ), - ], - ) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(tmp_path.joinpath(dest_dir)) - - storage_profiles_path_mapping_rules = { - "/home/user/movie1": "/tmp/movie1", - } - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - return_value=DownloadSummaryStatistics(), - ) as mock_download_files_from_manifests, patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch.object(Path, "stat", MagicMock(st_mtime_ns=1234512345123451)): - mock_on_downloading_files = MagicMock(return_value=True) - - (summary_statistics, result_pathmap_rules) = ( - self.default_asset_sync.attachment_sync_inputs( - s3_settings=default_queue.jobAttachmentSettings, - attachments=default_job.attachments, - queue_id=default_queue.queueId, - job_id=default_job.jobId, - session_dir=tmp_path, - storage_profiles_path_mapping_rules=storage_profiles_path_mapping_rules, - on_downloading_files=mock_on_downloading_files, - ) - ) - - # THEN - assert result_pathmap_rules == [ - { - "source_path_format": "posix", - "source_path": default_job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - - mock_download_files_from_manifests.assert_called_once_with( - s3_bucket="test-bucket", - manifests_by_root={ - f"{local_root}": test_manifest, - "/tmp/movie1": test_manifest, - }, - cas_prefix="assetRoot/Data", - fs_permission_settings=None, - session=ANY, - on_downloading_files=mock_on_downloading_files, - logger=getLogger("deadline.job_attachments"), - conflict_resolution=FileConflictResolution.OVERWRITE, - ) - - @pytest.mark.parametrize( - ("job_fixture_name"), - [ - ("default_job"), - ("vfs_job"), - ], - ) - @pytest.mark.parametrize( - ("s3_settings_fixture_name"), - [ - ("default_job_attachment_s3_settings"), - ], - ) - def test_attachment_sync_inputs_successful_using_vfs_fallback( - self, - tmp_path: Path, - default_queue: Queue, - job_fixture_name: str, - s3_settings_fixture_name: str, - test_manifest_one: dict, - request: pytest.FixtureRequest, - ): - """Asserts that a valid manifest can be processed to download attachments from S3""" - # GIVEN - job: Job = request.getfixturevalue(job_fixture_name) - s3_settings: JobAttachmentS3Settings = request.getfixturevalue(s3_settings_fixture_name) - default_queue.jobAttachmentSettings = s3_settings - session_dir = str(tmp_path) - dest_dir = "assetroot-27bggh78dd2b568ab123" - local_root = str(Path(session_dir) / dest_dir) - test_manifest = decode_manifest(json.dumps(test_manifest_one)) - assert job.attachments - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.asset_sync.get_manifest_from_s3", - return_value=test_manifest, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.download_files_from_manifests", - side_effect=[DownloadSummaryStatistics()], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync._get_unique_dest_dir_name", - side_effect=[dest_dir], - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.VFSProcessManager.find_vfs", - side_effect=VFSExecutableMissingError, - ), patch( - f"{deadline.__package__}.job_attachments.asset_sync.mount_vfs_from_manifests" - ) as mock_mount_vfs, patch("sys.platform", "linux"), patch.object( - Path, "stat", MagicMock(st_mtime_ns=1234512345123451) - ): - mock_on_downloading_files = MagicMock(return_value=True) - - (_, result_pathmap_rules) = self.default_asset_sync.attachment_sync_inputs( - s3_settings, - job.attachments, - default_queue.queueId, - job.jobId, - tmp_path, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - expected_source_path_format = ( - "windows" - if job.attachments.manifests[0].rootPathFormat == PathFormat.WINDOWS - else "posix" - ) - assert result_pathmap_rules == [ - { - "source_path_format": expected_source_path_format, - "source_path": job.attachments.manifests[0].rootPath, - "destination_path": local_root, - } - ] - mock_mount_vfs.assert_not_called() diff --git a/test/unit/deadline_job_attachments/test_download.py b/test/unit/deadline_job_attachments/test_download.py deleted file mode 100644 index 241e9dfb2..000000000 --- a/test/unit/deadline_job_attachments/test_download.py +++ /dev/null @@ -1,3123 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for downloading files from the Job Attachment CAS.""" - -from __future__ import annotations - -import os -import shutil - -from collections import Counter -from dataclasses import dataclass, fields -from datetime import datetime -from io import BytesIO -import json -from pathlib import Path -import sys -import tempfile -from threading import Lock -from typing import Any, Callable, DefaultDict, List -from unittest.mock import MagicMock, call, patch - -import boto3 -from botocore.exceptions import BotoCoreError, ClientError, ReadTimeoutError -from botocore.stub import Stubber - -import pytest - -import deadline -from deadline.job_attachments.asset_manifests import HashAlgorithm -from deadline.job_attachments.asset_manifests.base_manifest import ( - BaseAssetManifest, - BaseManifestPath as BaseManifestPath, -) -from deadline.job_attachments.asset_manifests.v2023_03_03 import ( - ManifestPath as ManifestPathv2023_03_03, -) -from deadline.job_attachments.asset_manifests.versions import ManifestVersion -from deadline.job_attachments.download import ( - OutputDownloader, - download_file, - download_files_from_manifests, - download_files_in_directory, - get_job_input_output_paths_by_asset_root, - get_job_input_paths_by_asset_root, - get_job_output_paths_by_asset_root, - get_output_manifests_by_asset_root, - get_manifest_from_s3, - handle_existing_vfs, - mount_vfs_from_manifests, - merge_asset_manifests, - _ensure_paths_within_directory, - _get_asset_root_from_metadata, - _get_manifests_by_session_action_id, - _get_new_copy_file_path, - _get_tasks_manifests_keys_from_s3, - _list_s3_objects_with_error_handling, - _merge_asset_manifests_sorted_asc_by_last_modified, - VFS_CACHE_REL_PATH_IN_SESSION, - VFS_MANIFEST_FOLDER_IN_SESSION, - VFS_MANIFEST_FOLDER_PERMISSIONS, - VFS_LOGS_FOLDER_IN_SESSION, - WINDOWS_MAX_PATH_LENGTH, - TEMP_DOWNLOAD_ADDED_CHARS_LENGTH, -) -from deadline.job_attachments.exceptions import ( - AssetSyncError, - JobAttachmentsError, - JobAttachmentsS3ClientError, - MissingAssetRootError, - PathOutsideDirectoryError, -) -from deadline.job_attachments.models import ( - Attachments, - FileConflictResolution, - Job, - JobAttachmentS3Settings, - ManifestPathGroup, - Queue, -) -from deadline.job_attachments.progress_tracker import ( - DownloadSummaryStatistics, - ProgressReportMetadata, - ProgressStatus, -) -from deadline.job_attachments.asset_manifests.decode import decode_manifest - -from deadline.job_attachments.os_file_permission import ( - PosixFileSystemPermissionSettings, - WindowsFileSystemPermissionSettings, - WindowsPermissionEnum, -) -from deadline.job_attachments.api import human_readable_file_size - -from .conftest import has_posix_target_user, has_posix_disjoint_user -from ..conftest import is_windows_non_admin - - -@dataclass -class Manifest: - prefix: str - manifests: bytes - - -MANIFESTS_v2022_03_03: List[Manifest] = [ - Manifest( - "job-1/step-1/task-1-1/sessionaction-9-9/manifest1v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test1","mtime":1234000000,"path":"test1.txt","size":1},' - b'{"hash":"test2","mtime":1234000000,"path":"test/test2.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-1/task-1-1/sessionaction-9-9/manifest2v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test3","mtime":1234000000,"path":"test/test3.txt","size":1},' - b'{"hash":"test4","mtime":1234000000,"path":"test4.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-1/task-1-1/session-action-1/manifest2v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test3","mtime":1234000000,"path":"test/test33.txt","size":1},' - b'{"hash":"test4","mtime":1234000000,"path":"test44.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-1/task-1-11/session-action-9/manifest7v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test13","mtime":1234000000,"path":"test13.txt","size":1},' - b'{"hash":"test14","mtime":1234000000,"path":"test/test14.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-1/task-1-2/session-action-9/manifest3v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test5","mtime":1234000000,"path":"test5.txt","size":1},' - b'{"hash":"test6","mtime":1234000000,"path":"test/test6.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-2/task-2-3/session-action-9/manifest4v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test7","mtime":1234000000,"path":"test7.txt","size":1},' - b'{"hash":"test8","mtime":1234000000,"path":"test/test8.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-2/task-2-3/session-action-9/manifest5v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test9","mtime":1234000000,"path":"test/test9.txt","size":1},' - b'{"hash":"test10","mtime":1234000000,"path":"test10.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-2/task-2-3/session-action-1/manifest5v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test9","mtime":1234000000,"path":"test/test99.txt","size":1},' - b'{"hash":"test100","mtime":1234000000,"path":"test10.txt","size":1}],' - b'"totalSize":2}', - ), - Manifest( - "job-1/step-2/task-2-4/session-action-9/manifest6v2023-03-03_output", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[{"hash":"test11","mtime":1234000000,"path":"test11.txt","size":1},' - b'{"hash":"test12","mtime":1234000000,"path":"test/test12.txt","size":1}],' - b'"totalSize":2}', - ), -] - -MANIFEST_VERSION_TO_MANIFESTS: dict[ManifestVersion, List[Manifest]] = { - ManifestVersion.v2023_03_03: MANIFESTS_v2022_03_03, -} - -INPUT_ASSET_MANIFESTS_V2023_03_03: List[Manifest] = [ - Manifest( - "Inputs/0000/manifest_input", - b'{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - b'"paths":[' - b'{"hash":"input1","mtime":1234000000,"path":"inputs/input1.txt","size":1},' - b'{"hash":"input2","mtime":1234000000,"path":"inputs/subdir/input2.txt","size":1},' - b'{"hash":"input3","mtime":1234000000,"path":"inputs/subdir/input3.txt","size":1},' - b'{"hash":"input4","mtime":1234000000,"path":"inputs/subdir/subdir2/input4.txt","size":1},' - b'{"hash":"input5","mtime":1234000000,"path":"inputs/input5.txt","size":1}],' - b'"totalSize":5}', - ), -] - -MANIFEST_VERSION_TO_INPUT_ASSET_MANIFESTS: dict[ManifestVersion, List[Manifest]] = { - ManifestVersion.v2023_03_03: INPUT_ASSET_MANIFESTS_V2023_03_03, -} - - -def assert_download_task_output( - s3_settings: JobAttachmentS3Settings, - farm_id, - queue_id, - tmp_path: Path, - expected_files: dict[str, List[Path]], - expected_total_bytes: int, - manifest_version: ManifestVersion, -): - """ - Assert that the expected files are downloaded when download_job_output is called with a task id. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - mock_on_downloading_files = MagicMock(return_value=True) - - output_downloader = OutputDownloader( - s3_settings=s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id="step-1", - task_id="task-1-1", - session_action_id="sessionaction-9-9", - ) - - summary_statistics = output_downloader.download_job_output( - on_downloading_files=mock_on_downloading_files, - ) - - check_expected_files_present(expected_files, tmp_path) - - check_manifest_version_v2023_mtime(manifest_version, tmp_path) - - assert_progress_tracker_values( - manifest_version=manifest_version, - summary_statistics=summary_statistics, - expected_files=expected_files, - expected_total_bytes=expected_total_bytes, - mock_on_downloading_files=mock_on_downloading_files, - ) - - -def check_manifest_version_v2023_mtime(manifest_version, tmp_path): - # Ensure that all the files from the 2023-03-03 manifest have had the correct mtime set. - if manifest_version == ManifestVersion.v2023_03_03: - assert all(path.stat().st_mtime == 1234 for path in tmp_path.glob("**/*") if path.is_file()) - - -def assert_download_step_output( - s3_settings: JobAttachmentS3Settings, - farm_id, - queue_id, - tmp_path: Path, - expected_files: dict[str, List[Path]], - expected_total_bytes: int, - manifest_version: ManifestVersion, -): - """ - Assert that the expected files are downloaded when download_job_output is called with a step id. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - mock_on_downloading_files = MagicMock(return_value=True) - - output_downloader = OutputDownloader( - s3_settings=s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id="step-1", - task_id=None, - ) - - summary_statistics = output_downloader.download_job_output( - on_downloading_files=mock_on_downloading_files, - ) - - check_expected_files_present(expected_files, tmp_path) - - check_manifest_version_v2023_mtime(manifest_version, tmp_path) - - assert_progress_tracker_values( - manifest_version=manifest_version, - summary_statistics=summary_statistics, - expected_files=expected_files, - expected_total_bytes=expected_total_bytes, - mock_on_downloading_files=mock_on_downloading_files, - ) - - -def assert_download_job_output( - s3_settings: JobAttachmentS3Settings, - farm_id, - queue_id, - tmp_path: Path, - expected_files: dict[str, List[Path]], - expected_total_bytes: int, - manifest_version: ManifestVersion, -): - """ - Assert that the expected files are downloaded when download_job_output is called. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - mock_on_downloading_files = MagicMock(return_value=True) - - output_downloader = OutputDownloader( - s3_settings=s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - summary_statistics = output_downloader.download_job_output( - on_downloading_files=mock_on_downloading_files, - ) - - # Ensure that only the expected files are there and no extras. - expected_files_set = set().union(*expected_files.values()) - assert expected_files_set == set([path for path in tmp_path.glob("**/*") if path.is_file()]) - check_manifest_version_v2023_mtime(manifest_version, tmp_path) - - assert_progress_tracker_values( - manifest_version=manifest_version, - summary_statistics=summary_statistics, - expected_files=expected_files, - expected_total_bytes=expected_total_bytes, - mock_on_downloading_files=mock_on_downloading_files, - ) - - -def assert_download_files_in_directory( - s3_settings: JobAttachmentS3Settings, - attachments: Attachments, - farm_id: str, - queue_id: str, - directory_path: str, - tmp_path: Path, - expected_files: dict[str, List[Path]], - expected_total_bytes: int, - manifest_version: ManifestVersion, -): - """ - Assert that the expected files are downloaded when download_files_in_directory is called. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - mock_on_downloading_files = MagicMock(return_value=True) - - summary_statistics = download_files_in_directory( - s3_settings=s3_settings, - attachments=attachments, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - directory_path=directory_path, - local_download_dir=str(tmp_path.resolve()), - on_downloading_files=mock_on_downloading_files, - ) - - check_expected_files_present(expected_files, tmp_path) - - assert_progress_tracker_values( - manifest_version=manifest_version, - summary_statistics=summary_statistics, - expected_files=expected_files, - expected_total_bytes=expected_total_bytes, - mock_on_downloading_files=mock_on_downloading_files, - ) - - -def check_expected_files_present(expected_files, tmp_path): - # Ensure that only the expected files are there and no extras. - expected_files_set = set().union(*expected_files.values()) - assert expected_files_set == set([path for path in tmp_path.glob("**/*") if path.is_file()]) - - -def assert_progress_tracker_values( - manifest_version: ManifestVersion, - summary_statistics: DownloadSummaryStatistics, - expected_files: dict[str, List[Path]], - expected_total_bytes: int, - mock_on_downloading_files: MagicMock, -): - readable_total_input_bytes = human_readable_file_size(expected_total_bytes) - expected_files_set = set().union(*expected_files.values()) - file_counts_by_root_directory = {root: len(paths) for root, paths in expected_files.items()} - - if manifest_version == ManifestVersion.v2023_03_03: - expected_progress_message_part = ( - f"Downloaded {readable_total_input_bytes} / {readable_total_input_bytes}" - f" of {len(expected_files_set)} files (Transfer rate: " - ) - expected_summary_statistics = DownloadSummaryStatistics( - total_time=summary_statistics.total_time, - total_files=len(expected_files_set), - total_bytes=expected_total_bytes, - processed_files=len(expected_files_set), - processed_bytes=expected_total_bytes, - skipped_files=0, - skipped_bytes=0, - transfer_rate=expected_total_bytes / summary_statistics.total_time, - file_counts_by_root_directory=file_counts_by_root_directory, - downloaded_files=sorted([str(path) for path in expected_files_set]), - ) - else: - # If the manifest version does not support `size` and `total_size` properties, - # the progress is tracked in the number of files instead of bytes. - expected_progress_message_part = ( - f"Downloaded {len(expected_files_set)}/{len(expected_files_set)} files" - ) - expected_summary_statistics = DownloadSummaryStatistics( - total_time=summary_statistics.total_time, - total_files=len(expected_files_set), - total_bytes=0, - processed_files=len(expected_files_set), - processed_bytes=0, - skipped_files=0, - skipped_bytes=0, - transfer_rate=0.0, - file_counts_by_root_directory=file_counts_by_root_directory, - downloaded_files=sorted([str(path) for path in expected_files_set]), - ) - - actual_args, _ = mock_on_downloading_files.call_args - actual_last_progress_report = actual_args[0] - assert actual_last_progress_report.status == ProgressStatus.DOWNLOAD_IN_PROGRESS - assert actual_last_progress_report.progress == 100.0 - assert expected_progress_message_part in actual_last_progress_report.progressMessage - - for attribute in fields(expected_summary_statistics): - assert getattr(summary_statistics, attribute.name) == getattr( - expected_summary_statistics, attribute.name - ) - - -def assert_download_job_output_with_task_id_and_no_step_id_throws_error( - s3_settings: JobAttachmentS3Settings, farm_id, queue_id -): - """ - Assert a JobAttachmentError is thrown when a task id is provided but step id is not. - """ - with pytest.raises(JobAttachmentsError): - mock_on_downloading_files = MagicMock(return_value=True) - - output_downloader = OutputDownloader( - s3_settings=s3_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id="task-1-1", - ) - output_downloader.download_job_output( - on_downloading_files=mock_on_downloading_files, - ) - - -def assert_get_job_input_paths_by_asset_root( - s3_settings: JobAttachmentS3Settings, - attachments: Attachments, - expected_files: dict[str, List[BaseManifestPath]], - expected_total_bytes: int, - manifest_version: ManifestVersion, -): - """ - Assert that get_job_input_paths_by_asset_root returns a dict of (asset root, manifest path group) of all asset files. - """ - with patch( - f"{deadline.__package__}.job_attachments.download.get_job_output_paths_by_asset_root", - return_value={ - "/tmp": ManifestPathGroup( - total_bytes=100, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03( - path="outputs/output.txt", hash="outputhash", size=100, mtime=1234567 - ) - ], - }, - ) - }, - ): - paths_by_root = get_job_input_paths_by_asset_root( - s3_settings=s3_settings, - attachments=attachments, - ) - assert len(paths_by_root) == len(expected_files) - total_bytes = 0 - for root, path_group in paths_by_root.items(): - assert len(path_group.files_by_hash_alg) == 1 # assume only one hash alg - assert path_group.files_by_hash_alg[HashAlgorithm.XXH128] == expected_files[root] - total_bytes += path_group.total_bytes - - assert total_bytes == expected_total_bytes - - -def assert_get_job_output_paths_by_asset_root( - s3_settings: JobAttachmentS3Settings, - farm_id: str, - queue_id: str, - expected_files: dict[str, List[BaseManifestPath]], - expected_total_bytes: int, - manifest_version: ManifestVersion, -): - """ - Assert that get_job_output_paths_by_asset_root returns a list of (hash, path) pairs of all output files. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value="/test", - ): - paths_by_root = get_job_output_paths_by_asset_root( - s3_settings=s3_settings, farm_id=farm_id, queue_id=queue_id, job_id="job-1" - ) - - assert len(paths_by_root) == len(expected_files) - total_bytes = 0 - for root, path_group in paths_by_root.items(): - assert len(path_group.files_by_hash_alg) == 1 # assume only one hash alg - assert path_group.files_by_hash_alg[HashAlgorithm.XXH128] == expected_files[root] - total_bytes += path_group.total_bytes - - assert total_bytes == expected_total_bytes - - -def assert_get_job_output_paths_by_asset_root_when_no_asset_root_throws_error( - farm_id: str, - queue_id: str, - s3_settings: JobAttachmentS3Settings, -): - """ - Assert that get_job_output_paths_by_asset_root raises MissingAssetRootError when fail to get manifest. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=None, - ), pytest.raises(MissingAssetRootError) as raised_err: - get_job_output_paths_by_asset_root(s3_settings, farm_id, queue_id, "job-1") - assert "Failed to get asset root from metadata of output manifest:" in str(raised_err.value) - - -def assert_get_job_input_output_paths_by_asset_root( - s3_settings: JobAttachmentS3Settings, - attachments: Attachments, - farm_id: str, - queue_id: str, - expected_files: dict[str, List[BaseManifestPath]], - expected_total_bytes: int, - manifest_version: ManifestVersion, -): - """ - Assert that get_job_input_output_paths_by_asset_root returns a list of (hash, path) pairs of all - asset files and output files. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value="/tmp", - ): - paths_by_root = get_job_input_output_paths_by_asset_root( - s3_settings, attachments, farm_id, queue_id, "job-1" - ) - - assert len(paths_by_root) == len(expected_files) - total_bytes = 0 - for root, path_group in paths_by_root.items(): - assert len(path_group.files_by_hash_alg) == 1 # assume only one hash alg - assert path_group.files_by_hash_alg[HashAlgorithm.XXH128] == expected_files[root] - if manifest_version == ManifestVersion.v2023_03_03: - total_bytes += path_group.total_bytes - - if manifest_version == ManifestVersion.v2023_03_03: - assert total_bytes == expected_total_bytes - - -@pytest.mark.docker -@pytest.mark.parametrize("manifest_version", [ManifestVersion.v2023_03_03]) -class TestFullDownload: - """ - Tests for downloads from cas. - """ - - @pytest.fixture(autouse=True) - def before_test( - self, - request, - create_s3_bucket: Callable[[str], None], - farm_id: str, - queue_id: str, - default_job_attachment_s3_settings: JobAttachmentS3Settings, - default_queue: Queue, - default_job: Job, - create_get_queue_response: Callable[[Queue], dict[str, Any]], - create_get_job_response: Callable[[Job], dict[str, Any]], - manifest_version: ManifestVersion, - ): - """ - Setup the default queue and s3 bucket for all asset tests. - Mark test with `no_setup` if you don't want this setup to run. - """ - if "no_setup" in request.keywords: - return - - self.job_attachment_settings = default_job_attachment_s3_settings - self.queue = default_queue - self.job = default_job - self.queue_response = create_get_queue_response(self.queue) - self.job_response = create_get_job_response(self.job) - create_s3_bucket(default_job_attachment_s3_settings.s3BucketName) - - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_settings.s3BucketName) - - for i in range(1, 15): - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.job_attachment_settings.rootPrefix}/Data/test{i}.xxh128", - ) - - for i in range(1, 6): - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.job_attachment_settings.rootPrefix}/Data/input{i}.xxh128", - ) - - for manifest in MANIFEST_VERSION_TO_MANIFESTS[manifest_version]: - bucket.upload_fileobj( - BytesIO(manifest.manifests), - f"{self.job_attachment_settings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/{manifest.prefix}", - ) - - for manifest in MANIFEST_VERSION_TO_INPUT_ASSET_MANIFESTS[manifest_version]: - bucket.upload_fileobj( - BytesIO(manifest.manifests), - f"{self.job_attachment_settings.rootPrefix}/Manifests/{farm_id}/{queue_id}/{manifest.prefix}", - ) - - # Put random junk in the outputs prefix to make sure it isn't downloaded. - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.job_attachment_settings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/job-1/step-1/task-1-1/junk", - ) - - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.job_attachment_settings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/job-1/step-1/junk.json", - ) - - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.job_attachment_settings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/job-1/junk2.json", - ) - - INPUT_MANIFEST_PATHS_BY_ASSET_ROOT_v2023_03_03: list[BaseManifestPath] = [ - ManifestPathv2023_03_03(path="inputs/input1.txt", hash="input1", size=1, mtime=1234000000), - ManifestPathv2023_03_03( - path="inputs/subdir/input2.txt", hash="input2", size=1, mtime=1234000000 - ), - ManifestPathv2023_03_03( - path="inputs/subdir/input3.txt", hash="input3", size=1, mtime=1234000000 - ), - ManifestPathv2023_03_03( - path="inputs/subdir/subdir2/input4.txt", hash="input4", size=1, mtime=1234000000 - ), - ManifestPathv2023_03_03(path="inputs/input5.txt", hash="input5", size=1, mtime=1234000000), - ] - INPUT_MANIFEST_VERSION_TO_ASSET_ROOT_PATHS: dict[ManifestVersion, list[BaseManifestPath]] = { - ManifestVersion.v2023_03_03: INPUT_MANIFEST_PATHS_BY_ASSET_ROOT_v2023_03_03, - } - - def test_get_job_input_paths_by_asset_root(self, manifest_version: ManifestVersion): - assert self.job.attachments is not None - assert_get_job_input_paths_by_asset_root( - self.job_attachment_settings, - self.job.attachments, - {"/tmp": self.INPUT_MANIFEST_VERSION_TO_ASSET_ROOT_PATHS[manifest_version]}, - 5, - manifest_version, - ) - - MANIFEST_PATHS_BY_ASSET_ROOT_v2023_03_03: list[BaseManifestPath] = [ - ManifestPathv2023_03_03(path="test1.txt", hash="test1", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test/test2.txt", hash="test2", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test/test3.txt", hash="test3", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test4.txt", hash="test4", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test13.txt", hash="test13", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test/test14.txt", hash="test14", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test5.txt", hash="test5", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test/test6.txt", hash="test6", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test7.txt", hash="test7", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test/test8.txt", hash="test8", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test/test9.txt", hash="test9", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test10.txt", hash="test10", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test11.txt", hash="test11", size=1, mtime=1234000000), - ManifestPathv2023_03_03(path="test/test12.txt", hash="test12", size=1, mtime=1234000000), - ] - - MANIFEST_VERSION_TO_ASSET_ROOT_PATHS: dict[ManifestVersion, list[BaseManifestPath]] = { - ManifestVersion.v2023_03_03: MANIFEST_PATHS_BY_ASSET_ROOT_v2023_03_03, - } - - def test_get_job_output_paths_by_asset_root( - self, farm_id, queue_id, manifest_version: ManifestVersion - ): - assert_get_job_output_paths_by_asset_root( - self.job_attachment_settings, - farm_id, - queue_id, - {"/test": self.MANIFEST_VERSION_TO_ASSET_ROOT_PATHS[manifest_version]}, - 14, - manifest_version, - ) - - def test_get_job_outputs_paths_by_asset_root_when_no_asset_root(self, farm_id, queue_id): - assert_get_job_output_paths_by_asset_root_when_no_asset_root_throws_error( - farm_id, queue_id, self.job_attachment_settings - ) - - def test_get_job_input_output_paths_by_asset_root( - self, farm_id, queue_id, manifest_version: ManifestVersion - ): - assert self.job.attachments is not None - assert_get_job_input_output_paths_by_asset_root( - self.job_attachment_settings, - self.job.attachments, - farm_id, - queue_id, - { - "/tmp": self.INPUT_MANIFEST_VERSION_TO_ASSET_ROOT_PATHS[manifest_version] - + self.MANIFEST_VERSION_TO_ASSET_ROOT_PATHS[manifest_version], - }, - 19, - manifest_version, - ) - - EXPECTED_DOWNLOAD_FILE_PATHS_RELATIVE = [ - "inputs/input1.txt", - "inputs/subdir/input2.txt", - "inputs/subdir/input3.txt", - "inputs/subdir/subdir2/input4.txt", - "inputs/input5.txt", - ] - - TARGET_PERMISSION_CHANGE_PATHS_RELATIVE = [ - "inputs/input1.txt", - ".", - "inputs", - "inputs/subdir/input2.txt", - "inputs/subdir", - "inputs/subdir/input3.txt", - "inputs/subdir/subdir2/input4.txt", - "inputs/subdir/subdir2", - "inputs/input5.txt", - ] - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for testing file permission changes in Posix-based OS.", - ) - def test_download_files_from_manifests_with_fs_permission_settings_posix( - self, - tmp_path: Path, - manifest_version: ManifestVersion, - ): - """ - Tests whether the files listed in the given manifest are downloaded correctly from the - S3 bucket. Also, verifies that the functions for changing file ownership and permissions - (i.e., chown & chmod for POSIX) are correctly called with the given permission settings. - """ - manifest_str = MANIFEST_VERSION_TO_INPUT_ASSET_MANIFESTS[manifest_version][ - 0 - ].manifests.decode("utf-8") - manifest = decode_manifest(manifest_str) - manifests_by_root = {str(tmp_path): manifest} - - fs_permission_settings = PosixFileSystemPermissionSettings( - os_user="test-user", - os_group="test-group", - dir_mode=0o20, - file_mode=0o20, - ) - - mock_on_downloading_files = MagicMock(return_value=True) - - # IF - with patch("shutil.chown") as mock_chown, patch("os.chmod") as mock_chmod: - _ = download_files_from_manifests( - s3_bucket=self.job_attachment_settings.s3BucketName, - manifests_by_root=manifests_by_root, - cas_prefix=self.job_attachment_settings.full_cas_prefix(), - fs_permission_settings=fs_permission_settings, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - # Ensure that `chown` and `chmod` are properly called with the given permission settings - # for the downloaded files (and directory) paths. - expected_changed_paths = [ - tmp_path / rel_path for rel_path in self.TARGET_PERMISSION_CHANGE_PATHS_RELATIVE - ] - - chown_expected_calls = [ - str(call(path, group="test-group")) for path in expected_changed_paths - ] - chown_actual_calls = [str(call_args) for call_args in mock_chown.call_args_list] - assert Counter(chown_actual_calls) == Counter(chown_expected_calls) - - chmod_expected_calls = [ - str(call(path, path.stat().st_mode | 0o20)) for path in expected_changed_paths - ] - chmod_actual_calls = [str(call_args) for call_args in mock_chmod.call_args_list] - assert Counter(chmod_actual_calls) == Counter(chmod_expected_calls) - - # Ensure that only the expected files are there and no extras. - expected_files = [ - tmp_path / rel_path for rel_path in self.EXPECTED_DOWNLOAD_FILE_PATHS_RELATIVE - ] - assert set(expected_files) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for testing file permission changes in Windows.", - ) - def test_download_files_from_manifests_with_fs_permission_settings_windows( - self, - tmp_path: Path, - manifest_version: ManifestVersion, - ): - """ - Tests whether the files listed in the given manifest are downloaded correctly from the - S3 bucket. Also, verifies that the function for changing file ownership and permissions - is correctly called with the given permission settings. - """ - manifest_str = MANIFEST_VERSION_TO_INPUT_ASSET_MANIFESTS[manifest_version][ - 0 - ].manifests.decode("utf-8") - manifest = decode_manifest(manifest_str) - manifests_by_root = {str(tmp_path): manifest} - - fs_permission_settings = WindowsFileSystemPermissionSettings( - os_user="test-user", - dir_mode=WindowsPermissionEnum.FULL_CONTROL, - file_mode=WindowsPermissionEnum.FULL_CONTROL, - ) - - mock_on_downloading_files = MagicMock(return_value=True) - - # IF - with patch( - f"{deadline.__package__}.job_attachments.os_file_permission._change_permission_for_windows" - ) as mock_change_permission: - _ = download_files_from_manifests( - s3_bucket=self.job_attachment_settings.s3BucketName, - manifests_by_root=manifests_by_root, - cas_prefix=self.job_attachment_settings.full_cas_prefix(), - fs_permission_settings=fs_permission_settings, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - # Ensure that `_change_permission_for_windows` are properly called with the given - # permission settings for the downloaded files (and directory) paths. - expected_changed_paths = [ - tmp_path / rel_path for rel_path in self.TARGET_PERMISSION_CHANGE_PATHS_RELATIVE - ] - - mock_change_permission_expected_calls = [ - str( - call( - str(path), - "test-user", - WindowsPermissionEnum.FULL_CONTROL, - ) - ) - for path in expected_changed_paths - ] - mock_change_permission_actual_calls = [ - str(call_args) for call_args in mock_change_permission.call_args_list - ] - assert Counter(mock_change_permission_actual_calls) == Counter( - mock_change_permission_expected_calls - ) - - # Ensure that only the expected files are there and no extras. - expected_files = [ - tmp_path / rel_path for rel_path in self.EXPECTED_DOWNLOAD_FILE_PATHS_RELATIVE - ] - assert set(expected_files) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for testing file permission changes in Posix-based OS.", - ) - @pytest.mark.xfail( - not (has_posix_target_user() and has_posix_disjoint_user()), - reason="Must be running inside of the sudo_environment testing container.", - ) - def test_download_files_from_manifests_have_correct_group_posix( - self, - tmp_path: Path, - manifest_version: ManifestVersion, - posix_target_group: str, - posix_disjoint_group: str, - ): - """ - Tests whether the file system ownership and permissions of the downloaded files - are correctly changed on POSIX-based environment. - """ - import grp - - # Creates some files in the root directory that were not downloaded by Job Attachment. - Path(tmp_path / "inputs/subdir/subdir2").mkdir(parents=True, exist_ok=True) - random_paths = [ - tmp_path / "not_asset.txt", - tmp_path / "inputs/not_asset.txt", - tmp_path / "inputs/subdir/not_asset.txt", - tmp_path / "inputs/subdir/subdir2/not_asset.txt", - ] - for path in random_paths: - with open(str(path), "w") as f: - f.write("I am a pre-existing file, not downloaded by Job Attachment.") - - manifest_str = MANIFEST_VERSION_TO_INPUT_ASSET_MANIFESTS[manifest_version][ - 0 - ].manifests.decode("utf-8") - manifest = decode_manifest(manifest_str) - manifests_by_root = {str(tmp_path): manifest} - - fs_permission_settings = PosixFileSystemPermissionSettings( - os_user="test-user", - os_group=posix_target_group, - dir_mode=0o20, - file_mode=0o20, - ) - - mock_on_downloading_files = MagicMock(return_value=True) - - # IF - _ = download_files_from_manifests( - s3_bucket=self.job_attachment_settings.s3BucketName, - manifests_by_root=manifests_by_root, - cas_prefix=self.job_attachment_settings.full_cas_prefix(), - fs_permission_settings=fs_permission_settings, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - expected_changed_paths = [ - tmp_path / rel_path for rel_path in self.TARGET_PERMISSION_CHANGE_PATHS_RELATIVE - ] - - # Verify that the group ownership and permissions of files downloaded through Job Attachment - # have been appropriately modified. - # Also, confirm that a permission error occurs when attempting to change the group ownership - # of those files to a group other than the target group. - for path in expected_changed_paths: - file_stat = os.stat(str(path)) - updated_mode = file_stat.st_mode - assert updated_mode == updated_mode | 0o20 - - updated_group_name = grp.getgrgid(file_stat.st_gid).gr_name # type: ignore - assert updated_group_name == posix_target_group - - with pytest.raises(PermissionError): - shutil.chown(path, group=posix_disjoint_group) - - # For the files that were not downloaded through Job Attachment, confirm that the group ownership - # has not been changed to the target group. - for path in random_paths: - group_name = grp.getgrgid(os.stat(str(path)).st_gid).gr_name # type: ignore - assert group_name != posix_target_group - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for testing file permission changes in Windows.", - ) - def test_download_files_from_manifests_have_correct_group_windows( - self, - tmp_path: Path, - manifest_version: ManifestVersion, - ): - """ - Tests whether the file system ownership and permissions of the downloaded files - are correctly changed on Windows environment. - """ - import win32security - import ntsecuritycon - - # Creates some files in the root directory that were not downloaded by Job Attachment. - Path(tmp_path / "inputs/subdir/subdir2").mkdir(parents=True, exist_ok=True) - random_paths = [ - tmp_path / "not_asset.txt", - tmp_path / "inputs/not_asset.txt", - tmp_path / "inputs/subdir/not_asset.txt", - tmp_path / "inputs/subdir/subdir2/not_asset.txt", - ] - for path in random_paths: - with open(str(path), "w") as f: - f.write("I am a pre-existing file, not downloaded by Job Attachment.") - - manifest_str = MANIFEST_VERSION_TO_INPUT_ASSET_MANIFESTS[manifest_version][ - 0 - ].manifests.decode("utf-8") - manifest = decode_manifest(manifest_str) - manifests_by_root = {str(tmp_path): manifest} - - # Use a builtin user 'Guest', so we can expect it to exist on any Windows machine - fs_permission_settings = WindowsFileSystemPermissionSettings( - os_user="Guest", - dir_mode=WindowsPermissionEnum.FULL_CONTROL, - file_mode=WindowsPermissionEnum.FULL_CONTROL, - ) - - mock_on_downloading_files = MagicMock(return_value=True) - - # IF - _ = download_files_from_manifests( - s3_bucket=self.job_attachment_settings.s3BucketName, - manifests_by_root=manifests_by_root, - cas_prefix=self.job_attachment_settings.full_cas_prefix(), - fs_permission_settings=fs_permission_settings, - on_downloading_files=mock_on_downloading_files, - ) - - # THEN - expected_changed_paths = [ - tmp_path / rel_path for rel_path in self.TARGET_PERMISSION_CHANGE_PATHS_RELATIVE - ] - - # Verify that the user ownership and permissions of files downloaded through Job Attachment - # have been appropriately modified. - for path in expected_changed_paths: - # Get the file's security information - sd = win32security.GetFileSecurity(str(path), win32security.DACL_SECURITY_INFORMATION) - # Get the discretionary access control list (DACL) - dacl = sd.GetSecurityDescriptorDacl() - # Get the permissions info from ACE - permission_mapping: dict[str, int] = {} - for ace_no in range(dacl.GetAceCount()): - trustee_sid = dacl.GetAce(ace_no)[2] - trustee_name, _, _ = win32security.LookupAccountSid(None, trustee_sid) - if trustee_name: - trustee = { - "TrusteeForm": win32security.TRUSTEE_IS_SID, - "TrusteeType": win32security.TRUSTEE_IS_USER, - "Identifier": trustee_sid, - } - result = dacl.GetEffectiveRightsFromAcl(trustee) - permission_mapping[trustee_name] = result - assert "Guest" in permission_mapping - assert permission_mapping["Guest"] == ntsecuritycon.FILE_ALL_ACCESS - - def test_download_task_output( - self, farm_id, queue_id, tmp_path: Path, manifest_version: ManifestVersion - ): - assert_download_task_output( - self.job_attachment_settings, - farm_id, - queue_id, - tmp_path, - expected_files={ - str(tmp_path): [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - ] - }, - expected_total_bytes=4, - manifest_version=manifest_version, - ) - - def test_download_step_output( - self, farm_id, queue_id, tmp_path: Path, manifest_version: ManifestVersion - ): - assert_download_step_output( - self.job_attachment_settings, - farm_id, - queue_id, - tmp_path, - expected_files={ - str(tmp_path): [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - tmp_path / "test13.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test5.txt", - tmp_path / "test" / "test6.txt", - ] - }, - expected_total_bytes=8, - manifest_version=manifest_version, - ) - - def test_download_job_output( - self, farm_id, queue_id, tmp_path: Path, manifest_version: ManifestVersion - ): - assert_download_job_output( - self.job_attachment_settings, - farm_id, - queue_id, - tmp_path, - expected_files={ - str(tmp_path): [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - tmp_path / "test13.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test5.txt", - tmp_path / "test" / "test6.txt", - tmp_path / "test7.txt", - tmp_path / "test" / "test8.txt", - tmp_path / "test" / "test9.txt", - tmp_path / "test10.txt", - tmp_path / "test11.txt", - tmp_path / "test" / "test12.txt", - ] - }, - expected_total_bytes=14, - manifest_version=manifest_version, - ) - - def test_download_files_in_directory( - self, farm_id, queue_id, tmp_path: Path, manifest_version: ManifestVersion - ): - assert self.job.attachments is not None - assert_download_files_in_directory( - self.job_attachment_settings, - self.job.attachments, - farm_id, - queue_id, - "test", - tmp_path, - expected_files={ - str(tmp_path): [ - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test" / "test6.txt", - tmp_path / "test" / "test8.txt", - tmp_path / "test" / "test9.txt", - tmp_path / "test" / "test12.txt", - ] - }, - expected_total_bytes=7, - manifest_version=manifest_version, - ) - - def test_OutputDownloader_get_output_paths_by_root( - self, - farm_id, - queue_id, - tmp_path: Path, - ): - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - - assert output_downloader.get_output_paths_by_root() == { - str(tmp_path.resolve()): [ - "test/test12.txt", - "test/test14.txt", - "test/test2.txt", - "test/test3.txt", - "test/test6.txt", - "test/test8.txt", - "test/test9.txt", - "test1.txt", - "test10.txt", - "test11.txt", - "test13.txt", - "test4.txt", - "test5.txt", - "test7.txt", - ] - } - - def test_OutputDownloader_set_root_path(self, farm_id, queue_id, tmp_path: Path): - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - - new_root_path = "/new_root_path" if sys.platform != "win32" else "C:\\new_root_path" - - output_downloader.set_root_path( - original_root=str(tmp_path.resolve()), new_root=new_root_path - ) - - assert output_downloader.get_output_paths_by_root() == { - new_root_path: [ - "test/test12.txt", - "test/test14.txt", - "test/test2.txt", - "test/test3.txt", - "test/test6.txt", - "test/test8.txt", - "test/test9.txt", - "test1.txt", - "test10.txt", - "test11.txt", - "test13.txt", - "test4.txt", - "test5.txt", - "test7.txt", - ] - } - - @pytest.mark.skipif( - is_windows_non_admin(), - reason="Windows requires Admin to create symlinks, skipping this test.", - ) - def test_OutputDownloader_set_root_path_with_symlinks(self, farm_id, queue_id, tmp_path: Path): - """ - Test that when a symlink path containing '..' is used as a new root. Without - resolving the symlink target, the absolute path with ".." removed is stored. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id="step-1", - task_id="task-1-1", - session_action_id="sessionaction-9-9", - ) - - target_path = tmp_path / "target" - target_path.mkdir() - sym_path = tmp_path / "subfolder/../symlink_folder" - sym_path.parent.mkdir(parents=True, exist_ok=True) - sym_path.symlink_to(target_path, target_is_directory=True) - output_downloader.set_root_path( - original_root=str(tmp_path.resolve()), new_root=str(sym_path) - ) - - assert output_downloader.get_output_paths_by_root() == { - str(tmp_path / "symlink_folder"): [ - "test/test2.txt", - "test/test3.txt", - "test1.txt", - "test4.txt", - ] - } - - def test_OutputDownloader_set_root_path_wrong_root_throws_exception( - self, farm_id, queue_id, tmp_path: Path - ): - """ - Assert a ValueError is thrown when given a non-existent root path. - """ - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - - with pytest.raises(ValueError): - output_downloader.set_root_path(original_root="/wrong_root", new_root="/new_root_path") - - def test_OutputDownloader_download_job_output_when_skip( - self, farm_id, queue_id, tmp_path: Path - ): - """ - When path conflicts occur during file download and the resolution method is set to SKIP, - test whether the files has actually been skipped. - Note: This test relies on `st_ctime` for checking if a file has been skipped. On Linux, - `st_ctime` represents the time of the last metadata change, but on Windows, it represents - the file creation time. So the skipping verification is only available on Linux. - """ - expected_files, output_downloader = self.download_outputs_check_expected_files_exist( - farm_id, queue_id, tmp_path - ) - # Record the last metadata modification times for each file. - modified_time_before_second_trial = [path.stat().st_ctime for path in expected_files] - # Re-download the files with the SKIP option. - output_downloader.download_job_output(file_conflict_resolution=FileConflictResolution.SKIP) - # Check that no additional files were added during the second download. - assert set(expected_files) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - - # (Test only on Linux system) Record the last metadata modification times again. - # Since the second download with the SKIP option should have skipped the files, - # the modification times should be the same before and after the second download. - if sys.platform == "linux": - modified_time_after_second_trial = [path.stat().st_ctime for path in expected_files] - assert modified_time_before_second_trial == modified_time_after_second_trial - - def download_outputs_check_expected_files_exist(self, farm_id, queue_id, tmp_path): - expected_files = [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - tmp_path / "test13.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test5.txt", - tmp_path / "test" / "test6.txt", - tmp_path / "test7.txt", - tmp_path / "test" / "test8.txt", - tmp_path / "test" / "test9.txt", - tmp_path / "test10.txt", - tmp_path / "test11.txt", - tmp_path / "test" / "test12.txt", - ] - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - # First download the files and check if the files are there. - # (Ensure that only the expected files are there and no extras.) - output_downloader.download_job_output() - assert set(expected_files) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - return expected_files, output_downloader - - def test_OutputDownloader_download_job_output_when_overwrite( - self, farm_id, queue_id, tmp_path: Path - ): - """ - When path conflicts occur during file download and the resolution method is set to OVERWRITE, - test whether the files has actually been overwritten. - Note: This test relies on `st_ctime` for checking if a file has been overwritten. On Linux, - `st_ctime` represents the time of the last metadata change, but on Windows, it represents - the file creation time. So the overwriting verification is only available on Linux. - """ - expected_files, output_downloader = self.download_outputs_check_expected_files_exist( - farm_id, queue_id, tmp_path - ) - # Record the last metadata modification times for each file. - modified_time_before_overwrite = [path.stat().st_ctime for path in expected_files] - # Re-download the files with the OVERWRITE option. - output_downloader.download_job_output( - file_conflict_resolution=FileConflictResolution.OVERWRITE - ) - # Check that no additional files were added during the second download. - assert set(expected_files) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - - # (Test only on Linux system) Record the last metadata modification times again. - # The modification times before and after the second download should be different. - if sys.platform == "linux": - modified_time_after_overwrite = [path.stat().st_ctime for path in expected_files] - for time_before, time_after in zip( - modified_time_before_overwrite, modified_time_after_overwrite - ): - assert time_before < time_after - - def test_OutputDownloader_download_job_output_when_create_copy( - self, farm_id, queue_id, tmp_path: Path - ): - expected_files = [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - tmp_path / "test13.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test5.txt", - tmp_path / "test" / "test6.txt", - tmp_path / "test7.txt", - tmp_path / "test" / "test8.txt", - tmp_path / "test" / "test9.txt", - tmp_path / "test10.txt", - tmp_path / "test11.txt", - tmp_path / "test" / "test12.txt", - ] - - expected_files_after_create_copy = [ - tmp_path / "test1 (1).txt", - tmp_path / "test" / "test2 (1).txt", - tmp_path / "test" / "test3 (1).txt", - tmp_path / "test4 (1).txt", - tmp_path / "test13 (1).txt", - tmp_path / "test" / "test14 (1).txt", - tmp_path / "test5 (1).txt", - tmp_path / "test" / "test6 (1).txt", - tmp_path / "test7 (1).txt", - tmp_path / "test" / "test8 (1).txt", - tmp_path / "test" / "test9 (1).txt", - tmp_path / "test10 (1).txt", - tmp_path / "test11 (1).txt", - tmp_path / "test" / "test12 (1).txt", - ] - - expected_files_after_create_copy.extend(expected_files) - - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - - # First download the files and check if the files are there. - output_downloader.download_job_output() - assert set(expected_files) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - # Re-download the files with the CREATE_COPY option. - output_downloader.download_job_output( - file_conflict_resolution=FileConflictResolution.CREATE_COPY - ) - assert set(expected_files_after_create_copy) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - - def test_OutputDownloader_download_job_output_unknown_resolution_throws_exception( - self, farm_id, queue_id, tmp_path: Path - ): - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value=str(tmp_path.resolve()), - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - - output_downloader.download_job_output() - with pytest.raises(ValueError): - output_downloader.download_job_output( - file_conflict_resolution=FileConflictResolution(99) - ) - - def test_OutputDownloader_download_job_output_to_new_asset_root( - self, farm_id, queue_id, tmp_path: Path - ): - expected_files = [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - tmp_path / "test13.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test5.txt", - tmp_path / "test" / "test6.txt", - tmp_path / "test7.txt", - tmp_path / "test" / "test8.txt", - tmp_path / "test" / "test9.txt", - tmp_path / "test10.txt", - tmp_path / "test11.txt", - tmp_path / "test" / "test12.txt", - ] - - with patch( - f"{deadline.__package__}.job_attachments.download._get_asset_root_from_metadata", - return_value="/test_root", - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - - output_downloader.set_root_path("/test_root", str(tmp_path.resolve())) - output_downloader.download_job_output() - assert set(expected_files) == set( - [path for path in tmp_path.glob("**/*") if path.is_file()] - ) - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for paths in POSIX path format and will be skipped on Windows.", - ) - @pytest.mark.parametrize( - "outputs_by_root", - [ - { - "/local/home": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03( - path="../inputs/input1.txt", hash="a", size=1, mtime=1 - ) - ], - }, - ), - }, - { - "/local/home": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03( - path="/inputs/input1.txt", hash="a", size=1, mtime=1 - ) - ], - }, - ), - }, - { - "home": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03( - path="/inputs/input1.txt", hash="a", size=1, mtime=1 - ) - ], - }, - ), - }, - { - "/local/home": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03(path="////", hash="a", size=1, mtime=1) - ], - }, - ), - }, - ], - ) - def test_OutputDownloader_download_job_output_posix_invalid_file_path_fails( - self, farm_id, queue_id, outputs_by_root: dict[str, ManifestPathGroup] - ): - self.create_output_downloaded_and_validate_path(farm_id, outputs_by_root, queue_id) - - def create_output_downloaded_and_validate_path(self, farm_id, outputs_by_root, queue_id): - with patch( - f"{deadline.__package__}.job_attachments.download.get_job_output_paths_by_asset_root", - return_value=outputs_by_root, - ): - output_downloader = OutputDownloader( - s3_settings=self.job_attachment_settings, - farm_id=farm_id, - queue_id=queue_id, - job_id="job-1", - step_id=None, - task_id=None, - ) - with patch( - f"{deadline.__package__}.job_attachments.download.download_files", return_value=[] - ), pytest.raises((PathOutsideDirectoryError, ValueError)): - output_downloader.download_job_output() - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for paths in Windows path format and will be skipped on non-Windows.", - ) - @pytest.mark.parametrize( - "outputs_by_root", - [ - { - "C:/Users": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03( - path="../inputs/input1.txt", hash="a", size=1, mtime=1 - ) - ], - }, - ), - }, - { - "C:/Users": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03( - path="C:/inputs/input1.txt", hash="a", size=1, mtime=1 - ) - ], - }, - ), - }, - { - "/C:": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03( - path="inputs/input1.txt", hash="a", size=1, mtime=1 - ) - ], - }, - ), - }, - { - "C:/Users": ManifestPathGroup( - total_bytes=1, - files_by_hash_alg={ - HashAlgorithm.XXH128: [ - ManifestPathv2023_03_03(path="////", hash="a", size=1, mtime=1) - ], - }, - ), - }, - ], - ) - def test_OutputDownloader_download_job_output_windows_invalid_file_path_fails( - self, farm_id, queue_id, outputs_by_root: dict[str, ManifestPathGroup] - ): - self.create_output_downloaded_and_validate_path(farm_id, outputs_by_root, queue_id) - - def test_get_asset_root_from_metadata_returns_none_if_not_found(self): - assert _get_asset_root_from_metadata(metadata={}) is None - - def test_get_manifest_from_s3_error_message_on_access_denied(self): - """ - Test if the function raises the expected exception with a proper error message - when S3 client's download_fileobj returns an Access Denied (403) error. - """ - s3_client = boto3.client("s3") - stubber = Stubber(s3_client) - stubber.add_client_error( - "get_object", - service_error_code="AccessDenied", - service_message="Access Denied", - http_status_code=403, - ) - - with stubber, patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", return_value=s3_client - ): - with pytest.raises(JobAttachmentsS3ClientError) as exc: - get_manifest_from_s3("test-key", "test-bucket") - assert isinstance(exc.value.__cause__, ClientError) - assert ( - exc.value.__cause__.response["ResponseMetadata"]["HTTPStatusCode"] == 403 - # type: ignore[attr-defined] - ) - assert ( - "Error downloading binary file in bucket 'test-bucket', Target key or prefix: 'test-key', " - "HTTP Status Code: 403, Forbidden or Access denied. " - ) in str(exc.value) - - def test_get_manifest_from_s3_error_message_on_timeout(self): - """ - Test that the appropriate error is raised when a ReadTimeoutError occurs - during an S3 client's download_fileobj call. - """ - mock_s3_client = MagicMock() - mock_s3_client.get_object.side_effect = ReadTimeoutError(endpoint_url="test_url") - - with patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", - return_value=mock_s3_client, - ): - with pytest.raises(AssetSyncError) as exc: - get_manifest_from_s3("test-key", "test-bucket") - assert isinstance(exc.value.__cause__, BotoCoreError) - assert ( - "An issue occurred with AWS service request while downloading binary file: " - 'Read timeout on endpoint URL: "test_url"\n' - "This could be due to temporary issues with AWS, internet connection, or your AWS credentials. " - "Please verify your credentials and network connection. If the problem persists, try again later" - " or contact support for further assistance." - ) in str(exc.value) - - def test_get_tasks_manifests_keys_from_s3_error_message_on_access_denied(self): - """ - Test if the function raises the expected exception with a proper error message - when S3 client's list_objects_v2 returns an Access Denied (403) error. - """ - s3_client = boto3.client("s3") - stubber = Stubber(s3_client) - stubber.add_client_error( - "list_objects_v2", - service_error_code="AccessDenied", - service_message="Access Denied", - http_status_code=403, - ) - - with stubber, patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", return_value=s3_client - ): - with pytest.raises(JobAttachmentsS3ClientError) as exc: - _get_tasks_manifests_keys_from_s3( - "assetRoot", - "test-bucket", - ) - assert isinstance(exc.value.__cause__, ClientError) - assert ( - exc.value.__cause__.response["ResponseMetadata"]["HTTPStatusCode"] == 403 - # type: ignore[attr-defined] - ) - assert ( - "Error listing bucket contents in bucket 'test-bucket', Target key or prefix: 'assetRoot', " - "HTTP Status Code: 403, Forbidden or Access denied. " - ) in str(exc.value) - - def test_get_tasks_manifests_keys_from_s3_error_message_on_timeout(self): - """ - Test that the appropriate error is raised when S3 client's get_paginator call triggers - a ReadTimeoutError while getting the keys of task output manifests from S3. - """ - mock_s3_client = MagicMock() - mock_s3_client.get_paginator.side_effect = ReadTimeoutError(endpoint_url="test_url") - - with patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", - return_value=mock_s3_client, - ): - with pytest.raises(AssetSyncError) as exc: - _get_tasks_manifests_keys_from_s3( - "assetRoot", - "test-bucket", - ) - assert isinstance(exc.value.__cause__, BotoCoreError) - assert ( - "An issue occurred with AWS service request while listing bucket contents: " - 'Read timeout on endpoint URL: "test_url"\n' - "This could be due to temporary issues with AWS, internet connection, or your AWS credentials. " - "Please verify your credentials and network connection. If the problem persists, try again later" - " or contact support for further assistance." - ) in str(exc.value) - - def test_download_file_error_message_on_access_denied(self): - """ - Test if the function raises the expected exception with a proper error message - when S3 client's download_file returns an Access Denied (403) error. - """ - s3_client = boto3.client("s3") - stubber = Stubber(s3_client) - stubber.add_client_error( - "head_object", - service_error_code="AccessDenied", - service_message="Access Denied", - http_status_code=403, - ) - - mock_lock = MagicMock() - mock_collision_dict = MagicMock() - - file_path = ManifestPathv2023_03_03( - path="inputs/input1.txt", hash="input1", size=1, mtime=1234000000 - ) - - with stubber, patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", return_value=s3_client - ), patch(f"{deadline.__package__}.job_attachments.download.Path.mkdir"): - with pytest.raises(JobAttachmentsS3ClientError) as exc: - download_file( - file_path, - HashAlgorithm.XXH128, - "/home/username/assets", - mock_lock, - mock_collision_dict, - "test-bucket", - "rootPrefix/Data", - s3_client, - ) - assert isinstance(exc.value.__cause__, ClientError) - assert ( - exc.value.__cause__.response["ResponseMetadata"]["HTTPStatusCode"] == 403 - # type: ignore[attr-defined] - ) - assert ( - "Error downloading file in bucket 'test-bucket', Target key or prefix: 'rootPrefix/Data/input1.xxh128', " - "HTTP Status Code: 403, Forbidden or Access denied. " - ) in str(exc.value) - failed_file_path = Path("/home/username/assets/inputs/input1.txt") - assert (f"(Failed to download the file to {str(failed_file_path)})") in str(exc.value) - mock_lock.assert_not_called() - mock_collision_dict.assert_not_called() - - def test_download_file_error_message_on_timeout(self): - """ - Test that the appropriate error is raised when a ReadTimeoutError occurs - during a transfer manager's download operation. - """ - mock_s3_client = MagicMock() - mock_future = MagicMock() - mock_transfer_manager = MagicMock() - mock_transfer_manager.download.return_value = mock_future - mock_future.result.side_effect = ReadTimeoutError(endpoint_url="test_url") - mock_lock = MagicMock() - mock_collision_dict = MagicMock() - - file_path = ManifestPathv2023_03_03( - path="inputs/input1.txt", hash="input1", size=1, mtime=1234000000 - ) - - with patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", - return_value=mock_s3_client, - ), patch( - f"{deadline.__package__}.job_attachments.download.get_s3_transfer_manager", - return_value=mock_transfer_manager, - ), patch(f"{deadline.__package__}.job_attachments.download.Path.mkdir"): - with pytest.raises(AssetSyncError) as exc: - download_file( - file_path, - HashAlgorithm.XXH128, - "/home/username/assets", - mock_lock, - mock_collision_dict, - "test-bucket", - "rootPrefix/Data", - mock_s3_client, - ) - assert isinstance(exc.value.__cause__, BotoCoreError) - assert ( - "An issue occurred with AWS service request while downloading file: " - 'Read timeout on endpoint URL: "test_url"\n' - "This could be due to temporary issues with AWS, internet connection, or your AWS credentials. " - "Please verify your credentials and network connection. If the problem persists, try again later" - " or contact support for further assistance." - ) in str(exc.value) - mock_lock.assert_not_called() - mock_collision_dict.assert_not_called() - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for Linux path only.", - ) - def test_windows_long_path_exception_PosixOS(self): - mock_s3_client = MagicMock() - mock_future = MagicMock() - mock_transfer_manager = MagicMock() - mock_transfer_manager.download.return_value = mock_future - mock_future.result.side_effect = Exception("Test exception") - mock_lock = MagicMock() - mock_collision_dict = MagicMock() - - file_path = ManifestPathv2023_03_03( - path="very/long/input/to/test/windows/max/file/path/for/error/handling/when/downloading/assest/from/job/attachment.txt", - hash="path", - size=1, - mtime=1234000000, - ) - - local_path = "Users/path/to/a/very/long/file/path/that/exceeds/the/windows/max/path/length/for/testing/max/file/path/error/handling/when/download/or/syncing/assest/using/job/attachment" - - self.download_file_and_check_exception( - file_path, - local_path, - mock_collision_dict, - mock_lock, - mock_s3_client, - mock_transfer_manager, - ) - - def download_file_and_check_exception( - self, - file_path, - local_path, - mock_collision_dict, - mock_lock, - mock_s3_client, - mock_transfer_manager, - ): - with patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", - return_value=mock_s3_client, - ), patch( - f"{deadline.__package__}.job_attachments.download.get_s3_transfer_manager", - return_value=mock_transfer_manager, - ), patch(f"{deadline.__package__}.job_attachments.download.Path.mkdir"): - with pytest.raises(AssetSyncError) as exc: - download_file( - file_path, - HashAlgorithm.XXH128, - local_path, - mock_lock, - mock_collision_dict, - "test-bucket", - "rootPrefix/Data", - mock_s3_client, - ) - expected_message = "Test exception" - assert str(exc.value) == expected_message - mock_lock.assert_not_called() - mock_collision_dict.assert_not_called() - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for Windows path only.", - ) - def test_windows_long_path(self): - mock_s3_client = MagicMock() - mock_future = MagicMock() - mock_transfer_manager = MagicMock() - mock_transfer_manager.download.return_value = mock_future - mock_future.result.side_effect = Exception("Test exception") - mock_lock = MagicMock() - mock_collision_dict = MagicMock() - - file_path = ManifestPathv2023_03_03( - path="very/long/input/to/test/windows/max/file/path/for/error/handling/when/downloading/assest/from/job/attachment.txt", - hash="path", - size=1, - mtime=1234000000, - ) - - local_path = "C:\\path\\to\\a\\very\\long\\file\\path\\that\\exceeds\\the\\windows\\max\\path\\length\\for\\testing\\max\\file\\path\\error\\handling\\when\\download\\or\\syncing\\assest\\using\\job\\attachment" - - self.download_file_and_check_exception( - file_path, - local_path, - mock_collision_dict, - mock_lock, - mock_s3_client, - mock_transfer_manager, - ) - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for Windows path only.", - ) - def test_windows_long_path_UNC_notation_WindowsOS(self): - ( - file_path, - local_path, - mock_collision_dict, - mock_lock, - mock_s3_client, - mock_transfer_manager, - ) = self.setup_mocks_and_file_path() - - with patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", - return_value=mock_s3_client, - ), patch( - f"{deadline.__package__}.job_attachments.download.get_s3_transfer_manager", - return_value=mock_transfer_manager, - ), patch( - f"{deadline.__package__}.job_attachments._utils._is_windows_long_path_registry_enabled", - return_value=False, - ), patch(f"{deadline.__package__}.job_attachments.download.Path.mkdir"): - with pytest.raises(AssetSyncError) as exc: - download_file( - file_path, - HashAlgorithm.XXH128, - local_path, - mock_lock, - mock_collision_dict, - "test-bucket", - "rootPrefix/Data", - mock_s3_client, - ) - expected_message = "Test exception" - assert str(exc.value) == expected_message - mock_lock.assert_not_called() - mock_collision_dict.assert_not_called() - - def setup_mocks_and_file_path(self): - mock_s3_client = MagicMock() - mock_future = MagicMock() - mock_transfer_manager = MagicMock() - mock_transfer_manager.download.return_value = mock_future - mock_future.result.side_effect = Exception("Test exception") - mock_lock = MagicMock() - mock_collision_dict = MagicMock() - file_path = ManifestPathv2023_03_03( - path="very/long/input/to/test/windows/max/file/path/for/error/handling/when/downloading/assest/from/job/attachment.txt", - hash="path", - size=1, - mtime=1234000000, - ) - local_path = "\\\\?\\C:\\path\\to\\a\\very\\long\\file\\path\\that\\exceeds\\the\\windows\\max\\path\\length\\for\\testing\\max\\file\\path\\error\\handling\\when\\download\\or\\syncing\\assest\\using\\job\\attachment" - return ( - file_path, - local_path, - mock_collision_dict, - mock_lock, - mock_s3_client, - mock_transfer_manager, - ) - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for Windows path only.", - ) - def test_windows_long_path_UNC_notation_and_registry_WindowsOS(self): - ( - file_path, - local_path, - mock_collision_dict, - mock_lock, - mock_s3_client, - mock_transfer_manager, - ) = self.setup_mocks_and_file_path() - - with patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", - return_value=mock_s3_client, - ), patch( - f"{deadline.__package__}.job_attachments.download.get_s3_transfer_manager", - return_value=mock_transfer_manager, - ), patch( - f"{deadline.__package__}.job_attachments._utils._is_windows_long_path_registry_enabled", - return_value=True, - ), patch(f"{deadline.__package__}.job_attachments.download.Path.mkdir"): - with pytest.raises(AssetSyncError) as exc: - download_file( - file_path, - HashAlgorithm.XXH128, - local_path, - mock_lock, - mock_collision_dict, - "test-bucket", - "rootPrefix/Data", - mock_s3_client, - ) - - expected_message = "Test exception" - assert str(exc.value) == expected_message - mock_lock.assert_not_called() - mock_collision_dict.assert_not_called() - - def _test_create_copy_long_path_scenario( - self, base_dir, long_base_name, expect_unc_prefix=False - ): - """ - Common test logic for CREATE_COPY long path scenarios. - Tests that original path is not long but copy becomes long. - """ - original_file = Path(base_dir) / f"{long_base_name}.txt" - copy_file = Path(base_dir) / f"{long_base_name} (1).txt" - - # Verify our test scenario is correct - original_len = len(str(original_file)) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH - copy_len = len(str(copy_file)) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH - assert original_len < WINDOWS_MAX_PATH_LENGTH, ( - f"Original should NOT be long path: {original_len}" - ) - assert copy_len >= WINDOWS_MAX_PATH_LENGTH, f"Copy should become long path: {copy_len}" - - # Create test file path object for the manifest - file_path = ManifestPathv2023_03_03( - path=f"{long_base_name}.txt", hash="testhash", size=1, mtime=1234000000 - ) - - # Mock S3 operations to simulate successful download - mock_s3_client = MagicMock() - mock_future = MagicMock() - mock_transfer_manager = MagicMock() - mock_transfer_manager.download.return_value = mock_future - mock_future.result.return_value = None - - mock_lock = MagicMock() - mock_collision_dict: DefaultDict[str, int] = DefaultDict(int) - - with patch( - f"{deadline.__package__}.job_attachments.download.get_s3_client", - return_value=mock_s3_client, - ), patch( - f"{deadline.__package__}.job_attachments.download.get_s3_transfer_manager", - return_value=mock_transfer_manager, - ), patch( - f"{deadline.__package__}.job_attachments.download.get_account_id", - return_value="123456789012", - ), patch( - f"{deadline.__package__}.job_attachments._utils._is_windows_long_path_registry_enabled", - return_value=False, # Ensure UNC prefix is used for Windows - ), patch( - "pathlib.Path.is_file", - return_value=True, # Simulate that original file exists to force conflict - ), patch( - f"{deadline.__package__}.job_attachments.download._get_new_copy_file_path", - return_value=copy_file, - ), patch("pathlib.Path.mkdir"), patch("os.utime"): - # Call download_file with CREATE_COPY resolution - download_file( - file_path, - HashAlgorithm.XXH128, - str(base_dir), - mock_lock, - mock_collision_dict, - "test-bucket", - "rootPrefix/Data", - mock_s3_client, - file_conflict_resolution=FileConflictResolution.CREATE_COPY, - ) - - # Verify the download was called - download_calls = mock_transfer_manager.download.call_args_list - assert len(download_calls) == 1, "Should have made exactly one download call" - - download_call = download_calls[0] - # Get fileobj from positional args or kwargs - if len(download_call.args) >= 3: - fileobj_path = download_call.args[2] - else: - fileobj_path = download_call.kwargs.get("fileobj", "") - - # Platform-specific path format validation - if expect_unc_prefix: - # Windows: verify UNC prefix is used - assert fileobj_path.startswith("\\\\?\\"), ( - f"Copy file path should use UNC prefix for long paths, got: {fileobj_path}" - ) - - # Verify the underlying path length that triggered the conversion - underlying_path = fileobj_path.replace("\\\\?\\", "") - assert ( - len(underlying_path) + TEMP_DOWNLOAD_ADDED_CHARS_LENGTH - >= WINDOWS_MAX_PATH_LENGTH - ), ( - f"The underlying path + temp chars should be at/over Windows limit: {len(underlying_path) + 9}" - ) - else: - # POSIX: verify no UNC prefix is used - assert not fileobj_path.startswith("\\\\?\\"), ( - f"POSIX systems should not use UNC prefix, got: {fileobj_path}" - ) - - # Verify the path is the expected copy path - expected_copy_path = str(copy_file) - assert fileobj_path == expected_copy_path, ( - f"Should use normal path format on POSIX: expected {expected_copy_path}, got {fileobj_path}" - ) - - # Verify it contains the copy filename pattern - assert f"{long_base_name} (1).txt" in fileobj_path, ( - "Should contain the copy filename pattern" - ) - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for Windows long path handling only.", - ) - def test_download_file_create_copy_becomes_long_path_windows(self): - """ - Test that when CREATE_COPY conflict resolution creates a filename that becomes a Windows long path, - download_file converts it to use the UNC prefix (\\?\\) format and successfully downloads the file. - """ - # Create a path that's just under the Windows limit, but becomes long with " (1)" - base_dir = "C:\\" + "a" * 100 # Directory part - long_base_name = "b" * 141 # Filename part - calculated to hit threshold - - # Use the common test logic with Windows-specific validation - self._test_create_copy_long_path_scenario(base_dir, long_base_name, expect_unc_prefix=True) - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for POSIX systems.", - ) - @pytest.mark.parametrize( - "dir_multiplier,filename_len", - [ - (14, 85) if sys.platform == "darwin" else (22, 72), - ], - ) - def test_download_file_create_copy_long_path_posix(self, dir_multiplier, filename_len): - """ - Test that CREATE_COPY conflict resolution works correctly on POSIX systems - with long filenames and actually downloads the file. - - The variables have been decided such that they cross the max path length of 260 - They are different for MacOS and Linux because of different temp directory lengths - """ - with tempfile.TemporaryDirectory() as tmp_dir: - tmp_path = Path(tmp_dir) - nested_dir = tmp_path / ("longdir" * dir_multiplier) - long_base_name = "a" * filename_len - - # Use the common test logic with POSIX-specific validation - self._test_create_copy_long_path_scenario( - nested_dir, long_base_name, expect_unc_prefix=False - ) - - -@pytest.mark.parametrize("manifest_version", [ManifestVersion.v2023_03_03]) -class TestFullDownloadPrefixesWithSlashes: - """ - Tests for downloads from cas when the queue prefixes are created. - """ - - @pytest.fixture(autouse=True) - def before_test( - self, - request, - create_s3_bucket: Callable[[str], None], - farm_id: str, - queue_id: str, - default_queue: Queue, - create_get_queue_response: Callable[[Queue], dict[str, Any]], - manifest_version: ManifestVersion, - ): - """ - Setup the default queue and s3 bucket for all asset tests. - Mark test with `no_setup` if you don't want this setup to run. - """ - if "no_setup" in request.keywords: - return - - self.queue = default_queue - assert self.queue.jobAttachmentSettings - self.queue.jobAttachmentSettings.rootPrefix = "test////////" - self.queue_response = create_get_queue_response(self.queue) - create_s3_bucket(self.queue.jobAttachmentSettings.s3BucketName) - - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.queue.jobAttachmentSettings.s3BucketName) - - for i in range(1, 15): - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.queue.jobAttachmentSettings.rootPrefix}/Data/test{i}.xxh128", - ) - - for manifest in MANIFEST_VERSION_TO_MANIFESTS[manifest_version]: - bucket.upload_fileobj( - BytesIO(manifest.manifests), - f"{self.queue.jobAttachmentSettings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/{manifest.prefix}", - ) - - # Put random junk in the outputs prefix to make sure it isn't downloaded. - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.queue.jobAttachmentSettings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/job-1/step-1/task-1-1/junk", - ) - - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.queue.jobAttachmentSettings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/job-1/step-1/junk.json", - ) - - bucket.upload_fileobj( - BytesIO(b"a"), - f"{self.queue.jobAttachmentSettings.rootPrefix}/" - f"Manifests/{farm_id}/{queue_id}/job-1/junk2.json", - ) - - def test_download_task_output_prefixes_with_slashes( - self, farm_id, queue_id, tmp_path: Path, manifest_version: ManifestVersion - ): - assert self.queue.jobAttachmentSettings - assert_download_task_output( - self.queue.jobAttachmentSettings, - farm_id, - queue_id, - tmp_path, - expected_files={ - str(tmp_path): [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - ] - }, - expected_total_bytes=4, - manifest_version=manifest_version, - ) - - def test_download_step_prefixes_with_slashes( - self, farm_id, queue_id, tmp_path: Path, manifest_version: ManifestVersion - ): - assert self.queue.jobAttachmentSettings - assert_download_step_output( - self.queue.jobAttachmentSettings, - farm_id, - queue_id, - tmp_path, - expected_files={ - str(tmp_path): [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - tmp_path / "test13.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test5.txt", - tmp_path / "test" / "test6.txt", - ] - }, - expected_total_bytes=8, - manifest_version=manifest_version, - ) - - def test_download_job_prefixes_with_slashes( - self, farm_id, queue_id, tmp_path: Path, manifest_version: ManifestVersion - ): - assert self.queue.jobAttachmentSettings - assert_download_job_output( - self.queue.jobAttachmentSettings, - farm_id, - queue_id, - tmp_path, - expected_files={ - str(tmp_path): [ - tmp_path / "test1.txt", - tmp_path / "test" / "test2.txt", - tmp_path / "test" / "test3.txt", - tmp_path / "test4.txt", - tmp_path / "test13.txt", - tmp_path / "test" / "test14.txt", - tmp_path / "test5.txt", - tmp_path / "test" / "test6.txt", - tmp_path / "test7.txt", - tmp_path / "test" / "test8.txt", - tmp_path / "test" / "test9.txt", - tmp_path / "test10.txt", - tmp_path / "test11.txt", - tmp_path / "test" / "test12.txt", - ] - }, - expected_total_bytes=14, - manifest_version=manifest_version, - ) - - -@pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for paths in POSIX path format and will be skipped on Windows.", -) -@pytest.mark.parametrize( - "root_path, output_paths", - [ - ("/local/home", ["test.png"]), - ("/local/home", ["outputs/test.png"]), - ("/local/home", ["../home/outputs/test.png"]), - ("/local/home/documents/..", ["outputs/test.png"]), - ("/local/home/documents/..", ["../home/outputs/test.png"]), - ("/////local/home", ["test.png"]), - ], -) -def test_ensure_paths_within_directory_posix_no_error(root_path: str, output_paths: list[str]): - _ensure_paths_within_directory(root_path, output_paths) - - -@pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for paths in POSIX path format and will be skipped on Windows.", -) -@pytest.mark.parametrize( - "root_path, output_paths", - [ - ("/local/home", ["../test.png"]), - ("/local/home", ["outputs/../../test.png"]), - ("/local/home", ["../home/../outputs/test.png"]), - ("/local/home", ["/outputs/test.png"]), - ("local", ["local/outputs/test.png"]), - ("C:/Users", ["outputs/test.png"]), - ("", ["outputs/test.png"]), - ], -) -def test_ensure_paths_within_directory_posix_raises_error(root_path: str, output_paths: list[str]): - with pytest.raises((PathOutsideDirectoryError, ValueError)): - _ensure_paths_within_directory(root_path, output_paths) - - -@pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for paths in Windows path format and will be skipped on non-Windows.", -) -@pytest.mark.parametrize( - "root_path, output_paths", - [ - ("C:/Users", ["test.png"]), - ("C:/Users", ["outputs/test.png"]), - ("C:/Users", ["../Users/outputs/test.png"]), - ("C:/Users/Temp/..", ["outputs/test.png"]), - ("C:/Users/Temp/..", ["../Users/outputs/test.png"]), - ], -) -def test_ensure_paths_within_directory_windows_no_error(root_path: str, output_paths: list[str]): - _ensure_paths_within_directory(root_path, output_paths) - - -@pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for paths in Windows path format and will be skipped on non-Windows.", -) -@pytest.mark.parametrize( - "root_path, output_paths", - [ - ("C:/Users", ["../test.png"]), - ("C:/Users", ["test.png", "../test.png"]), - ("C:/Users", ["outputs/../../test.png"]), - ("C:/Users", ["../home/../outputs/test.png"]), - ("C:/Users", ["C:/Temp/outputs/test.png"]), - (":/Users", ["outputs/test.png"]), - ("/Users", ["outputs/test.png"]), - ("/local/home", ["outputs/test.png"]), - ("", ["outputs/test.png"]), - ], -) -def test_ensure_paths_within_directory_windows_raises_error( - root_path: str, output_paths: list[str] -): - with pytest.raises((PathOutsideDirectoryError, ValueError)): - _ensure_paths_within_directory(root_path, output_paths) - - -def test_merge_asset_manifests( - test_manifest_one: dict, test_manifest_two: dict, merged_manifest: dict -): - """ - Test that merging two manifests correctly overlays the 2nd on top of the 1st - """ - manifests = [ - decode_manifest(json.dumps(test_manifest_one)), - decode_manifest(json.dumps(test_manifest_two)), - ] - - actual_merged_manifest = merge_asset_manifests(manifests) - - assert decode_manifest(json.dumps(merged_manifest)) == actual_merged_manifest - - -def test_merge_asset_manifests_empty(): - """ - Test that merging an empty list returns None - """ - assert merge_asset_manifests([]) is None - - -def test_merge_asset_manifest_single(test_manifest_one: dict): - """ - Test that merging a single manifest returns the same manifest - """ - manifest = decode_manifest(json.dumps(test_manifest_one)) - actual_merged_manifest = merge_asset_manifests([manifest]) - - assert actual_merged_manifest == manifest - - -def on_downloading_files(progress: ProgressReportMetadata) -> bool: - return True - - -def test_download_files_from_manifests( - test_manifest_one: dict, - test_manifest_two: dict, -): - manifests: list[BaseAssetManifest] = [ - decode_manifest(json.dumps(test_manifest_one)), - decode_manifest(json.dumps(test_manifest_two)), - ] - - merged_manifest = merge_asset_manifests(manifests) - - assert merged_manifest - - downloaded_files: list[str] = [] - - def download_file(*args): - nonlocal downloaded_files - downloaded_files.append(args[0].path) - return (40, Path(args[0].path)) - - with patch( - f"{deadline.__package__}.job_attachments.download.download_file", side_effect=download_file - ), patch(f"{deadline.__package__}.job_attachments.download.get_s3_client"): - download_files_from_manifests( - s3_bucket="s3_settings.s3BucketName", - manifests_by_root={"/test": merged_manifest}, - cas_prefix="s3_settings.full_cas_prefix()", - session=boto3.Session(region_name="us-west-2"), - on_downloading_files=on_downloading_files, - ) - - assert sorted(downloaded_files) == ["a.txt", "b.txt", "c.txt", "d.txt"] - - -def test_handle_existing_vfs_no_mount_returns(test_manifest_one: dict): - """ - Test that handling an existing manifest for a non existent mount returns the manifest - """ - manifest = decode_manifest(json.dumps(test_manifest_one)) - with patch( - f"{deadline.__package__}.job_attachments.download.VFSProcessManager.is_mount", - return_value=False, - ) as mock_is_mount: - result_manifest = handle_existing_vfs( - manifest, Path("/some/session/dir"), "/not/a/mount", "test-user" - ) - mock_is_mount.assert_called_once_with("/not/a/mount") - assert manifest == result_manifest - - -def test_handle_existing_vfs_success( - test_manifest_one: dict, test_manifest_two: dict, merged_manifest: dict -): - """ - Test that handling an existing manifest for a mount which exists attempts to merge the manifests and - shut down the mount - """ - manifest_one = decode_manifest(json.dumps(test_manifest_one)) - manifest_two = decode_manifest(json.dumps(test_manifest_two)) - merged_decoded = decode_manifest(json.dumps(merged_manifest)) - session_path = Path("/some/session/dir") - with patch( - f"{deadline.__package__}.job_attachments.download.VFSProcessManager.is_mount", - return_value=True, - ) as mock_is_mount, patch( - f"{deadline.__package__}.job_attachments.download.VFSProcessManager.get_manifest_path_for_mount", - return_value="/some/manifest/path", - ) as mock_get_manifest_path, patch( - f"{deadline.__package__}.job_attachments.download._read_manifest_file", - return_value=manifest_one, - ) as mock_decode_manifest, patch( - f"{deadline.__package__}.job_attachments.download.VFSProcessManager.kill_process_at_mount", - ) as mock_kill_process: - result_manifest = handle_existing_vfs( - manifest_two, session_path, "/some/mount", "test-user" - ) - mock_is_mount.assert_called_once_with("/some/mount") - mock_get_manifest_path.assert_called_once_with( - session_dir=session_path, mount_point="/some/mount" - ) - mock_decode_manifest.assert_called_once_with("/some/manifest/path") - mock_kill_process.assert_called_once_with( - session_dir=session_path, mount_point="/some/mount", os_user="test-user" - ) - assert result_manifest == merged_decoded - - -@pytest.mark.skipif( - sys.platform == "win32", - reason="This VFS test is currently not valid for windows - VFS is a linux only feature currently.", -) -def test_mount_vfs_from_manifests( - test_manifest_one: dict, test_manifest_two: dict, merged_manifest: dict -): - """ - Test that handling an existing manifest for a mount which exists attempts to merge the manifests and - shut down the mount - """ - manifest_one = decode_manifest(json.dumps(test_manifest_one)) - manifest_two = decode_manifest(json.dumps(test_manifest_two)) - merged_decoded = decode_manifest(json.dumps(merged_manifest)) - temp_dir = tempfile.TemporaryDirectory() - temp_dir_path = Path(temp_dir.name) - manifests_by_root = {"/some/root/one": manifest_one, "/some/root/two": manifest_two} - fs_permissions = PosixFileSystemPermissionSettings("test-user", "test-group", 0o31, 0o66) - manifest_permissions = PosixFileSystemPermissionSettings( - fs_permissions.os_user, - fs_permissions.os_group, - VFS_MANIFEST_FOLDER_PERMISSIONS.dir_mode, - VFS_MANIFEST_FOLDER_PERMISSIONS.file_mode, - ) - - cache_path = temp_dir_path / VFS_CACHE_REL_PATH_IN_SESSION - manifest_path = temp_dir_path / VFS_MANIFEST_FOLDER_IN_SESSION - logs_path = temp_dir_path / VFS_LOGS_FOLDER_IN_SESSION - - with patch( - f"{deadline.__package__}.job_attachments.download._set_fs_group", - ) as mock_set_vs_group, patch( - f"{deadline.__package__}.job_attachments.download.handle_existing_vfs", - return_value=merged_decoded, - ) as mock_handle_existing, patch( - f"{deadline.__package__}.job_attachments.download._write_manifest_to_temp_file", - ) as mock_write_manifest, patch( - f"{deadline.__package__}.job_attachments.download.VFSProcessManager.start", - ) as mock_vfs_start: - mount_vfs_from_manifests( - "test-bucket", - manifests_by_root, - boto3_session=boto3.Session(region_name="us-west-2"), - session_dir=temp_dir_path, - os_env_vars={}, - fs_permission_settings=fs_permissions, - cas_prefix="cas/test", - ) - # Were the cache and manifest folders created - assert os.path.isdir(cache_path) - assert os.path.isdir(manifest_path) - - # - # Did we attempt to assign the expected permissions - mock_set_vs_group.assert_has_calls( - [ - call([str(cache_path / "cas/test")], str(cache_path), fs_permissions), - call([str(manifest_path)], str(manifest_path), manifest_permissions), - call([str(logs_path)], str(logs_path), fs_permissions), - ] - ) - - mock_handle_existing.assert_has_calls( - [ - call( - manifest=manifest_one, - session_dir=temp_dir_path, - mount_point="/some/root/one", - os_user="test-user", - ), - call( - manifest=manifest_two, - session_dir=temp_dir_path, - mount_point="/some/root/two", - os_user="test-user", - ), - ] - ) - - mock_write_manifest.assert_has_calls( - [call(merged_decoded, dir=manifest_path), call(merged_decoded, dir=manifest_path)] - ) - mock_vfs_start.assert_has_calls( - [call(session_dir=temp_dir_path), call(session_dir=temp_dir_path)] - ) - - -def test_get_manifests_by_session_action_id_task_based(): - s3_settings = JobAttachmentS3Settings(s3BucketName="test-bucket", rootPrefix="root") - - with patch( - "deadline.job_attachments.download._list_s3_objects_with_error_handling" - ) as mock_list: - mock_list.return_value = [ - { - "Key": "farm-0/queue-0/job-0/step-0/task-0/2025-05-06T02:58:03.824934Z_sessionaction-0-0/0_output" - } - ] - - with patch( - "deadline.job_attachments.download.get_asset_root_and_manifest_from_s3" - ) as mock_get: - mock_get.return_value = ("/test/root", MagicMock()) - - result = _get_manifests_by_session_action_id( - s3_settings, - "farm-0", - "queue-0", - "job-0", - "step-0", - "task-0", - "sessionaction-0-0", - None, - ) - - assert "/test/root" in result - mock_list.assert_called_once() - - -def test_get_manifests_by_session_action_id_chunked_fallback(): - s3_settings = JobAttachmentS3Settings(s3BucketName="test-bucket", rootPrefix="root") - - with patch( - "deadline.job_attachments.download._list_s3_objects_with_error_handling" - ) as mock_list: - mock_list.side_effect = [ - [], # Empty first call (task prefix) - [ - { - "Key": "farm-0/queue-0/job-0/step-0/2025-05-06T02:58:03.824934Z_sessionaction-0-0/0_output" - } - ], # Found on fallback (step prefix) - ] - - with patch( - "deadline.job_attachments.download.get_asset_root_and_manifest_from_s3" - ) as mock_get: - mock_get.return_value = ("/test/root", MagicMock()) - - result = _get_manifests_by_session_action_id( - s3_settings, - "farm-0", - "queue-0", - "job-0", - "step-0", - "task-0", - "sessionaction-0-0", - None, - ) - - assert "/test/root" in result - assert mock_list.call_count == 2 - - -def test_get_manifests_by_session_action_id_no_manifests_found(): - """Test _get_manifests_by_session_action_id when no manifests are found in either task or step prefix.""" - s3_settings = JobAttachmentS3Settings(s3BucketName="test-bucket", rootPrefix="root") - - with patch( - "deadline.job_attachments.download._list_s3_objects_with_error_handling" - ) as mock_list: - mock_list.side_effect = [JobAttachmentsError("Not found"), JobAttachmentsError("Not found")] - - result = _get_manifests_by_session_action_id( - s3_settings, "farm-0", "queue-0", "job-0", "step-0", "task-0", "sessionaction-0-0", None - ) - - assert result == {} - assert mock_list.call_count == 2 - - -def test_get_tasks_manifests_keys_chunked_and_task_based_with_latest(): - mock_s3_client = MagicMock() - mock_paginator = MagicMock() - mock_s3_client.get_paginator.return_value = mock_paginator - mock_paginator.paginate.return_value = [ - { - "Contents": [ - {"Key": "step-0/2025-05-06T02:58:03.824934Z_sessionaction-0-0/0_output"}, # Chunked - { - "Key": "step-0/task-0/2025-05-06T02:58:03.824934Z_sessionaction-0-1/0_output" - }, # Newer task-based - { - "Key": "step-0/task-0/2024-05-06T02:58:03.824934Z_sessionaction-1-1/0_output" - }, # Older task-based - ] - } - ] - - with patch("deadline.job_attachments.download.get_s3_client", return_value=mock_s3_client): - result = _get_tasks_manifests_keys_from_s3( - "prefix/", "bucket", None, select_latest_per_task=True - ) - - assert len(result) == 2 - assert "step-0/2025-05-06T02:58:03.824934Z_sessionaction-0-0/0_output" in result - assert "step-0/task-0/2025-05-06T02:58:03.824934Z_sessionaction-0-1/0_output" in result - - -def test_get_manifests_by_session_action_id_regex_matching(): - """Test that _get_manifests_by_session_action_id correctly matches session action IDs using regex.""" - s3_settings = JobAttachmentS3Settings(s3BucketName="test-bucket", rootPrefix="root") - - with patch( - "deadline.job_attachments.download._list_s3_objects_with_error_handling" - ) as mock_list: - # Mock S3 contents with various files, only one should match the session action ID - mock_list.return_value = [ - {"Key": "prefix/20241225T120000_sessionaction-1-0/manifest_output"}, # Should match - {"Key": "prefix/other_file"}, # Should not match - {"Key": "prefix/20241225T130000_sessionaction-2-0/manifest_output"}, # Should not match - {"Key": "prefix/no_output_suffix"}, # Should not match - ] - - with patch( - "deadline.job_attachments.download.get_asset_root_and_manifest_from_s3" - ) as mock_get: - mock_get.return_value = ("/test/root", MagicMock()) - - result = _get_manifests_by_session_action_id( - s3_settings, - "farm-0", - "queue-0", - "job-0", - "step-0", - "task-0", - "sessionaction-1-0", # This should only match the first file - None, - ) - - assert "/test/root" in result - # Should only call get_asset_root_and_manifest_from_s3 once for the matching file - mock_get.assert_called_once_with( - "prefix/20241225T120000_sessionaction-1-0/manifest_output", "test-bucket", None - ) - - -def test_get_tasks_manifests_keys_chunked_and_task_based_without_latest(): - mock_s3_client = MagicMock() - mock_paginator = MagicMock() - mock_s3_client.get_paginator.return_value = mock_paginator - mock_paginator.paginate.return_value = [ - { - "Contents": [ - {"Key": "step-0/2025-05-06T02:58:03.824934Z_sessionaction-0-0/0_output"}, # Chunked - { - "Key": "step-0/task-0/2025-05-06T02:58:03.824934Z_sessionaction-0-1/0_output" - }, # Newer task-based - { - "Key": "step-0/task-0/2024-05-06T02:58:03.824934Z_sessionaction-1-1/0_output" - }, # Older task-based - ] - } - ] - - with patch("deadline.job_attachments.download.get_s3_client", return_value=mock_s3_client): - result = _get_tasks_manifests_keys_from_s3( - "prefix/", "bucket", None, select_latest_per_task=False - ) - - assert len(result) == 3 - assert "step-0/2025-05-06T02:58:03.824934Z_sessionaction-0-0/0_output" in result - assert "step-0/task-0/2025-05-06T02:58:03.824934Z_sessionaction-0-1/0_output" in result - assert "step-0/task-0/2024-05-06T02:58:03.824934Z_sessionaction-1-1/0_output" in result - - -def test_get_output_manifests_by_asset_root_with_session_action_id(): - s3_settings = JobAttachmentS3Settings(s3BucketName="test-bucket", rootPrefix="root") - - with patch("deadline.job_attachments.download._get_manifests_by_session_action_id") as mock_get: - mock_get.return_value = {"/test/root": [MagicMock()]} - - result = get_output_manifests_by_asset_root( - s3_settings, "farm-1", "queue-1", "job-1", "step-1", "task-1", "session-1", None - ) - - assert "/test/root" in result - mock_get.assert_called_once_with( - s3_settings, "farm-1", "queue-1", "job-1", "step-1", "task-1", "session-1", None - ) - - -def test_get_output_manifests_by_asset_root_chronological_merge_chunked(): - """Test chronological merging of manifests for chunked-based paths with different timestamps.""" - s3_settings = JobAttachmentS3Settings(s3BucketName="test-bucket", rootPrefix="root") - - # Create two manifests with different content for the same asset root - older_manifest_dict = { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "a96ddfc33590cd7d2391f1972f66a72a", - "mtime": 1111111111111111, - "path": "file.txt", - "size": 10, - } - ], - "totalSize": 10, - } - newer_manifest_dict = { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "b96ddfc33590cd7d2391f1972f66a72b", - "mtime": 2222222222222222, - "path": "file.txt", - "size": 20, - } - ], - "totalSize": 20, - } - - older_manifest = decode_manifest(json.dumps(older_manifest_dict)) - newer_manifest = decode_manifest(json.dumps(newer_manifest_dict)) - - # Mock the S3 calls - with patch( - "deadline.job_attachments.download._get_tasks_manifests_keys_from_s3" - ) as mock_keys, patch( - "deadline.job_attachments.download._get_asset_root_and_manifest_from_s3_with_last_modified" - ) as mock_get_manifest: - # Two chunked session actions under same step with different timestamps - mock_keys.return_value = [ - "step-0/2024-05-06T02:58:03.824934Z_sessionaction-0-0/0_output", # Older - "step-0/2025-05-06T02:58:03.824934Z_sessionaction-1-0/0_output", # Newer - ] - - # Return manifests with timestamps - older first, then newer - mock_get_manifest.side_effect = [ - ("/test/root", datetime(2024, 5, 6, 2, 58, 3), older_manifest), - ("/test/root", datetime(2025, 5, 6, 2, 58, 3), newer_manifest), - ] - - result = get_output_manifests_by_asset_root( - s3_settings, "farm-1", "queue-1", "job-1", "step-1" - ) - - # Should have one asset root with one merged manifest - assert len(result) == 1 - assert "/test/root" in result - assert len(result["/test/root"]) == 1 - - # The merged manifest should contain the newer file (newer overwrites older) - merged_manifest = result["/test/root"][0] - assert len(merged_manifest.paths) == 1 - assert merged_manifest.paths[0].hash == "b96ddfc33590cd7d2391f1972f66a72b" - assert merged_manifest.paths[0].size == 20 - - -def test_get_output_manifests_by_asset_root_multiple_asset_roots(): - """Test chronological merging with multiple asset roots.""" - s3_settings = JobAttachmentS3Settings(s3BucketName="test-bucket", rootPrefix="root") - - # Create manifests for different asset roots - manifest_root1 = decode_manifest( - json.dumps( - { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "a96ddfc33590cd7d2391f1972f66a72a", - "mtime": 1111111111111111, - "path": "file1.txt", - "size": 10, - } - ], - "totalSize": 10, - } - ) - ) - manifest_root2 = decode_manifest( - json.dumps( - { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "b96ddfc33590cd7d2391f1972f66a72b", - "mtime": 2222222222222222, - "path": "file2.txt", - "size": 20, - } - ], - "totalSize": 20, - } - ) - ) - - with patch( - "deadline.job_attachments.download._get_tasks_manifests_keys_from_s3" - ) as mock_keys, patch( - "deadline.job_attachments.download._get_asset_root_and_manifest_from_s3_with_last_modified" - ) as mock_get_manifest: - mock_keys.return_value = ["manifest1", "manifest2"] - mock_get_manifest.side_effect = [ - ("/root1", datetime(2024, 5, 6), manifest_root1), - ("/root2", datetime(2024, 5, 7), manifest_root2), - ] - - result = get_output_manifests_by_asset_root(s3_settings, "farm-1", "queue-1", "job-1") - - assert len(result) == 2 - assert "/root1" in result - assert "/root2" in result - assert len(result["/root1"]) == 1 - assert len(result["/root2"]) == 1 - - -def test_list_s3_objects_with_error_handling_no_contents(): - """Test _list_s3_objects_with_error_handling when S3 returns no Contents.""" - mock_s3_client = MagicMock() - mock_paginator = MagicMock() - mock_s3_client.get_paginator.return_value = mock_paginator - mock_paginator.paginate.return_value = [{}] # No Contents key - - with patch("deadline.job_attachments.download.get_s3_client", return_value=mock_s3_client): - with pytest.raises(JobAttachmentsError, match="Unable to find asset manifest"): - _list_s3_objects_with_error_handling("bucket", "prefix/", None) - - -def test_get_tasks_manifests_keys_chunked_only(): - """Test _get_tasks_manifests_keys_from_s3 with only chunked manifests (no task-based).""" - mock_s3_client = MagicMock() - mock_paginator = MagicMock() - mock_s3_client.get_paginator.return_value = mock_paginator - mock_paginator.paginate.return_value = [ - { - "Contents": [ - {"Key": "step-0/2025-05-06T02:58:03.824934Z_sessionaction-0-0/0_output"}, - {"Key": "step-0/2025-05-06T03:58:03.824934Z_sessionaction-1-0/0_output"}, - ] - } - ] - - with patch("deadline.job_attachments.download.get_s3_client", return_value=mock_s3_client): - result = _get_tasks_manifests_keys_from_s3( - "prefix/", "bucket", None, select_latest_per_task=True - ) - - assert len(result) == 2 - assert all("task-" not in key for key in result) - - -def test_merge_asset_manifests_sorted_same_timestamp(): - """Test _merge_asset_manifests_sorted_asc_by_last_modified with manifests having same timestamp.""" - # Create two manifests with same timestamp but different content - manifest1 = decode_manifest( - json.dumps( - { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "a96ddfc33590cd7d2391f1972f66a72a", - "mtime": 1111111111111111, - "path": "file.txt", - "size": 10, - } - ], - "totalSize": 10, - } - ) - ) - manifest2 = decode_manifest( - json.dumps( - { - "hashAlg": "xxh128", - "manifestVersion": "2023-03-03", - "paths": [ - { - "hash": "b96ddfc33590cd7d2391f1972f66a72b", - "mtime": 2222222222222222, - "path": "file.txt", - "size": 20, - } - ], - "totalSize": 20, - } - ) - ) - - same_timestamp = datetime(2024, 5, 6, 2, 58, 3) - manifests_with_timestamps = [ - (same_timestamp, manifest1), - (same_timestamp, manifest2), - ] - - result = _merge_asset_manifests_sorted_asc_by_last_modified(manifests_with_timestamps) - - # Should merge successfully even with same timestamps - assert result is not None - assert len(result.paths) == 1 - # The second manifest should overwrite the first (stable sort behavior) - assert result.paths[0].hash == "b96ddfc33590cd7d2391f1972f66a72b" - - -def test_get_new_copy_file_path_file_collisions(tmp_path: Path) -> None: - """Tests that copying files append the correct number""" - existing_files = [ - tmp_path / "test_col.txt", - tmp_path / "test_col (1).txt", - tmp_path / "test_col (2).txt", - tmp_path / "test_col (3).txt", - tmp_path / "test_skip.txt", - tmp_path / "test_skip (1).txt", - tmp_path / "test_skip (2).txt", - tmp_path / "test_skip (4).txt", - tmp_path / "test_original.txt", - tmp_path / "test_overlapping_path_but_original.txt", - tmp_path / "test_overlapping_path_but_original (1).txt", - ] - for path in existing_files: - with open(str(path), "w") as f: - f.write("I am a pre-existing file, not downloaded by Job Attachment.") - - assert set(existing_files) == set([path for path in tmp_path.glob("**/*") if path.is_file()]) - - expected_files = [ - tmp_path / "test_col (4).txt", - tmp_path / "test_skip (3).txt", - tmp_path / "test_original (1).txt", - tmp_path / "test_overlapping_path_but_original (2).txt", - tmp_path / "test_overlapping_path_but_original (1) (1).txt", - ] - - input_paths = [ - tmp_path / "test_col.txt", - tmp_path / "test_skip.txt", - tmp_path / "test_original.txt", - tmp_path / "test_overlapping_path_but_original.txt", - tmp_path / "test_overlapping_path_but_original (1).txt", - ] - - test_lock = Lock() - test_dict: DefaultDict[str, int] = DefaultDict(int) - results = [] - for input_path in input_paths: - results.append(_get_new_copy_file_path(input_path, test_lock, test_dict)) - - assert set(expected_files) == set(results) - assert test_dict[str(tmp_path / "test_col.txt")] == 4 - assert test_dict[str(tmp_path / "test_skip.txt")] == 3 - assert test_dict[str(tmp_path / "test_original.txt")] == 1 - assert test_dict[str(tmp_path / "test_overlapping_path_but_original.txt")] == 2 - assert test_dict[str(tmp_path / "test_overlapping_path_but_original (1).txt")] == 1 diff --git a/test/unit/deadline_job_attachments/test_glob.py b/test/unit/deadline_job_attachments/test_glob.py deleted file mode 100644 index 3eff70c59..000000000 --- a/test/unit/deadline_job_attachments/test_glob.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -import os -from deadline.job_attachments.exceptions import NonValidInputError -import pytest -from typing import List -from deadline.job_attachments._glob import _glob_paths, _process_glob_inputs - - -def test_glob_inputs_string(glob_config_file): - """ - Test case to test glob config as a string. - """ - glob: str - with open(glob_config_file) as f: - glob = f.read() - glob_config = _process_glob_inputs(glob) - assert "include.file" in glob_config.include_glob - assert "exclude.file" in glob_config.exclude_glob - - -def test_glob_inputs_file(glob_config_file): - """ - Test case to test glob config as a file. - """ - glob_config = _process_glob_inputs(glob_config_file) - assert "include.file" in glob_config.include_glob - assert "exclude.file" in glob_config.exclude_glob - - -def test_bad_glob_string(): - """ - Test case to test a bad glob config will raise an exception. - """ - glob: str = "This is not a json" - with pytest.raises(NonValidInputError): - _process_glob_inputs(glob) - - -def test_glob_path_default(test_glob_folder: str): - """ - Test case to glob all files. - """ - globbed_files: List[str] = _glob_paths(path=test_glob_folder) - - # There are 4 files - assert len(globbed_files) == 4 - assert os.path.join(os.sep, test_glob_folder, "include.txt") in globbed_files - assert os.path.join(os.sep, test_glob_folder, "exclude.txt") in globbed_files - assert os.path.join(os.sep, test_glob_folder, "nested", "nested_include.txt") in globbed_files - assert os.path.join(os.sep, test_glob_folder, "nested", "nested_exclude.txt") in globbed_files - - -def test_glob_path_default_include(test_glob_folder: str): - """ - Test case to glob all files. - """ - globbed_files: List[str] = _glob_paths( - path=test_glob_folder, include=["*include.txt", "*/*include.txt"] - ) - - # There are 2 files - assert len(globbed_files) == 2 - assert os.path.join(os.sep, test_glob_folder, "include.txt") in globbed_files - assert os.path.join(os.sep, test_glob_folder, "nested", "nested_include.txt") in globbed_files - - -def test_glob_path_exclude(test_glob_folder: str): - """ - Test case to glob all files and exclude some. - """ - globbed_files: List[str] = _glob_paths( - path=test_glob_folder, exclude=["*exclude.txt", "*/*exclude.txt"] - ) - - # There are 4 files - assert len(globbed_files) == 2 - assert os.path.join(os.sep, test_glob_folder, "include.txt") in globbed_files - assert os.path.join(os.sep, test_glob_folder, "nested", "nested_include.txt") in globbed_files - - -def test_glob_path_include_subdir(test_glob_folder: str): - """ - Test case to glob files only from the include sub directory. - """ - globbed_files: List[str] = _glob_paths(path=test_glob_folder, include=["nested/**"]) - - # There are 2 files - assert len(globbed_files) == 2 - assert os.path.join(os.sep, test_glob_folder, "nested", "nested_include.txt") in globbed_files - assert os.path.join(os.sep, test_glob_folder, "nested", "nested_exclude.txt") in globbed_files - - -def test_glob_path_include_nonexistent(test_glob_folder: str): - """ - Test case to glob files only from the include sub directory which does not exist. - """ - globbed_files: List[str] = _glob_paths(path=test_glob_folder, include=["nonexistent/**"]) - - # There are 0 files - assert len(globbed_files) == 0 - - -def test_glob_path_exclude_subdir(test_glob_folder: str): - """ - Test case to glob files and exclude sub directory. - """ - globbed_files: List[str] = _glob_paths(path=test_glob_folder, exclude=["nested/**"]) - - # There are 2 files - assert len(globbed_files) == 2 - assert os.path.join(os.sep, test_glob_folder, "include.txt") in globbed_files - assert os.path.join(os.sep, test_glob_folder, "exclude.txt") in globbed_files - - -def test_glob_path_exclude_nonexistent(test_glob_folder: str): - """ - Test case to glob files only exclude sub directory which does not exist. - """ - globbed_files: List[str] = _glob_paths(path=test_glob_folder, exclude=["nonexistent/**"]) - - # There are 2 files - assert len(globbed_files) == 4 diff --git a/test/unit/deadline_job_attachments/test_models.py b/test/unit/deadline_job_attachments/test_models.py deleted file mode 100644 index 5ff3e8fb8..000000000 --- a/test/unit/deadline_job_attachments/test_models.py +++ /dev/null @@ -1,368 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -from unittest.mock import patch -from dataclasses import asdict - -from deadline.job_attachments.models import ( - PathFormat, - StorageProfileOperatingSystemFamily, - PathMappingRule, - JobAttachmentS3Settings, - ManifestSnapshot, - ManifestProperties, -) -from deadline.job_attachments.asset_manifests.hash_algorithms import HashAlgorithm -from deadline.job_attachments.exceptions import MalformedAttachmentSettingError - -import pytest -import json - - -class TestModels: - @pytest.mark.parametrize( - ("sys_os", "expected_output"), - [("win32", "windows"), ("darwin", "posix"), ("linux", "posix")], - ) - def test_get_host_path_format_string(self, sys_os: str, expected_output: str): - """ - Tests that the expected OS string is returned - """ - with patch("sys.platform", sys_os): - assert PathFormat.get_host_path_format_string() == expected_output - - @pytest.mark.parametrize( - ("input", "output"), - [ - ("windows", StorageProfileOperatingSystemFamily.WINDOWS), - ("WINDOWS", StorageProfileOperatingSystemFamily.WINDOWS), - ("wInDoWs", StorageProfileOperatingSystemFamily.WINDOWS), - ("linux", StorageProfileOperatingSystemFamily.LINUX), - ("LINUX", StorageProfileOperatingSystemFamily.LINUX), - ("LiNuX", StorageProfileOperatingSystemFamily.LINUX), - ("macos", StorageProfileOperatingSystemFamily.MACOS), - ("MACOS", StorageProfileOperatingSystemFamily.MACOS), - ("maCOs", StorageProfileOperatingSystemFamily.MACOS), - ], - ) - def test_storage_profile_operating_system_family_case( - self, input: str, output: StorageProfileOperatingSystemFamily - ) -> None: - """ - Tests that the correct enum types are created regardless of input string casing. - """ - assert StorageProfileOperatingSystemFamily(input) == output - - @pytest.mark.parametrize(("input"), [("linuxx"), ("darwin"), ("oSx"), ("MSDOS")]) - def test_storage_profile_operating_system_raises_type_error(self, input): - """ - Tests that a ValueError is raised when a non-valid string is given. - I.e. our case-insensitivity isn't causing false-positives. - """ - with pytest.raises(ValueError): - StorageProfileOperatingSystemFamily(input) - - def test_path_mapping_rules(self): - """ - Test rule construction and hashing the source attributes - """ - path_mapping = PathMappingRule( - source_path_format="posix", - source_path="/tmp", - destination_path="/local/home/test/output", - ) - assert "a0271fe0c8b1c1f99b82b442cd878122" == path_mapping.get_hashed_source_path( - HashAlgorithm.XXH128 - ) - - -class TestJobAttachmentS3SettingsModel: - @pytest.mark.parametrize( - ("input", "output"), - [ - ("s3BucketName/rootPrefix", JobAttachmentS3Settings("s3BucketName", "rootPrefix")), - ("s3BucketName/root/Prefix", JobAttachmentS3Settings("s3BucketName", "root/Prefix")), - ], - ) - def test_job_attachment_setting_root_path(self, input: str, output: JobAttachmentS3Settings): - """ - Test Job Attachment S3 Settings from and to S3 root path - """ - assert output == JobAttachmentS3Settings.from_root_path(input) - assert input == output.to_root_path() - - def test_job_attachment_setting_from_path_error(self): - """ - Test Job Attachment S3 Settings from malformed S3 root path - """ - with pytest.raises(MalformedAttachmentSettingError): - JobAttachmentS3Settings.from_root_path("s3BucketOnly") - - @pytest.mark.parametrize( - ("input", "output"), - [ - ("s3://BucketName/rootPrefix", JobAttachmentS3Settings("BucketName", "rootPrefix")), - ("s3://BucketName/root/Prefix", JobAttachmentS3Settings("BucketName", "root/Prefix")), - ], - ) - def test_job_attachment_setting_root_uri(self, input: str, output: JobAttachmentS3Settings): - """ - Test Job Attachment S3 Settings from and to S3 root uri - """ - assert output == JobAttachmentS3Settings.from_s3_root_uri(input) - assert input == output.to_s3_root_uri() - - def test_job_attachment_setting_from_s3_root_uri_error(self): - """ - Test Job Attachment S3 Settings from malformed S3 root uri - """ - with pytest.raises(MalformedAttachmentSettingError): - JobAttachmentS3Settings.from_s3_root_uri("s3://s3BucketOnly") - - def test_job_attachment_s3_settings_partial_session_action_manifest_prefix(self): - """ - Test JobAttachmentS3Settings partial_session_action_manifest_prefix method - """ - # Mock the _float_to_iso_datetime_string function to return a predictable value - with patch( - "deadline.job_attachments.models._float_to_iso_datetime_string", - return_value="2025-05-22T22:17:03.409012Z", - ): - # Call the partial_session_action_manifest_prefix method - result = JobAttachmentS3Settings.partial_session_action_manifest_prefix( - farm_id="farm1", - queue_id="queue1", - job_id="job1", - step_id="step1", - task_id="task1", - session_action_id="session1", - time=1747952223.4090126, # This is 2025-05-22T22:17:03.409012Z in timestamp - ) - - # Verify the result - expected = "farm1/queue1/job1/step1/task1/2025-05-22T22:17:03.409012Z_session1" - assert result == expected - - -class TestManifestSnapshotModel: - """Tests for the ManifestSnapshot class""" - - def test_manifest_snapshot_creation(self): - """ - Test ManifestSnapshot creation with required values - """ - # Test with specific values - snapshot = ManifestSnapshot(root="/path/to/root", manifest="manifest-path") - assert snapshot.root == "/path/to/root" - assert snapshot.manifest == "manifest-path" - - def test_manifest_snapshot_construct_from_json_missing_attribute(self): - """ - Test ManifestSnapshot error when missing attribute - """ - json_str = json.dumps({"manifest": "path/to/manifest"}) - assert isinstance(json_str, str) - - # Test deserialization - with pytest.raises(TypeError): - ManifestSnapshot(**json.loads(json_str)) - - def test_manifest_snapshot_json_serialization_special_characters(self): - """ - Test ManifestSnapshot serialization with special characters - """ - # Test with paths containing special characters - snapshot = ManifestSnapshot( - root='/path/with spaces/and"quotes"/and\\backslashes', - manifest="manifest-with-unicode-€-£-¥", - ) - - # Convert to JSON and back - json_str = json.dumps(asdict(snapshot)) - data = json.loads(json_str) - recreated = ManifestSnapshot(**data) - - # Verify the special characters are preserved - assert recreated.root == '/path/with spaces/and"quotes"/and\\backslashes' - assert recreated.manifest == "manifest-with-unicode-€-£-¥" - - -class TestManifestPropertiesModel: - """Tests for the ManifestProperties class""" - - def test_from_dict_minimal_required_fields(self): - """Test ManifestProperties.from_dict with only required fields""" - data = {"rootPath": "/test/path", "rootPathFormat": "posix"} - - manifest_props = ManifestProperties.from_dict(data) - - assert manifest_props.rootPath == "/test/path" - assert manifest_props.rootPathFormat == PathFormat.POSIX - assert manifest_props.fileSystemLocationName is None - assert manifest_props.inputManifestPath is None - assert manifest_props.inputManifestHash is None - assert manifest_props.outputRelativeDirectories is None - - def test_from_dict_all_fields_populated(self): - """Test ManifestProperties.from_dict with all fields populated""" - data = { - "rootPath": "/test/path", - "rootPathFormat": "windows", - "fileSystemLocationName": "test-location", - "inputManifestPath": "s3://bucket/manifest.json", - "inputManifestHash": "abc123hash", - "outputRelativeDirectories": ["output1", "output2", "subdir/output3"], - } - - manifest_props = ManifestProperties.from_dict(data) - - assert manifest_props.rootPath == "/test/path" - assert manifest_props.rootPathFormat == PathFormat.WINDOWS - assert manifest_props.fileSystemLocationName == "test-location" - assert manifest_props.inputManifestPath == "s3://bucket/manifest.json" - assert manifest_props.inputManifestHash == "abc123hash" - assert manifest_props.outputRelativeDirectories == ["output1", "output2", "subdir/output3"] - - @pytest.mark.parametrize( - ("path_format", "expected_enum"), - [ - ("posix", PathFormat.POSIX), - ("windows", PathFormat.WINDOWS), - ], - ) - def test_from_dict_path_format_variations(self, path_format: str, expected_enum: PathFormat): - """Test ManifestProperties.from_dict with different path format values""" - data = {"rootPath": "/test/path", "rootPathFormat": path_format} - - manifest_props = ManifestProperties.from_dict(data) - assert manifest_props.rootPathFormat == expected_enum - - def test_from_dict_missing_required_field_root_path(self): - """Test ManifestProperties.from_dict raises KeyError when rootPath is missing""" - data = {"rootPathFormat": "posix"} - - with pytest.raises(KeyError, match="rootPath"): - ManifestProperties.from_dict(data) - - def test_from_dict_missing_required_field_root_path_format(self): - """Test ManifestProperties.from_dict raises KeyError when rootPathFormat is missing""" - data = {"rootPath": "/test/path"} - - with pytest.raises(KeyError, match="rootPathFormat"): - ManifestProperties.from_dict(data) - - def test_from_dict_invalid_path_format(self): - """Test ManifestProperties.from_dict raises ValueError for invalid path format""" - invalid_root_path_format = "invalid_format" - data = {"rootPath": "/test/path", "rootPathFormat": invalid_root_path_format} - - with pytest.raises(ValueError, match=invalid_root_path_format): - ManifestProperties.from_dict(data) - - def test_from_dict_with_empty_optional_lists(self): - """Test ManifestProperties.from_dict with empty lists for optional fields""" - data = { - "rootPath": "/test/path", - "rootPathFormat": "posix", - "outputRelativeDirectories": [], - } - - manifest_props = ManifestProperties.from_dict(data) - assert manifest_props.outputRelativeDirectories == [] - - def test_from_dict_roundtrip_with_to_dict(self): - """Test that from_dict and to_dict are inverse operations""" - # Create a ManifestProperties instance with all fields - original = ManifestProperties( - rootPath="/original/path", - rootPathFormat=PathFormat.POSIX, - fileSystemLocationName="test-location", - inputManifestPath="s3://bucket/manifest.json", - inputManifestHash="hash123", - outputRelativeDirectories=["out1", "out2"], - ) - - # Convert to dict and back - data = original.to_dict() - recreated = ManifestProperties.from_dict(data) - - # Verify they are equal - assert recreated == original - - def test_from_dict_with_special_characters_in_paths(self): - """Test ManifestProperties.from_dict with special characters in paths""" - data = { - "rootPath": '/path/with spaces/and"quotes"/and\\backslashes', - "rootPathFormat": "posix", - "fileSystemLocationName": "location-with-unicode-€-£-¥", - "inputManifestPath": "s3://bucket-name/path with spaces/manifest.json", - "outputRelativeDirectories": ["output with spaces", "output/with/slashes"], - } - - manifest_props = ManifestProperties.from_dict(data) - - assert manifest_props.rootPath == '/path/with spaces/and"quotes"/and\\backslashes' - assert manifest_props.fileSystemLocationName == "location-with-unicode-€-£-¥" - assert manifest_props.inputManifestPath == "s3://bucket-name/path with spaces/manifest.json" - assert manifest_props.outputRelativeDirectories == [ - "output with spaces", - "output/with/slashes", - ] - - def test_from_dict_with_none_values_in_optional_fields(self): - """Test ManifestProperties.from_dict with explicit None values for optional fields""" - data = { - "rootPath": "/test/path", - "rootPathFormat": "posix", - "fileSystemLocationName": None, - "inputManifestPath": None, - "inputManifestHash": None, - "outputRelativeDirectories": None, - } - - manifest_props = ManifestProperties.from_dict(data) - - assert manifest_props.rootPath == "/test/path" - assert manifest_props.rootPathFormat == PathFormat.POSIX - assert manifest_props.fileSystemLocationName is None - assert manifest_props.inputManifestPath is None - assert manifest_props.inputManifestHash is None - assert manifest_props.outputRelativeDirectories is None - - def test_as_output_metadata_ascii_path(self): - """Test as_output_metadata with ASCII-only root path""" - manifest_props = ManifestProperties( - rootPath="/test/path", - rootPathFormat=PathFormat.POSIX, - fileSystemLocationName="test-location", - ) - - result = manifest_props.as_output_metadata() - - expected = { - "Metadata": {"asset-root": "/test/path", "file-system-location-name": "test-location"} - } - assert result == expected - - def test_as_output_metadata_non_ascii_path(self): - """Test as_output_metadata with non-ASCII root path""" - manifest_props = ManifestProperties( - rootPath="/test/café/测试", rootPathFormat=PathFormat.POSIX - ) - - result = manifest_props.as_output_metadata() - - expected = { - "Metadata": { - "asset-root": '"/test/caf\\u00e9/\\u6d4b\\u8bd5"', - "asset-root-json": '"/test/caf\\u00e9/\\u6d4b\\u8bd5"', - } - } - assert result == expected - - def test_as_output_metadata_no_file_system_location(self): - """Test as_output_metadata without file system location name""" - manifest_props = ManifestProperties(rootPath="/test/path", rootPathFormat=PathFormat.POSIX) - - result = manifest_props.as_output_metadata() - - expected = {"Metadata": {"asset-root": "/test/path"}} - assert result == expected diff --git a/test/unit/deadline_job_attachments/test_path_mapping.py b/test/unit/deadline_job_attachments/test_path_mapping.py deleted file mode 100644 index 18a9d3589..000000000 --- a/test/unit/deadline_job_attachments/test_path_mapping.py +++ /dev/null @@ -1,331 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from pathlib import Path -import dataclasses - -import pytest - -from deadline.job_attachments._path_mapping import ( - _generate_path_mapping_rules, - _PathMappingRuleApplier, -) -from deadline.job_attachments.models import ( - PathFormat, - PathMappingRule, - StorageProfileOperatingSystemFamily, -) - - -@dataclasses.dataclass -class DestPaths: - path1: Path - path2: Path - path3: Path - - -# Fixtures for shared resources -@pytest.fixture -def dest_paths(tmp_path_factory: pytest.TempPathFactory): - """Create a set of local directory paths to use.""" - base_dir = tmp_path_factory.mktemp("checkpoint") - dest_path1 = base_dir / "path1" - dest_path2 = base_dir / "path2" / "with" / "some" / "nesting" - dest_path3 = base_dir / ("a" * 1000) - yield DestPaths(dest_path1, dest_path2, dest_path3) - - -# Sample storage profiles for testing -SAMPLE_WINDOWS_STORAGE_PROFILE = { - "storageProfileId": "sp-windows-123", - "osFamily": StorageProfileOperatingSystemFamily.WINDOWS.value, - "fileSystemLocations": [ - {"name": "shared", "path": "C:\\shared"}, - {"name": "temp", "path": "C:\\temp"}, - ], -} - -SAMPLE_LINUX_STORAGE_PROFILE = { - "storageProfileId": "sp-linux-456", - "osFamily": StorageProfileOperatingSystemFamily.LINUX.value, - "fileSystemLocations": [ - {"name": "shared", "path": "/mnt/shared"}, - {"name": "temp", "path": "/tmp"}, - ], -} - -SAMPLE_MACOS_STORAGE_PROFILE = { - "storageProfileId": "sp-macos-789", - "osFamily": StorageProfileOperatingSystemFamily.MACOS.value, - "fileSystemLocations": [ - {"name": "shared", "path": "/Volumes/shared"}, - {"name": "temp", "path": "/tmp"}, - ], -} - -# Storage profiles for edge cases -EMPTY_LOCATIONS_PROFILE = { - "storageProfileId": "sp-empty-001", - "osFamily": StorageProfileOperatingSystemFamily.LINUX.value, - "fileSystemLocations": [], -} - -NO_MATCHING_LOCATIONS_PROFILE = { - "storageProfileId": "sp-nomatch-002", - "osFamily": StorageProfileOperatingSystemFamily.LINUX.value, - "fileSystemLocations": [ - {"name": "different", "path": "/mnt/different"}, - {"name": "other", "path": "/mnt/other"}, - ], -} - -WINDOWS_DESTINATION_PROFILE = { - "storageProfileId": "sp-windows-dest-456", - "osFamily": StorageProfileOperatingSystemFamily.WINDOWS.value, - "fileSystemLocations": [ - {"name": "shared", "path": "D:\\shared"}, - {"name": "temp", "path": "D:\\temp"}, - ], -} - -LINUX_DESTINATION_PROFILE = { - "storageProfileId": "sp-linux-dest-789", - "osFamily": StorageProfileOperatingSystemFamily.LINUX.value, - "fileSystemLocations": [ - {"name": "shared", "path": "/opt/shared"}, - {"name": "temp", "path": "/var/tmp"}, - ], -} - -# Test cases for _generate_path_mapping_rules function -GENERATE_PATH_MAPPING_RULES_CASES: tuple = ( - # (source_profile, destination_profile, expected_rules) - pytest.param( - SAMPLE_WINDOWS_STORAGE_PROFILE, - SAMPLE_WINDOWS_STORAGE_PROFILE, - [], - id="identical profiles return empty list", - ), - pytest.param( - SAMPLE_LINUX_STORAGE_PROFILE, - NO_MATCHING_LOCATIONS_PROFILE, - [], - id="no matching locations return empty list", - ), - pytest.param( - EMPTY_LOCATIONS_PROFILE, - SAMPLE_LINUX_STORAGE_PROFILE, - [], - id="empty source locations return empty list", - ), - pytest.param( - SAMPLE_WINDOWS_STORAGE_PROFILE, - WINDOWS_DESTINATION_PROFILE, - [ - PathMappingRule(PathFormat.WINDOWS.value, "C:\\shared", "D:\\shared"), - PathMappingRule(PathFormat.WINDOWS.value, "C:\\temp", "D:\\temp"), - ], - id="Windows profiles generate Windows format rules", - ), - pytest.param( - SAMPLE_LINUX_STORAGE_PROFILE, - LINUX_DESTINATION_PROFILE, - [ - PathMappingRule(PathFormat.POSIX.value, "/mnt/shared", "/opt/shared"), - PathMappingRule(PathFormat.POSIX.value, "/tmp", "/var/tmp"), - ], - id="Linux profiles generate POSIX format rules", - ), - pytest.param( - SAMPLE_MACOS_STORAGE_PROFILE, - LINUX_DESTINATION_PROFILE, - [ - PathMappingRule(PathFormat.POSIX.value, "/Volumes/shared", "/opt/shared"), - PathMappingRule(PathFormat.POSIX.value, "/tmp", "/var/tmp"), - ], - id="macOS profiles generate POSIX format rules", - ), -) - - -@pytest.mark.parametrize( - ( - "source_profile", - "destination_profile", - "expected_rules", - ), - GENERATE_PATH_MAPPING_RULES_CASES, -) -def test_generate_path_mapping_rules( - source_profile, - destination_profile, - expected_rules, -): - """Test that _generate_path_mapping_rules generates correct path mapping rules.""" - rules = _generate_path_mapping_rules(source_profile, destination_profile) - - assert rules == expected_rules - - -def test_path_mapping_rule_applier_create_empty(): - applier = _PathMappingRuleApplier([]) - assert applier.path_mapping_rules == [] - assert applier._path_mapping_trie == {} - - assert applier.transform("/some/path") == "/some/path" - with pytest.raises(ValueError): - applier.strict_transform("/some/path") - - -def test_path_mapping_rule_applier_create_bad_source_path(dest_paths: DestPaths): - with pytest.raises(ValueError): - _PathMappingRuleApplier([PathMappingRule("xisop", "/mnt/shared", str(dest_paths.path1))]) - - -def test_path_mapping_rule_applier_create_posix(dest_paths: DestPaths): - rules = [ - PathMappingRule(PathFormat.POSIX.value, "/mnt/shared", str(dest_paths.path1)), - PathMappingRule(PathFormat.POSIX.value, "/mnt/projects", str(dest_paths.path2)), - PathMappingRule(PathFormat.POSIX.value, "/tmp", str(dest_paths.path3)), - ] - applier = _PathMappingRuleApplier(rules) - assert applier.path_mapping_rules == rules - assert applier.source_path_format == PathFormat.POSIX.value - assert set(applier._path_mapping_trie.keys()) == {"/"} - assert set(applier._path_mapping_trie["/"].keys()) == {"mnt", "tmp"} - assert set(applier._path_mapping_trie["/"]["mnt"].keys()) == {"shared", "projects"} - - -def test_path_mapping_rule_applier_create_windows(dest_paths: DestPaths): - rules = [ - PathMappingRule(PathFormat.WINDOWS.value, "C:\\Mnt\\Shared", str(dest_paths.path1)), - PathMappingRule(PathFormat.WINDOWS.value, "C:\\Mnt\\proJects", str(dest_paths.path2)), - PathMappingRule(PathFormat.WINDOWS.value, "D:\\tmp", str(dest_paths.path3)), - ] - applier = _PathMappingRuleApplier(rules) - assert applier.path_mapping_rules == rules - assert applier.source_path_format == PathFormat.WINDOWS.value - assert set(applier._path_mapping_trie.keys()) == {"c:\\", "d:\\"} - assert set(applier._path_mapping_trie["c:\\"].keys()) == {"mnt"} - assert set(applier._path_mapping_trie["d:\\"].keys()) == {"tmp"} - assert set(applier._path_mapping_trie["c:\\"]["mnt"].keys()) == {"shared", "projects"} - - -def test_path_mapping_rule_applier_create_mixed(dest_paths: DestPaths): - with pytest.raises(ValueError): - rules = [ - PathMappingRule(PathFormat.POSIX.value, "/mnt/shared", str(dest_paths.path1)), - PathMappingRule(PathFormat.WINDOWS.value, "D:\\tmp", str(dest_paths.path3)), - ] - _PathMappingRuleApplier(rules) - - -def test_source_posix_rule(dest_paths: DestPaths): - applier = _PathMappingRuleApplier( - [ - PathMappingRule(PathFormat.POSIX.value, "/mnt/shared", str(dest_paths.path1)), - PathMappingRule(PathFormat.POSIX.value, "/mnt/projects", str(dest_paths.path2)), - PathMappingRule(PathFormat.POSIX.value, "/tmp", str(dest_paths.path3)), - ] - ) - - # All three rules can be used with both regular and strict transform - assert applier.transform("/mnt/shared") == dest_paths.path1 - assert applier.transform("/mnt/projects") == dest_paths.path2 - assert applier.transform("/tmp") == dest_paths.path3 - assert applier.strict_transform("/mnt/shared") == dest_paths.path1 - assert applier.strict_transform("/mnt/projects") == dest_paths.path2 - assert applier.strict_transform("/tmp") == dest_paths.path3 - - # transform passes through other paths - assert applier.transform("/other/path") == "/other/path" - - # strict_transform raises for other paths - with pytest.raises(ValueError): - applier.strict_transform("/other/path") - - -def test_source_windows_rule(dest_paths: DestPaths): - applier = _PathMappingRuleApplier( - [ - PathMappingRule(PathFormat.WINDOWS.value, "C:\\Shared", str(dest_paths.path1)), - PathMappingRule(PathFormat.WINDOWS.value, "C:\\proJects", str(dest_paths.path2)), - PathMappingRule(PathFormat.WINDOWS.value, "D:\\tmp", str(dest_paths.path3)), - ] - ) - - # All three rules can be used with both regular and strict transform - assert applier.transform("C:\\Shared") == dest_paths.path1 - assert applier.transform("C:\\proJects") == dest_paths.path2 - assert applier.transform("D:\\tmp") == dest_paths.path3 - assert applier.strict_transform("C:\\Shared") == dest_paths.path1 - assert applier.strict_transform("C:\\proJects") == dest_paths.path2 - assert applier.strict_transform("D:\\tmp") == dest_paths.path3 - - # Windows is case insensitive but case preserving - assert applier.transform("C:\\ShArEd") == dest_paths.path1 - assert ( - applier.transform("C:\\PROJECTS\\Case\\Of\\tail\\PreServed") - == dest_paths.path2 / "Case" / "Of" / "tail" / "PreServed" - ) - - # transform passes through other paths - assert applier.transform("C:\\other\\path") == "C:\\other\\path" - - # strict_transform raises for other paths - with pytest.raises(ValueError): - applier.strict_transform("C:\\other\\path") - - -def test_source_posix_rule_edge_cases(dest_paths: DestPaths): - applier = _PathMappingRuleApplier( - [ - PathMappingRule(PathFormat.POSIX.value, "/mnt/shared", str(dest_paths.path1)), - PathMappingRule(PathFormat.POSIX.value, "/mnt/shared/projects", str(dest_paths.path2)), - PathMappingRule(PathFormat.POSIX.value, "/tmp", str(dest_paths.path3)), - ] - ) - - # Paths that are not transformed - assert applier.transform("") == "" - assert applier.transform("/") == "/" - assert applier.transform("/other/path") == "/other/path" - assert applier.transform("/mnt/other/path") == "/mnt/other/path" - assert applier.transform("/Mnt/shared") == "/Mnt/shared" - - # Edge cases with unicode and spaces - assert applier.transform("/mnt/shared/файл.txt") == dest_paths.path1 / "файл.txt" - assert ( - applier.transform("/mnt/shared/file with spaces.txt") - == dest_paths.path1 / "file with spaces.txt" - ) - - # The second rule applies because it is longer and more specific than the first rule - assert applier.transform("/mnt/shared/projects") == dest_paths.path2 - assert applier.transform("/mnt/shared/projects/file.txt") == dest_paths.path2 / "file.txt" - - -def test_source_windows_rule_edge_cases(dest_paths: DestPaths): - applier = _PathMappingRuleApplier( - [ - PathMappingRule(PathFormat.WINDOWS.value, "C:\\shared", str(dest_paths.path1)), - PathMappingRule( - PathFormat.WINDOWS.value, "C:\\shared\\projects", str(dest_paths.path2) - ), - PathMappingRule(PathFormat.WINDOWS.value, "D:\\temp", str(dest_paths.path3)), - ] - ) - - # Paths that are not transformed - assert applier.transform("") == "" - assert applier.transform("C:\\other\\path") == "C:\\other\\path" - - # Edge cases with unicode and spaces - assert applier.transform("C:\\shared\\файл.txt") == dest_paths.path1 / "файл.txt" - assert ( - applier.transform("C:\\shared\\file with spaces.txt") - == dest_paths.path1 / "file with spaces.txt" - ) - - # The second rule applies because it is longer and more specific than the first rule - assert applier.transform("C:\\shared\\projects") == dest_paths.path2 - assert applier.transform("C:\\shared\\projects\\file.txt") == dest_paths.path2 / "file.txt" diff --git a/test/unit/deadline_job_attachments/test_progress_tracker.py b/test/unit/deadline_job_attachments/test_progress_tracker.py deleted file mode 100644 index 706a18bca..000000000 --- a/test/unit/deadline_job_attachments/test_progress_tracker.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - - -from deadline.job_attachments.progress_tracker import ( - SummaryStatistics, - DownloadSummaryStatistics, - ProgressTracker, - ProgressStatus, -) -import pytest -import concurrent - - -# += operator doesn't seem to be non-threadsafe in python 3.10 or later, but can be an issue in earlier versions. -class TestProgressTracker: - """ - Tests for ProgressTracker class - """ - - def test_increment_race_condition(self): - progress_tracker = ProgressTracker(ProgressStatus.NONE, 0, 0) - - N = 10**5 - K = 10 - - def increment(): - for _ in range(N): - progress_tracker.increase_processed(1, 0) - - with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: - for _ in range(K): - executor.submit(increment) - - assert progress_tracker.processed_files == N * K - - -class TestSummaryStatistics: - """ - Tests for SummaryStatistics class - """ - - def test_aggregate_with_no_stats(self): - summary1 = SummaryStatistics() - summary2 = SummaryStatistics() - - expected_aggregated_stats = SummaryStatistics() - - aggregated = summary1.aggregate(summary2) - assert aggregated == expected_aggregated_stats - - def test_aggregate(self): - summary1 = SummaryStatistics( - total_time=10.0, - total_files=10, - total_bytes=1000, - processed_files=7, - processed_bytes=700, - skipped_files=3, - skipped_bytes=300, - transfer_rate=70.0, - ) - summary2 = SummaryStatistics( - total_time=10.0, - total_files=10, - total_bytes=1000, - processed_files=8, - processed_bytes=800, - skipped_files=2, - skipped_bytes=200, - transfer_rate=80.0, - ) - - expected_aggregated_stats = SummaryStatistics( - total_time=20.0, - total_files=20, - total_bytes=2000, - processed_files=15, - processed_bytes=1500, - skipped_files=5, - skipped_bytes=500, - transfer_rate=75.0, - ) - - aggregated = summary1.aggregate(summary2) - assert aggregated == expected_aggregated_stats - - def test_aggregate_summary_stats_and_download_summary_stats(self): - summary1 = SummaryStatistics( - total_time=10.0, - total_files=10, - total_bytes=1000, - processed_files=7, - processed_bytes=700, - skipped_files=3, - skipped_bytes=300, - transfer_rate=70.0, - ) - summary2 = DownloadSummaryStatistics( - total_time=10.0, - total_files=10, - total_bytes=1000, - processed_files=8, - processed_bytes=800, - skipped_files=2, - skipped_bytes=200, - transfer_rate=80.0, - file_counts_by_root_directory={ - "/home/username/outputs1": 1, - "/home/username/outputs2": 2, - "/home/username/outputs3": 5, - }, - ) - - expected_aggregated_stats = SummaryStatistics( - total_time=20.0, - total_files=20, - total_bytes=2000, - processed_files=15, - processed_bytes=1500, - skipped_files=5, - skipped_bytes=500, - transfer_rate=75.0, - ) - - aggregated = summary1.aggregate(summary2) - assert aggregated == expected_aggregated_stats - - -class TestDownloadSummaryStatistics: - """ - Tests for DownloadSummaryStatistics class - """ - - def test_aggregate_with_no_stats(self): - summary1 = DownloadSummaryStatistics() - summary2 = DownloadSummaryStatistics() - - expected_aggregated_stats = DownloadSummaryStatistics() - - aggregated = summary1.aggregate(summary2) - assert aggregated == expected_aggregated_stats - - def test_aggregate(self): - summary1 = DownloadSummaryStatistics( - total_time=10.0, - total_files=10, - total_bytes=1000, - processed_files=7, - processed_bytes=700, - skipped_files=3, - skipped_bytes=300, - transfer_rate=70.0, - file_counts_by_root_directory={ - "/home/username/outputs1": 1, - "/home/username/outputs2": 2, - "/home/username/outputs3": 4, - }, - ) - summary2 = DownloadSummaryStatistics( - total_time=10.0, - total_files=10, - total_bytes=1000, - processed_files=8, - processed_bytes=800, - skipped_files=2, - skipped_bytes=200, - transfer_rate=80.0, - file_counts_by_root_directory={ - "/home/username/outputs3": 5, - "/home/username/outputs4": 3, - }, - ) - - expected_aggregated_stats = DownloadSummaryStatistics( - total_time=20.0, - total_files=20, - total_bytes=2000, - processed_files=15, - processed_bytes=1500, - skipped_files=5, - skipped_bytes=500, - transfer_rate=75.0, - file_counts_by_root_directory={ - "/home/username/outputs1": 1, - "/home/username/outputs2": 2, - "/home/username/outputs3": 9, - "/home/username/outputs4": 3, - }, - ) - - aggregated = summary1.aggregate(summary2) - assert aggregated == expected_aggregated_stats - - def test_aggregate_with_summary_stats(self): - """ - Tests if it raises exception when DownloadSummaryStatistics calls aggreate function - with SummaryStatistics object. - """ - summary1 = DownloadSummaryStatistics() - summary2 = SummaryStatistics() - - with pytest.raises(TypeError): - summary1.aggregate(summary2) diff --git a/test/unit/deadline_job_attachments/test_upload.py b/test/unit/deadline_job_attachments/test_upload.py deleted file mode 100644 index 7312e2107..000000000 --- a/test/unit/deadline_job_attachments/test_upload.py +++ /dev/null @@ -1,3134 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -""" -Tests related to the uploading of assets. -""" - -import os -import sys -from copy import deepcopy -from datetime import datetime -from io import BytesIO -from logging import DEBUG, INFO -from pathlib import Path -from typing import Dict, List, Set, Tuple -from unittest.mock import MagicMock, patch - -import boto3 -import py.path -import pytest -from botocore.exceptions import BotoCoreError, ClientError, ReadTimeoutError -from botocore.stub import Stubber -from moto import mock_aws - -import deadline -from deadline.job_attachments.asset_manifests import ( - BaseManifestModel, - BaseManifestPath, - HashAlgorithm, - ManifestVersion, -) -from deadline.job_attachments.caches import HashCacheEntry, S3CheckCacheEntry -from deadline.job_attachments.exceptions import ( - AssetSyncError, - JobAttachmentsS3ClientError, - MisconfiguredInputsError, - MissingS3BucketError, - MissingS3RootPrefixError, -) -from deadline.job_attachments.models import ( - AssetRootGroup, - Attachments, - FileSystemLocation, - FileSystemLocationType, - ManifestProperties, - JobAttachmentS3Settings, - StorageProfileOperatingSystemFamily, - PathFormat, - StorageProfile, -) -from deadline.job_attachments.asset_manifests.v2023_03_03 import AssetManifest - -from deadline.job_attachments.progress_tracker import ( - ProgressStatus, - SummaryStatistics, -) -from deadline.job_attachments.upload import FileStatus, S3AssetManager, S3AssetUploader -from deadline.job_attachments.api import human_readable_file_size -from ..conftest import is_windows_non_admin - - -class TestUpload: - """ - Tests for handling uploading assets. - """ - - @pytest.fixture(autouse=True) - def before_test( - self, - request, - create_s3_bucket, - default_job_attachment_s3_settings: JobAttachmentS3Settings, - ): - """ - Setup the default queue and s3 bucket for all asset tests. - Mark test with `no_setup` if you don't want this setup to run. - """ - if "no_setup" in request.keywords: - return - - self.job_attachment_s3_settings = default_job_attachment_s3_settings - create_s3_bucket(bucket_name=default_job_attachment_s3_settings.s3BucketName) - - @mock_aws - @pytest.mark.parametrize( - "manifest_version,expected_manifest", - [ - ( - ManifestVersion.v2023_03_03, - '{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - '"paths":[{"hash":"d","mtime":1234000000,"path":"meta.txt","size":1},' - '{"hash":"a","mtime":1234000000,"path":"scene/maya.ma","size":1},' - '{"hash":"c","mtime":1234000000,"path":"textures/normals/normal.png","size":1},' - '{"hash":"b","mtime":1234000000,"path":"textures/texture.png","size":1}],"totalSize":4}', - ), - ], - ) - def test_asset_management( - self, - tmpdir: py.path.local, - farm_id, - queue_id, - default_job_attachment_s3_settings, - assert_canonical_manifest, - assert_expected_files_on_s3, - caplog, - manifest_version: ManifestVersion, - expected_manifest: str, - ): - """ - Test that the correct files get uploaded to S3 and the asset manifest - is as expected when there are multiple input and output files. - """ - # Given - asset_root = str(tmpdir) - - scene_file = tmpdir.mkdir("scene").join("maya.ma") - scene_file.write("a") - os.utime(scene_file, (1234, 1234)) - - texture_file = tmpdir.mkdir("textures").join("texture.png") - texture_file.write("b") - os.utime(texture_file, (1234, 1234)) - - normal_file = tmpdir.join("textures").mkdir("normals").join("normal.png") - normal_file.write("c") - os.utime(normal_file, (1234, 1234)) - - meta_file = tmpdir.join("meta.txt") - meta_file.write("d") - os.utime(meta_file, (1234, 1234)) - - cache_dir = tmpdir.mkdir("cache") - output_dir1 = tmpdir.join("outputs") - output_dir2 = tmpdir.join("outputs").join("textures") - - history_dir = tmpdir.join("history") - expected_manifest_file = history_dir.join("manifests").join("e_input") - expected_mapping_file = history_dir.join("manifests").join("manifest_s3_mapping") - expected_mapping_contents = f"{{'local_file': 'e_input', 's3_key': '{default_job_attachment_s3_settings.rootPrefix}/Manifests/{farm_id}/{queue_id}/Inputs/0000/e_input'}}\n" - assert not os.path.exists(history_dir) - assert not os.path.exists(expected_manifest_file) - assert not os.path.exists(expected_mapping_file) - - expected_total_input_bytes = ( - scene_file.size() + texture_file.size() + normal_file.size() + meta_file.size() - ) - - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["e", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect={ - str(scene_file): "a", - str(texture_file): "b", - str(normal_file): "c", - str(meta_file): "d", - }.get, - ), patch( - f"{deadline.__package__}.job_attachments.models._generate_random_guid", - return_value="0000", - ): - caplog.set_level(DEBUG) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[ - str(scene_file), - str(texture_file), - str(normal_file), - str(meta_file), - str(meta_file), - "", - ], - output_paths=[ - str(asset_root), - str(output_dir1), - str(output_dir2), - str(output_dir2), - "", - ], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, attachments = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=str(cache_dir), - manifest_write_dir=str(history_dir), - ) - - # Then - expected_attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath=asset_root, - rootPathFormat=PathFormat.POSIX, - inputManifestPath=f"{farm_id}/{queue_id}/Inputs/0000/e_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=[ - ".", - "outputs", - os.path.join("outputs", "textures"), - ], - ) - ], - ) - - assert attachments == expected_attachments - assert attachments.to_dict() == { # type: ignore - "fileSystem": "COPIED", - "manifests": [ - { - "rootPath": f"{asset_root}", - "rootPathFormat": PathFormat("posix").value, - "inputManifestPath": f"{farm_id}/{queue_id}/Inputs/0000/e_input", - "inputManifestHash": "manifesthash", - "outputRelativeDirectories": [ - ".", - "outputs", - os.path.join("outputs", "textures"), - ], - } - ], - } - - assert f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/e_input" in caplog.text - - # Ensure we wrote our manifest file locally - assert os.path.exists(expected_manifest_file) - assert os.path.isfile(expected_manifest_file) - assert os.path.exists(expected_mapping_file) - assert os.path.isfile(expected_mapping_file) - with open(expected_mapping_file, "r") as mapping_file: - actual_contents = mapping_file.read() - assert actual_contents == expected_mapping_contents - - assert_progress_report_last_callback( - num_input_files=4, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=4, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=4, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_s3_settings.s3BucketName) - - assert_expected_files_on_s3( - bucket, - expected_files={ - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/e_input", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/a.xxh128", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/b.xxh128", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/c.xxh128", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/d.xxh128", - }, - ) - - assert_canonical_manifest( - bucket, - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/e_input", - expected_manifest=expected_manifest, - ) - - @mock_aws - @pytest.mark.skipif( - sys.platform != "win32", - reason="Requires Windows to test resolving paths completely with multiple drives", - ) - @pytest.mark.parametrize( - "manifest_version,expected_manifest", - [ - ( - ManifestVersion.v2023_03_03, - '{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - '"paths":[{"hash":"a","mtime":1234000000,"path":"input.txt","size":1}],"totalSize":1}', - ), - ], - ) - def test_asset_management_windows_multi_root( - self, - tmpdir, - farm_id, - queue_id, - assert_canonical_manifest, - assert_expected_files_on_s3, - caplog, - manifest_version, - expected_manifest, - ): - """ - Test that the correct files get uploaded to S3 and the asset manifest - is as expected when there are multiple input and output files. - """ - # Given - root_c = tmpdir.mkdir("c-drive-inputs") - input_c = root_c.join("input.txt") - input_c.write("a") - os.utime(input_c, (1234, 1234)) - root_d = r"D:\my\awesome" - input_d = r"D:\my\awesome\input2.txt" # doesn't exist, shouldn't get included - output_d = r"D:\my\awesome\outputdir" - cache_dir = tmpdir.mkdir("cache") - - with patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["b", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect={str(input_c): "a"}.get, - ), patch( - f"{deadline.__package__}.job_attachments.models._generate_random_guid", - return_value="0000", - ): - caplog.set_level(DEBUG) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[input_c, input_d], - output_paths=[output_d], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, attachments = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=cache_dir, - ) - - # Then - expected_attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath=root_c, - rootPathFormat=PathFormat.WINDOWS, - inputManifestPath=f"{farm_id}/{queue_id}/Inputs/0000/b_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=[], - ), - ManifestProperties( - rootPath=root_d, - rootPathFormat=PathFormat.WINDOWS, - outputRelativeDirectories=["outputdir"], - ), - ], - ) - expected_total_input_bytes = input_c.size() - - assert attachments == expected_attachments - assert attachments.to_dict() == { # type: ignore - "fileSystem": "COPIED", - "manifests": [ - { - "rootPath": f"{root_c}", - "rootPathFormat": PathFormat("windows").value, - "inputManifestPath": f"{farm_id}/{queue_id}/Inputs/0000/b_input", - "inputManifestHash": "manifesthash", - }, - { - "rootPath": f"{root_d}", - "rootPathFormat": PathFormat("windows").value, - "outputRelativeDirectories": [ - "outputdir", - ], - }, - ], - } - - assert f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/b_input" in caplog.text - - assert_progress_report_last_callback( - num_input_files=1, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=1, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=1, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_s3_settings.s3BucketName) - - assert_expected_files_on_s3( - bucket, - expected_files={ - f"{self.job_attachment_s3_settings.rootPrefix}/Manifests/{farm_id}/{queue_id}/Inputs/0000/b_input", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/a.xxh128", - }, - ) - - assert_canonical_manifest( - bucket, - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/b_input", - expected_manifest=expected_manifest, - ) - - @mock_aws - @pytest.mark.parametrize( - "num_input_files", - [ - 1, - 100, - 200, - ], - ) - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_many_inputs( - self, - tmpdir, - farm_id, - queue_id, - assert_canonical_manifest, - assert_expected_files_on_s3, - caplog, - manifest_version: ManifestVersion, - num_input_files: int, - ): - """ - Test that the correct files get uploaded to S3 and the asset manifest - is as expected when there are multiple input and output files. - """ - # Given - asset_root = str(tmpdir) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["c", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=[str(i) for i in range(num_input_files)], - ), patch( - f"{deadline.__package__}.job_attachments.models._generate_random_guid", - return_value="0000", - ): - caplog.set_level(DEBUG) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - input_files = [] - expected_total_input_bytes = 0 - test_dir = tmpdir.mkdir("large_submit") - for i in range(num_input_files): - test_file = test_dir.join(f"test{i}.txt") - test_file.write(f"test {i}") - expected_total_input_bytes += test_file.size() - input_files.append(test_file) - - cache_dir = tmpdir.mkdir("cache") - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=input_files, - output_paths=[str(Path(asset_root).joinpath("outputs"))], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, attachments = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=cache_dir, - ) - - # Then - expected_attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath=asset_root, - rootPathFormat=PathFormat.POSIX, - inputManifestPath=f"{farm_id}/{queue_id}/Inputs/0000/c_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=["outputs"], - ) - ], - ) - - assert attachments == expected_attachments - assert attachments.to_dict() == { # type: ignore - "fileSystem": "COPIED", - "manifests": [ - { - "rootPath": f"{asset_root}", - "rootPathFormat": PathFormat("posix").value, - "inputManifestPath": f"{farm_id}/{queue_id}/Inputs/0000/c_input", - "inputManifestHash": "manifesthash", - "outputRelativeDirectories": ["outputs"], - } - ], - } - - assert f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/c_input" in caplog.text - - assert_progress_report_last_callback( - num_input_files=num_input_files, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=num_input_files, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=num_input_files, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_s3_settings.s3BucketName) - - expected_files = set( - [ - f"{self.job_attachment_s3_settings.full_cas_prefix()}/{i}.xxh128" - for i in range(num_input_files) - ] - ) - expected_files.add( - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/c_input", - ) - assert_expected_files_on_s3(bucket, expected_files=expected_files) - - @mock_aws - @pytest.mark.parametrize( - "num_input_files", - [ - 1, - 100, - 200, - ], - ) - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_many_inputs_with_same_hash( - self, - tmpdir, - farm_id, - queue_id, - manifest_version: ManifestVersion, - num_input_files: int, - ): - """ - Test that the asset management can handle many input files with the same hash. - If files with different paths have the same content (and thus the same hash), - they should be counted as skipped files. - """ - asset_root = str(tmpdir) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - # Change the number of thread workers to 1 to get consistent tests - asset_manager.asset_uploader.num_upload_workers = 1 - - # Given - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["c", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=lambda *args, **kwargs: "samehash", - ): - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - input_files = [] - expected_total_input_bytes = 0 - test_dir = tmpdir.mkdir("large_submit") - for i in range(num_input_files): - test_file = test_dir.join(f"test{i}.txt") - test_file.write("same content") - expected_total_input_bytes += test_file.size() - input_files.append(test_file) - expected_total_downloaded_bytes = test_file.size() - - cache_dir = tmpdir.mkdir("cache") - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=input_files, - output_paths=[str(Path(asset_root).joinpath("outputs"))], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, _ = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=cache_dir, - ) - - # Then - assert_progress_report_last_callback( - num_input_files=num_input_files, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=num_input_files, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=1, - processed_bytes=expected_total_downloaded_bytes, - skipped_files=num_input_files - 1, - skipped_bytes=expected_total_input_bytes - expected_total_downloaded_bytes, - ) - - @mock_aws - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_no_outputs_inputs_already_uploaded( - self, - tmpdir, - farm_id, - queue_id, - assert_expected_files_on_s3, - caplog, - manifest_version: ManifestVersion, - ): - """ - Test the input files that have already been uploaded to S3 are skipped. - """ - already_uploaded_file = tmpdir.mkdir("scene").join("maya_scene.ma") - already_uploaded_file.write("cool scene with lots of spheres") - - not_yet_uploaded_file = tmpdir.mkdir("textures").join("cool_texture.png") - not_yet_uploaded_file.write("the best texture you've ever seen") - - expected_total_skipped_bytes = already_uploaded_file.size() - expected_total_uploaded_bytes = not_yet_uploaded_file.size() - expected_total_input_bytes = expected_total_skipped_bytes + expected_total_uploaded_bytes - - def mock_hash_file(file_path: str, hash_alg: HashAlgorithm): - if file_path == already_uploaded_file: - return "existinghash" - elif file_path == not_yet_uploaded_file: - return "somethingnew" - - # Given - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["manifest", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=mock_hash_file, - ), patch( - f"{deadline.__package__}.job_attachments.models._generate_random_guid", - return_value="0000", - ): - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_s3_settings.s3BucketName) - - caplog.set_level(DEBUG) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - # mock pre-uploading the file - bucket.put_object( - Key=f"{self.job_attachment_s3_settings.full_cas_prefix()}/existinghash.xxh128", - Body="a", - ) - - cache_dir = tmpdir.mkdir("cache") - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[already_uploaded_file, not_yet_uploaded_file], - output_paths=[], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, _ = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=cache_dir, - ) - - # Then - assert "maya_scene.ma because it has already been uploaded to s3" in caplog.text - assert ( - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/manifest_input" - in caplog.text - ) - - assert_progress_report_last_callback( - num_input_files=2, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=2, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=1, - processed_bytes=expected_total_uploaded_bytes, - skipped_files=1, - skipped_bytes=expected_total_skipped_bytes, - ) - - assert_expected_files_on_s3( - bucket, - expected_files={ - f"{self.job_attachment_s3_settings.rootPrefix}/Manifests/{farm_id}/{queue_id}/Inputs/0000/manifest_input", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/existinghash.xxh128", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/somethingnew.xxh128", - }, - ) - - @mock_aws - @pytest.mark.parametrize( - "num_input_files", - [ - 1, - 100, - 200, - ], - ) - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_no_outputs_large_number_of_inputs_already_uploaded( - self, - tmpdir, - farm_id, - queue_id, - assert_expected_files_on_s3, - caplog, - manifest_version: ManifestVersion, - num_input_files: int, - ): - """ - Test the input files that have already been uploaded to S3 are skipped. - """ - # Given - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["manifesto", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=[str(i) for i in range(num_input_files)], - ), patch( - f"{deadline.__package__}.job_attachments.models._generate_random_guid", - return_value="0000", - ): - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_s3_settings.s3BucketName) - - caplog.set_level(DEBUG) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - input_files = [] - expected_total_input_bytes = 0 - test_dir = tmpdir.mkdir("large_submit") - for i in range(num_input_files): - test_file = test_dir.join(f"test{i}.txt") - test_file.write(f"test {i}") - expected_total_input_bytes += test_file.size() - input_files.append(test_file) - # mock pre-uploading the file - bucket.put_object( - Key=f"{self.job_attachment_s3_settings.full_cas_prefix()}/{i}.xxh128", - Body=f"test {i}", - ) - - not_yet_uploaded_file = tmpdir.mkdir("textures").join("texture.png") - not_yet_uploaded_file.write("b") - - cache_dir = tmpdir.mkdir("cache") - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=input_files, - output_paths=[], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, _ = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=cache_dir, - ) - - # Then - assert ( - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/manifesto_input" - in caplog.text - ) - - assert_progress_report_last_callback( - num_input_files=num_input_files, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=num_input_files, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=0, - processed_bytes=0, - skipped_files=num_input_files, - skipped_bytes=expected_total_input_bytes, - ) - - expected_files = set( - [ - f"{self.job_attachment_s3_settings.full_cas_prefix()}/{i}.xxh128" - for i in range(num_input_files) - ] - ) - expected_files.add( - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/manifesto_input", - ) - assert_expected_files_on_s3(bucket, expected_files=expected_files) - - @mock_aws - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_no_inputs( - self, - tmpdir, - farm_id, - queue_id, - assert_canonical_manifest, - assert_expected_files_on_s3, - caplog, - manifest_version: ManifestVersion, - ): - """ - Test that only the manifest file gets uploaded to S3 and the asset manifest is as expected - when there are no input files and multiple output files. - """ - output_dir = str(tmpdir.join("outputs")) - - # Given - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["a", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.models._generate_random_guid", - return_value="0000", - ): - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - cache_dir = tmpdir.mkdir("cache") - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[], - output_paths=[output_dir], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, attachments = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=cache_dir, - ) - - # Then - expected_attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath=output_dir, - rootPathFormat=PathFormat.POSIX, - outputRelativeDirectories=["."], - ) - ], - ) - - assert attachments == expected_attachments - assert attachments.to_dict() == { # type: ignore - "fileSystem": "COPIED", - "manifests": [ - { - "rootPath": f"{output_dir}", - "rootPathFormat": PathFormat("posix").value, - "outputRelativeDirectories": ["."], - } - ], - } - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=0, - processed_bytes=0, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=0, - processed_bytes=0, - skipped_files=0, - skipped_bytes=0, - ) - - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_no_s3_bucket_set( - self, - farm_id, - queue_id, - manifest_version: ManifestVersion, - ): - """ - Test that the appropriate error is raised when no s3 bucket is provided. - """ - missing_s3_job_attachment_settings = deepcopy(self.job_attachment_s3_settings) - - del missing_s3_job_attachment_settings.s3BucketName - - with pytest.raises(AttributeError): - S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=missing_s3_job_attachment_settings, - asset_manifest_version=manifest_version, - ) - - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_with_s3_bucket_empty( - self, - farm_id, - queue_id, - manifest_version: ManifestVersion, - ): - """ - Test that the appropriate error is raised when no s3 bucket is provided. - """ - s3_job_attachment_settings_with_s3_bucket_empty = deepcopy(self.job_attachment_s3_settings) - s3_job_attachment_settings_with_s3_bucket_empty.s3BucketName = "" - - with pytest.raises(MissingS3BucketError): - S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=s3_job_attachment_settings_with_s3_bucket_empty, - asset_manifest_version=manifest_version, - ) - - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_no_s3_root_prefix_set( - self, - farm_id, - queue_id, - manifest_version: ManifestVersion, - ): - """ - Test that the appropriate error is raised when no s3 root prefix is provided. - """ - missing_s3_job_attachment_settings = deepcopy(self.job_attachment_s3_settings) - - del missing_s3_job_attachment_settings.rootPrefix - - with pytest.raises(AttributeError): - S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=missing_s3_job_attachment_settings, - asset_manifest_version=manifest_version, - ) - - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_asset_management_with_root_prefix_empty( - self, - farm_id, - queue_id, - manifest_version: ManifestVersion, - ): - """ - Test that the appropriate error is raised when no s3 bucket is provided. - """ - s3_job_attachment_settings_with_root_prefix_empty = deepcopy( - self.job_attachment_s3_settings - ) - s3_job_attachment_settings_with_root_prefix_empty.rootPrefix = "" - - with pytest.raises(MissingS3RootPrefixError): - S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=s3_job_attachment_settings_with_root_prefix_empty, - asset_manifest_version=manifest_version, - ) - - def test_asset_management_manifest_version_not_implemented(self, farm_id, queue_id, tmpdir): - """ - Test that the appropriate error is raised when the library doesn't support an asset manifest version. - """ - with patch( - f"{deadline.__package__}.job_attachments.upload.ManifestModelRegistry.get_manifest_model", - return_value=BaseManifestModel, - ): - with pytest.raises( - NotImplementedError, - match=r"Creation of manifest version (ManifestVersion.)?UNDEFINED is not supported.", - ): - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=ManifestVersion.UNDEFINED, - ) - cache_dir = tmpdir.mkdir("cache") - test_file = tmpdir.join("test.txt") - test_file.write("test") - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[test_file], - output_paths=[], - referenced_paths=[], - ) - asset_manager.hash_assets_and_create_manifest( - upload_group.asset_groups, - upload_group.total_input_files, - upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - ) - - def test_asset_uploader_constructor(self, fresh_deadline_config): - """ - Test that when the asset uploader is created, the instance variables are correctly set. - """ - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - assert uploader.num_upload_workers == 5 - assert uploader.small_file_threshold == 20 * 8 * (1024**2) - - def test_asset_uploader_constructor_with_non_integer_config_settings( - self, fresh_deadline_config - ): - """ - Tests that the asset uploader works correctly with valid integer params. - (Non-integer config parsing is now the caller's responsibility.) - """ - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - assert uploader.num_upload_workers >= 1 - - @pytest.mark.parametrize( - "setting_name, nonvalid_value, expected_error_msg", - [ - pytest.param( - "s3_max_pool_connections", - "-100", - "'s3_max_pool_connections' (-100) must be positive integer.", - id="s3_max_pool_connections value is negative.", - ), - pytest.param( - "s3_max_pool_connections", - "0", - "'s3_max_pool_connections' (0) must be positive integer.", - id="s3_max_pool_connections value is 0.", - ), - pytest.param( - "s3_max_pool_connections", - "some string", - "Failed to parse configuration settings. Please ensure that the following settings in the config file are integers", - id="s3_max_pool_connections value is not a number.", - ), - pytest.param( - "small_file_threshold_multiplier", - "-12", - "'small_file_threshold_multiplier' (-12) must be positive integer.", - id="small_file_threshold_multiplier value is negative.", - ), - pytest.param( - "small_file_threshold_multiplier", - "some string", - "Failed to parse configuration settings. Please ensure that the following settings in the config file are integers", - id="small_file_threshold_multiplier value is not a number.", - ), - ], - ) - def test_asset_uploader_constructor_with_nonvalid_config_settings( - self, setting_name, nonvalid_value, expected_error_msg, fresh_deadline_config - ): - """ - Tests that when the asset uploader is created with nonvalid settings, an AssetSyncError is raised. - """ - kwargs = {"s3_max_pool_connections": 50, "small_file_threshold_multiplier": 20} - try: - kwargs[setting_name] = int(nonvalid_value) - except ValueError: - # Non-integer values can't be passed as int params, so this test case - # is no longer applicable (caller is responsible for parsing) - return - with pytest.raises(AssetSyncError) as err: - _ = S3AssetUploader(**kwargs) - assert expected_error_msg in str(err.value) - - @mock_aws - def test_file_already_uploaded_bucket_in_different_account(self): - """ - Test that the appropriate error is raised when checking if a file has already been uploaded, but the bucket - is in an account that is different from the uploader's account. - """ - s3 = boto3.client("s3") - stubber = Stubber(s3) - stubber.add_client_error( - "head_object", - service_error_code="AccessDenied", - service_message="Access Denied", - http_status_code=403, - ) - - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - - uploader._s3 = s3 - - with stubber: - with pytest.raises(JobAttachmentsS3ClientError) as err: - uploader.file_already_uploaded( - self.job_attachment_s3_settings.s3BucketName, "test_key" - ) - assert isinstance(err.value.__cause__, ClientError) - assert ( - err.value.__cause__.response["ResponseMetadata"]["HTTPStatusCode"] == 403 # type: ignore[attr-defined] - ) - assert ( - "Error checking if object exists in bucket 'test-bucket', Target key or prefix: 'test_key', " - "HTTP Status Code: 403, Access denied. Ensure that the bucket is in the account 123456789012, " - "and your AWS IAM Role or User has the 's3:ListBucket' permission for this bucket." - ) in str(err.value) - - @mock_aws - def test_file_already_uploaded_timeout(self): - """ - Test that the appropriate error is raised when a ReadTimeoutError occurs - during an S3 request to check file existence in an S3 bucket. - """ - mock_s3_client = MagicMock() - mock_s3_client.head_object.side_effect = ReadTimeoutError(endpoint_url="test_url") - - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - uploader._s3 = mock_s3_client - - with pytest.raises(AssetSyncError) as err: - uploader.file_already_uploaded(self.job_attachment_s3_settings.s3BucketName, "test_key") - assert isinstance(err.value.__cause__, BotoCoreError) - assert ( - "An issue occurred with AWS service request while checking for the existence of an object in the S3 bucket: " - 'Read timeout on endpoint URL: "test_url"\n' - "This could be due to temporary issues with AWS, internet connection, or your AWS credentials. " - "Please verify your credentials and network connection. If the problem persists, try again later" - " or contact support for further assistance." - ) in str(err.value) - - @mock_aws - def test_upload_bytes_to_s3_bucket_in_different_account(self): - """ - Test that the appropriate error is raised when uploading bytes, but the bucket - is in an account that is different from the uploader's account. - """ - s3 = boto3.client("s3") - stubber = Stubber(s3) - - # This is the error that's surfaced when a bucket is in a different account than expected. - stubber.add_client_error( - "put_object", - service_error_code="AccessDenied", - service_message="Access Denied", - http_status_code=403, - ) - - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - - uploader._s3 = s3 - - with stubber: - with pytest.raises(JobAttachmentsS3ClientError) as err: - uploader.upload_bytes_to_s3( - BytesIO(), self.job_attachment_s3_settings.s3BucketName, "test_key" - ) - assert isinstance(err.value.__cause__, ClientError) - assert ( - err.value.__cause__.response["ResponseMetadata"]["HTTPStatusCode"] == 403 # type: ignore[attr-defined] - ) - assert ( - "Error uploading binary file in bucket 'test-bucket', Target key or prefix: 'test_key', " - "HTTP Status Code: 403, Forbidden or Access denied. " - ) in str(err.value) - - @mock_aws - def test_upload_bytes_to_s3_timeout(self): - """ - Test that the appropriate error is raised when a ReadTimeoutError occurs - during an S3 request to upload a binary file to an S3 bucket. - """ - mock_s3_client = MagicMock() - mock_s3_client.upload_fileobj.side_effect = ReadTimeoutError(endpoint_url="test_url") - - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - uploader._s3 = mock_s3_client - - with pytest.raises(AssetSyncError) as err: - uploader.upload_bytes_to_s3( - BytesIO(), self.job_attachment_s3_settings.s3BucketName, "test_key" - ) - assert isinstance(err.value.__cause__, BotoCoreError) - assert ( - "An issue occurred with AWS service request while uploading binary file: " - 'Read timeout on endpoint URL: "test_url"\n' - "This could be due to temporary issues with AWS, internet connection, or your AWS credentials. " - "Please verify your credentials and network connection. If the problem persists, try again later" - " or contact support for further assistance." - ) in str(err.value) - - @mock_aws - def test_upload_file_to_s3_bucket_in_different_account(self, tmp_path: Path): - """ - Test that the appropriate error is raised when uploading files, but the bucket - is in an account that is different from the uploader's account. - """ - s3 = boto3.client("s3") - stubber = Stubber(s3) - - # This is the error that's surfaced when a bucket is in a different account than expected. - stubber.add_client_error( - "put_object", - service_error_code="AccessDenied", - service_message="Access Denied", - http_status_code=403, - ) - - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - - uploader._s3 = s3 - - file = tmp_path / "test_file" - file.write_text("") - - with stubber: - with pytest.raises(JobAttachmentsS3ClientError) as err: - uploader.upload_file_to_s3( - file, self.job_attachment_s3_settings.s3BucketName, "test_key" - ) - assert isinstance(err.value.__cause__, ClientError) - assert ( - err.value.__cause__.response["ResponseMetadata"]["HTTPStatusCode"] == 403 # type: ignore[attr-defined] - ) - assert ( - "Error uploading file in bucket 'test-bucket', Target key or prefix: 'test_key', " - "HTTP Status Code: 403, Forbidden or Access denied. " - ) in str(err.value) - assert (f"(Failed to upload {str(file)})") in str(err.value) - - @mock_aws - def test_upload_file_to_s3_bucket_has_kms_permissions_error(self, tmp_path: Path): - """ - Test that the appropriate error is raised when uploading files, but the bucket - is encrypted with a KMS key and the user doesn't have access to the key. - """ - s3 = boto3.client("s3") - stubber = Stubber(s3) - - # This is the error that's surfaced when a bucket is in a different account than expected. - stubber.add_client_error( - "put_object", - service_error_code="AccessDenied", - service_message="An error occurred (AccessDenied) when calling the PutObject operation: User: arn:aws:sts:::assumed-role/ is not authorized to perform: kms:GenerateDataKey on resource: arn:aws:kms:us-west-2::key/ because no identity-based policy allows the kms:GenerateDataKey action", - http_status_code=403, - ) - - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - - uploader._s3 = s3 - - file = tmp_path / "test_file" - file.write_text("") - - with stubber: - with pytest.raises(JobAttachmentsS3ClientError) as err: - uploader.upload_file_to_s3( - file, self.job_attachment_s3_settings.s3BucketName, "test_key" - ) - assert isinstance(err.value.__cause__, ClientError) - assert ( - err.value.__cause__.response["ResponseMetadata"]["HTTPStatusCode"] == 403 # type: ignore[attr-defined] - ) - assert ( - "If a customer-managed KMS key is set, confirm that your AWS IAM Role or " - "User has the 'kms:GenerateDataKey' and 'kms:DescribeKey' permissions for the key used to encrypt the bucket." - ) in str(err.value) - assert (f"(Failed to upload {str(file)})") in str(err.value) - - @mock_aws - def test_upload_file_to_s3_timeout(self, tmp_path: Path): - """ - Test that the appropriate error is raised when a ReadTimeoutError occurs - during an S3 request to upload a file to an S3 bucket. - """ - mock_future = MagicMock() - mock_transfer_manager = MagicMock() - mock_transfer_manager.upload.return_value = mock_future - mock_future.result.side_effect = ReadTimeoutError(endpoint_url="test_url") - - s3 = boto3.client("s3") - uploader = S3AssetUploader(s3_max_pool_connections=50, small_file_threshold_multiplier=20) - uploader._s3 = s3 - - file = tmp_path / "test_file" - file.write_text("") - - with patch( - f"{deadline.__package__}.job_attachments.upload.get_s3_transfer_manager", - return_value=mock_transfer_manager, - ): - with pytest.raises(AssetSyncError) as err: - uploader.upload_file_to_s3( - file, self.job_attachment_s3_settings.s3BucketName, "test_key" - ) - assert isinstance(err.value.__cause__, BotoCoreError) - assert ( - "An issue occurred with AWS service request while uploading file: " - 'Read timeout on endpoint URL: "test_url"\n' - "This could be due to temporary issues with AWS, internet connection, or your AWS credentials. " - "Please verify your credentials and network connection. If the problem persists, try again later" - " or contact support for further assistance." - ) in str(err.value) - - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_process_input_path_cached_file_is_updated( - self, farm_id, queue_id, tmpdir, manifest_version: ManifestVersion - ): - """ - Test that a file that exists in the hash cache, but has been modified, will be hashed again. - """ - # GIVEN - root_dir = tmpdir.mkdir("root") - test_file = root_dir.join("test.txt") - test_file.write("test") - file_time = os.stat(test_file).st_mtime - expected_entry = HashCacheEntry( - test_file, - HashAlgorithm.XXH128, - "b", - str(datetime.fromtimestamp((file_time))), - ) - - # WHEN - test_entry = HashCacheEntry(test_file, HashAlgorithm.XXH128, "a", "123.45") - hash_cache = MagicMock() - hash_cache.get_connection_entry.return_value = test_entry - - with patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=["b"], - ): - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - is_hashed, _, man_path = asset_manager._process_input_path( - Path(test_file), root_dir, hash_cache - ) - - # THEN - assert is_hashed == FileStatus.NEW or is_hashed == FileStatus.MODIFIED - assert man_path.path == "test.txt" - assert man_path.hash == "b" - hash_cache.put_entry.assert_called_with(expected_entry) - - def test_process_input_path_skip_file_already_in_hash_cache(self, farm_id, queue_id, tmpdir): - """ - Test the input files that already exists in the hash cache are skipped hashing. - """ - # GIVEN - root_dir = tmpdir.mkdir("root") - test_file = root_dir.join("test.txt") - test_file.write("test") - file_time = str(datetime.fromtimestamp(os.stat(test_file).st_mtime)) - file_bytes = test_file.size() - - # WHEN - test_entry = HashCacheEntry(test_file, HashAlgorithm.XXH128, "a", file_time) - hash_cache = MagicMock() - hash_cache.get_connection_entry.return_value = test_entry - - with patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=["a"], - ): - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=ManifestVersion.v2023_03_03, - ) - - is_hashed, size, man_path = asset_manager._process_input_path( - Path(test_file), root_dir, hash_cache - ) - _ = asset_manager._create_manifest_file( - [Path(test_file)], root_dir, hash_cache=hash_cache - ) - - # THEN - assert is_hashed == FileStatus.UNCHANGED - assert size == file_bytes - assert man_path.path == "test.txt" - assert man_path.hash == "a" - hash_cache.put_entry.assert_not_called() - - @mock_aws - def test_asset_management_misconfigured_inputs(self, farm_id, queue_id, tmpdir): - """ - Ensure that when directories are classified as files the submission is prevented with a MisconfiguredInputsError. - """ - asset_root = str(tmpdir) - - # GIVEN - scene_file = tmpdir.mkdir("scene").join("maya.ma") - scene_file.write("a") - input_not_exist = "/texture/that/doesnt/exist.anywhere" - directory_as_file = str(Path(scene_file).parent) - - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["c", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=["a"], - ): - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=ManifestVersion.v2023_03_03, - ) - - # WHEN / THEN - with pytest.raises(MisconfiguredInputsError, match="scene"): - asset_manager.prepare_paths_for_upload( - input_paths=[input_not_exist, directory_as_file, scene_file], - output_paths=[str(Path(asset_root).joinpath("outputs"))], - referenced_paths=[], - ) - - @mock_aws - def test_asset_management_input_not_exists(self, farm_id, queue_id, tmpdir, caplog): - """Test that input paths that do not exist are added to referenced files.""" - asset_root = str(tmpdir) - - # GIVEN - scene_file = tmpdir.mkdir("scene").join("maya.ma") - scene_file.write("a") - input_not_exist = tmpdir.join("/texture/that/does/notexist.anywhere") - - cache_dir = tmpdir.mkdir("cache") - - expected_total_input_bytes = scene_file.size() - - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["c", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=["a"], - ): - caplog.set_level(INFO) - - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=ManifestVersion.v2023_03_03, - ) - - # When - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[input_not_exist, scene_file], - output_paths=[str(Path(asset_root).joinpath("outputs"))], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=cache_dir, - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, _ = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=cache_dir, - ) - - # Then - assert "notexist.anywhere' does not exist. Adding to referenced paths." in caplog.text - assert len(upload_group.asset_groups) == 1 - assert len(upload_group.asset_groups[0].references) == 1 - assert Path(input_not_exist) in upload_group.asset_groups[0].references - - assert_progress_report_last_callback( - num_input_files=1, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=1, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=1, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - @mock_aws - def test_asset_management_input_not_exists_require_fails(self, farm_id, queue_id, tmpdir): - """Test that input paths that do not exist raise a MisconfiguredInputsError if the `require_paths_exist` flag is true.""" - asset_root = str(tmpdir) - - # GIVEN - scene_file = tmpdir.mkdir("scene").join("maya.ma") - scene_file.write("a") - input_not_exist = "/texture/that/does/notexist.anywhere" - - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["c", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=["a"], - ): - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=ManifestVersion.v2023_03_03, - ) - - # When - with pytest.raises(MisconfiguredInputsError, match="Missing input files") as execinfo: - asset_manager.prepare_paths_for_upload( - input_paths=[input_not_exist, scene_file], - output_paths=[str(Path(asset_root).joinpath("outputs"))], - referenced_paths=[], - require_paths_exist=True, - ) - - assert "notexist.anywhere" in str(execinfo) - - @mock_aws - @pytest.mark.parametrize( - "manifest_version,expected_manifest", - [ - ( - ManifestVersion.v2023_03_03, - '{"hashAlg":"xxh128","manifestVersion":"2023-03-03",' - '"paths":[{"hash":"a","mtime":1234000000,"path":"sym_ip_test.txt","size":1}],"totalSize":1}', - ), - ], - ) - @pytest.mark.skipif( - is_windows_non_admin(), - reason="Windows requires Admin to create symlinks, skipping this test.", - ) - def test_manage_assets_with_symlinks( - self, - tmpdir: py.path.local, - farm_id, - queue_id, - assert_canonical_manifest, - assert_expected_files_on_s3, - manifest_version: ManifestVersion, - expected_manifest: str, - ): - """ - Test that symlink paths that contain '..' expand the full path without - resolving the symlink target, but also hash the symlink target and not - the link. - - /tmp/source_folder/test.txt - /symlink-folder - """ - # Given - test_file = tmpdir.mkdir("source_folder").join("test.txt") - test_file.write("a") - - expected_total_input_bytes = test_file.size() - os.utime(test_file, (1234, 1234)) - - source_path = Path(tmpdir.join("source_folder")) - symlink_input_path = Path( - tmpdir.mkdir("symlink_folder").join("sub_folder").join("..").join("sym_ip_test.txt") - ) - symlink_input_path.symlink_to(str(test_file)) - symlink_output_path = Path( - tmpdir.join("symlink_folder").join("sub_folder").join("..").join("sym_op_test_dir") - ) - symlink_output_path.symlink_to(source_path, target_is_directory=True) - - cache_dir = tmpdir.mkdir("cache") - - # WHEN - with patch( - f"{deadline.__package__}.job_attachments.upload.PathFormat.get_host_path_format", - return_value=PathFormat.POSIX, - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_data", - side_effect=["manifest", "manifesthash"], - ), patch( - f"{deadline.__package__}.job_attachments.upload.hash_file", - side_effect=["a"], - ), patch( - f"{deadline.__package__}.job_attachments.models._generate_random_guid", - return_value="0000", - ): - mock_on_preparing_to_submit = MagicMock(return_value=True) - mock_on_uploading_assets = MagicMock(return_value=True) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - - upload_group = asset_manager.prepare_paths_for_upload( - input_paths=[str(symlink_input_path)], - output_paths=[str(symlink_output_path)], - referenced_paths=[], - ) - ( - hash_summary_statistics, - asset_root_manifests, - ) = asset_manager.hash_assets_and_create_manifest( - asset_groups=upload_group.asset_groups, - total_input_files=upload_group.total_input_files, - total_input_bytes=upload_group.total_input_bytes, - hash_cache_dir=str(cache_dir), - on_preparing_to_submit=mock_on_preparing_to_submit, - ) - - upload_summary_statistics, attachments = asset_manager.upload_assets( - manifests=asset_root_manifests, - on_uploading_assets=mock_on_uploading_assets, - s3_check_cache_dir=str(cache_dir), - ) - - # THEN - expected_root = str(tmpdir.join("symlink_folder")) - expected_attachments = Attachments( - manifests=[ - ManifestProperties( - rootPath=expected_root, - rootPathFormat=PathFormat.POSIX, - inputManifestPath=f"{farm_id}/{queue_id}/Inputs/0000/manifest_input", - inputManifestHash="manifesthash", - outputRelativeDirectories=["sym_op_test_dir"], - ) - ], - ) - - assert attachments == expected_attachments - - assert_progress_report_last_callback( - num_input_files=1, - expected_total_input_bytes=expected_total_input_bytes, - on_preparing_to_submit=mock_on_preparing_to_submit, - on_uploading_assets=mock_on_uploading_assets, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=hash_summary_statistics, - processed_files=1, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - assert_progress_report_summary_statistics( - actual_summary_statistics=upload_summary_statistics, - processed_files=1, - processed_bytes=expected_total_input_bytes, - skipped_files=0, - skipped_bytes=0, - ) - - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_s3_settings.s3BucketName) - - assert_expected_files_on_s3( - bucket, - expected_files={ - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/manifest_input", - f"{self.job_attachment_s3_settings.full_cas_prefix()}/a.xxh128", - }, - ) - - assert_canonical_manifest( - bucket, - f"assetRoot/Manifests/{farm_id}/{queue_id}/Inputs/0000/manifest_input", - expected_manifest=expected_manifest, - ) - - @pytest.mark.parametrize( - "mock_file_system_locations, expected_result", - [ - ( - [], - ({}, {}), - ), - ( - [ - FileSystemLocation( - name="location-1", - type=FileSystemLocationType.LOCAL, - path="C:\\User\\Movie1", - ), - ], - ({"C:\\User\\Movie1": "location-1"}, {}), - ), - ( - [ - FileSystemLocation( - name="location-1", - type=FileSystemLocationType.SHARED, - path="/mnt/shared/movie1", - ), - ], - ({}, {"/mnt/shared/movie1": "location-1"}), - ), - ( - [ - FileSystemLocation( - name="location-1", - type=FileSystemLocationType.LOCAL, - path="C:\\User\\Movie1", - ), - FileSystemLocation( - name="location-2", - type=FileSystemLocationType.LOCAL, - path="/home/user1/movie1", - ), - FileSystemLocation( - name="location-3", - type=FileSystemLocationType.SHARED, - path="/mnt/shared/movie1", - ), - FileSystemLocation( - name="location-4", - type=FileSystemLocationType.SHARED, - path="/mnt/shared/etc", - ), - ], - ( - { - "C:\\User\\Movie1": "location-1", - "/home/user1/movie1": "location-2", - }, - { - "/mnt/shared/movie1": "location-3", - "/mnt/shared/etc": "location-4", - }, - ), - ), - ], - ) - def test_get_file_system_locations_by_type( - self, - farm_id: str, - queue_id: str, - mock_file_system_locations: List[FileSystemLocation], - expected_result: Tuple[Dict[str, str], Dict[str, str]], - ): - mock_storage_profile_for_queue = StorageProfile( - storageProfileId="sp-0123456789", - displayName="Storage profile 1", - osFamily=StorageProfileOperatingSystemFamily.WINDOWS, - fileSystemLocations=mock_file_system_locations, - ) - - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - ) - - result = asset_manager._get_file_system_locations_by_type( - storage_profile_for_queue=mock_storage_profile_for_queue - ) - - assert result == expected_result - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for paths in POSIX path format and will be skipped on Windows.", - ) - @patch.object(Path, "exists", return_value=True) - @pytest.mark.parametrize( - "input_paths, output_paths, referenced_paths, local_type_locations, shared_type_locations, expected_result", - [ - ( - set(), # input paths - set(), # output paths - set(), # referenced paths - {}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [], - ), - ( - { - "/home/username/DOCS/inputs/input1.txt", - "/HOME/username/DOCS/inputs/input2.txt", - }, # input paths - {"/home/username/docs/outputs"}, # output paths - set(), # referenced paths - {}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [ - AssetRootGroup( - root_path="/", - inputs={ - Path("/home/username/DOCS/inputs/input1.txt"), - Path("/HOME/username/DOCS/inputs/input2.txt"), - }, - outputs={ - Path("/home/username/docs/outputs"), - }, - ), - ], - ), - ( - {"/home/username/docs/inputs/input1.txt"}, # input paths - {"/home/username/docs/outputs"}, # output paths - set(), # referenced paths - {"/home/username/movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [ - AssetRootGroup( - root_path="/home/username/docs", - inputs={ - Path("/home/username/docs/inputs/input1.txt"), - }, - outputs={ - Path("/home/username/docs/outputs"), - }, - ), - ], - ), - ( - {"/home/username/movie1/inputs/input1.txt"}, # input paths - {"/home/username/movie1/outputs"}, # output paths - set(), # referenced paths - {"/home/username/movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [ - AssetRootGroup( - file_system_location_name="Movie 1 - Local", - root_path="/home/username/movie1", - inputs={ - Path("/home/username/movie1/inputs/input1.txt"), - }, - outputs={ - Path("/home/username/movie1/outputs"), - }, - ), - ], - ), - ( - {"/mnt/shared/movie1/something.txt"}, # input paths - {"/home/username/movie1/outputs"}, # output paths - set(), # referenced paths - {"/home/username/movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - {"/mnt/shared/movie1": "Movie 1 - Shared"}, # File System Location (SHARED type) - [ - AssetRootGroup( - file_system_location_name="Movie 1 - Local", - root_path="/home/username/movie1/outputs", - inputs=set(), - outputs={ - Path("/home/username/movie1/outputs"), - }, - ), - ], - ), - ( - { - "/home/username/movie1/inputs/input1.txt", - "/home/username/movie1/inputs/input2.txt", - "/home/username/docs/doc1.txt", - "/home/username/docs/doc2.txt", - "/home/username/extra1.txt", - "/mnt/shared/movie1/something.txt", - }, # input paths - { - "/home/username/movie1/outputs1", - "/home/username/movie1/outputs2", - }, # output paths - {"/home/username/movie1/outputs1/referenced/path"}, # referenced paths - {"/home/username/movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - {"/mnt/shared/movie1": "Movie 1 - Shared"}, # File System Location (SHARED type) - [ - AssetRootGroup( - file_system_location_name="Movie 1 - Local", - root_path="/home/username/movie1", - inputs={ - Path("/home/username/movie1/inputs/input1.txt"), - Path("/home/username/movie1/inputs/input2.txt"), - }, - outputs={ - Path("/home/username/movie1/outputs1"), - Path("/home/username/movie1/outputs2"), - }, - references={Path("/home/username/movie1/outputs1/referenced/path")}, - ), - AssetRootGroup( - root_path="/home/username", - inputs={ - Path("/home/username/docs/doc1.txt"), - Path("/home/username/docs/doc2.txt"), - Path("/home/username/extra1.txt"), - }, - outputs=set(), - ), - ], - ), - ], - ) - def test_get_asset_groups( - self, - farm_id: str, - queue_id: str, - input_paths: Set[str], - output_paths: Set[str], - referenced_paths: Set[str], - local_type_locations: Dict[str, str], - shared_type_locations: Dict[str, str], - expected_result: List[AssetRootGroup], - ): - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - ) - result = asset_manager._get_asset_groups( - input_paths, - output_paths, - referenced_paths, - local_type_locations, - shared_type_locations, - ) - - sorted_result = sorted(result, key=lambda x: x.root_path) - sorted_expected_result = sorted(expected_result, key=lambda x: x.root_path) - - assert sorted_result == sorted_expected_result - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for paths in Windows path format and will be skipped on POSIX-based system.", - ) - @patch.object(Path, "exists", return_value=True) - @pytest.mark.parametrize( - "input_paths, output_paths, referenced_paths, local_type_locations, shared_type_locations, expected_result", - [ - ( - set(), # input paths - set(), # output paths - set(), # referenced paths - {}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [], - ), - ( - {"d:\\USERNAME\\DOCS\\inputs\\input1.txt"}, # input paths - {"D:\\username\\docs\\outputs"}, # output paths - set(), # referenced paths - {}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [ - AssetRootGroup( - root_path="D:\\username\\docs", - inputs={ - Path("d:\\USERNAME\\DOCS\\inputs\\input1.txt"), - }, - outputs={ - Path("D:\\username\\docs\\outputs"), - }, - ), - ], - ), - ( - {"D:\\username\\docs\\inputs\\input1.txt"}, # input paths - {"d:\\USERNAME\\DOCS\\outputs"}, # output paths - set(), # referenced paths - {}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [ - AssetRootGroup( - root_path="D:\\username\\docs", - inputs={ - Path("D:\\username\\docs\\inputs\\input1.txt"), - }, - outputs={ - Path("d:\\USERNAME\\DOCS\\outputs"), - }, - ), - ], - ), - ( - {"C:\\username\\docs\\inputs\\input1.txt"}, # input paths - {"C:\\username\\docs\\outputs"}, # output paths - set(), # referenced paths - {"C:\\username\\movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [ - AssetRootGroup( - root_path="C:\\username\\docs", - inputs={ - Path("C:\\username\\docs\\inputs\\input1.txt"), - }, - outputs={ - Path("C:\\username\\docs\\outputs"), - }, - ), - ], - ), - ( - {"C:\\username\\movie1\\inputs\\input1.txt"}, # input paths - {"C:\\username\\movie1\\outputs"}, # output paths - set(), # referenced paths - {"C:\\username\\movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - {}, # File System Location (SHARED type) - [ - AssetRootGroup( - file_system_location_name="Movie 1 - Local", - root_path="C:\\username\\movie1", - inputs={ - Path("C:\\username\\movie1\\inputs\\input1.txt"), - }, - outputs={ - Path("C:\\username\\movie1\\outputs"), - }, - ), - ], - ), - ( - {"X:\\mnt\\shared\\movie1\\something.txt"}, # input paths - {"C:\\username\\movie1\\outputs"}, # output paths - set(), # referenced paths - {"C:\\username\\movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - { - "X:\\mnt\\shared\\movie1": "Movie 1 - Shared" - }, # File System Location (SHARED type) - [ - AssetRootGroup( - file_system_location_name="Movie 1 - Local", - root_path="C:\\username\\movie1\\outputs", - inputs=set(), - outputs={ - Path("C:\\username\\movie1\\outputs"), - }, - ), - ], - ), - ( - { - "C:\\username\\movie1\\inputs\\input1.txt", - "C:\\username\\movie1\\inputs\\input2.txt", - "C:\\username\\docs\\doc1.txt", - "C:\\username\\docs\\doc2.txt", - "C:\\username\\extra1.txt", - "X:\\mnt\\shared\\movie1\\something.txt", - }, # input paths - { - "C:\\username\\movie1\\outputs1", - "C:\\username\\movie1\\outputs2", - }, # output paths - {"C:\\username\\movie1\\outputs1\\referenced\\path"}, # referenced paths - {"C:\\username\\movie1": "Movie 1 - Local"}, # File System Location (LOCAL type) - { - "X:\\mnt\\shared\\movie1": "Movie 1 - Shared" - }, # File System Location (SHARED type) - [ - AssetRootGroup( - file_system_location_name="Movie 1 - Local", - root_path="C:\\username\\movie1", - inputs={ - Path("C:\\username\\movie1\\inputs\\input1.txt"), - Path("C:\\username\\movie1\\inputs\\input2.txt"), - }, - outputs={ - Path("C:\\username\\movie1\\outputs1"), - Path("C:\\username\\movie1\\outputs2"), - }, - references={Path("C:\\username\\movie1\\outputs1\\referenced\\path")}, - ), - AssetRootGroup( - root_path="C:\\username", - inputs={ - Path("C:\\username\\docs\\doc1.txt"), - Path("C:\\username\\docs\\doc2.txt"), - Path("C:\\username\\extra1.txt"), - }, - outputs=set(), - ), - ], - ), - ], - ) - def test_get_asset_groups_for_windows( - self, - farm_id: str, - queue_id: str, - input_paths: Set[str], - output_paths: Set[str], - referenced_paths: Set[str], - local_type_locations: Dict[str, str], - shared_type_locations: Dict[str, str], - expected_result: List[AssetRootGroup], - ): - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - ) - result = asset_manager._get_asset_groups( - input_paths, - output_paths, - referenced_paths, - local_type_locations, - shared_type_locations, - ) - - sorted_result = sorted(result, key=lambda x: x.root_path) - sorted_expected_result = sorted(expected_result, key=lambda x: x.root_path) - - assert len(sorted_result) == len(sorted_expected_result) - for i in range(len(sorted_result)): - assert sorted_result[i].root_path.upper() == sorted_expected_result[i].root_path.upper() - assert sorted_result[i].inputs == sorted_expected_result[i].inputs - assert sorted_result[i].outputs == sorted_expected_result[i].outputs - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for paths in Windows path format and will be skipped on POSIX-based system.", - ) - @patch.object(Path, "exists", return_value=True) - def test_get_asset_groups_for_windows_case_insensitive( - self, - farm_id: str, - queue_id: str, - ): - """ - Tests that the asset manager can handle Windows paths and ignore case. - (Verifies if two paths are treated as the same when they differ only in letter case.) - """ - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - ) - - input_paths = { - "C:\\username\\docs\\inputs\\input1.txt", - "C:\\username\\DOCS\\inputs\\input1.txt", - } - output_paths = {"C:\\username\\docs\\outputs"} - - result = asset_manager._get_asset_groups( - input_paths, - output_paths, - referenced_paths=set(), - local_type_locations={}, - shared_type_locations={}, - ) - - assert result[0].root_path == "C:\\username\\docs" or "C:\\username\\DOCS" - assert result[0].inputs == {Path("C:\\username\\docs\\inputs\\input1.txt")} or { - Path("C:\\username\\DOCS\\inputs\\input1.txt") - } - assert result[0].outputs == {Path("C:\\username\\docs\\outputs")} - - @pytest.mark.parametrize( - "input_files, size_threshold, expected_queues", - [ - ( - [], - 100 * (1024**2), # 100 MB - ([], []), - ), - ( - [ - BaseManifestPath(path="", hash="", size=10 * (1024**2), mtime=1), - BaseManifestPath(path="", hash="", size=100 * (1024**2), mtime=1), - BaseManifestPath(path="", hash="", size=1000 * (1024**2), mtime=1), - ], - 100 * (1024**2), # 100 MB - ( - [ - BaseManifestPath(path="", hash="", size=10 * (1024**2), mtime=1), - BaseManifestPath(path="", hash="", size=100 * (1024**2), mtime=1), - ], - [ - BaseManifestPath(path="", hash="", size=1000 * (1024**2), mtime=1), - ], - ), - ), - ( - [ - BaseManifestPath(path="", hash="", size=10 * (1024**2), mtime=1), - BaseManifestPath(path="", hash="", size=100 * (1024**2), mtime=1), - ], - 800 * (1024**2), # 800 MB - ( - [ - BaseManifestPath(path="", hash="", size=10 * (1024**2), mtime=1), - BaseManifestPath(path="", hash="", size=100 * (1024**2), mtime=1), - ], - [], - ), - ), - ], - ) - def test_separate_files_by_size( - self, - input_files: List[BaseManifestPath], - size_threshold: int, - expected_queues: Tuple[List[BaseManifestPath], List[BaseManifestPath]], - ): - """ - Tests that a helper method `_separate_files_by_size` is working as expected. - """ - a3_asset_uploader = S3AssetUploader( - s3_max_pool_connections=50, small_file_threshold_multiplier=20 - ) - actual_queues = a3_asset_uploader._separate_files_by_size( - files_to_upload=input_files, - size_threshold=size_threshold, - ) - assert actual_queues == expected_queues - - @pytest.mark.parametrize( - "cache_entry", - [ - S3CheckCacheEntry( - s3_key="bucket/Data/test-hash", - last_seen_time=str(datetime.now().timestamp()), - ), - None, - ], - ) - def test_verify_hash_cache_integrity_returns_true_when_cache_and_s3_match(self, cache_entry): - # Given - mock_s3_check_cache_impl = MagicMock() - mock_s3_check_cache_impl.get_connection_entry.return_value = cache_entry - mock_s3_check_cache = MagicMock() - mock_s3_check_cache.__enter__.return_value = mock_s3_check_cache_impl - - mock_s3_client = MagicMock() - mock_s3_client.head_object.return_value = {} - - s3_asset_uploader = S3AssetUploader( - s3_max_pool_connections=50, small_file_threshold_multiplier=20 - ) - s3_asset_uploader._s3 = mock_s3_client - - # When - with patch( - f"{deadline.__package__}.job_attachments.upload.S3CheckCache", - return_value=mock_s3_check_cache, - ): - with patch.object( - s3_asset_uploader, - "file_already_uploaded", - wraps=s3_asset_uploader.file_already_uploaded, - ) as file_already_uploaded_spy: - # Then - # Execute verify_hash_cache_integrity where hash exists in S3 - assert s3_asset_uploader.verify_hash_cache_integrity( - s3_check_cache_dir="cache-dir", - manifest=AssetManifest( - hash_alg=HashAlgorithm.XXH128, - paths=[ - BaseManifestPath( - path="test-file.txt", hash="test-hash", size=5, mtime=1 - ) - ], - total_size=1, - ), - s3_cas_prefix="Data", - s3_bucket="bucket", - ) - if cache_entry: - file_already_uploaded_spy.assert_called_once_with( - bucket="bucket", key="Data/test-hash" - ) - else: - file_already_uploaded_spy.assert_not_called() - - def test_verify_hash_cache_integrity_returns_false_when_cache_and_s3_mismatch(self): - # Given - mock_s3_check_cache_impl = MagicMock() - mock_s3_check_cache_impl.get_connection_entry.return_value = S3CheckCacheEntry( - s3_key="bucket/Data/test-hash", - last_seen_time=str(datetime.now().timestamp()), - ) - mock_s3_check_cache = MagicMock() - mock_s3_check_cache.__enter__.return_value = mock_s3_check_cache_impl - - mock_s3_client = MagicMock() - mock_s3_client.head_object.side_effect = ClientError( - {"ResponseMetadata": {"HTTPStatusCode": 404}}, "HeadObject" - ) - - s3_asset_uploader = S3AssetUploader( - s3_max_pool_connections=50, small_file_threshold_multiplier=20 - ) - s3_asset_uploader._s3 = mock_s3_client - - # When - with patch( - f"{deadline.__package__}.job_attachments.upload.S3CheckCache", - return_value=mock_s3_check_cache, - ): - with patch.object( - s3_asset_uploader, - "file_already_uploaded", - wraps=s3_asset_uploader.file_already_uploaded, - ) as file_already_uploaded_spy: - # Execute verify_hash_cache_integrity where hash does not exist in S3 - # Then - assert not s3_asset_uploader.verify_hash_cache_integrity( - s3_check_cache_dir="cache-dir", - manifest=AssetManifest( - hash_alg=HashAlgorithm.XXH128, - paths=[ - BaseManifestPath( - path="test-file.txt", hash="test-hash", size=5, mtime=1 - ) - ], - total_size=1, - ), - s3_cas_prefix="Data", - s3_bucket="bucket", - ) - - file_already_uploaded_spy.assert_called_once_with( - bucket="bucket", key="Data/test-hash" - ) - - def test_reset_s3_check_cache_removes_cache(self, tmpdir): - # Given - cache_dir: str = "mock_dir" - mock_s3_check_cache_impl = MagicMock() - - mock_s3_check_cache = MagicMock() - mock_s3_check_cache.__enter__.return_value = mock_s3_check_cache_impl - - # When - with patch( - f"{deadline.__package__}.job_attachments.upload.S3CheckCache", - return_value=mock_s3_check_cache, - ): - # Execute reset_s3_check_cache where the cached hash does not exist in S3 - s3_asset_uploader = S3AssetUploader( - s3_max_pool_connections=50, small_file_threshold_multiplier=20 - ) - s3_asset_uploader.reset_s3_check_cache(cache_dir) - - # Then - mock_s3_check_cache_impl.remove_cache.assert_called_once() - - @mock_aws - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_upload_object_to_cas_skips_upload_with_cache( - self, - tmpdir, - farm_id, - queue_id, - manifest_version, - default_job_attachment_s3_settings, - ): - """ - Tests that objects are not uploaded to S3 if there is a corresponding entry in the S3CheckCache - """ - # Given - asset_root = tmpdir.mkdir("test-root") - test_file = asset_root.join("test-file.txt") - test_file.write("stuff") - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - s3_key = f"{default_job_attachment_s3_settings.s3BucketName}/prefix/test-hash.xxh128" - test_entry = S3CheckCacheEntry(s3_key, "123.45") - s3_cache = MagicMock() - s3_cache.get_connection_entry.return_value = test_entry - - # When - with patch.object( - asset_manager.asset_uploader, - "_get_current_timestamp", - side_effect=["345.67"], - ): - is_uploaded, file_size = asset_manager.asset_uploader.upload_object_to_cas( - file=BaseManifestPath(path="test-file.txt", hash="test-hash", size=5, mtime=1), - hash_algorithm=HashAlgorithm.XXH128, - s3_bucket=default_job_attachment_s3_settings.s3BucketName, - source_root=Path(asset_root), - s3_cas_prefix="prefix", - s3_check_cache=s3_cache, - ) - - # Then - assert not is_uploaded - assert file_size == 5 - s3_cache.put_entry.assert_not_called() - - @mock_aws - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - @pytest.mark.parametrize( - "file_exists_in_s3, expected_upload", - [ - pytest.param(True, False, id="file-exists-skip-upload"), - pytest.param(False, True, id="file-missing-do-upload"), - ], - ) - def test_upload_object_to_cas_force_s3_check_bypasses_cache( - self, - tmpdir, - farm_id, - queue_id, - manifest_version, - default_job_attachment_s3_settings, - file_exists_in_s3, - expected_upload, - ): - """ - Tests that when force_s3_check=True, the S3CheckCache is bypassed and S3 HEAD is always performed. - Verifies that upload happens only when file doesn't exist in S3. - """ - # Given - asset_root = tmpdir.mkdir("test-root") - test_file = asset_root.join("test-file.txt") - test_file.write("stuff") - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - s3_key = f"{default_job_attachment_s3_settings.s3BucketName}/prefix/test-hash.xxh128" - test_entry = S3CheckCacheEntry(s3_key, "123.45") - s3_cache = MagicMock() - # Cache has an entry, but it should be bypassed - s3_cache.get_connection_entry.return_value = test_entry - - # When - with patch.object( - asset_manager.asset_uploader, - "_get_current_timestamp", - side_effect=["345.67"], - ), patch.object( - asset_manager.asset_uploader, - "file_already_uploaded", - return_value=file_exists_in_s3, - ) as mock_file_already_uploaded, patch.object( - asset_manager.asset_uploader, - "upload_file_to_s3", - ) as mock_upload_file_to_s3: - is_uploaded, file_size = asset_manager.asset_uploader.upload_object_to_cas( - file=BaseManifestPath(path="test-file.txt", hash="test-hash", size=5, mtime=1), - hash_algorithm=HashAlgorithm.XXH128, - s3_bucket=default_job_attachment_s3_settings.s3BucketName, - source_root=Path(asset_root), - s3_cas_prefix="prefix", - s3_check_cache=s3_cache, - force_s3_check=True, - ) - - # Then - assert is_uploaded == expected_upload - assert file_size == 5 - # Cache lookup should NOT have been called (bypassed due to force_s3_check) - s3_cache.get_connection_entry.assert_not_called() - # S3 HEAD should always be called when force_s3_check=True - mock_file_already_uploaded.assert_called_once_with( - default_job_attachment_s3_settings.s3BucketName, - "prefix/test-hash.xxh128", - ) - # Upload should only happen if file doesn't exist in S3 - if expected_upload: - mock_upload_file_to_s3.assert_called_once() - else: - mock_upload_file_to_s3.assert_not_called() - # Cache should always be updated after HEAD/upload - expected_new_entry = S3CheckCacheEntry(s3_key, "345.67") - s3_cache.put_entry.assert_called_once_with(expected_new_entry) - - def test_open_non_symlink_file_binary(self, tmp_path: Path): - temp_file = tmp_path / "temp_file.txt" - temp_file.write_text("this is test file") - - a3_asset_uploader = S3AssetUploader( - s3_max_pool_connections=50, small_file_threshold_multiplier=20 - ) - with a3_asset_uploader._open_non_symlink_file_binary(str(temp_file)) as file_obj: - assert file_obj is not None - assert file_obj.read() == b"this is test file" - - def test_open_non_symlink_file_binary_posix_fail(self, tmp_path: Path, caplog): - caplog.set_level(DEBUG) - - # IF - target_file = tmp_path / "target_file.txt" - target_file.write_text(("This is target")) - symlink_path = tmp_path / "symlink" - os.symlink(target_file, symlink_path) - - # WHEN - a3_asset_uploader = S3AssetUploader( - s3_max_pool_connections=50, small_file_threshold_multiplier=20 - ) - with a3_asset_uploader._open_non_symlink_file_binary(str(symlink_path)) as file_obj: - # THEN - assert file_obj is None - assert ( - f"Failed to open file. The following file will be skipped: {symlink_path}" - in caplog.text - ) - if hasattr(os, "O_NOFOLLOW") is False: - # Windows or other platforms that don't support O_NOFOLLOW - assert "Mismatch between path and its final path" in caplog.text - else: - # Posix - assert "Too many levels of symbolic links:" in caplog.text - - @mock_aws - @pytest.mark.parametrize( - "manifest_version", - [ - ManifestVersion.v2023_03_03, - ], - ) - def test_upload_object_to_cas_adds_cache_entry( - self, - tmpdir, - farm_id, - queue_id, - manifest_version, - default_job_attachment_s3_settings, - assert_expected_files_on_s3, - ): - """ - Tests that when an object is added to the CAS, an S3 cache entry is added. - """ - # Given - asset_root = tmpdir.mkdir("test-root") - test_file = asset_root.join("test-file.txt") - test_file.write("stuff") - asset_manager = S3AssetManager( - farm_id=farm_id, - queue_id=queue_id, - job_attachment_settings=self.job_attachment_s3_settings, - asset_manifest_version=manifest_version, - ) - s3_key = f"{default_job_attachment_s3_settings.s3BucketName}/prefix/test-hash.xxh128" - s3_cache = MagicMock() - s3_cache.get_connection_entry.return_value = None - expected_new_entry = S3CheckCacheEntry(s3_key, "345.67") - - # When - with patch.object( - asset_manager.asset_uploader, - "_get_current_timestamp", - side_effect=["345.67"], - ): - is_uploaded, file_size = asset_manager.asset_uploader.upload_object_to_cas( - file=BaseManifestPath(path="test-file.txt", hash="test-hash", size=5, mtime=1), - hash_algorithm=HashAlgorithm.XXH128, - s3_bucket=default_job_attachment_s3_settings.s3BucketName, - source_root=Path(asset_root), - s3_cas_prefix="prefix", - s3_check_cache=s3_cache, - ) - - # Then - assert is_uploaded - assert file_size == 5 - s3_cache.put_entry.assert_called_once_with(expected_new_entry) - - s3 = boto3.Session(region_name="us-west-2").resource("s3") # pylint: disable=invalid-name - bucket = s3.Bucket(self.job_attachment_s3_settings.s3BucketName) - - assert_expected_files_on_s3( - bucket, - expected_files={"prefix/test-hash.xxh128"}, - ) - - def test_gather_upload_metadata(self): - # Given - manifest = AssetManifest( - hash_alg=HashAlgorithm("xxh128"), - total_size=10, - paths=[ - BaseManifestPath(path="output_file", hash="a", size=1, mtime=167907934333848), - BaseManifestPath( - path="output/nested_output_file", - hash="b", - size=1, - mtime=1479079344833848, - ), - ], - ) - # When - hash_alg, _, manifest_name = S3AssetUploader._gather_upload_metadata( - manifest, Path("mocksourcerootpath"), "suffix" - ) - # Then - assert hash_alg == HashAlgorithm.XXH128 - assert manifest_name == "73addc7c69ddec53bd8d9df653add3c4_suffix" - - def test_get_hashed_file_name_from_root_str(self): - # Given - manifest = AssetManifest( - hash_alg=HashAlgorithm("xxh128"), - total_size=10, - paths=[ - BaseManifestPath(path="output_file", hash="a", size=1, mtime=167907934333848), - BaseManifestPath( - path="output/nested_output_file", - hash="b", - size=1, - mtime=1479079344833848, - ), - ], - ) - # When - hash_alg, manifest_name = S3AssetUploader._get_hashed_file_name_from_root_str( - manifest, "C:\\Program Files\\My App\\data.txt", "suffix" - ) - # Then - assert hash_alg == HashAlgorithm.XXH128 - assert manifest_name == "da20652d1ff9f1dc55050b28b3ab6d36_suffix" - - def test_get_total_size_of_files_with_valid_files(self, tmp_path: Path): - """Test that _get_total_size_of_files correctly calculates total size for valid files.""" - # Create test files with known sizes - file1 = tmp_path / "file1.txt" - file2 = tmp_path / "file2.txt" - file1.write_text("hello") # 5 bytes - file2.write_text("world!") # 6 bytes - - manager = S3AssetManager( - farm_id="farm-123", - queue_id="queue-456", - job_attachment_settings=JobAttachmentS3Settings( - s3BucketName="test-bucket", rootPrefix="test-prefix" - ), - ) - - total_size = manager._get_total_size_of_files([str(file1), str(file2)]) - assert total_size == 11 - - def test_get_total_size_of_files_with_missing_files(self, tmp_path: Path, caplog): - """Test that _get_total_size_of_files handles missing files gracefully.""" - # Create one real file and reference non-existent files - real_file = tmp_path / "real.txt" - real_file.write_text("content") # 7 bytes - missing_file = tmp_path / "missing.txt" - - manager = S3AssetManager( - farm_id="farm-123", - queue_id="queue-456", - job_attachment_settings=JobAttachmentS3Settings( - s3BucketName="test-bucket", rootPrefix="test-prefix" - ), - ) - - total_size = manager._get_total_size_of_files([str(real_file), str(missing_file)]) - assert total_size == 7 - assert "Skipping file in size calculation" in caplog.text - - def test_get_total_size_of_files_uses_threadpool(self, tmp_path: Path): - """Test that _get_total_size_of_files uses ThreadPoolExecutor with correct configuration.""" - test_file = tmp_path / "test.txt" - test_file.write_text("test") - - manager = S3AssetManager( - farm_id="farm-123", - queue_id="queue-456", - job_attachment_settings=JobAttachmentS3Settings( - s3BucketName="test-bucket", rootPrefix="test-prefix" - ), - ) - - with patch( - "deadline.job_attachments.upload.concurrent.futures.ThreadPoolExecutor" - ) as mock_executor: - mock_executor.return_value.__enter__.return_value.map.return_value = [4] - - manager._get_total_size_of_files([str(test_file)]) - - mock_executor.assert_called_once_with(max_workers=8) - - def test_get_total_size_of_files_empty_list(self): - """Test that _get_total_size_of_files returns 0 for empty file list.""" - manager = S3AssetManager( - farm_id="farm-123", - queue_id="queue-456", - job_attachment_settings=JobAttachmentS3Settings( - s3BucketName="test-bucket", rootPrefix="test-prefix" - ), - ) - - total_size = manager._get_total_size_of_files([]) - assert total_size == 0 - - def test_get_total_input_size_from_manifests_uses_manifest_sizes(self): - """Test that _get_total_input_size_from_manifests uses manifest path sizes directly.""" - from deadline.job_attachments.models import AssetRootManifest - - manager = S3AssetManager( - farm_id="farm-123", - queue_id="queue-456", - job_attachment_settings=JobAttachmentS3Settings( - s3BucketName="test-bucket", rootPrefix="test-prefix" - ), - ) - - # Create manifests with known file sizes - manifest1 = AssetManifest( - hash_alg=HashAlgorithm.XXH128, - total_size=150, - paths=[ - BaseManifestPath(path="file1.txt", hash="hash1", size=100, mtime=1234567890), - BaseManifestPath(path="file2.txt", hash="hash2", size=50, mtime=1234567891), - ], - ) - - manifest2 = AssetManifest( - hash_alg=HashAlgorithm.XXH128, - total_size=75, - paths=[ - BaseManifestPath(path="file3.txt", hash="hash3", size=75, mtime=1234567892), - ], - ) - - root_manifests = [ - AssetRootManifest(root_path="/path1", asset_manifest=manifest1), - AssetRootManifest(root_path="/path2", asset_manifest=manifest2), - AssetRootManifest(root_path="/path3", asset_manifest=None), # No manifest - ] - - total_files, total_bytes = manager._get_total_input_size_from_manifests(root_manifests) - - assert total_files == 3 # 2 files from manifest1 + 1 file from manifest2 - assert total_bytes == 225 # 100 + 50 + 75 - - def test_get_total_input_size_from_manifests_empty_manifests(self): - """Test that _get_total_input_size_from_manifests handles empty manifest list.""" - manager = S3AssetManager( - farm_id="farm-123", - queue_id="queue-456", - job_attachment_settings=JobAttachmentS3Settings( - s3BucketName="test-bucket", rootPrefix="test-prefix" - ), - ) - - total_files, total_bytes = manager._get_total_input_size_from_manifests([]) - - assert total_files == 0 - assert total_bytes == 0 - - def test_get_total_input_size_from_manifests_no_asset_manifests(self): - """Test that _get_total_input_size_from_manifests handles root manifests with no asset manifests.""" - from deadline.job_attachments.models import AssetRootManifest - - manager = S3AssetManager( - farm_id="farm-123", - queue_id="queue-456", - job_attachment_settings=JobAttachmentS3Settings( - s3BucketName="test-bucket", rootPrefix="test-prefix" - ), - ) - - root_manifests = [ - AssetRootManifest(root_path="/path1", asset_manifest=None), - AssetRootManifest(root_path="/path2", asset_manifest=None), - ] - - total_files, total_bytes = manager._get_total_input_size_from_manifests(root_manifests) - - assert total_files == 0 - assert total_bytes == 0 - - -def assert_progress_report_last_callback( - num_input_files: int, - expected_total_input_bytes: int, - on_preparing_to_submit: MagicMock, - on_uploading_assets: MagicMock, -): - """ - Assert that the argument of the last callback (when the progress is 100%) is as expected. - """ - readable_total_input_bytes = human_readable_file_size(expected_total_input_bytes) - actual_args, _ = on_preparing_to_submit.call_args - actual_last_hashing_progress_report = actual_args[0] - assert actual_last_hashing_progress_report.status == ProgressStatus.PREPARING_IN_PROGRESS - assert actual_last_hashing_progress_report.progress == 100.0 - assert ( - f"Processed {readable_total_input_bytes} / {readable_total_input_bytes}" - f" of {num_input_files} file{'' if num_input_files == 1 else 's'}" - " (Hashing speed: " - ) in actual_last_hashing_progress_report.progressMessage - - actual_args, _ = on_uploading_assets.call_args - actual_last_upload_progress_report = actual_args[0] - assert actual_last_upload_progress_report.status == ProgressStatus.UPLOAD_IN_PROGRESS - assert actual_last_upload_progress_report.progress == 100.0 - assert ( - f"Uploaded {readable_total_input_bytes} / {readable_total_input_bytes}" - f" of {num_input_files} file{'' if num_input_files == 1 else 's'}" - " (Transfer rate: " - ) in actual_last_upload_progress_report.progressMessage - - -def assert_progress_report_summary_statistics( - actual_summary_statistics: SummaryStatistics, - processed_files: int, - processed_bytes: int, - skipped_files: int, - skipped_bytes: int, -): - """ - Assert that the reported summary statistics are as expected. - """ - expected_summary_statistics = SummaryStatistics( - total_time=actual_summary_statistics.total_time, - total_files=processed_files + skipped_files, - total_bytes=processed_bytes + skipped_bytes, - processed_files=processed_files, - processed_bytes=processed_bytes, - skipped_files=skipped_files, - skipped_bytes=skipped_bytes, - transfer_rate=processed_bytes / actual_summary_statistics.total_time, - ) - assert actual_summary_statistics == expected_summary_statistics diff --git a/test/unit/deadline_job_attachments/test_utils.py b/test/unit/deadline_job_attachments/test_utils.py deleted file mode 100644 index 58d1b71ae..000000000 --- a/test/unit/deadline_job_attachments/test_utils.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -from pathlib import Path -import sys - -import pytest - -from deadline.job_attachments._utils import ( - _normalize_windows_path, - _is_relative_to, - _retry, -) - - -class TestUtils: - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for paths in Windows format and will be skipped on non-Windows systems.", - ) - @pytest.mark.parametrize( - ("input_path", "expected"), - [ - (r"\\?\C:\path\to\file.txt", Path(r"C:\path\to\file.txt")), - (r"\\?\D:\another\long\path", Path(r"D:\another\long\path")), - (r"C:\normal\path.txt", Path(r"C:\normal\path.txt")), - (r"Z:\already\normal\path", Path(r"Z:\already\normal\path")), - ], - ) - def test_normalize_windows_path(self, input_path, expected): - """ - Tests if _normalize_windows_path correctly strips the \\?\\ prefix - from Windows extended-length paths. - """ - assert _normalize_windows_path(Path(input_path)) == expected - - @pytest.mark.skipif( - sys.platform == "win32", - reason="This test is for paths in POSIX path format and will be skipped on Windows.", - ) - @pytest.mark.parametrize( - ("path1", "path2", "expected"), - [ - ("/a/b/c", "/a/b", True), - (Path("/a/b/c.txt"), "/a", True), - ("a/b/c", "a/b", True), - (Path("a/b/c.txt"), "a", True), - ("/a/b/c", "a/b", False), - ("a/b/c", "/a/b", False), - ("/a/b/c", "/d", False), - ("a/b/c", "b", False), - ("a/b/c", "d", False), - ], - ) - def test_is_relative_to_on_posix(self, path1, path2, expected): - """ - Tests if the is_relative_to() works correctly when using Posix paths. - """ - assert _is_relative_to(path1, path2) == expected - - @pytest.mark.skipif( - sys.platform != "win32", - reason="This test is for paths in Windows path format and will be skipped on non-Windows.", - ) - @pytest.mark.parametrize( - ("path1", "path2", "expected"), - [ - ("C:/a/b/c", "C:/a/b", True), - (Path("C:/a/b/c.txt"), "C:/a", True), - ("C:\\a\\b\\c", "C:\\a\\b", True), - (Path("C:\\a\\b\\c.txt"), "C:\\a", True), - ("a/b/c", "a/b", True), - (Path("a/b/c.txt"), "a", True), - ("C:/a/b/c", "a/b", False), - ("a/b/c", "C:/a/b", False), - ("C:/a/b/c", "C:/d", False), - ("a/b/c", "b", False), - ("a/b/c", "d", False), - ( - "\\\\?\\C:\\path\\to\\a\\very\\long\\file\\path\\that\\exceeds\\the\\windows\\max\\path\\length\\for\\testing\\max\\file\\path\\error\\handling\\when\\comparing\\path\\relativity\\using\\job\\attachments", - "C:\\path\\to\\", - True, - ), - ( - "\\\\?\\C:\\path\\to\\a\\very\\long\\file\\path\\that\\exceeds\\the\\windows\\max\\path\\length\\for\\testing\\max\\file\\path\\error\\handling\\when\\comparing\\path\\relativity\\using\\job\\attachments", - "C:\\path\\doesnt\\exist\\", - False, - ), - ( - "\\\\?\\C:\\ProgramData\\Amazon\\OpenJD\\session-612345a668724122b6949a232cb4583e1234567d\\assetroot-777691d8674399c12345\\Desktop\\resources\\isolated-black-tree-silhouettes-white-background-shade-trees-used-product-design-isolated-black-tree-silhouettes-1270.jpg", - Path( - "C:\\ProgramData\\Amazon\\OpenJD\\session-612345a668724122b6949a232cb4583e1234567d\\assetroot-777691d8674399c12345" - ), - True, - ), - ], - ) - def test_is_relative_to_on_windows(self, path1, path2, expected): - """ - Tests if the is_relative_to() works correctly when using Windows paths. - """ - assert _is_relative_to(path1, path2) == expected - - def test_retry(self): - """ - Test a function that throws an exception is retried. - """ - call_count = 0 - - # Given - @_retry(ExceptionToCheck=NotImplementedError, tries=2, delay=0.1, backoff=0.1) - def test_bad_function(): - nonlocal call_count - call_count = call_count + 1 - if call_count == 1: - raise NotImplementedError() - - # When - test_bad_function() - - # Then - assert call_count == 2 diff --git a/test/unit/deadline_job_attachments/test_vfs.py b/test/unit/deadline_job_attachments/test_vfs.py deleted file mode 100644 index ddb0c5818..000000000 --- a/test/unit/deadline_job_attachments/test_vfs.py +++ /dev/null @@ -1,896 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -"""Tests for the Asset Synching class for task-level attachments.""" - -import os -import stat -import sys -from pathlib import Path -import subprocess -import threading -from typing import Union -from unittest.mock import Mock, patch, call, MagicMock - -import pytest - -import deadline -from deadline.job_attachments.asset_sync import AssetSync -from deadline.job_attachments.exceptions import ( - VFSExecutableMissingError, - VFSLaunchScriptMissingError, -) -from deadline.job_attachments.models import JobAttachmentS3Settings -from deadline.job_attachments.vfs import ( - VFSProcessManager, - DEADLINE_VFS_ENV_VAR, - DEADLINE_VFS_CACHE_ENV_VAR, - DEADLINE_VFS_EXECUTABLE, - DEADLINE_VFS_EXECUTABLE_SCRIPT, - DEADLINE_VFS_INSTALL_PATH, - DEADLINE_VFS_PID_FILE_NAME, - DEADLINE_MANIFEST_GROUP_READ_PERMS, - VFS_LOGS_FOLDER_IN_SESSION, -) - - -# TODO: Remove the skip once we support Windows for AssetSync -@pytest.mark.skipif(sys.platform == "win32", reason="VFS doesn't currently support Windows") -class TestVFSProcessmanager: - @pytest.fixture(autouse=True) - def setup_and_teardown( - self, - request, - create_s3_bucket, - default_job_attachment_s3_settings: JobAttachmentS3Settings, - default_asset_sync: AssetSync, - ): - """ - Setup the default queue and s3 bucket for all asset tests. - Mark test with `no_setup` if you don't want this setup to run. - After test completes, reset all static VFSProcessManager fields - """ - if "no_setup" in request.keywords: - return - - create_s3_bucket(bucket_name=default_job_attachment_s3_settings.s3BucketName) - self.default_asset_sync = default_asset_sync - self.s3_settings = default_job_attachment_s3_settings - - yield - - # reset VFSProcessManager fields - VFSProcessManager.exe_path = None - VFSProcessManager.launch_script_path = None - VFSProcessManager.library_path = None - VFSProcessManager.cwd_path = None - - def test_build_launch_command( - self, - tmp_path: Path, - ): - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - test_os_user = "test-user" - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user=test_os_user, - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - test_executable = os.environ[DEADLINE_VFS_ENV_VAR] + DEADLINE_VFS_EXECUTABLE_SCRIPT - - expected_launch_command = ( - f"sudo -E -u {test_os_user}" - f" {test_executable} {local_root} -f --clienttype=deadline" - f" --bucket={self.s3_settings.s3BucketName}" - f" --manifest={manifest_path}" - f" --region={os.environ['AWS_DEFAULT_REGION']}" - f" -oallow_other" - ) - with patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ): - assert ( - process_manager.build_launch_command(mount_point=local_root) - == expected_launch_command - ) - - # Create process manager with CAS prefix - test_CAS_prefix: str = "test_prefix" - process_manager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user=test_os_user, - os_env_vars={"AWS_PROFILE": "test-profile"}, - cas_prefix=test_CAS_prefix, - ) - - # intermediate cleanup - VFSProcessManager.launch_script_path = None - - expected_launch_command = ( - f"sudo -E -u {test_os_user}" - f" {test_executable} {local_root} -f --clienttype=deadline" - f" --bucket={self.s3_settings.s3BucketName}" - f" --manifest={manifest_path}" - f" --region={os.environ['AWS_DEFAULT_REGION']}" - f" -oallow_other" - f" --casprefix={test_CAS_prefix}" - ) - with patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ): - assert ( - process_manager.build_launch_command(mount_point=local_root) - == expected_launch_command - ) - - def test_find_vfs_with_env_set( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - # verify which is only called when class path is not set - with patch(f"{deadline.__package__}.job_attachments.vfs.shutil.which") as mock_which: - mock_which.return_value = "/test/path" - test_path: Union[os.PathLike, str] = process_manager.find_vfs() - - assert str(test_path) == "/test/path" - test_path = process_manager.find_vfs() - mock_which.assert_called_once() - - # Reset VFS path and remove from PATH so other methods are checked - mock_which.return_value = None - VFSProcessManager.exe_path = None - - with patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists" - ) as mock_path_exists: - mock_path_exists.return_value = False - - with pytest.raises(VFSExecutableMissingError): - process_manager.find_vfs() - - # Verify DEADLINE_VFS_ENV_VAR location is checked - # Verify bin folder is checked as a last resort - mock_path_exists.assert_has_calls( - [ - call(os.environ[DEADLINE_VFS_ENV_VAR] + f"/bin/{DEADLINE_VFS_EXECUTABLE}"), - call(os.path.join(os.getcwd(), f"bin/{DEADLINE_VFS_EXECUTABLE}")), - ] - ) - - def test_find_vfs_with_deadline_env_set( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - # verify which is only called when class path is not set - with patch(f"{deadline.__package__}.job_attachments.vfs.shutil.which") as mock_which: - mock_which.return_value = "/test/path" - test_path: Union[os.PathLike, str] = process_manager.find_vfs() - - assert str(test_path) == "/test/path" - test_path = process_manager.find_vfs() - mock_which.assert_called_once() - - # Reset VFS path and remove from PATH so other methods are checked - mock_which.return_value = None - VFSProcessManager.exe_path = None - - with patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists" - ) as mock_path_exists: - mock_path_exists.return_value = False - - with pytest.raises(VFSExecutableMissingError): - process_manager.find_vfs() - - # Verify DEADLINE_VFS_ENV_VAR location is checked - # Verify bin folder is checked as a last resort - mock_path_exists.assert_has_calls( - [ - call(os.environ[DEADLINE_VFS_ENV_VAR] + f"/bin/{DEADLINE_VFS_EXECUTABLE}"), - call(os.path.join(os.getcwd(), f"bin/{DEADLINE_VFS_EXECUTABLE}")), - ] - ) - - def find_vfs_with_env_not_set( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - bin_check = os.path.join(os.getcwd(), f"bin/{DEADLINE_VFS_EXECUTABLE}") - - # verify which is only called when class path is not set - with patch( - f"{deadline.__package__}.job_attachments.vfs.shutil.which", - return_value=None, - ) as mock_which, patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - side_effect=lambda x: True if x == bin_check else False, - ) as mock_path_exists: - test_path: Union[os.PathLike, str] = process_manager.find_vfs() - assert str(test_path) == bin_check - - test_path = process_manager.find_vfs() - mock_which.assert_called_once() - assert mock_path_exists.call_count == 2 - - def test_find_library_path( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs" - ) as mock_find_vfs: - mock_find_vfs.return_value = "/test/directory/path" - library_path: Union[os.PathLike, str] = process_manager.get_library_path() - - assert str(library_path) == "/test/lib" - - process_manager.get_library_path() - - mock_find_vfs.assert_called_once() - - def test_find_vfs_launch_script_with_env_set( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - vfs_test_path = str((Path(__file__) / "deadline_vfs").resolve()) - os.environ[DEADLINE_VFS_ENV_VAR] = vfs_test_path - - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists" - ) as mock_os_path_exists: - mock_os_path_exists.return_value = True - deadline_vfs_launch_script_path: Union[os.PathLike, str] = ( - process_manager.find_vfs_launch_script() - ) - assert ( - str(deadline_vfs_launch_script_path) - == vfs_test_path + DEADLINE_VFS_EXECUTABLE_SCRIPT - ) - - process_manager.find_vfs_launch_script() - - mock_os_path_exists.assert_called_once() - - VFSProcessManager.launch_script_path = None - mock_os_path_exists.return_value = False - - with pytest.raises(VFSLaunchScriptMissingError): - process_manager.find_vfs_launch_script() - - def test_find_vfs_launch_script_with_env_not_set( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - # Note that env variable not set - if DEADLINE_VFS_ENV_VAR in os.environ: - os.environ.pop(DEADLINE_VFS_ENV_VAR) - - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists" - ) as mock_os_path_exists: - mock_os_path_exists.return_value = True - deadline_vfs_launch_script_path: Union[os.PathLike, str] = ( - process_manager.find_vfs_launch_script() - ) - - # Will return preset vfs install path with exe script path appended since env is not set - assert ( - str(deadline_vfs_launch_script_path) - == DEADLINE_VFS_INSTALL_PATH + DEADLINE_VFS_EXECUTABLE_SCRIPT - ) - - process_manager.find_vfs_launch_script() - mock_os_path_exists.assert_called_once() - - VFSProcessManager.launch_script_path = None - mock_os_path_exists.return_value = False - - with pytest.raises(VFSLaunchScriptMissingError): - process_manager.find_vfs_launch_script() - - def test_create_mount_point( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - - # Create process manager without CAS prefix - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - # Verify mount point is created and others have rwx access to it - process_manager.create_mount_point(local_root) - assert os.path.exists(local_root) - assert bool(os.stat(local_root).st_mode & stat.S_IROTH) - assert bool(os.stat(local_root).st_mode & stat.S_IWOTH) - assert bool(os.stat(local_root).st_mode & stat.S_IXOTH) - - def test_pids_recorded_and_killed( - self, - tmp_path: Path, - ): - session_dir: str = str(tmp_path) - dest_dir1: str = "assetroot-27bggh78dd2b568ab123" - local_root1: str = f"{session_dir}/{dest_dir1}" - manifest_path1: str = f"{local_root1}/manifest.json" - dest_dir2: str = "assetroot-27bggh78dd23131d221" - local_root2: str = f"{session_dir}/{dest_dir2}" - manifest_path2: str = f"{local_root2}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - test_pid1 = 12345 - test_pid2 = 67890 - test_os_user = "test-user" - # Create process managers - process_manager1: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path1, - mount_point=local_root1, - os_user=test_os_user, - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - process_manager2: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path2, - mount_point=local_root2, - os_user=test_os_user, - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs", - return_value="/test/directory/path", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.Popen", - ) as mock_popen, patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.wait_for_mount", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.run" - ) as mock_subprocess_run, patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.get_launch_environ", - return_value=os.environ, - ): - # start first mock VFS process - mock_subprocess = MagicMock() - mock_subprocess.pid = test_pid1 - mock_popen.return_value = mock_subprocess - process_manager1.start(tmp_path) - - # start second mock VFS process - mock_subprocess.pid = test_pid2 - process_manager2.start(tmp_path) - - # verify the pids were written to the correct location - pid_file_path = (tmp_path / DEADLINE_VFS_PID_FILE_NAME).resolve() - with open(pid_file_path, "r") as pid_file: - pid_file_contents = pid_file.readlines() - assert f"{local_root1}:{test_pid1}:{manifest_path1}\n" in pid_file_contents - assert f"{local_root2}:{test_pid2}:{manifest_path2}\n" in pid_file_contents - - assert os.path.exists(local_root1) - assert os.path.exists(local_root2) - - VFSProcessManager.kill_all_processes(tmp_path, os_user=test_os_user) - # Verify all mounts were killed - mock_subprocess_run.assert_has_calls( - [ - call( - VFSProcessManager.get_shutdown_args(local_root1, test_os_user), check=True - ), - call( - VFSProcessManager.get_shutdown_args(local_root2, test_os_user), check=True - ), - ], - any_order=True, - ) - with pytest.raises(FileNotFoundError): - open(pid_file_path, "r") - - def test_process_output_captured( - self, - tmp_path: Path, - ): - # Test to verify the spawned process output is captured and redirected to log.info - - session_dir: str = str(tmp_path) - dest_dir1: str = "assetroot-27bggh78dd2b568ab123" - local_root1: str = f"{session_dir}/{dest_dir1}" - manifest_path1: str = f"{local_root1}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - test_pid1 = 12345 - - # Create process managers - process_manager1: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path1, - mount_point=local_root1, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs", - return_value="/test/directory/path", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.Popen", - ) as mock_popen, patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.wait_for_mount", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ), patch(f"{deadline.__package__}.job_attachments.vfs.log") as mock_logger, patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.get_launch_environ", - return_value=os.environ, - ): - call_count = 0 - exception_count = 0 - signal = threading.Semaphore(0) - - # Intercept the logging - def mock_log(str): - nonlocal call_count - if str == "a" or str == "b" or str == "c": - call_count += 1 - - def mock_exception(str): - nonlocal exception_count - nonlocal signal - exception_count += 1 - signal.release() - - # Create a series of mock outputs and signal completion at the end - def mock_output(): - yield "a".encode("utf-8") - yield "b".encode("utf-8") - yield "c".encode("utf-8") - yield Exception("Test Exception") - - mock_logger.info = Mock(side_effect=mock_log) - mock_logger.exception = Mock(side_effect=mock_exception) - - mock_subprocess = MagicMock() - mock_subprocess.pid = test_pid1 - mock_subprocess.stdout = mock_output() - mock_popen.return_value = mock_subprocess - - # Start the process and wait for our signal to indicate all the outputs have been read - process_manager1.start(tmp_path) - - # Wait for *up to* 60 seconds at most for the mock outputs to be read by the thread. - # This should never take that long and failing the timeout should indicate something is wrong. - assert signal.acquire(blocking=True, timeout=60) - - # Verify all output was logged - assert call_count == 3 - assert exception_count == 1 - - def test_pids_file_behavior( - self, - tmp_path: Path, - ): - # Test to verify the spawned process output is captured and redirected to log.info - - session_dir: str = str(tmp_path) - dest_dir1: str = "assetroot-27bggh78dd2b568ab123" - local_root1: str = f"{session_dir}/{dest_dir1}" - manifest_path1: str = f"{local_root1}/manifest.json" - dest_dir2: str = "assetroot-27bggh78dd23131d221" - local_root2: str = f"{session_dir}/{dest_dir2}" - manifest_path2: str = f"{local_root2}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - test_pid1 = 12345 - test_pid2 = 67890 - test_os_user = "test-user" - - # Create process managers - process_manager1: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path1, - mount_point=local_root1, - os_user=test_os_user, - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - process_manager2: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path2, - mount_point=local_root2, - os_user=test_os_user, - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs", - return_value="/test/directory/path", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.Popen", - ) as mock_popen, patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.wait_for_mount", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.run" - ) as mock_subprocess_run, patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.get_launch_environ", - return_value=os.environ, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.is_mount", - return_value=True, - ): - # start first mock VFS process - mock_subprocess = MagicMock() - mock_subprocess.pid = test_pid1 - mock_popen.return_value = mock_subprocess - process_manager1.start(tmp_path) - - # Verify only the first processes' pid is written - assert VFSProcessManager.get_manifest_path_for_mount( - session_dir=tmp_path, mount_point=local_root1 - ) == Path(manifest_path1) - assert not VFSProcessManager.get_manifest_path_for_mount( - session_dir=tmp_path, mount_point=local_root2 - ) - - # start second mock VFS process - mock_subprocess.pid = test_pid2 - process_manager2.start(tmp_path) - - # Verify both pids are written - assert VFSProcessManager.get_manifest_path_for_mount( - session_dir=tmp_path, mount_point=local_root1 - ) == Path(manifest_path1) - assert VFSProcessManager.get_manifest_path_for_mount( - session_dir=tmp_path, mount_point=local_root2 - ) == Path(manifest_path2) - - # Verify killing process 1 removes pid entry - assert VFSProcessManager.kill_process_at_mount( - session_dir=tmp_path, mount_point=local_root1, os_user=test_os_user - ) - mock_subprocess_run.assert_called_with( - VFSProcessManager.get_shutdown_args(local_root1, test_os_user), check=True - ) - assert not VFSProcessManager.get_manifest_path_for_mount( - session_dir=tmp_path, mount_point=local_root1 - ) - assert VFSProcessManager.get_manifest_path_for_mount( - session_dir=tmp_path, mount_point=local_root2 - ) == Path(manifest_path2) - - VFSProcessManager.kill_all_processes(tmp_path, os_user=test_os_user) - - mock_subprocess_run.assert_has_calls( - [ - call( - VFSProcessManager.get_shutdown_args(local_root1, test_os_user), check=True - ), - call( - VFSProcessManager.get_shutdown_args(local_root2, test_os_user), check=True - ), - ], - any_order=True, - ) - - def test_manifest_group_set( - self, - tmp_path: Path, - ): - # Test to verify group ownership of the manifest is set properly on startup - - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-27bggh78dd2b568ab123" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - test_os_user = "test-user" - test_os_group = "test-group" - - # Create process manager - process_manager1: VFSProcessManager = VFSProcessManager( - asset_bucket=self.s3_settings.s3BucketName, - region=os.environ["AWS_DEFAULT_REGION"], - manifest_path=manifest_path, - mount_point=local_root, - os_user=test_os_user, - os_group=test_os_group, - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs", - return_value="/test/directory/path", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.Popen", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.wait_for_mount", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.shutil.chown", - ) as mock_chown, patch( - f"{deadline.__package__}.job_attachments.vfs.os.chmod", - ) as mock_chmod, patch(f"{deadline.__package__}.job_attachments.vfs.subprocess.run"), patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.get_launch_environ", - return_value=os.environ, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.is_mount", - return_value=True, - ): - process_manager1.start(tmp_path) - - mock_chown.assert_called_with(manifest_path, group=test_os_group) - - mock_chmod.assert_called_with(manifest_path, DEADLINE_MANIFEST_GROUP_READ_PERMS) - - -@pytest.mark.parametrize("vfs_cache_enabled", [True, False]) -def test_launch_environment_has_expected_settings( - tmp_path: Path, - vfs_cache_enabled: bool, -): - # Test to verify when retrieving the launch environment it does not contain os.environ variables (Unless passed in), - # it DOES contain the VFSProcessManager's environment variables, and aws configuration variables aren't modified - session_dir: str = str(tmp_path) - test_mount: str = f"{session_dir}/test_mount" - manifest_path1: str = f"{session_dir}/manifests/some_manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - if vfs_cache_enabled: - os.environ[DEADLINE_VFS_CACHE_ENV_VAR] = "V0" - else: - os.environ.pop(DEADLINE_VFS_CACHE_ENV_VAR, None) - - provided_vars = { - "VFS_ENV_VAR": "test-vfs-env-var", - "AWS_PROFILE": "test-profile", - "AWS_CONFIG_FILE": "test-config", - "AWS_SHARED_CREDENTIALS_FILE": "test-credentials", - } - # Create process managers - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket="test-bucket", - region="test-region", - manifest_path=manifest_path1, - mount_point=test_mount, - os_user="test-user", - os_env_vars=provided_vars, - ) - - # Provided environment variables are passed through - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs", - return_value="/test/directory/path", - ): - launch_env = process_manager.get_launch_environ() - - for key, value in provided_vars.items(): - assert launch_env.get(key) == value - - if vfs_cache_enabled: - assert launch_env.get(DEADLINE_VFS_CACHE_ENV_VAR) == "V0" - else: - assert launch_env.get(DEADLINE_VFS_CACHE_ENV_VAR) is None - - # Base environment variables are not passed through - assert not launch_env.get(DEADLINE_VFS_ENV_VAR) - - -def test_vfs_launched_in_session_folder( - tmp_path: Path, -): - # Test to verify the cwd of launched vfs is the session folder - - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-cwdtest" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - - # Create process manager - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket="test-bucket", - region="test-region", - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs", - return_value="/test/directory/path", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.Popen", - ) as mock_popen, patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.wait_for_mount", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.get_launch_environ", - return_value=os.environ, - ): - process_manager.start(tmp_path) - - launch_command = process_manager.build_launch_command(mount_point=local_root) - launch_env = process_manager.get_launch_environ() - - mock_popen.assert_called_once_with( - args=launch_command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - cwd=session_dir, # Was the session folder used as cwd - env=launch_env, - shell=True, - executable="/bin/bash", - ) - - -def test_vfs_has_expected_logs_folder( - tmp_path: Path, -): - # Test to verify the expected logs folder is returned - - session_dir: str = str(tmp_path) - dest_dir: str = "assetroot-logsdirtest" - local_root: str = f"{session_dir}/{dest_dir}" - manifest_path: str = f"{local_root}/manifest.json" - os.environ[DEADLINE_VFS_ENV_VAR] = str((Path(__file__) / "deadline_vfs").resolve()) - expected_logs_folder = tmp_path / VFS_LOGS_FOLDER_IN_SESSION - - # Create process manager - process_manager: VFSProcessManager = VFSProcessManager( - asset_bucket="test-bucket", - region="test-region", - manifest_path=manifest_path, - mount_point=local_root, - os_user="test-user", - os_env_vars={"AWS_PROFILE": "test-profile"}, - ) - - assert VFSProcessManager.logs_folder_path(tmp_path) == expected_logs_folder - - with patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.find_vfs", - return_value="/test/directory/path", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.subprocess.Popen", - ), patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.wait_for_mount", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.os.path.exists", - return_value=True, - ), patch( - f"{deadline.__package__}.job_attachments.vfs.VFSProcessManager.get_launch_environ", - return_value=os.environ, - ): - process_manager.start(tmp_path) - - assert process_manager.get_logs_folder() == expected_logs_folder diff --git a/testing_containers/localuser_sudo_environment/Dockerfile b/testing_containers/localuser_sudo_environment/Dockerfile deleted file mode 100644 index 828821245..000000000 --- a/testing_containers/localuser_sudo_environment/Dockerfile +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -FROM python:3.13-bullseye - -# Set environment variables, and let our tests know that we are in an -# environment that can run the sudo tests -ENV DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_USER=targetuser -ENV DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_GROUP=targetgroup -ENV DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_USER=disjointuser -ENV DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_GROUP=disjointgroup - -# Use a docker volume to mount the root of the repo to this directory -WORKDIR /code - -# We set up three users for our tests: -# 1) hostuser -- the user that will be running the pytests. -# 2) targetuser -- the user assumed to be running the job in the tests. -# 3) disjointuser -- a user to be used in cross-account testing. -# These accounts belong to the following groups: -# hostuser: hostuser, targetgroup -# targetuser: targetuser, targetgroup -# disjointuser: disjointuser, disjointgroup -RUN apt-get update && apt-get install sudo && \ - rm -rf /var/lib/apt/lists/* && \ - addgroup ${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_GROUP} && \ - useradd -ms /bin/bash -G ${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_GROUP} ${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_USER} && \ - useradd -ms /bin/bash -G ${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_GROUP} hostuser && \ - echo "hostuser ALL=(${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_TARGET_USER},hostuser) NOPASSWD: ALL" > /etc/sudoers.d/hostuser && \ - addgroup ${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_GROUP} && \ - useradd -ms /bin/bash -G ${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_GROUP} ${DEADLINE_JOB_ATTACHMENT_TEST_SUDO_DISJOINT_USER} && \ - chmod 777 /code - -WORKDIR /home/hostuser - -COPY --chown=hostuser:hostuser run_tests.sh /home/hostuser/ - -USER hostuser - -CMD ["/bin/sh", "-c", "./run_tests.sh"] diff --git a/testing_containers/localuser_sudo_environment/README.md b/testing_containers/localuser_sudo_environment/README.md deleted file mode 100644 index 539597dc6..000000000 --- a/testing_containers/localuser_sudo_environment/README.md +++ /dev/null @@ -1,6 +0,0 @@ -## Docker Environment for Testing File and Directory Permissions - -This Docker environment is set up to test code related to modifying OS group ownership and permissions of files and directories. It adds two different OS groups (and a user in each group,) allowing us to manipulate system-level settings of files and directories. When the container starts, it executes the test script `run_tests.sh`, which only runs unit tests marked with `docker`. - -### Usage -To use this Docker environment and run the related tests, navigate to the root of this repository and run the `./scripts/run_sudo_tests.sh` script. diff --git a/testing_containers/localuser_sudo_environment/run_tests.sh b/testing_containers/localuser_sudo_environment/run_tests.sh deleted file mode 100755 index 6332087f2..000000000 --- a/testing_containers/localuser_sudo_environment/run_tests.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - -set -eux - -mkdir -p /home/hostuser/code/ -cp -r /code/* /home/hostuser/code/ -cp -r /code/.git /home/hostuser/code/ - -cd code -python -m venv .venv -source .venv/bin/activate -pip install hatch -hatch run pytest --cov=src/deadline --cov-report=html:build/coverage --cov-report=xml:build/coverage/coverage.xml --cov-report=term-missing --cov-fail-under=25 -m docker