diff --git a/cirro/cli/__init__.py b/cirro/cli/__init__.py index cd7004a..28f0cad 100644 --- a/cirro/cli/__init__.py +++ b/cirro/cli/__init__.py @@ -1,5 +1,6 @@ from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets from cirro.cli.controller import run_create_pipeline_config, run_validate_folder +from cirro.cli.controller import run_list_projects, run_list_files __all__ = [ 'run_ingest', @@ -7,5 +8,7 @@ 'run_configure', 'run_list_datasets', 'run_create_pipeline_config', - 'run_validate_folder' + 'run_validate_folder', + 'run_list_projects', + 'run_list_files', ] diff --git a/cirro/cli/cli.py b/cirro/cli/cli.py index 91cc6f7..f6bf6c6 100644 --- a/cirro/cli/cli.py +++ b/cirro/cli/cli.py @@ -6,7 +6,7 @@ from cirro.cli import run_create_pipeline_config, run_validate_folder from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets -from cirro.cli.controller import handle_error, run_upload_reference +from cirro.cli.controller import handle_error, run_upload_reference, run_list_projects, run_list_files from cirro.cli.interactive.utils import InputError @@ -25,6 +25,27 @@ def run(): pass # Print out help text, nothing to do +@run.command(help='List projects') +def list_projects(): + run_list_projects() + + +@run.command(help='List files in a dataset', no_args_is_help=True) +@click.option('--project', + help='Name or ID of the project') +@click.option('--dataset', + help='Name or ID of the dataset') +@click.option('--file-limit', + help='Maximum number of files to list', + default=100000, show_default=True) +@click.option('-i', '--interactive', + help='Gather arguments interactively', + is_flag=True, default=False) +def list_files(**kwargs): + check_required_args(kwargs) + run_list_files(kwargs, interactive=kwargs.get('interactive')) + + @run.command(help='List datasets', no_args_is_help=True) @click.option('--project', help='Name or ID of the project') @@ -47,6 +68,9 @@ def list_datasets(**kwargs): multiple=True) @click.option('--data-directory', help='Directory to store the files') +@click.option('--file-limit', + help='Maximum number of files to enumerate from the dataset', + default=100000, show_default=True) @click.option('-i', '--interactive', help='Gather arguments interactively', is_flag=True, default=False) @@ -89,6 +113,9 @@ def upload(**kwargs): help='Name or ID of the project') @click.option('--data-directory', help='Local directory you wish to validate') +@click.option('--file-limit', + help='Maximum number of files to enumerate from the dataset', + default=100000, show_default=True) @click.option('-i', '--interactive', help='Gather arguments interactively', is_flag=True, default=False) diff --git a/cirro/cli/controller.py b/cirro/cli/controller.py index ef55e20..ab05611 100644 --- a/cirro/cli/controller.py +++ b/cirro/cli/controller.py @@ -17,7 +17,7 @@ from cirro.cli.interactive.utils import get_id_from_name, get_item_from_name_or_id, InputError, validate_files from cirro.cli.interactive.validate_args import gather_validate_arguments, gather_validate_arguments_dataset from cirro.cli.models import ListArguments, UploadArguments, DownloadArguments, CreatePipelineConfigArguments, \ - UploadReferenceArguments, ValidateArguments + UploadReferenceArguments, ValidateArguments, ListFilesArguments from cirro.config import UserConfig, save_user_config, load_user_config from cirro.file_utils import get_files_in_directory from cirro.models.process import PipelineDefinition, ConfigAppStatus, CONFIG_APP_URL @@ -119,7 +119,10 @@ def run_validate_folder(input_params: ValidateArguments, interactive=False): # Filter out datasets that are not complete datasets = [d for d in datasets if d.status == Status.COMPLETED] input_params = gather_validate_arguments_dataset(input_params, datasets) - files = cirro.datasets.get_assets_listing(input_params['project'], input_params['dataset']).files + files = cirro.datasets.get_assets_listing( + input_params['project'], input_params['dataset'], + file_limit=input_params['file_limit'] + ).files if len(files) == 0: raise InputError('There are no files in this dataset to validate against') @@ -137,7 +140,8 @@ def run_validate_folder(input_params: ValidateArguments, interactive=False): results = cirro.datasets.validate_folder( project_id=project_id, dataset_id=dataset_id, - local_folder=input_params['data_directory'] + local_folder=input_params['data_directory'], + file_limit=input_params['file_limit'] ) for file_list, label, log_level in [ @@ -165,7 +169,10 @@ def run_download(input_params: DownloadArguments, interactive=False): # Filter out datasets that are not complete datasets = [d for d in datasets if d.status == Status.COMPLETED] input_params = gather_download_arguments_dataset(input_params, datasets) - files = cirro.datasets.get_assets_listing(input_params['project'], input_params['dataset']).files + files = cirro.datasets.get_assets_listing( + input_params['project'], input_params['dataset'], + file_limit=input_params['file_limit'] + ).files if len(files) == 0: raise InputError('There are no files in this dataset to download') @@ -180,7 +187,9 @@ def run_download(input_params: DownloadArguments, interactive=False): dataset_id = get_id_from_name(datasets, input_params['dataset']) if input_params['file']: - all_files = cirro.datasets.get_assets_listing(project_id, dataset_id).files + all_files = cirro.datasets.get_assets_listing( + project_id, dataset_id, file_limit=input_params['file_limit'] + ).files files_to_download = [] for filepath in input_params['file']: @@ -198,7 +207,48 @@ def run_download(input_params: DownloadArguments, interactive=False): cirro.datasets.download_files(project_id=project_id, dataset_id=dataset_id, download_location=input_params['data_directory'], - files=files_to_download) + files=files_to_download, + file_limit=input_params['file_limit']) + + +def run_list_projects(): + """List all available projects.""" + cirro = _init_cirro_client() + projects = _get_projects(cirro) + + import pandas as pd + df = pd.DataFrame([{'id': p.id, 'name': p.name} for p in projects]) + print(df.to_string(index=False)) + + +def run_list_files(input_params: ListFilesArguments, interactive=False): + """List files available in a dataset.""" + cirro = _init_cirro_client() + projects = _get_projects(cirro) + + if interactive: + from cirro.cli.interactive.common_args import ask_project, ask_dataset + from cirro.services.service_helpers import list_all_datasets + project_name = ask_project(projects, input_params.get('project')) + project_id = get_id_from_name(projects, project_name) + datasets = list_all_datasets(project_id=project_id, client=cirro) + dataset_id = ask_dataset(datasets, input_params.get('dataset'), msg_action='list files for') + else: + project_id = get_id_from_name(projects, input_params['project']) + datasets = cirro.datasets.list(project_id) + dataset_id = get_id_from_name(datasets, input_params['dataset']) + + files = cirro.datasets.get_assets_listing( + project_id, dataset_id, file_limit=input_params['file_limit'] + ).files + + if len(files) == 0: + logger.info("No files found in this dataset") + return + + import pandas as pd + df = pd.DataFrame([{'path': f.normalized_path, 'size': f.size} for f in files]) + print(df.to_string(index=False)) def run_upload_reference(input_params: UploadReferenceArguments, interactive=False): diff --git a/cirro/cli/models.py b/cirro/cli/models.py index 3fd701c..1acc469 100644 --- a/cirro/cli/models.py +++ b/cirro/cli/models.py @@ -7,6 +7,7 @@ class DownloadArguments(TypedDict): data_directory: str interactive: bool file: Optional[list[str]] + file_limit: int class UploadArguments(TypedDict): @@ -25,6 +26,7 @@ class ValidateArguments(TypedDict): project: str data_directory: str interactive: bool + file_limit: int class ListArguments(TypedDict): @@ -45,3 +47,10 @@ class UploadReferenceArguments(TypedDict): project: str reference_file: list[str] interactive: bool + + +class ListFilesArguments(TypedDict): + project: str + dataset: str + interactive: bool + file_limit: int diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 205a14d..c4a7a6a 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -276,11 +276,20 @@ def get_file(self, relative_path: str) -> DataPortalFile: msg = '\n'.join([f"No file found with path '{relative_path}'."]) raise DataPortalAssetNotFound(msg) - def list_files(self) -> DataPortalFiles: + def list_files(self, file_limit: int = 100000) -> DataPortalFiles: """ Return the list of files which make up the dataset. + + Args: + file_limit (int): Maximum number of files to return (default 100,000) """ - files = self._get_assets().files + assets = self._client.datasets.get_assets_listing( + project_id=self.project_id, + dataset_id=self.id, + file_limit=file_limit + ) + files = assets.files + return DataPortalFiles( [ DataPortalFile(file=file, client=self._client) diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index 3d58063..12094e5 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -7,7 +7,6 @@ from cirro_api_client.v1.api.sharing import get_shared_datasets from cirro_api_client.v1.models import ImportDataRequest, UploadDatasetRequest, UpdateDatasetRequest, Dataset, \ DatasetDetail, CreateResponse, UploadDatasetCreateResponse, FileEntry - from cirro.file_utils import is_hidden_file from cirro.models.assets import DatasetAssets, Artifact from cirro.models.dataset import DatasetValidationResponse @@ -309,12 +308,13 @@ def validate_folder( self, project_id: str, dataset_id: str, - local_folder: PathLike + local_folder: PathLike, + file_limit: int = 100000 ) -> DatasetValidationResponse: """ Validates that the contents of a dataset match that of a local folder. """ - ds_files = self.get_assets_listing(project_id, dataset_id).files + ds_files = self.get_assets_listing(project_id, dataset_id, file_limit=file_limit).files local_folder = Path(local_folder) if not local_folder.is_dir(): @@ -367,7 +367,8 @@ def download_files( project_id: str, dataset_id: str, download_location: str, - files: Union[List[File], List[str]] = None + files: Union[List[File], List[str]] = None, + file_limit: int = 100000 ) -> None: """ Downloads files from a dataset @@ -380,9 +381,10 @@ def download_files( dataset_id (str): ID of the Dataset download_location (str): Local destination for downloaded files files (typing.List[str]): Optional list of files to download + file_limit (int): Maximum number of files to get (default 100,000) """ if files is None: - files = self.get_assets_listing(project_id, dataset_id).files + files = self.get_assets_listing(project_id, dataset_id, file_limit=file_limit).files if len(files) == 0: return diff --git a/pyproject.toml b/pyproject.toml index 37ac663..1093b99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cirro" -version = "1.10.4" +version = "1.11.0" description = "CLI tool and SDK for interacting with the Cirro platform" authors = ["Cirro Bio "] license = "MIT"