From e23674b7874bf58e03d31f0adb395b2a240cb1eb Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Wed, 1 Apr 2026 11:36:33 -0700 Subject: [PATCH 1/7] Add list-projects and list-files --- cirro/cli/__init__.py | 5 ++++- cirro/cli/cli.py | 21 +++++++++++++++++++- cirro/cli/controller.py | 41 ++++++++++++++++++++++++++++++++++++++- cirro/cli/models.py | 6 ++++++ cirro/services/dataset.py | 15 ++++++++++++++ pyproject.toml | 2 +- 6 files changed, 86 insertions(+), 4 deletions(-) diff --git a/cirro/cli/__init__.py b/cirro/cli/__init__.py index cd7004a..28f0cad 100644 --- a/cirro/cli/__init__.py +++ b/cirro/cli/__init__.py @@ -1,5 +1,6 @@ from cirro.cli.controller import run_ingest, run_download, run_configure, run_list_datasets from cirro.cli.controller import run_create_pipeline_config, run_validate_folder +from cirro.cli.controller import run_list_projects, run_list_files __all__ = [ 'run_ingest', @@ -7,5 +8,7 @@ 'run_configure', 'run_list_datasets', 'run_create_pipeline_config', - 'run_validate_folder' + 'run_validate_folder', + 'run_list_projects', + 'run_list_files', ] diff --git a/cirro/cli/cli.py b/cirro/cli/cli.py index 91cc6f7..68cb004 100644 --- a/cirro/cli/cli.py +++ b/cirro/cli/cli.py @@ -6,7 +6,7 @@ from cirro.cli import run_create_pipeline_config, run_validate_folder from cirro.cli import run_ingest, run_download, run_configure, run_list_datasets -from cirro.cli.controller import handle_error, run_upload_reference +from cirro.cli.controller import handle_error, run_upload_reference, run_list_projects, run_list_files from cirro.cli.interactive.utils import InputError @@ -25,6 +25,25 @@ def run(): pass # Print out help text, nothing to do +@run.command(help='List projects') +def list_projects(): + run_list_projects() + + +@run.command(help='List files in a dataset', no_args_is_help=True) +@click.option('--project', + help='Name or ID of the project') +@click.option('--dataset', + help='Name or ID of the dataset') +@click.option('-i', '--interactive', + help='Gather arguments interactively', + is_flag=True, default=False) +def list_files(**kwargs): + check_required_args(kwargs) + run_list_files(kwargs, interactive=kwargs.get('interactive')) + + + @run.command(help='List datasets', no_args_is_help=True) @click.option('--project', help='Name or ID of the project') diff --git a/cirro/cli/controller.py b/cirro/cli/controller.py index ef55e20..a26c5be 100644 --- a/cirro/cli/controller.py +++ b/cirro/cli/controller.py @@ -17,7 +17,7 @@ from cirro.cli.interactive.utils import get_id_from_name, get_item_from_name_or_id, InputError, validate_files from cirro.cli.interactive.validate_args import gather_validate_arguments, gather_validate_arguments_dataset from cirro.cli.models import ListArguments, UploadArguments, DownloadArguments, CreatePipelineConfigArguments, \ - UploadReferenceArguments, ValidateArguments + UploadReferenceArguments, ValidateArguments, ListFilesArguments from cirro.config import UserConfig, save_user_config, load_user_config from cirro.file_utils import get_files_in_directory from cirro.models.process import PipelineDefinition, ConfigAppStatus, CONFIG_APP_URL @@ -201,6 +201,45 @@ def run_download(input_params: DownloadArguments, interactive=False): files=files_to_download) +def run_list_projects(): + """List all available projects.""" + cirro = _init_cirro_client() + projects = _get_projects(cirro) + + import pandas as pd + df = pd.DataFrame([{'id': p.id, 'name': p.name} for p in projects]) + print(df.to_string(index=False)) + + +def run_list_files(input_params: ListFilesArguments, interactive=False): + """List files available in a dataset.""" + cirro = _init_cirro_client() + projects = _get_projects(cirro) + + if interactive: + from cirro.cli.interactive.common_args import ask_project, ask_dataset + from cirro.services.service_helpers import list_all_datasets + project_name = ask_project(projects, input_params.get('project')) + project_id = get_id_from_name(projects, project_name) + datasets = list_all_datasets(project_id=project_id, client=cirro) + dataset_id = ask_dataset(datasets, input_params.get('dataset'), msg_action='list files for') + else: + project_id = get_id_from_name(projects, input_params['project']) + datasets = cirro.datasets.list(project_id) + dataset_id = get_id_from_name(datasets, input_params['dataset']) + + files = cirro.datasets.get_assets_listing(project_id, dataset_id).files + + if len(files) == 0: + logger.info("No files found in this dataset") + return + + import pandas as pd + df = pd.DataFrame([{'path': f.normalized_path, 'size': f.size} for f in files]) + print(df.to_string(index=False)) + + + def run_upload_reference(input_params: UploadReferenceArguments, interactive=False): cirro = _init_cirro_client() projects = _get_projects(cirro) diff --git a/cirro/cli/models.py b/cirro/cli/models.py index 3fd701c..aefe696 100644 --- a/cirro/cli/models.py +++ b/cirro/cli/models.py @@ -45,3 +45,9 @@ class UploadReferenceArguments(TypedDict): project: str reference_file: list[str] interactive: bool + + +class ListFilesArguments(TypedDict): + project: str + dataset: str + interactive: bool diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index 3d58063..85f3530 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -1,4 +1,5 @@ import logging +from collections.abc import Mapping as _Mapping from pathlib import Path from typing import List, Optional, Union, Dict @@ -7,6 +8,20 @@ from cirro_api_client.v1.api.sharing import get_shared_datasets from cirro_api_client.v1.models import ImportDataRequest, UploadDatasetRequest, UpdateDatasetRequest, Dataset, \ DatasetDetail, CreateResponse, UploadDatasetCreateResponse, FileEntry +from cirro_api_client.v1.models.dataset_viz_config import DatasetVizConfig as _DatasetVizConfig + +# Patch DatasetVizConfig.from_dict to handle the case where the API returns a string +# (a path to the config file) instead of a config dict. +_original_dviz_from_dict = _DatasetVizConfig.from_dict.__func__ + + +def _safe_dviz_from_dict(cls, src_dict): + if not isinstance(src_dict, _Mapping): + return cls() + return _original_dviz_from_dict(cls, src_dict) + + +_DatasetVizConfig.from_dict = classmethod(_safe_dviz_from_dict) from cirro.file_utils import is_hidden_file from cirro.models.assets import DatasetAssets, Artifact diff --git a/pyproject.toml b/pyproject.toml index 37ac663..1093b99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cirro" -version = "1.10.4" +version = "1.11.0" description = "CLI tool and SDK for interacting with the Cirro platform" authors = ["Cirro Bio "] license = "MIT" From 402c19176d120aba51b0ec4791093b634899c5f6 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Wed, 1 Apr 2026 11:40:31 -0700 Subject: [PATCH 2/7] Resolve flake8 errors --- cirro/services/dataset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index 85f3530..a515fb0 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -10,6 +10,13 @@ DatasetDetail, CreateResponse, UploadDatasetCreateResponse, FileEntry from cirro_api_client.v1.models.dataset_viz_config import DatasetVizConfig as _DatasetVizConfig +from cirro.file_utils import is_hidden_file +from cirro.models.assets import DatasetAssets, Artifact +from cirro.models.dataset import DatasetValidationResponse +from cirro.models.file import FileAccessContext, File, PathLike +from cirro.services.base import get_all_records +from cirro.services.file import FileEnabledService + # Patch DatasetVizConfig.from_dict to handle the case where the API returns a string # (a path to the config file) instead of a config dict. _original_dviz_from_dict = _DatasetVizConfig.from_dict.__func__ @@ -23,13 +30,6 @@ def _safe_dviz_from_dict(cls, src_dict): _DatasetVizConfig.from_dict = classmethod(_safe_dviz_from_dict) -from cirro.file_utils import is_hidden_file -from cirro.models.assets import DatasetAssets, Artifact -from cirro.models.dataset import DatasetValidationResponse -from cirro.models.file import FileAccessContext, File, PathLike -from cirro.services.base import get_all_records -from cirro.services.file import FileEnabledService - logger = logging.getLogger() From edeb4b9cb3a4ff354a5b15547079eda3e078e29a Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Wed, 1 Apr 2026 11:40:40 -0700 Subject: [PATCH 3/7] Drop blank lines --- cirro/cli/cli.py | 1 - cirro/cli/controller.py | 1 - 2 files changed, 2 deletions(-) diff --git a/cirro/cli/cli.py b/cirro/cli/cli.py index 68cb004..a60bfce 100644 --- a/cirro/cli/cli.py +++ b/cirro/cli/cli.py @@ -43,7 +43,6 @@ def list_files(**kwargs): run_list_files(kwargs, interactive=kwargs.get('interactive')) - @run.command(help='List datasets', no_args_is_help=True) @click.option('--project', help='Name or ID of the project') diff --git a/cirro/cli/controller.py b/cirro/cli/controller.py index a26c5be..0b29e65 100644 --- a/cirro/cli/controller.py +++ b/cirro/cli/controller.py @@ -239,7 +239,6 @@ def run_list_files(input_params: ListFilesArguments, interactive=False): print(df.to_string(index=False)) - def run_upload_reference(input_params: UploadReferenceArguments, interactive=False): cirro = _init_cirro_client() projects = _get_projects(cirro) From e77d2cc22c983b1017547c7790321fb2daaf54a0 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Wed, 1 Apr 2026 13:28:06 -0700 Subject: [PATCH 4/7] Let the user increase the file limit --- cirro/cli/cli.py | 9 +++++++++ cirro/cli/controller.py | 24 ++++++++++++++++++------ cirro/cli/models.py | 3 +++ cirro/sdk/dataset.py | 15 +++++++++++++-- cirro/services/dataset.py | 11 +++++++---- 5 files changed, 50 insertions(+), 12 deletions(-) diff --git a/cirro/cli/cli.py b/cirro/cli/cli.py index a60bfce..f6bf6c6 100644 --- a/cirro/cli/cli.py +++ b/cirro/cli/cli.py @@ -35,6 +35,9 @@ def list_projects(): help='Name or ID of the project') @click.option('--dataset', help='Name or ID of the dataset') +@click.option('--file-limit', + help='Maximum number of files to list', + default=100000, show_default=True) @click.option('-i', '--interactive', help='Gather arguments interactively', is_flag=True, default=False) @@ -65,6 +68,9 @@ def list_datasets(**kwargs): multiple=True) @click.option('--data-directory', help='Directory to store the files') +@click.option('--file-limit', + help='Maximum number of files to enumerate from the dataset', + default=100000, show_default=True) @click.option('-i', '--interactive', help='Gather arguments interactively', is_flag=True, default=False) @@ -107,6 +113,9 @@ def upload(**kwargs): help='Name or ID of the project') @click.option('--data-directory', help='Local directory you wish to validate') +@click.option('--file-limit', + help='Maximum number of files to enumerate from the dataset', + default=100000, show_default=True) @click.option('-i', '--interactive', help='Gather arguments interactively', is_flag=True, default=False) diff --git a/cirro/cli/controller.py b/cirro/cli/controller.py index 0b29e65..ab05611 100644 --- a/cirro/cli/controller.py +++ b/cirro/cli/controller.py @@ -119,7 +119,10 @@ def run_validate_folder(input_params: ValidateArguments, interactive=False): # Filter out datasets that are not complete datasets = [d for d in datasets if d.status == Status.COMPLETED] input_params = gather_validate_arguments_dataset(input_params, datasets) - files = cirro.datasets.get_assets_listing(input_params['project'], input_params['dataset']).files + files = cirro.datasets.get_assets_listing( + input_params['project'], input_params['dataset'], + file_limit=input_params['file_limit'] + ).files if len(files) == 0: raise InputError('There are no files in this dataset to validate against') @@ -137,7 +140,8 @@ def run_validate_folder(input_params: ValidateArguments, interactive=False): results = cirro.datasets.validate_folder( project_id=project_id, dataset_id=dataset_id, - local_folder=input_params['data_directory'] + local_folder=input_params['data_directory'], + file_limit=input_params['file_limit'] ) for file_list, label, log_level in [ @@ -165,7 +169,10 @@ def run_download(input_params: DownloadArguments, interactive=False): # Filter out datasets that are not complete datasets = [d for d in datasets if d.status == Status.COMPLETED] input_params = gather_download_arguments_dataset(input_params, datasets) - files = cirro.datasets.get_assets_listing(input_params['project'], input_params['dataset']).files + files = cirro.datasets.get_assets_listing( + input_params['project'], input_params['dataset'], + file_limit=input_params['file_limit'] + ).files if len(files) == 0: raise InputError('There are no files in this dataset to download') @@ -180,7 +187,9 @@ def run_download(input_params: DownloadArguments, interactive=False): dataset_id = get_id_from_name(datasets, input_params['dataset']) if input_params['file']: - all_files = cirro.datasets.get_assets_listing(project_id, dataset_id).files + all_files = cirro.datasets.get_assets_listing( + project_id, dataset_id, file_limit=input_params['file_limit'] + ).files files_to_download = [] for filepath in input_params['file']: @@ -198,7 +207,8 @@ def run_download(input_params: DownloadArguments, interactive=False): cirro.datasets.download_files(project_id=project_id, dataset_id=dataset_id, download_location=input_params['data_directory'], - files=files_to_download) + files=files_to_download, + file_limit=input_params['file_limit']) def run_list_projects(): @@ -228,7 +238,9 @@ def run_list_files(input_params: ListFilesArguments, interactive=False): datasets = cirro.datasets.list(project_id) dataset_id = get_id_from_name(datasets, input_params['dataset']) - files = cirro.datasets.get_assets_listing(project_id, dataset_id).files + files = cirro.datasets.get_assets_listing( + project_id, dataset_id, file_limit=input_params['file_limit'] + ).files if len(files) == 0: logger.info("No files found in this dataset") diff --git a/cirro/cli/models.py b/cirro/cli/models.py index aefe696..1acc469 100644 --- a/cirro/cli/models.py +++ b/cirro/cli/models.py @@ -7,6 +7,7 @@ class DownloadArguments(TypedDict): data_directory: str interactive: bool file: Optional[list[str]] + file_limit: int class UploadArguments(TypedDict): @@ -25,6 +26,7 @@ class ValidateArguments(TypedDict): project: str data_directory: str interactive: bool + file_limit: int class ListArguments(TypedDict): @@ -51,3 +53,4 @@ class ListFilesArguments(TypedDict): project: str dataset: str interactive: bool + file_limit: int diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 205a14d..567e97f 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -276,11 +276,22 @@ def get_file(self, relative_path: str) -> DataPortalFile: msg = '\n'.join([f"No file found with path '{relative_path}'."]) raise DataPortalAssetNotFound(msg) - def list_files(self) -> DataPortalFiles: + def list_files(self, file_limit: int = 100000) -> DataPortalFiles: """ Return the list of files which make up the dataset. + + Args: + file_limit (int): Maximum number of files to return (default 100,000) """ - files = self._get_assets().files + if file_limit != 100000: + assets = self._client.datasets.get_assets_listing( + project_id=self.project_id, + dataset_id=self.id, + file_limit=file_limit + ) + files = assets.files + else: + files = self._get_assets().files return DataPortalFiles( [ DataPortalFile(file=file, client=self._client) diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index a515fb0..cd4ab0c 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -324,12 +324,13 @@ def validate_folder( self, project_id: str, dataset_id: str, - local_folder: PathLike + local_folder: PathLike, + file_limit: int = 100000 ) -> DatasetValidationResponse: """ Validates that the contents of a dataset match that of a local folder. """ - ds_files = self.get_assets_listing(project_id, dataset_id).files + ds_files = self.get_assets_listing(project_id, dataset_id, file_limit=file_limit).files local_folder = Path(local_folder) if not local_folder.is_dir(): @@ -382,7 +383,8 @@ def download_files( project_id: str, dataset_id: str, download_location: str, - files: Union[List[File], List[str]] = None + files: Union[List[File], List[str]] = None, + file_limit: int = 100000 ) -> None: """ Downloads files from a dataset @@ -395,9 +397,10 @@ def download_files( dataset_id (str): ID of the Dataset download_location (str): Local destination for downloaded files files (typing.List[str]): Optional list of files to download + file_limit (int): Maximum number of files to get (default 100,000) """ if files is None: - files = self.get_assets_listing(project_id, dataset_id).files + files = self.get_assets_listing(project_id, dataset_id, file_limit=file_limit).files if len(files) == 0: return From f843b8f5ff7dd669ee47312b99955ae38dc04a31 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Wed, 1 Apr 2026 13:28:42 -0700 Subject: [PATCH 5/7] Remove the bugfix --- cirro/services/dataset.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index cd4ab0c..820706b 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -17,19 +17,6 @@ from cirro.services.base import get_all_records from cirro.services.file import FileEnabledService -# Patch DatasetVizConfig.from_dict to handle the case where the API returns a string -# (a path to the config file) instead of a config dict. -_original_dviz_from_dict = _DatasetVizConfig.from_dict.__func__ - - -def _safe_dviz_from_dict(cls, src_dict): - if not isinstance(src_dict, _Mapping): - return cls() - return _original_dviz_from_dict(cls, src_dict) - - -_DatasetVizConfig.from_dict = classmethod(_safe_dviz_from_dict) - logger = logging.getLogger() From 9dc20f7866e90c94fb97acaa70c2148f002aa593 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Wed, 1 Apr 2026 13:32:00 -0700 Subject: [PATCH 6/7] Also remove the imports --- cirro/services/dataset.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cirro/services/dataset.py b/cirro/services/dataset.py index 820706b..12094e5 100644 --- a/cirro/services/dataset.py +++ b/cirro/services/dataset.py @@ -1,5 +1,4 @@ import logging -from collections.abc import Mapping as _Mapping from pathlib import Path from typing import List, Optional, Union, Dict @@ -8,8 +7,6 @@ from cirro_api_client.v1.api.sharing import get_shared_datasets from cirro_api_client.v1.models import ImportDataRequest, UploadDatasetRequest, UpdateDatasetRequest, Dataset, \ DatasetDetail, CreateResponse, UploadDatasetCreateResponse, FileEntry -from cirro_api_client.v1.models.dataset_viz_config import DatasetVizConfig as _DatasetVizConfig - from cirro.file_utils import is_hidden_file from cirro.models.assets import DatasetAssets, Artifact from cirro.models.dataset import DatasetValidationResponse From bb6f29aec543398a0be994c4fda747909fefd86c Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Wed, 1 Apr 2026 15:13:49 -0700 Subject: [PATCH 7/7] Simplify logic --- cirro/sdk/dataset.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cirro/sdk/dataset.py b/cirro/sdk/dataset.py index 567e97f..c4a7a6a 100644 --- a/cirro/sdk/dataset.py +++ b/cirro/sdk/dataset.py @@ -283,15 +283,13 @@ def list_files(self, file_limit: int = 100000) -> DataPortalFiles: Args: file_limit (int): Maximum number of files to return (default 100,000) """ - if file_limit != 100000: - assets = self._client.datasets.get_assets_listing( - project_id=self.project_id, - dataset_id=self.id, - file_limit=file_limit - ) - files = assets.files - else: - files = self._get_assets().files + assets = self._client.datasets.get_assets_listing( + project_id=self.project_id, + dataset_id=self.id, + file_limit=file_limit + ) + files = assets.files + return DataPortalFiles( [ DataPortalFile(file=file, client=self._client)