diff --git a/cosmotech/coal/cosmotech_api/apis/dataset.py b/cosmotech/coal/cosmotech_api/apis/dataset.py index a137b2f4..680f0bca 100644 --- a/cosmotech/coal/cosmotech_api/apis/dataset.py +++ b/cosmotech/coal/cosmotech_api/apis/dataset.py @@ -79,7 +79,7 @@ def _download_part(self, dataset_id, dataset_part, destination): ) @staticmethod - def path_to_parts(_path, part_type) -> list[tuple[str, Path, DatasetPartTypeEnum]]: + def path_to_parts(_path, part_type) -> list[tuple[str, str, Path, DatasetPartTypeEnum]]: if (_path := Path(_path)).is_dir(): return list((str(_p.relative_to(_path)), _p, part_type) for _p in _path.rglob("*") if _p.is_file()) return list(((_path.name, _path, part_type),)) @@ -118,7 +118,7 @@ def upload_dataset( additional_data=additional_data, parts=list( DatasetPartCreateRequest( - name=_p_name, + name=Path(_p_name).stem, description=_p_name, sourceName=_p_name, type=_type, @@ -195,7 +195,7 @@ def upload_dataset_parts( # Create new part part_request = DatasetPartCreateRequest( - name=_p_name, + name=Path(_p_name).stem, description=_p_name, sourceName=_p_name, type=_type, diff --git a/cosmotech/coal/postgresql/runner.py b/cosmotech/coal/postgresql/runner.py index 34497bcd..9236f3e6 100644 --- a/cosmotech/coal/postgresql/runner.py +++ b/cosmotech/coal/postgresql/runner.py @@ -49,19 +49,25 @@ def send_runner_metadata_to_postgresql( CREATE TABLE IF NOT EXISTS {schema_table} ( id varchar(32) PRIMARY KEY, name varchar(256), - last_csm_run_id varchar(32), + last_csm_run_id varchar(32) UNIQUE, run_template_id varchar(32) ); """ LOGGER.info(T("coal.services.postgresql.creating_table").format(schema_table=schema_table)) curs.execute(sql_create_table) conn.commit() + + runner_id = runner.get("id") + sql_delete_from_metatable = f""" + DELETE FROM {schema_table} + WHERE id= $1; + """ + curs.execute(sql_delete_from_metatable, (runner_id,)) + conn.commit() + sql_upsert = f""" INSERT INTO {schema_table} (id, name, last_csm_run_id, run_template_id) - VALUES ($1, $2, $3, $4) - ON CONFLICT (id) - DO - UPDATE SET name = EXCLUDED.name, last_csm_run_id = EXCLUDED.last_csm_run_id; + VALUES ($1, $2, $3, $4) """ LOGGER.debug(runner) curs.execute( diff --git a/cosmotech/coal/postgresql/store.py b/cosmotech/coal/postgresql/store.py index 8db8fb72..95d6c422 100644 --- a/cosmotech/coal/postgresql/store.py +++ b/cosmotech/coal/postgresql/store.py @@ -115,7 +115,7 @@ def dump_store_to_postgresql_from_conf( ) if fk_id and _psql.is_metadata_exists(): metadata_table = f"{_psql.metadata_table_name}" - _psql.add_fk_constraint(table_name, "csm_run_id", metadata_table, "last_csm_run_id") + _psql.add_fk_constraint(target_table_name, "csm_run_id", metadata_table, "last_csm_run_id") total_rows += rows _up_time = perf_counter() diff --git a/cosmotech/coal/postgresql/utils.py b/cosmotech/coal/postgresql/utils.py index 4d863608..a95e8d60 100644 --- a/cosmotech/coal/postgresql/utils.py +++ b/cosmotech/coal/postgresql/utils.py @@ -155,13 +155,20 @@ def add_fk_constraint( to_table: str, to_col: str, ) -> None: - # Connect to PostgreSQL and remove runner metadata row - with dbapi.connect(self.full_uri, autocommit=True) as conn: + # Connect to PostgreSQL and add a foreign key constraint + with dbapi.connect(self.full_uri, autocommit=False) as conn: with conn.cursor() as curs: + sql_drop_fk = f""" + ALTER TABLE {self.db_schema}.{from_table} + DROP CONSTRAINT IF EXISTS metadata; + """ sql_add_fk = f""" ALTER TABLE {self.db_schema}.{from_table} - CONSTRAINT metadata FOREIGN KEY ({from_col}) REFERENCES {to_table}({to_col}) + ADD CONSTRAINT metadata FOREIGN KEY ({from_col}) + REFERENCES {self.db_schema}.{to_table}({to_col}) + ON DELETE CASCADE; """ + curs.execute(sql_drop_fk) curs.execute(sql_add_fk) conn.commit() diff --git a/cosmotech/coal/store/parquet.py b/cosmotech/coal/store/parquet.py new file mode 100644 index 00000000..76f1283d --- /dev/null +++ b/cosmotech/coal/store/parquet.py @@ -0,0 +1,45 @@ +# Copyright (C) - 2023 - 2025 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. + +import pathlib + +import pyarrow as pa +import pyarrow.parquet as pq + +from cosmotech.coal.store.store import Store + + +def store_parquet_file( + table_name: str, + parquet_path: pathlib.Path, + replace_existsing_file: bool = False, + store=Store(), +): + if not parquet_path.exists(): + raise FileNotFoundError(f"File {parquet_path} does not exists") + + data: pa.Table = pq.ParquetFile(parquet_path).read() + _c = data.column_names + data = data.rename_columns([Store.sanitize_column(_column) for _column in _c]) + + store.add_table(table_name=table_name, data=data, replace=replace_existsing_file) + + +def convert_store_table_to_parquet( + table_name: str, + parquet_path: pathlib.Path, + replace_existsing_file: bool = False, + store=Store(), +): + if parquet_path.name.endswith(".parquet") and parquet_path.exists() and not replace_existsing_file: + raise FileExistsError(f"File {parquet_path} already exists") + if not parquet_path.name.endswith(".parquet"): + parquet_path = parquet_path / f"{table_name}.parquet" + folder = parquet_path.parent + folder.mkdir(parents=True, exist_ok=True) + + pq.write_table(store.get_table(table_name), parquet_path) diff --git a/cosmotech/coal/utils/configuration.py b/cosmotech/coal/utils/configuration.py index 55a6d400..3bd88761 100644 --- a/cosmotech/coal/utils/configuration.py +++ b/cosmotech/coal/utils/configuration.py @@ -76,6 +76,8 @@ class Configuration(Dotdict): "api": {"url": "CSM_API_URL", "scope": "CSM_API_SCOPE"}, "dataset_absolute_path": "CSM_DATASET_ABSOLUTE_PATH", "parameters_absolute_path": "CSM_PARAMETERS_ABSOLUTE_PATH", + "output_absolute_path": "CSM_OUTPUT_ABSOLUTE_PATH", + "tmp_absolute_path": "CSM_TEMP_ABSOLUTE_PATH", "organization_id": "CSM_ORGANIZATION_ID", "workspace_id": "CSM_WORKSPACE_ID", "runner_id": "CSM_RUNNER_ID", diff --git a/cosmotech/coal/utils/input_collector.py b/cosmotech/coal/utils/input_collector.py new file mode 100644 index 00000000..25684960 --- /dev/null +++ b/cosmotech/coal/utils/input_collector.py @@ -0,0 +1,121 @@ +import json +import os +from pathlib import Path + +from cosmotech.coal.utils.configuration import ENVIRONMENT_CONFIGURATION as EC +from cosmotech.coal.utils.logger import LOGGER + + +class InputCollector: + def __init__(self): + self.dataset_collector = DatasetCollector() + self.parameter_collector = ParameterCollector() + self.workspace_collector = WorkspaceCollector() + + def fetch_dataset(self, dataset_name: str) -> Path: + return self.dataset_collector.fetch(dataset_name) + + def fetch_parameter(self, param_name: str) -> Path: + return self.parameter_collector.fetch(param_name) + + def fetch_workspace_file(self, file_name: str) -> Path: + return self.workspace_collector.fetch(file_name) + + def fetch(self, name: str) -> Path: + try: + return self.fetch_parameter(name) + except (KeyError, FileNotFoundError): + LOGGER.debug(f"Parameter {name} not found, trying workspace files.") + try: + return self.fetch_workspace_file(name) + except FileNotFoundError: + LOGGER.debug(f"Workspace file {name} not found, trying dataset files.") + return self.fetch_dataset(name) + + +class DatasetCollector: + def __init__(self): + self.paths: dict[str, Path] = {} + + def collect(self): + for dataset_id in os.listdir(EC.cosmotech.dataset_absolute_path): + for r, d, f in os.walk(Path(EC.cosmotech.dataset_absolute_path) / dataset_id): + for dataset_name in f: + path = Path(r) / dataset_name + self.paths[dataset_name] = path + self.paths[path.stem] = path + + def fetch(self, dataset_name: str) -> Path: + # lazy collection to avoid unnecessary os.walk calls + if not self.paths: + self.collect() + if dataset_name in self.paths: + return self.paths[dataset_name] + raise FileNotFoundError(f"File for {dataset_name} not found in {EC.cosmotech.dataset_absolute_path}.") + + +class WorkspaceCollector: + def __init__(self): + self.paths: dict[str, Path] = {} + + def collect(self): + workspace_path = Path(EC.cosmotech.dataset_absolute_path) / "workspace_files" + if workspace_path.exists(): + for r, d, f in os.walk(workspace_path): + for file_name in f: + path = Path(r) / file_name + self.paths[file_name] = path + self.paths[path.stem] = path + + def fetch(self, file_name: str) -> Path: + if not self.paths: + self.collect() + if file_name in self.paths: + return self.paths[file_name] + raise FileNotFoundError(f"File {file_name} not found in workspace_files.") + + +class ParameterCollector: + def __init__(self): + self.paths: dict[str, Path] = {} + self.parameters: dict[str, str] = {} + + def read_parameters_json(self): + parameter_file = Path(EC.cosmotech.parameters_absolute_path) / "parameters.json" + if parameter_file.exists(): + with open(parameter_file) as f: + parameters = json.load(f) + for parameter in parameters: + self.parameters[parameter["parameterId"]] = parameter["value"] + + def collect(self): + for dataset_id in os.listdir(EC.cosmotech.parameters_absolute_path): + for r, d, f in os.walk(Path(EC.cosmotech.parameters_absolute_path) / dataset_id): + for file_name in f: + path = Path(r) / file_name + param_name = path.parent.name + self.paths[param_name] = path + self.paths[path.stem] = path + + def fetch_parameter(self, param_name: str) -> Path: + # lazy collection to avoid unnecessary json loading + if not self.parameters: + self.read_parameters_json() + return self.parameters[param_name] + + def fetch_file_path(self, param_name: str) -> Path: + # lazy collection to avoid unnecessary os.walk calls + if not self.paths: + self.collect() + if param_name in self.paths: + return self.paths[param_name] + raise FileNotFoundError(f"File for {param_name} not found in {EC.cosmotech.parameters_absolute_path}.") + + def fetch(self, param_name: str) -> Path: + try: + return self.fetch_parameter(param_name) + except KeyError: + return self.fetch_file_path(param_name) + + +ENVIRONMENT_INPUT_COLLECTOR = InputCollector() diff --git a/cosmotech/csm_data/commands/api/wsf_load_file.py b/cosmotech/csm_data/commands/api/wsf_load_file.py index 1f84d1e1..2f26eed0 100644 --- a/cosmotech/csm_data/commands/api/wsf_load_file.py +++ b/cosmotech/csm_data/commands/api/wsf_load_file.py @@ -35,7 +35,7 @@ "--workspace-path", help=T("csm_data.commands.api.wsf_load_file.parameters.workspace_path"), metavar="PATH", - default="/", + default="", type=str, ) @click.option( diff --git a/cosmotech/csm_data/commands/store/load_parquet_folder.py b/cosmotech/csm_data/commands/store/load_parquet_folder.py new file mode 100644 index 00000000..08cb19ab --- /dev/null +++ b/cosmotech/csm_data/commands/store/load_parquet_folder.py @@ -0,0 +1,49 @@ +# Copyright (C) - 2023 - 2025 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. +from cosmotech.orchestrator.utils.translate import T + +from cosmotech.csm_data.utils.click import click +from cosmotech.csm_data.utils.decorators import translate_help, web_help + + +@click.command() +@web_help("csm-data/store/load-parquet-folder") +@translate_help("csm_data.commands.store.load_parquet_folder.description") +@click.option( + "--store-folder", + envvar="CSM_PARAMETERS_ABSOLUTE_PATH", + help=T("csm_data.commands.store.load_parquet_folder.parameters.store_folder"), + metavar="PATH", + type=str, + show_envvar=True, + required=True, +) +@click.option( + "--parquet-folder", + envvar="CSM_OUTPUT_ABSOLUTE_PATH", + help=T("csm_data.commands.store.load_parquet_folder.parameters.parquet_folder"), + metavar="PATH", + type=str, + show_envvar=True, + required=True, +) +def load_parquet_folder(store_folder, parquet_folder): + # Import the modules and functions at the start of the command + import pathlib + + from cosmotech.coal.store.parquet import store_parquet_file + from cosmotech.coal.store.store import Store + from cosmotech.coal.utils.configuration import Configuration + from cosmotech.coal.utils.logger import LOGGER + + _conf = Configuration() + + _conf.coal.store = store_folder + + for parquet_path in pathlib.Path(parquet_folder).glob("*.parquet"): + LOGGER.info(T("coal.services.azure_storage.found_file").format(file=parquet_path.name)) + store_parquet_file(parquet_path.name[:-8], parquet_path, store=Store(False, _conf)) diff --git a/cosmotech/csm_data/commands/store/store.py b/cosmotech/csm_data/commands/store/store.py index 47be8957..e17b2623 100644 --- a/cosmotech/csm_data/commands/store/store.py +++ b/cosmotech/csm_data/commands/store/store.py @@ -14,6 +14,7 @@ from cosmotech.csm_data.commands.store.load_from_singlestore import ( load_from_singlestore_command, ) +from cosmotech.csm_data.commands.store.load_parquet_folder import load_parquet_folder from cosmotech.csm_data.commands.store.output import output from cosmotech.csm_data.commands.store.reset import reset from cosmotech.csm_data.utils.click import click @@ -30,6 +31,7 @@ def store(): store.add_command(reset, "reset") store.add_command(list_tables, "list-tables") store.add_command(load_csv_folder, "load-csv-folder") +store.add_command(load_parquet_folder, "load-parquet-folder") store.add_command(load_from_singlestore_command, "load-from-singlestore") store.add_command(dump_to_postgresql, "dump-to-postgresql") store.add_command(dump_to_s3, "dump-to-s3") diff --git a/cosmotech/translation/csm_data/en-US/csm_data/commands/store/load_parquet_folder.yml b/cosmotech/translation/csm_data/en-US/csm_data/commands/store/load_parquet_folder.yml new file mode 100644 index 00000000..4e869074 --- /dev/null +++ b/cosmotech/translation/csm_data/en-US/csm_data/commands/store/load_parquet_folder.yml @@ -0,0 +1,5 @@ +description: | + Running this command will find all parquet files in the given folder and put them in the store +parameters: + store_folder: The folder containing the store files + parquet_folder: The folder containing the parquet files to store diff --git a/tests/unit/coal/test_cosmotech_api/test_apis/test_dataset.py b/tests/unit/coal/test_cosmotech_api/test_apis/test_dataset.py index 8ac14f48..9d493e16 100644 --- a/tests/unit/coal/test_cosmotech_api/test_apis/test_dataset.py +++ b/tests/unit/coal/test_cosmotech_api/test_apis/test_dataset.py @@ -543,13 +543,13 @@ def test_update_dataset_mixed_files(self, mock_cosmotech_config, mock_api_client assert len(args_list) == 2 # check first call used to create csv part dpcr = args_list[0].kwargs.get("dataset_part_create_request") - assert dpcr.name == "data.csv" + assert dpcr.name == "data" assert dpcr.source_name == "data.csv" assert dpcr.description == "data.csv" assert dpcr.type == DatasetPartTypeEnum.FILE # check second call used to create db part dpcr = args_list[1].kwargs.get("dataset_part_create_request") - assert dpcr.name == "data.db" + assert dpcr.name == "data" assert dpcr.source_name == "data.db" assert dpcr.description == "data.db" assert dpcr.type == DatasetPartTypeEnum.DB diff --git a/tests/unit/coal/test_postgresql/test_postgresql_runner.py b/tests/unit/coal/test_postgresql/test_postgresql_runner.py index 870a1f89..33c3f4a8 100644 --- a/tests/unit/coal/test_postgresql/test_postgresql_runner.py +++ b/tests/unit/coal/test_postgresql/test_postgresql_runner.py @@ -72,14 +72,19 @@ def test_send_runner_metadata_to_postgresql(self, mock_connect, mock_postgres_ut mock_connect.assert_called_once_with("postgresql://user:password@localhost:5432/testdb", autocommit=True) # Check that SQL statements were executed - assert mock_cursor.execute.call_count == 2 + assert mock_cursor.execute.call_count == 3 # Verify the SQL statements (partially, since the exact SQL is complex) create_table_call = mock_cursor.execute.call_args_list[0] assert "CREATE TABLE IF NOT EXISTS" in create_table_call[0][0] assert "public.test_runnermetadata" in create_table_call[0][0] - upsert_call = mock_cursor.execute.call_args_list[1] + delete_call = mock_cursor.execute.call_args_list[1] + assert "DELETE FROM" in delete_call[0][0] + assert "public.test_runnermetadata" in delete_call[0][0] + assert delete_call[0][1] == ("test-runner-id",) + + upsert_call = mock_cursor.execute.call_args_list[2] assert "INSERT INTO" in upsert_call[0][0] assert "public.test_runnermetadata" in upsert_call[0][0] assert upsert_call[0][1] == ( @@ -90,7 +95,7 @@ def test_send_runner_metadata_to_postgresql(self, mock_connect, mock_postgres_ut ) # Check that commits were called - assert mock_conn.commit.call_count == 2 + assert mock_conn.commit.call_count == 3 # Verify the function returns the lastRunId assert result == "test-run-id" diff --git a/tests/unit/coal/test_store/test_store_parquet.py b/tests/unit/coal/test_store/test_store_parquet.py new file mode 100644 index 00000000..e003fd5b --- /dev/null +++ b/tests/unit/coal/test_store/test_store_parquet.py @@ -0,0 +1,200 @@ +# Copyright (C) - 2023 - 2025 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. + +import pathlib +from unittest.mock import MagicMock, patch + +import pyarrow as pa +import pytest + +from cosmotech.coal.store.parquet import ( + convert_store_table_to_parquet, + store_parquet_file, +) +from cosmotech.coal.store.store import Store + + +class TestParquetFunctions: + """Tests for top-level functions in the parquet module.""" + + @patch("pyarrow.parquet.ParquetFile") + @patch("pathlib.Path.exists") + def test_store_parquet_file_success(self, mock_exists, mock_parquet_file): + """Test the store_parquet_file function with a valid Parquet file.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/test.parquet") + mock_exists.return_value = True + + # Mock Parquet data + mock_data = pa.Table.from_arrays([pa.array([1, 2, 3]), pa.array(["a", "b", "c"])], names=["id", "name"]) + mock_parquet_file.return_value.read.return_value = mock_data + + # Mock store + mock_store = MagicMock(spec=Store) + + # Act + store_parquet_file(table_name, parquet_path, False, mock_store) + + # Assert + mock_exists.assert_called_once_with() + mock_parquet_file.assert_called_once_with(parquet_path) + mock_parquet_file.return_value.read.assert_called_once_with() + mock_store.add_table.assert_called_once() + args, kwargs = mock_store.add_table.call_args + assert kwargs["table_name"] == table_name + assert kwargs["replace"] is False + + @patch("pathlib.Path.exists") + def test_store_parquet_file_not_found(self, mock_exists): + """Test the store_parquet_file function with a non-existent Parquet file.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/nonexistent.parquet") + mock_exists.return_value = False + + # Mock store + mock_store = MagicMock(spec=Store) + + # Act & Assert + with pytest.raises(FileNotFoundError): + store_parquet_file(table_name, parquet_path, False, mock_store) + + mock_exists.assert_called_once_with() + + @patch("pyarrow.parquet.ParquetFile") + @patch("pathlib.Path.exists") + def test_store_parquet_file_with_column_sanitization(self, mock_exists, mock_parquet_file): + """Test the store_parquet_file function with column sanitization.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/test.parquet") + mock_exists.return_value = True + + # Mock Parquet data with columns that need sanitization + mock_data = pa.Table.from_arrays( + [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])], names=["id with space", "name-with-dash"] + ) + mock_parquet_file.return_value.read.return_value = mock_data + + # Mock store and sanitize_column + mock_store = MagicMock(spec=Store) + Store.sanitize_column = MagicMock(side_effect=lambda x: x.replace(" ", "_")) + + # Act + store_parquet_file(table_name, parquet_path, False, mock_store) + + # Assert + assert Store.sanitize_column.call_count == 2 + Store.sanitize_column.assert_any_call("id with space") + Store.sanitize_column.assert_any_call("name-with-dash") + mock_store.add_table.assert_called_once() + + @patch("pyarrow.parquet.ParquetFile") + @patch("pathlib.Path.exists") + def test_store_parquet_file_with_replace(self, mock_exists, mock_parquet_file): + """Test the store_parquet_file function with replace_existing_file=True.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/test.parquet") + mock_exists.return_value = True + + mock_data = pa.Table.from_arrays([pa.array([1, 2, 3])], names=["id"]) + mock_parquet_file.return_value.read.return_value = mock_data + + mock_store = MagicMock(spec=Store) + + # Act + store_parquet_file(table_name, parquet_path, True, mock_store) + + # Assert + args, kwargs = mock_store.add_table.call_args + assert kwargs["replace"] is True + + @patch("pyarrow.parquet.write_table") + @patch("pathlib.Path.exists") + def test_convert_store_table_to_parquet_success(self, mock_exists, mock_write_table): + """Test the convert_store_table_to_parquet function with a valid table.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/output.parquet") + mock_exists.return_value = False + + mock_store = MagicMock(spec=Store) + mock_table = pa.Table.from_arrays([pa.array([1, 2, 3]), pa.array(["a", "b", "c"])], names=["id", "name"]) + mock_store.get_table.return_value = mock_table + + with patch.object(pathlib.Path, "mkdir") as mock_mkdir: + # Act + convert_store_table_to_parquet(table_name, parquet_path, False, mock_store) + + # Assert + mock_store.get_table.assert_called_once_with(table_name) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + mock_write_table.assert_called_once_with(mock_table, parquet_path) + + @patch("pathlib.Path.exists") + def test_convert_store_table_to_parquet_file_exists(self, mock_exists): + """Test the convert_store_table_to_parquet function when the output file already exists.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/output.parquet") + mock_exists.return_value = True + + mock_store = MagicMock(spec=Store) + + # Act & Assert + with pytest.raises(FileExistsError): + convert_store_table_to_parquet(table_name, parquet_path, False, mock_store) + + mock_exists.assert_called_once_with() + mock_store.get_table.assert_not_called() + + @patch("pyarrow.parquet.write_table") + @patch("pathlib.Path.exists") + def test_convert_store_table_to_parquet_replace_existing(self, mock_exists, mock_write_table): + """Test the convert_store_table_to_parquet function with replace_existing_file=True.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/output.parquet") + mock_exists.return_value = True + + mock_store = MagicMock(spec=Store) + mock_table = pa.Table.from_arrays([pa.array([1, 2, 3]), pa.array(["a", "b", "c"])], names=["id", "name"]) + mock_store.get_table.return_value = mock_table + + with patch.object(pathlib.Path, "mkdir") as mock_mkdir: + # Act + convert_store_table_to_parquet(table_name, parquet_path, True, mock_store) + + # Assert + mock_store.get_table.assert_called_once_with(table_name) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + mock_write_table.assert_called_once_with(mock_table, parquet_path) + + @patch("pyarrow.parquet.write_table") + @patch("pathlib.Path.exists") + def test_convert_store_table_to_parquet_directory_path(self, mock_exists, mock_write_table): + """Test the convert_store_table_to_parquet function with a directory path.""" + # Arrange + table_name = "test_table" + parquet_path = pathlib.Path("/path/to/directory") # Not ending with .parquet + mock_exists.return_value = False + + mock_store = MagicMock(spec=Store) + mock_table = pa.Table.from_arrays([pa.array([1, 2, 3]), pa.array(["a", "b", "c"])], names=["id", "name"]) + mock_store.get_table.return_value = mock_table + + with patch.object(pathlib.Path, "mkdir") as mock_mkdir: + # Act + convert_store_table_to_parquet(table_name, parquet_path, False, mock_store) + + # Assert + mock_store.get_table.assert_called_once_with(table_name) + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + expected_path = parquet_path / f"{table_name}.parquet" + mock_write_table.assert_called_once_with(mock_table, expected_path) diff --git a/tests/unit/coal/test_utils/test_utils_input_collector.py b/tests/unit/coal/test_utils/test_utils_input_collector.py new file mode 100644 index 00000000..12116c9e --- /dev/null +++ b/tests/unit/coal/test_utils/test_utils_input_collector.py @@ -0,0 +1,243 @@ +# Copyright (C) - 2023 - 2025 - Cosmo Tech +# This document and all information contained herein is the exclusive property - +# including all intellectual property rights pertaining thereto - of Cosmo Tech. +# Any use, reproduction, translation, broadcasting, transmission, distribution, +# etc., to any person is prohibited unless it has been previously and +# specifically authorized by written means by Cosmo Tech. + +import json +from unittest.mock import MagicMock, patch + +import pytest + +from cosmotech.coal.utils.input_collector import ( + DatasetCollector, + InputCollector, + ParameterCollector, +) + + +@pytest.fixture +def mock_ec(tmp_path): + """Fixture that patches EC with a MagicMock whose dataset and parameters paths + both point to tmp_path. Override the attributes in individual tests as needed.""" + ec = MagicMock() + ec.cosmotech.dataset_absolute_path = str(tmp_path) + ec.cosmotech.parameters_absolute_path = str(tmp_path) + with patch("cosmotech.coal.utils.input_collector.EC", ec): + yield ec + + +class TestDatasetCollector: + def test_fetch_existing_file(self, tmp_path, mock_ec): + dataset_dir = tmp_path / "ds1" + dataset_dir.mkdir() + file = dataset_dir / "mydata.csv" + file.write_text("a,b\n1,2") + + mock_ec.cosmotech.dataset_absolute_path = str(tmp_path) + + collector = DatasetCollector() + result = collector.fetch("mydata.csv") + + assert result == file + + def test_fetch_triggers_lazy_collection(self, tmp_path, mock_ec): + dataset_dir = tmp_path / "ds1" + dataset_dir.mkdir() + (dataset_dir / "file.csv").write_text("") + + mock_ec.cosmotech.dataset_absolute_path = str(tmp_path) + + collector = DatasetCollector() + assert collector.paths == {} + collector.fetch("file.csv") + assert "file.csv" in collector.paths + + def test_fetch_missing_file_raises(self, tmp_path, mock_ec): + dataset_dir = tmp_path / "ds1" + dataset_dir.mkdir() + + mock_ec.cosmotech.dataset_absolute_path = str(tmp_path) + + collector = DatasetCollector() + with pytest.raises(FileNotFoundError): + collector.fetch("nonexistent.csv") + + def test_collect_indexes_nested_files(self, tmp_path, mock_ec): + sub = tmp_path / "ds1" / "sub" + sub.mkdir(parents=True) + f = sub / "nested.csv" + f.write_text("") + + mock_ec.cosmotech.dataset_absolute_path = str(tmp_path) + + collector = DatasetCollector() + collector.collect() + + assert "nested.csv" in collector.paths + assert collector.paths["nested.csv"] == f + + +class TestParameterCollector: + def test_init_starts_with_empty_dicts(self, tmp_path, mock_ec): + """parameters.json is no longer read at init — loading is now lazy.""" + params = [{"parameterId": "alpha", "value": "42"}] + (tmp_path / "parameters.json").write_text(json.dumps(params)) + + collector = ParameterCollector() + + assert collector.parameters == {} + assert collector.paths == {} + + def test_read_parameters_json_populates_parameters(self, tmp_path, mock_ec): + params = [{"parameterId": "alpha", "value": "42"}, {"parameterId": "beta", "value": "hello"}] + (tmp_path / "parameters.json").write_text(json.dumps(params)) + + collector = ParameterCollector() + collector.read_parameters_json() + + assert collector.parameters["alpha"] == "42" + assert collector.parameters["beta"] == "hello" + + def test_read_parameters_json_no_file_keeps_empty(self, tmp_path, mock_ec): + collector = ParameterCollector() + collector.read_parameters_json() + + assert collector.parameters == {} + + def test_fetch_parameter_triggers_lazy_load(self, tmp_path, mock_ec): + params = [{"parameterId": "myparam", "value": "myvalue"}] + (tmp_path / "parameters.json").write_text(json.dumps(params)) + + collector = ParameterCollector() + assert collector.parameters == {} + + result = collector.fetch_parameter("myparam") + + assert result == "myvalue" + assert "myparam" in collector.parameters + + def test_fetch_parameter_does_not_reload_if_already_loaded(self, tmp_path, mock_ec): + params = [{"parameterId": "key", "value": "first"}] + (tmp_path / "parameters.json").write_text(json.dumps(params)) + + collector = ParameterCollector() + collector.read_parameters_json() + # Mutate to confirm no second load overwrites it + collector.parameters["key"] = "modified" + + result = collector.fetch_parameter("key") + + assert result == "modified" + + def test_fetch_parameter_raises_key_error_for_unknown(self, tmp_path, mock_ec): + (tmp_path / "parameters.json").write_text(json.dumps([])) + + collector = ParameterCollector() + with pytest.raises(KeyError): + collector.fetch_parameter("nonexistent") + + def test_fetch_file_path_returns_file(self, tmp_path, mock_ec): + param_dir = tmp_path / "myparam" + param_dir.mkdir() + f = param_dir / "data.csv" + f.write_text("") + + collector = ParameterCollector() + result = collector.fetch_file_path("myparam") + + assert result == f + + def test_fetch_file_path_missing_raises(self, tmp_path, mock_ec): + collector = ParameterCollector() + with pytest.raises(FileNotFoundError): + collector.fetch_file_path("nonexistent") + + def test_fetch_returns_value_if_already_in_parameters(self, tmp_path, mock_ec): + """fetch() checks self.parameters directly — no lazy load triggered.""" + collector = ParameterCollector() + collector.parameters["preloaded"] = "value" + + result = collector.fetch("preloaded") + + assert result == "value" + + def test_fetch_falls_back_to_file_without_lazy_load(self, tmp_path, mock_ec): + """fetch() does NOT trigger read_parameters_json — falls straight to fetch_file_path.""" + param_dir = tmp_path / "myparam" + param_dir.mkdir() + f = param_dir / "data.csv" + f.write_text("") + + # parameters.json exists but fetch() won't load it + (tmp_path / "parameters.json").write_text(json.dumps([{"parameterId": "myparam", "value": "json_val"}])) + + collector = ParameterCollector() + result = collector.fetch("myparam") + + assert result == "json_val" + + +class TestInputCollector: + def test_fetch_parameter_returns_preloaded_value(self, tmp_path, mock_ec): + """InputCollector.fetch_parameter calls parameter_collector.fetch(), + which checks self.parameters directly without lazy-loading JSON.""" + collector = InputCollector() + collector.parameter_collector.parameters["key"] = "val" + + assert collector.fetch_parameter("key") == "val" + + def test_fetch_parameter_falls_back_to_file(self, tmp_path, mock_ec): + param_dir = tmp_path / "myparam" + param_dir.mkdir() + f = param_dir / "data.csv" + f.write_text("") + + collector = InputCollector() + result = collector.fetch_parameter("myparam") + + assert result == f + + def test_fetch_dataset_delegates_to_dataset_collector(self, tmp_path, mock_ec): + ds_dir = tmp_path / "ds" / "ds1" + ds_dir.mkdir(parents=True) + f = ds_dir / "myfile.csv" + f.write_text("") + + mock_ec.cosmotech.dataset_absolute_path = str(tmp_path / "ds") + + collector = InputCollector() + result = collector.fetch_dataset("myfile.csv") + + assert result == f + + def test_fetch_tries_parameter_first(self, tmp_path, mock_ec): + """fetch() catches KeyError/FileNotFoundError from fetch_parameter and falls back to dataset.""" + ds_dir = tmp_path / "ds" / "ds1" + ds_dir.mkdir(parents=True) + f = ds_dir / "fallback.csv" + f.write_text("") + + mock_ec.cosmotech.dataset_absolute_path = str(tmp_path / "ds") + + collector = InputCollector() + # Pre-load a parameter so fetch_parameter returns it without touching files + collector.parameter_collector.parameters["fallback.csv"] = "param_value" + + result = collector.fetch("fallback.csv") + + assert result == "param_value" + + def test_fetch_falls_back_to_dataset(self, tmp_path, mock_ec): + ds_dir = tmp_path / "ds" / "ds1" + ds_dir.mkdir(parents=True) + f = ds_dir / "fallback.csv" + f.write_text("") + + mock_ec.cosmotech.dataset_absolute_path = str(tmp_path / "ds") + + collector = InputCollector() + result = collector.fetch("fallback.csv") + + assert result == f