Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cosmotech/coal/cosmotech_api/apis/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _download_part(self, dataset_id, dataset_part, destination):
)

@staticmethod
def path_to_parts(_path, part_type) -> list[tuple[str, Path, DatasetPartTypeEnum]]:
def path_to_parts(_path, part_type) -> list[tuple[str, str, Path, DatasetPartTypeEnum]]:
if (_path := Path(_path)).is_dir():
return list((str(_p.relative_to(_path)), _p, part_type) for _p in _path.rglob("*") if _p.is_file())
return list(((_path.name, _path, part_type),))
Expand Down Expand Up @@ -118,7 +118,7 @@ def upload_dataset(
additional_data=additional_data,
parts=list(
DatasetPartCreateRequest(
name=_p_name,
name=Path(_p_name).stem,
description=_p_name,
sourceName=_p_name,
type=_type,
Expand Down Expand Up @@ -195,7 +195,7 @@ def upload_dataset_parts(

# Create new part
part_request = DatasetPartCreateRequest(
name=_p_name,
name=Path(_p_name).stem,
description=_p_name,
sourceName=_p_name,
type=_type,
Expand Down
16 changes: 11 additions & 5 deletions cosmotech/coal/postgresql/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,25 @@ def send_runner_metadata_to_postgresql(
CREATE TABLE IF NOT EXISTS {schema_table} (
id varchar(32) PRIMARY KEY,
name varchar(256),
last_csm_run_id varchar(32),
last_csm_run_id varchar(32) UNIQUE,
run_template_id varchar(32)
);
"""
LOGGER.info(T("coal.services.postgresql.creating_table").format(schema_table=schema_table))
curs.execute(sql_create_table)
conn.commit()

runner_id = runner.get("id")
sql_delete_from_metatable = f"""
DELETE FROM {schema_table}
WHERE id= $1;
"""
curs.execute(sql_delete_from_metatable, (runner_id,))
conn.commit()

sql_upsert = f"""
INSERT INTO {schema_table} (id, name, last_csm_run_id, run_template_id)
VALUES ($1, $2, $3, $4)
ON CONFLICT (id)
DO
UPDATE SET name = EXCLUDED.name, last_csm_run_id = EXCLUDED.last_csm_run_id;
VALUES ($1, $2, $3, $4)
"""
LOGGER.debug(runner)
curs.execute(
Expand Down
2 changes: 1 addition & 1 deletion cosmotech/coal/postgresql/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def dump_store_to_postgresql_from_conf(
)
if fk_id and _psql.is_metadata_exists():
metadata_table = f"{_psql.metadata_table_name}"
_psql.add_fk_constraint(table_name, "csm_run_id", metadata_table, "last_csm_run_id")
_psql.add_fk_constraint(target_table_name, "csm_run_id", metadata_table, "last_csm_run_id")

total_rows += rows
_up_time = perf_counter()
Expand Down
13 changes: 10 additions & 3 deletions cosmotech/coal/postgresql/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,20 @@ def add_fk_constraint(
to_table: str,
to_col: str,
) -> None:
# Connect to PostgreSQL and remove runner metadata row
with dbapi.connect(self.full_uri, autocommit=True) as conn:
# Connect to PostgreSQL and add a foreign key constraint
with dbapi.connect(self.full_uri, autocommit=False) as conn:
with conn.cursor() as curs:
sql_drop_fk = f"""
ALTER TABLE {self.db_schema}.{from_table}
DROP CONSTRAINT IF EXISTS metadata;
"""
sql_add_fk = f"""
ALTER TABLE {self.db_schema}.{from_table}
CONSTRAINT metadata FOREIGN KEY ({from_col}) REFERENCES {to_table}({to_col})
ADD CONSTRAINT metadata FOREIGN KEY ({from_col})
REFERENCES {self.db_schema}.{to_table}({to_col})
ON DELETE CASCADE;
"""
curs.execute(sql_drop_fk)
curs.execute(sql_add_fk)
conn.commit()

Expand Down
45 changes: 45 additions & 0 deletions cosmotech/coal/store/parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (C) - 2023 - 2025 - Cosmo Tech
# This document and all information contained herein is the exclusive property -
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
# Any use, reproduction, translation, broadcasting, transmission, distribution,
# etc., to any person is prohibited unless it has been previously and
# specifically authorized by written means by Cosmo Tech.

import pathlib

import pyarrow as pa
import pyarrow.parquet as pq

from cosmotech.coal.store.store import Store


def store_parquet_file(
table_name: str,
parquet_path: pathlib.Path,
replace_existsing_file: bool = False,
store=Store(),
):
if not parquet_path.exists():
raise FileNotFoundError(f"File {parquet_path} does not exists")

data: pa.Table = pq.ParquetFile(parquet_path).read()
_c = data.column_names
data = data.rename_columns([Store.sanitize_column(_column) for _column in _c])

store.add_table(table_name=table_name, data=data, replace=replace_existsing_file)


def convert_store_table_to_parquet(
table_name: str,
parquet_path: pathlib.Path,
replace_existsing_file: bool = False,
store=Store(),
):
if parquet_path.name.endswith(".parquet") and parquet_path.exists() and not replace_existsing_file:
raise FileExistsError(f"File {parquet_path} already exists")
if not parquet_path.name.endswith(".parquet"):
parquet_path = parquet_path / f"{table_name}.parquet"
folder = parquet_path.parent
folder.mkdir(parents=True, exist_ok=True)

pq.write_table(store.get_table(table_name), parquet_path)
2 changes: 2 additions & 0 deletions cosmotech/coal/utils/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class Configuration(Dotdict):
"api": {"url": "CSM_API_URL", "scope": "CSM_API_SCOPE"},
"dataset_absolute_path": "CSM_DATASET_ABSOLUTE_PATH",
"parameters_absolute_path": "CSM_PARAMETERS_ABSOLUTE_PATH",
"output_absolute_path": "CSM_OUTPUT_ABSOLUTE_PATH",
"tmp_absolute_path": "CSM_TEMP_ABSOLUTE_PATH",
"organization_id": "CSM_ORGANIZATION_ID",
"workspace_id": "CSM_WORKSPACE_ID",
"runner_id": "CSM_RUNNER_ID",
Expand Down
121 changes: 121 additions & 0 deletions cosmotech/coal/utils/input_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import json
import os
from pathlib import Path

from cosmotech.coal.utils.configuration import ENVIRONMENT_CONFIGURATION as EC
from cosmotech.coal.utils.logger import LOGGER


class InputCollector:
def __init__(self):
self.dataset_collector = DatasetCollector()
self.parameter_collector = ParameterCollector()
self.workspace_collector = WorkspaceCollector()

def fetch_dataset(self, dataset_name: str) -> Path:
return self.dataset_collector.fetch(dataset_name)

def fetch_parameter(self, param_name: str) -> Path:
return self.parameter_collector.fetch(param_name)

def fetch_workspace_file(self, file_name: str) -> Path:
return self.workspace_collector.fetch(file_name)

def fetch(self, name: str) -> Path:
try:
return self.fetch_parameter(name)
except (KeyError, FileNotFoundError):
LOGGER.debug(f"Parameter {name} not found, trying workspace files.")
try:
return self.fetch_workspace_file(name)
except FileNotFoundError:
LOGGER.debug(f"Workspace file {name} not found, trying dataset files.")
return self.fetch_dataset(name)


class DatasetCollector:
def __init__(self):
self.paths: dict[str, Path] = {}

def collect(self):
for dataset_id in os.listdir(EC.cosmotech.dataset_absolute_path):
for r, d, f in os.walk(Path(EC.cosmotech.dataset_absolute_path) / dataset_id):
for dataset_name in f:
path = Path(r) / dataset_name
self.paths[dataset_name] = path
self.paths[path.stem] = path

def fetch(self, dataset_name: str) -> Path:
# lazy collection to avoid unnecessary os.walk calls
if not self.paths:
self.collect()
if dataset_name in self.paths:
return self.paths[dataset_name]
raise FileNotFoundError(f"File for {dataset_name} not found in {EC.cosmotech.dataset_absolute_path}.")


class WorkspaceCollector:
def __init__(self):
self.paths: dict[str, Path] = {}

def collect(self):
workspace_path = Path(EC.cosmotech.dataset_absolute_path) / "workspace_files"
if workspace_path.exists():
for r, d, f in os.walk(workspace_path):
for file_name in f:
path = Path(r) / file_name
self.paths[file_name] = path
self.paths[path.stem] = path

def fetch(self, file_name: str) -> Path:
if not self.paths:
self.collect()
if file_name in self.paths:
return self.paths[file_name]
raise FileNotFoundError(f"File {file_name} not found in workspace_files.")


class ParameterCollector:
def __init__(self):
self.paths: dict[str, Path] = {}
self.parameters: dict[str, str] = {}

def read_parameters_json(self):
parameter_file = Path(EC.cosmotech.parameters_absolute_path) / "parameters.json"
if parameter_file.exists():
with open(parameter_file) as f:
parameters = json.load(f)
for parameter in parameters:
self.parameters[parameter["parameterId"]] = parameter["value"]

def collect(self):
for dataset_id in os.listdir(EC.cosmotech.parameters_absolute_path):
for r, d, f in os.walk(Path(EC.cosmotech.parameters_absolute_path) / dataset_id):
for file_name in f:
path = Path(r) / file_name
param_name = path.parent.name
self.paths[param_name] = path
self.paths[path.stem] = path

def fetch_parameter(self, param_name: str) -> Path:
# lazy collection to avoid unnecessary json loading
if not self.parameters:
self.read_parameters_json()
return self.parameters[param_name]

def fetch_file_path(self, param_name: str) -> Path:
# lazy collection to avoid unnecessary os.walk calls
if not self.paths:
self.collect()
if param_name in self.paths:
return self.paths[param_name]
raise FileNotFoundError(f"File for {param_name} not found in {EC.cosmotech.parameters_absolute_path}.")

def fetch(self, param_name: str) -> Path:
try:
return self.fetch_parameter(param_name)
except KeyError:
return self.fetch_file_path(param_name)


ENVIRONMENT_INPUT_COLLECTOR = InputCollector()
2 changes: 1 addition & 1 deletion cosmotech/csm_data/commands/api/wsf_load_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"--workspace-path",
help=T("csm_data.commands.api.wsf_load_file.parameters.workspace_path"),
metavar="PATH",
default="/",
default="",
type=str,
)
@click.option(
Expand Down
49 changes: 49 additions & 0 deletions cosmotech/csm_data/commands/store/load_parquet_folder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (C) - 2023 - 2025 - Cosmo Tech
# This document and all information contained herein is the exclusive property -
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
# Any use, reproduction, translation, broadcasting, transmission, distribution,
# etc., to any person is prohibited unless it has been previously and
# specifically authorized by written means by Cosmo Tech.
from cosmotech.orchestrator.utils.translate import T

from cosmotech.csm_data.utils.click import click
from cosmotech.csm_data.utils.decorators import translate_help, web_help


@click.command()
@web_help("csm-data/store/load-parquet-folder")
@translate_help("csm_data.commands.store.load_parquet_folder.description")
@click.option(
"--store-folder",
envvar="CSM_PARAMETERS_ABSOLUTE_PATH",
help=T("csm_data.commands.store.load_parquet_folder.parameters.store_folder"),
metavar="PATH",
type=str,
show_envvar=True,
required=True,
)
@click.option(
"--parquet-folder",
envvar="CSM_OUTPUT_ABSOLUTE_PATH",
help=T("csm_data.commands.store.load_parquet_folder.parameters.parquet_folder"),
metavar="PATH",
type=str,
show_envvar=True,
required=True,
)
def load_parquet_folder(store_folder, parquet_folder):
# Import the modules and functions at the start of the command
import pathlib

from cosmotech.coal.store.parquet import store_parquet_file
from cosmotech.coal.store.store import Store
from cosmotech.coal.utils.configuration import Configuration
from cosmotech.coal.utils.logger import LOGGER

_conf = Configuration()

_conf.coal.store = store_folder

for parquet_path in pathlib.Path(parquet_folder).glob("*.parquet"):
LOGGER.info(T("coal.services.azure_storage.found_file").format(file=parquet_path.name))
store_parquet_file(parquet_path.name[:-8], parquet_path, store=Store(False, _conf))
2 changes: 2 additions & 0 deletions cosmotech/csm_data/commands/store/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from cosmotech.csm_data.commands.store.load_from_singlestore import (
load_from_singlestore_command,
)
from cosmotech.csm_data.commands.store.load_parquet_folder import load_parquet_folder
from cosmotech.csm_data.commands.store.output import output
from cosmotech.csm_data.commands.store.reset import reset
from cosmotech.csm_data.utils.click import click
Expand All @@ -30,6 +31,7 @@ def store():
store.add_command(reset, "reset")
store.add_command(list_tables, "list-tables")
store.add_command(load_csv_folder, "load-csv-folder")
store.add_command(load_parquet_folder, "load-parquet-folder")
store.add_command(load_from_singlestore_command, "load-from-singlestore")
store.add_command(dump_to_postgresql, "dump-to-postgresql")
store.add_command(dump_to_s3, "dump-to-s3")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
description: |
Running this command will find all parquet files in the given folder and put them in the store
parameters:
store_folder: The folder containing the store files
parquet_folder: The folder containing the parquet files to store
4 changes: 2 additions & 2 deletions tests/unit/coal/test_cosmotech_api/test_apis/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,13 +543,13 @@ def test_update_dataset_mixed_files(self, mock_cosmotech_config, mock_api_client
assert len(args_list) == 2
# check first call used to create csv part
dpcr = args_list[0].kwargs.get("dataset_part_create_request")
assert dpcr.name == "data.csv"
assert dpcr.name == "data"
assert dpcr.source_name == "data.csv"
assert dpcr.description == "data.csv"
assert dpcr.type == DatasetPartTypeEnum.FILE
# check second call used to create db part
dpcr = args_list[1].kwargs.get("dataset_part_create_request")
assert dpcr.name == "data.db"
assert dpcr.name == "data"
assert dpcr.source_name == "data.db"
assert dpcr.description == "data.db"
assert dpcr.type == DatasetPartTypeEnum.DB
11 changes: 8 additions & 3 deletions tests/unit/coal/test_postgresql/test_postgresql_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,19 @@ def test_send_runner_metadata_to_postgresql(self, mock_connect, mock_postgres_ut
mock_connect.assert_called_once_with("postgresql://user:password@localhost:5432/testdb", autocommit=True)

# Check that SQL statements were executed
assert mock_cursor.execute.call_count == 2
assert mock_cursor.execute.call_count == 3

# Verify the SQL statements (partially, since the exact SQL is complex)
create_table_call = mock_cursor.execute.call_args_list[0]
assert "CREATE TABLE IF NOT EXISTS" in create_table_call[0][0]
assert "public.test_runnermetadata" in create_table_call[0][0]

upsert_call = mock_cursor.execute.call_args_list[1]
delete_call = mock_cursor.execute.call_args_list[1]
assert "DELETE FROM" in delete_call[0][0]
assert "public.test_runnermetadata" in delete_call[0][0]
assert delete_call[0][1] == ("test-runner-id",)

upsert_call = mock_cursor.execute.call_args_list[2]
assert "INSERT INTO" in upsert_call[0][0]
assert "public.test_runnermetadata" in upsert_call[0][0]
assert upsert_call[0][1] == (
Expand All @@ -90,7 +95,7 @@ def test_send_runner_metadata_to_postgresql(self, mock_connect, mock_postgres_ut
)

# Check that commits were called
assert mock_conn.commit.call_count == 2
assert mock_conn.commit.call_count == 3

# Verify the function returns the lastRunId
assert result == "test-run-id"
Expand Down
Loading
Loading