Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 18 additions & 26 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,78 @@ requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/DD_tools"]
packages = ["src/TreeOfLife_toolbox"]

[project]
name = "DD_tools"
name = "TreeOfLife_toolbox"
dynamic = ["version"]
authors = [
{ name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
{ name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
{ name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
]
description = "A tool for downloading files from a list of URLs in parallel."
description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.10, <3.12"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"attrs",
"brotli",
"certifi",
"charset-normalizer",
"cramjam",
"cython",
"exceptiongroup",
"fsspec",
"hatchling",
"idna",
"inflate64",
"iniconfig",
"mpi4py < 4",
"mpi4py",
"multivolumefile",
"numpy",
"opencv-python",
"packaging",
"pandas",
"pathspec",
"pillow",
"pip",
"pluggy",
"psutil",
"py4j",
"pyarrow",
"pybcj",
"pycryptodomex",
"pyppmd",
"pyspark",
"pytest",
"python-dateutil",
"python-dotenv",
"pytz",
"pyyaml",
"pyzstd",
"requests",
"setuptools",
"six",
"texttable",
"tomli",
"trove-classifiers",
"typing-extensions",
"tzdata",
"urllib3",
"wheel"
]

[project.optional-dependencies]
dev = ["pytest"]
dev = [
"pytest",
"ruff"
]

keywords = [
"parallel",
"distributed",
"download",
"url",
"mpi-applications",
"dataset-generation",
]

[project.urls]
Homepage = "https://github.com/Imageomics/distributed-downloader"
Repository = "https://github.com/Imageomics/distributed-downloader.git"
"Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"

[project.scripts]
tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"

[tool.hatch.version]
path = "src/DD_tools/main/__about__.py"
path = "src/TreeOfLife_toolbox/main/__about__.py"
3 changes: 1 addition & 2 deletions scripts/tools_filter.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ executor_memory="64G"
module load spark/3.4.1
module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

pbs-spark-submit \
--driver-memory $driver_memory \
--executor-memory $executor_memory \
"${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
"${TOOLBOX_PATH}/main/filter.py" \
"${tool_name}" \
> "${logs_dir}/tool_filter.log"
3 changes: 1 addition & 2 deletions scripts/tools_scheduler.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_scheduler.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_verifier.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_verifier.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_worker.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task="$TOOLS_CPU_PER_WORKER" \
--mem=0 \
--output="${logs_dir}/tool_worker-%2t.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
Empty file removed src/DD_tools/__init__.py
Empty file.
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from TreeOfLife_toolbox import column_name_change_lila_fix
69 changes: 69 additions & 0 deletions src/TreeOfLife_toolbox/column_name_change_lila_fix/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Column Name Change Lila Fix

A specialized tool built to correct column naming errors in Lila BC dataset parquet files.

## Overview

This tool fixes a specific issue where parquet files from the `storage.googleapis.com` server in the Lila BC dataset
have incorrect column names (`uuid_y` instead of `uuid` and `source_id_y` instead of `source_id`). The tool:

1. Filters for files only from the `storage.googleapis.com` server
2. Creates a schedule to distribute work across MPI workers
3. Processes each file by renaming the columns and saving to a new location

## Configuration Requirements

### Required Config Fields

- `uuid_table_path`: Path to the CSV file containing the UUID table with file paths to process

## Prerequisites

Before running this tool:

1. The UUID table must exist at the specified path
2. The table must contain at least the following columns:
- `server`: Used to filter for only `storage.googleapis.com` entries
- `path`: The full path to the parquet file to be processed
3. Original parquet files must be accessible at the paths specified in the UUID table
4. The worker nodes must have sufficient permissions to read source files and write to the destination folder

## Process Flow

1. **Filtering**: The filter component extracts paths from the UUID table, keeping only those from the
`storage.googleapis.com` server
2. **Scheduling**: The scheduler distributes the paths across available worker nodes
3. **Processing**: Each worker:
- Loads the assigned parquet file
- Renames the columns according to the mapping:
- `uuid_y` → `uuid`
- `source_id_y` → `source_id`
- Saves the corrected file to a new location with zstd compression

## Output and Post-conditions

After successful execution:

1. Corrected parquet files will be saved to:
`/fs/scratch/PAS2136/gbif/processed/lilabc/name_fix/server=storage.googleapis.com/`

2. The directory structure of the output will preserve the original filenames

3. Each processed file will have correctly named columns:
- `uuid` (previously `uuid_y`)
- `source_id` (previously `source_id_y`)
- All other columns remain unchanged

4. A verification table will be created in the tool's directory, tracking which files were successfully processed

5. The tool's checkpoint will be marked as completed when all files have been processed

## Limitations

- This tool can only process files from the `storage.googleapis.com` server
- The column mapping is hardcoded to fix specifically `uuid_y` and `source_id_y`
- The output path is hardcoded to `/fs/scratch/PAS2136/gbif/processed/lilabc/name_fix/server=storage.googleapis.com/`
- There is a 150-second time limit for processing each file

> ⚠️ **Note**: This is a specialized tool built for a specific dataset issue. It should not be used for other cases
> without code modifications.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .classes import (
ColumnNameChangeLilaFixFilter,
ColumnNameChangeLilaFixScheduleCreation,
ColumnNameChangeLilaFixRunner,
)
Loading