Imageomics · Andrey170170 · Feb 16, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,86 +3,78 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/DD_tools"]
+packages = ["src/TreeOfLife_toolbox"]
 
 [project]
-name = "DD_tools"
+name = "TreeOfLife_toolbox"
 dynamic = ["version"]
 authors = [
     { name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
     { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
     { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
 ]
-description = "A tool for downloading files from a list of URLs in parallel."
+description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10, <3.12"
 classifiers = [
+    "Development Status :: 4 - Beta",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 dependencies = [
     "attrs",
     "brotli",
-    "certifi",
-    "charset-normalizer",
     "cramjam",
     "cython",
-    "exceptiongroup",
     "fsspec",
-    "hatchling",
-    "idna",
     "inflate64",
-    "iniconfig",
-    "mpi4py < 4",
+    "mpi4py",
     "multivolumefile",
-    "numpy",
     "opencv-python",
-    "packaging",
     "pandas",
     "pathspec",
     "pillow",
-    "pip",
-    "pluggy",
     "psutil",
-    "py4j",
     "pyarrow",
     "pybcj",
     "pycryptodomex",
     "pyppmd",
     "pyspark",
-    "pytest",
-    "python-dateutil",
     "python-dotenv",
-    "pytz",
     "pyyaml",
     "pyzstd",
     "requests",
     "setuptools",
-    "six",
     "texttable",
-    "tomli",
     "trove-classifiers",
     "typing-extensions",
-    "tzdata",
-    "urllib3",
     "wheel"
 ]
 
 [project.optional-dependencies]
-dev = ["pytest"]
+dev = [
+    "pytest",
+    "ruff"
+]
 
 keywords = [
     "parallel",
     "distributed",
-    "download",
     "url",
+    "mpi-applications",
+    "dataset-generation",
 ]
 
 [project.urls]
 Homepage = "https://github.com/Imageomics/distributed-downloader"
 Repository = "https://github.com/Imageomics/distributed-downloader.git"
 "Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"
 
+[project.scripts]
+tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"
+
 [tool.hatch.version]
-path = "src/DD_tools/main/__about__.py"
+path = "src/TreeOfLife_toolbox/main/__about__.py"
diff --git a/scripts/tools_filter.slurm b/scripts/tools_filter.slurm
@@ -19,11 +19,10 @@ executor_memory="64G"
 module load spark/3.4.1
 module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 pbs-spark-submit \
     --driver-memory $driver_memory \
     --executor-memory $executor_memory \
-    "${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
+    "${TOOLBOX_PATH}/main/filter.py" \
     "${tool_name}" \
     > "${logs_dir}/tool_filter.log"
diff --git a/scripts/tools_scheduler.slurm b/scripts/tools_scheduler.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_scheduler.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
diff --git a/scripts/tools_verifier.slurm b/scripts/tools_verifier.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_verifier.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
diff --git a/scripts/tools_worker.slurm b/scripts/tools_worker.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task="$TOOLS_CPU_PER_WORKER" \
   --mem=0 \
   --output="${logs_dir}/tool_worker-%2t.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
diff --git a/src/DD_tools/__init__.py b/src/DD_tools/__init__.py
diff --git a/src/TreeOfLife_toolbox/__init__.py b/src/TreeOfLife_toolbox/__init__.py
@@ -0,0 +1 @@
+from TreeOfLife_toolbox import column_name_change_lila_fix
diff --git a/src/TreeOfLife_toolbox/column_name_change_lila_fix/README.md b/src/TreeOfLife_toolbox/column_name_change_lila_fix/README.md
@@ -0,0 +1,69 @@
+# Column Name Change Lila Fix
+
+A specialized tool built to correct column naming errors in Lila BC dataset parquet files.
+
+## Overview
+
+This tool fixes a specific issue where parquet files from the `storage.googleapis.com` server in the Lila BC dataset
+have incorrect column names (`uuid_y` instead of `uuid` and `source_id_y` instead of `source_id`). The tool:
+
+1. Filters for files only from the `storage.googleapis.com` server
+2. Creates a schedule to distribute work across MPI workers
+3. Processes each file by renaming the columns and saving to a new location
+
+## Configuration Requirements
+
+### Required Config Fields
+
+- `uuid_table_path`: Path to the CSV file containing the UUID table with file paths to process
+
+## Prerequisites
+
+Before running this tool:
+
+1. The UUID table must exist at the specified path
+2. The table must contain at least the following columns:
+    - `server`: Used to filter for only `storage.googleapis.com` entries
+    - `path`: The full path to the parquet file to be processed
+3. Original parquet files must be accessible at the paths specified in the UUID table
+4. The worker nodes must have sufficient permissions to read source files and write to the destination folder
+
+## Process Flow
+
+1. **Filtering**: The filter component extracts paths from the UUID table, keeping only those from the
+   `storage.googleapis.com` server
+2. **Scheduling**: The scheduler distributes the paths across available worker nodes
+3. **Processing**: Each worker:
+    - Loads the assigned parquet file
+    - Renames the columns according to the mapping:
+        - `uuid_y` → `uuid`
+        - `source_id_y` → `source_id`
+    - Saves the corrected file to a new location with zstd compression
+
+## Output and Post-conditions
+
+After successful execution:
+
+1. Corrected parquet files will be saved to:
+   `/fs/scratch/PAS2136/gbif/processed/lilabc/name_fix/server=storage.googleapis.com/`
+
+2. The directory structure of the output will preserve the original filenames
+
+3. Each processed file will have correctly named columns:
+    - `uuid` (previously `uuid_y`)
+    - `source_id` (previously `source_id_y`)
+    - All other columns remain unchanged
+
+4. A verification table will be created in the tool's directory, tracking which files were successfully processed
+
+5. The tool's checkpoint will be marked as completed when all files have been processed
+
+## Limitations
+
+- This tool can only process files from the `storage.googleapis.com` server
+- The column mapping is hardcoded to fix specifically `uuid_y` and `source_id_y`
+- The output path is hardcoded to `/fs/scratch/PAS2136/gbif/processed/lilabc/name_fix/server=storage.googleapis.com/`
+- There is a 150-second time limit for processing each file
+
+> ⚠️ **Note**: This is a specialized tool built for a specific dataset issue. It should not be used for other cases
+> without code modifications.
diff --git a/src/TreeOfLife_toolbox/column_name_change_lila_fix/__init__.py b/src/TreeOfLife_toolbox/column_name_change_lila_fix/__init__.py
@@ -0,0 +1,5 @@
+from .classes import (
+    ColumnNameChangeLilaFixFilter,
+    ColumnNameChangeLilaFixScheduleCreation,
+    ColumnNameChangeLilaFixRunner,
+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from TreeOfLife_toolbox import column_name_change_lila_fix