From 10653306180f5212b91617a2842c6057e45a9918 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Sun, 16 Feb 2025 16:05:25 -0500
Subject: [PATCH 1/5] Added fathom_net_crop tool

---
 src/DD_tools/fathom_net_crop/__init__.py |   0
 src/DD_tools/fathom_net_crop/classes.py  | 123 +++++++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 src/DD_tools/fathom_net_crop/__init__.py
 create mode 100644 src/DD_tools/fathom_net_crop/classes.py

diff --git a/src/DD_tools/fathom_net_crop/__init__.py b/src/DD_tools/fathom_net_crop/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/DD_tools/fathom_net_crop/classes.py b/src/DD_tools/fathom_net_crop/classes.py
new file mode 100644
index 0000000..a3777bd
--- /dev/null
+++ b/src/DD_tools/fathom_net_crop/classes.py
@@ -0,0 +1,123 @@
+import hashlib
+import os
+import shutil
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from DD_tools.main.config import Config
+from DD_tools.main.filters import PythonFilterToolBase, FilterRegister
+from DD_tools.main.runners import MPIRunnerTool, RunnerRegister
+from DD_tools.main.schedulers import DefaultScheduler, SchedulerRegister
+
+
+@FilterRegister("fathom_net_crop")
+class FathomnetCropFilter(PythonFilterToolBase):
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+        self.filter_name: str = "fathom_net_crop"
+
+
+@SchedulerRegister("fathom_net_crop")
+class FathomnetCropScheduleCreation(DefaultScheduler):
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+        self.filter_name: str = "fathom_net_crop"
+
+
+@RunnerRegister("fathom_net_crop")
+class FathomnetCropRunner(MPIRunnerTool):
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+        self.filter_name: str = "fathom_net_crop"
+        self.data_scheme: List[str] = ["server_name", "partition_id"]
+        self.verification_scheme: List[str] = ["server_name", "partition_id"]
+        self.total_time = 150
+        self.bb_df = pd.read_csv(
+            "/fs/scratch/PAS2136/gbif/processed/fathomNet/filtered_by_size.csv"
+        )
+        self.image_crop_path = os.path.join(
+            cfg.get_folder("path_to_output_folder"), "image_crop"
+        )
+
+    def apply_filter(
+            self, filtering_df: pd.DataFrame, server_name: str, partition_id: int
+    ) -> int:
+        self.is_enough_time()
+
+        input_path = os.path.join(
+            self.downloaded_images_path,
+            f"server_name={server_name}",
+            f"partition_id={partition_id}",
+        )
+        parquet_path = os.path.join(input_path, "successes.parquet")
+
+        if not os.path.exists(parquet_path):
+            self.logger.info(f"Path doesn't exists: {parquet_path}")
+            return 0
+
+        full_image = pd.read_parquet(
+            parquet_path, filters=[("source_id", "in", self.bb_df["image_uuid"])]
+        )
+
+        self.is_enough_time()
+
+        columns = full_image.columns
+        full_image = full_image.merge(
+            self.bb_df,
+            left_on="source_id",
+            right_on="image_uuid",
+            how="inner",
+            validate="1:m",
+        )
+        cropped_images = []
+        for _, row in full_image.iterrows():
+            cropped_entry = row[columns].to_dict()
+            image_binary = row["image"]
+            image_size = row["resized_size"]
+            image_np = np.frombuffer(image_binary, dtype=np.uint8).reshape(
+                [image_size[0], image_size[1], 3]
+            )
+            # fix
+            min_y = min(image_size[0], max(row["y"], 0))
+            min_x = min(image_size[1], max(row["x"], 0))
+            max_y = min(image_size[0], max(row["y"] + row["height"], 0))
+            max_x = min(image_size[1], max(row["x"] + row["width"], 0))
+
+            image_cropped = image_np[min_y:max_y, min_x:max_x]
+
+            cropped_entry["image"] = image_cropped.tobytes()
+            cropped_entry["resized_size"] = (max_y - min_y, max_x - min_x)
+            cropped_entry["hashsum_resized"] = hashlib.md5(
+                cropped_entry["image"]
+            ).hexdigest()
+            cropped_entry["uuid"] = row["tol_uuid"]
+            cropped_entry["source_id"] = row["bb_uuid"]
+
+            assert len(cropped_entry["image"]) == (
+                    cropped_entry["resized_size"][0] * cropped_entry["resized_size"][1] * 3
+            ), f"Size mismatch for {row['tol_uuid']}"
+
+            cropped_images.append(cropped_entry)
+
+        self.is_enough_time()
+        full_image = pd.DataFrame(cropped_images)
+        output_path = os.path.join(
+            self.image_crop_path,
+            f"server_name={server_name}",
+            f"partition_id={partition_id}",
+        )
+        os.makedirs(output_path, exist_ok=True)
+        full_image.to_parquet(
+            os.path.join(output_path, "successes.parquet"),
+            index=False,
+            compression="zstd",
+            compression_level=3,
+        )
+        for file in ["errors.parquet", "completed"]:
+            shutil.copyfile(
+                os.path.join(input_path, file), os.path.join(output_path, file)
+            )
+
+        return len(full_image)

From f41119261d8b8b6a8a7da29ce0cfbe37b604b389 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Mon, 12 May 2025 02:29:05 -0400
Subject: [PATCH 2/5] Rename project from 'DD_tools' to 'TreeOfLife_toolbox'.

Updated package structure, filenames, and references to reflect the new name. Adjusted `pyproject.toml` to rename the project, update dependencies, and modify supported Python versions. These changes ensure consistency and alignment with the new project branding.
---
 pyproject.toml                                | 28 ++++---------------
 .../__init__.py                               |  0
 .../main/__about__.py                         |  0
 .../main/checkpoint.py                        |  0
 .../main/config.py                            |  0
 .../main/config_templates/tools.yaml          |  0
 .../main/filter.py                            |  0
 .../main/filters.py                           |  0
 .../main/main.py                              |  0
 .../main/registry.py                          |  0
 .../main/runner.py                            |  0
 .../main/runners.py                           |  0
 .../main/scheduler.py                         |  0
 .../main/schedulers.py                        |  0
 .../main/utils.py                             |  0
 .../main/verification.py                      |  0
 16 files changed, 5 insertions(+), 23 deletions(-)
 rename src/{DD_tools => TreeOfLife_toolbox}/__init__.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/__about__.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/checkpoint.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/config.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/config_templates/tools.yaml (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/filter.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/filters.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/main.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/registry.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/runner.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/runners.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/scheduler.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/schedulers.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/utils.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/verification.py (100%)

diff --git a/pyproject.toml b/pyproject.toml
index 17be78f..cb76174 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,10 +3,10 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/DD_tools"]
+packages = ["src/TreeOfLife_toolbox"]
 
 [project]
-name = "DD_tools"
+name = "TreeOfLife_toolbox"
 dynamic = ["version"]
 authors = [
     { name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
@@ -15,7 +15,7 @@ authors = [
 ]
 description = "A tool for downloading files from a list of URLs in parallel."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10, <3.12"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
@@ -24,48 +24,30 @@ classifiers = [
 dependencies = [
     "attrs",
     "brotli",
-    "certifi",
-    "charset-normalizer",
     "cramjam",
     "cython",
-    "exceptiongroup",
     "fsspec",
-    "hatchling",
-    "idna",
     "inflate64",
-    "iniconfig",
-    "mpi4py < 4",
+    "mpi4py",
     "multivolumefile",
-    "numpy",
     "opencv-python",
-    "packaging",
     "pandas",
     "pathspec",
     "pillow",
-    "pip",
-    "pluggy",
     "psutil",
-    "py4j",
     "pyarrow",
     "pybcj",
     "pycryptodomex",
     "pyppmd",
     "pyspark",
-    "pytest",
-    "python-dateutil",
     "python-dotenv",
-    "pytz",
     "pyyaml",
     "pyzstd",
     "requests",
     "setuptools",
-    "six",
     "texttable",
-    "tomli",
     "trove-classifiers",
     "typing-extensions",
-    "tzdata",
-    "urllib3",
     "wheel"
 ]
 
@@ -85,4 +67,4 @@ Repository = "https://github.com/Imageomics/distributed-downloader.git"
 "Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"
 
 [tool.hatch.version]
-path = "src/DD_tools/main/__about__.py"
+path = "src/TreeOfLife_toolbox/main/__about__.py"
diff --git a/src/DD_tools/__init__.py b/src/TreeOfLife_toolbox/__init__.py
similarity index 100%
rename from src/DD_tools/__init__.py
rename to src/TreeOfLife_toolbox/__init__.py
diff --git a/src/DD_tools/main/__about__.py b/src/TreeOfLife_toolbox/main/__about__.py
similarity index 100%
rename from src/DD_tools/main/__about__.py
rename to src/TreeOfLife_toolbox/main/__about__.py
diff --git a/src/DD_tools/main/checkpoint.py b/src/TreeOfLife_toolbox/main/checkpoint.py
similarity index 100%
rename from src/DD_tools/main/checkpoint.py
rename to src/TreeOfLife_toolbox/main/checkpoint.py
diff --git a/src/DD_tools/main/config.py b/src/TreeOfLife_toolbox/main/config.py
similarity index 100%
rename from src/DD_tools/main/config.py
rename to src/TreeOfLife_toolbox/main/config.py
diff --git a/src/DD_tools/main/config_templates/tools.yaml b/src/TreeOfLife_toolbox/main/config_templates/tools.yaml
similarity index 100%
rename from src/DD_tools/main/config_templates/tools.yaml
rename to src/TreeOfLife_toolbox/main/config_templates/tools.yaml
diff --git a/src/DD_tools/main/filter.py b/src/TreeOfLife_toolbox/main/filter.py
similarity index 100%
rename from src/DD_tools/main/filter.py
rename to src/TreeOfLife_toolbox/main/filter.py
diff --git a/src/DD_tools/main/filters.py b/src/TreeOfLife_toolbox/main/filters.py
similarity index 100%
rename from src/DD_tools/main/filters.py
rename to src/TreeOfLife_toolbox/main/filters.py
diff --git a/src/DD_tools/main/main.py b/src/TreeOfLife_toolbox/main/main.py
similarity index 100%
rename from src/DD_tools/main/main.py
rename to src/TreeOfLife_toolbox/main/main.py
diff --git a/src/DD_tools/main/registry.py b/src/TreeOfLife_toolbox/main/registry.py
similarity index 100%
rename from src/DD_tools/main/registry.py
rename to src/TreeOfLife_toolbox/main/registry.py
diff --git a/src/DD_tools/main/runner.py b/src/TreeOfLife_toolbox/main/runner.py
similarity index 100%
rename from src/DD_tools/main/runner.py
rename to src/TreeOfLife_toolbox/main/runner.py
diff --git a/src/DD_tools/main/runners.py b/src/TreeOfLife_toolbox/main/runners.py
similarity index 100%
rename from src/DD_tools/main/runners.py
rename to src/TreeOfLife_toolbox/main/runners.py
diff --git a/src/DD_tools/main/scheduler.py b/src/TreeOfLife_toolbox/main/scheduler.py
similarity index 100%
rename from src/DD_tools/main/scheduler.py
rename to src/TreeOfLife_toolbox/main/scheduler.py
diff --git a/src/DD_tools/main/schedulers.py b/src/TreeOfLife_toolbox/main/schedulers.py
similarity index 100%
rename from src/DD_tools/main/schedulers.py
rename to src/TreeOfLife_toolbox/main/schedulers.py
diff --git a/src/DD_tools/main/utils.py b/src/TreeOfLife_toolbox/main/utils.py
similarity index 100%
rename from src/DD_tools/main/utils.py
rename to src/TreeOfLife_toolbox/main/utils.py
diff --git a/src/DD_tools/main/verification.py b/src/TreeOfLife_toolbox/main/verification.py
similarity index 100%
rename from src/DD_tools/main/verification.py
rename to src/TreeOfLife_toolbox/main/verification.py

From 4a982dd82b03ab2bd8e9a8fc1a7ea270e51b6f53 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Mon, 12 May 2025 02:34:30 -0400
Subject: [PATCH 3/5] Refactor import paths to use TreeOfLife_toolbox module.

Updated all import statements to reference TreeOfLife_toolbox instead of DD_tools for consistency and clarity. Adjusted slurm scripts to align with the new module structure and standardized environment variables for toolbox path configuration.
---
 scripts/tools_filter.slurm                  |  3 +--
 scripts/tools_scheduler.slurm               |  3 +--
 scripts/tools_verifier.slurm                |  3 +--
 scripts/tools_worker.slurm                  |  3 +--
 src/TreeOfLife_toolbox/main/filter.py       |  8 ++++----
 src/TreeOfLife_toolbox/main/filters.py      |  8 ++++----
 src/TreeOfLife_toolbox/main/main.py         | 10 ++++++----
 src/TreeOfLife_toolbox/main/registry.py     |  4 ++--
 src/TreeOfLife_toolbox/main/runner.py       |  8 ++++----
 src/TreeOfLife_toolbox/main/runners.py      |  4 ++--
 src/TreeOfLife_toolbox/main/scheduler.py    |  8 ++++----
 src/TreeOfLife_toolbox/main/schedulers.py   |  4 ++--
 src/TreeOfLife_toolbox/main/verification.py | 10 +++++-----
 13 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/scripts/tools_filter.slurm b/scripts/tools_filter.slurm
index 4642e34..6aee3f6 100644
--- a/scripts/tools_filter.slurm
+++ b/scripts/tools_filter.slurm
@@ -19,11 +19,10 @@ executor_memory="64G"
 module load spark/3.4.1
 module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 pbs-spark-submit \
     --driver-memory $driver_memory \
     --executor-memory $executor_memory \
-    "${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
+    "${TOOLBOX_PATH}/main/filter.py" \
     "${tool_name}" \
     > "${logs_dir}/tool_filter.log"
diff --git a/scripts/tools_scheduler.slurm b/scripts/tools_scheduler.slurm
index e4fb6a2..ea35a32 100644
--- a/scripts/tools_scheduler.slurm
+++ b/scripts/tools_scheduler.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_scheduler.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
diff --git a/scripts/tools_verifier.slurm b/scripts/tools_verifier.slurm
index 98ca024..6a3b75e 100644
--- a/scripts/tools_verifier.slurm
+++ b/scripts/tools_verifier.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_verifier.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
diff --git a/scripts/tools_worker.slurm b/scripts/tools_worker.slurm
index 2ee2662..4856e62 100644
--- a/scripts/tools_worker.slurm
+++ b/scripts/tools_worker.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task="$TOOLS_CPU_PER_WORKER" \
   --mem=0 \
   --output="${logs_dir}/tool_worker-%2t.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
diff --git a/src/TreeOfLife_toolbox/main/filter.py b/src/TreeOfLife_toolbox/main/filter.py
index 080e1a2..ed526c5 100644
--- a/src/TreeOfLife_toolbox/main/filter.py
+++ b/src/TreeOfLife_toolbox/main/filter.py
@@ -1,10 +1,10 @@
 import argparse
 import os
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")
diff --git a/src/TreeOfLife_toolbox/main/filters.py b/src/TreeOfLife_toolbox/main/filters.py
index 11c9426..385f18e 100644
--- a/src/TreeOfLife_toolbox/main/filters.py
+++ b/src/TreeOfLife_toolbox/main/filters.py
@@ -7,10 +7,10 @@
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType
 
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsBase
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import SuccessEntry
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsBase
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import SuccessEntry
 
 FilterRegister = partial(ToolsRegistryBase.register, "filter")
 
diff --git a/src/TreeOfLife_toolbox/main/main.py b/src/TreeOfLife_toolbox/main/main.py
index b3d5732..5272354 100644
--- a/src/TreeOfLife_toolbox/main/main.py
+++ b/src/TreeOfLife_toolbox/main/main.py
@@ -1,15 +1,16 @@
 import argparse
 import os
 from logging import Logger
+from pathlib import Path
 from typing import Dict, List, Optional, TextIO, Tuple
 
 import pandas as pd
 from attr import Factory, define, field
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import (
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import (
     init_logger,
     ensure_created,
     truncate_paths,
@@ -78,6 +79,7 @@ def __attrs_post_init__(self):
 
     def __init_environment(self) -> None:
         os.environ["CONFIG_PATH"] = self.config.config_path
+        os.environ["TOOLBOX_PATH"] = str(Path(__file__).parent.parent.resolve())
 
         os.environ["ACCOUNT"] = self.config["account"]
         os.environ["PATH_TO_INPUT"] = self.config["path_to_input"]
diff --git a/src/TreeOfLife_toolbox/main/registry.py b/src/TreeOfLife_toolbox/main/registry.py
index 12774dd..03cf9d6 100644
--- a/src/TreeOfLife_toolbox/main/registry.py
+++ b/src/TreeOfLife_toolbox/main/registry.py
@@ -1,7 +1,7 @@
 from typing import Dict, Type, Optional
 
-from DD_tools.main.config import Config
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.utils import init_logger
 
 
 class ToolsRegistryBase(type):
diff --git a/src/TreeOfLife_toolbox/main/runner.py b/src/TreeOfLife_toolbox/main/runner.py
index 214237e..77dcefa 100644
--- a/src/TreeOfLife_toolbox/main/runner.py
+++ b/src/TreeOfLife_toolbox/main/runner.py
@@ -1,10 +1,10 @@
 import argparse
 import os
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")
diff --git a/src/TreeOfLife_toolbox/main/runners.py b/src/TreeOfLife_toolbox/main/runners.py
index cd875d3..bfb5d5e 100644
--- a/src/TreeOfLife_toolbox/main/runners.py
+++ b/src/TreeOfLife_toolbox/main/runners.py
@@ -6,8 +6,8 @@
 
 import pandas as pd
 
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase
 
 RunnerRegister = partial(ToolsRegistryBase.register, "runner")
 
diff --git a/src/TreeOfLife_toolbox/main/scheduler.py b/src/TreeOfLife_toolbox/main/scheduler.py
index 707b656..d686ae6 100644
--- a/src/TreeOfLife_toolbox/main/scheduler.py
+++ b/src/TreeOfLife_toolbox/main/scheduler.py
@@ -1,10 +1,10 @@
 import argparse
 import os
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")
diff --git a/src/TreeOfLife_toolbox/main/schedulers.py b/src/TreeOfLife_toolbox/main/schedulers.py
index ed70a9c..6b2c6e2 100644
--- a/src/TreeOfLife_toolbox/main/schedulers.py
+++ b/src/TreeOfLife_toolbox/main/schedulers.py
@@ -5,8 +5,8 @@
 
 import pandas as pd
 
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase
 
 SchedulerRegister = partial(ToolsRegistryBase.register, "scheduler")
 
diff --git a/src/TreeOfLife_toolbox/main/verification.py b/src/TreeOfLife_toolbox/main/verification.py
index 742bb86..31d2561 100644
--- a/src/TreeOfLife_toolbox/main/verification.py
+++ b/src/TreeOfLife_toolbox/main/verification.py
@@ -3,11 +3,11 @@
 
 import pandas as pd
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.runners import MPIRunnerTool
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.runners import MPIRunnerTool
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")

From 99ca3b016b14c27482d7152bba47d6db71b46334 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Mon, 12 May 2025 02:39:08 -0400
Subject: [PATCH 4/5] Update metadata and dependencies in pyproject.toml

Revised the project description, added programming language classifiers, and enhanced optional dependencies with 'ruff'. Introduced new keywords and added a script entry point for better usability.
---
 pyproject.toml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cb76174..b3e3a5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,11 +13,14 @@ authors = [
     { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
     { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
 ]
-description = "A tool for downloading files from a list of URLs in parallel."
+description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
 readme = "README.md"
 requires-python = ">=3.10, <3.12"
 classifiers = [
+    "Development Status :: 4 - Beta",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
@@ -52,13 +55,17 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-dev = ["pytest"]
+dev = [
+    "pytest",
+    "ruff"
+]
 
 keywords = [
     "parallel",
     "distributed",
-    "download",
     "url",
+    "mpi-applications",
+    "dataset-generation",
 ]
 
 [project.urls]
@@ -66,5 +73,8 @@ Homepage = "https://github.com/Imageomics/distributed-downloader"
 Repository = "https://github.com/Imageomics/distributed-downloader.git"
 "Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"
 
+[project.scripts]
+tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"
+
 [tool.hatch.version]
 path = "src/TreeOfLife_toolbox/main/__about__.py"

From f6d98aa5d50c389efb9acfd0b47318b39fb5e57d Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Tue, 13 May 2025 01:12:34 -0400
Subject: [PATCH 5/5] Refactor and relocate FathomNet crop tools

Migrated FathomNet crop-related classes and utilities from `DD_tools` to `TreeOfLife_toolbox`, ensuring proper namespace adjustments and functionality preservation. Added a README to document the new module structure and enhanced docstrings for better clarity on cropping, scheduling, and filtering operations.
---
 src/DD_tools/fathom_net_crop/__init__.py      |   0
 src/DD_tools/fathom_net_crop/classes.py       | 123 -----------
 src/TreeOfLife_toolbox/__init__.py            |   1 +
 .../fathom_net_crop/README.md                 |  64 ++++++
 .../fathom_net_crop/__init__.py               |   5 +
 .../fathom_net_crop/classes.py                | 199 ++++++++++++++++++
 6 files changed, 269 insertions(+), 123 deletions(-)
 delete mode 100644 src/DD_tools/fathom_net_crop/__init__.py
 delete mode 100644 src/DD_tools/fathom_net_crop/classes.py
 create mode 100644 src/TreeOfLife_toolbox/fathom_net_crop/README.md
 create mode 100644 src/TreeOfLife_toolbox/fathom_net_crop/__init__.py
 create mode 100644 src/TreeOfLife_toolbox/fathom_net_crop/classes.py

diff --git a/src/DD_tools/fathom_net_crop/__init__.py b/src/DD_tools/fathom_net_crop/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/DD_tools/fathom_net_crop/classes.py b/src/DD_tools/fathom_net_crop/classes.py
deleted file mode 100644
index a3777bd..0000000
--- a/src/DD_tools/fathom_net_crop/classes.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import hashlib
-import os
-import shutil
-from typing import List
-
-import numpy as np
-import pandas as pd
-
-from DD_tools.main.config import Config
-from DD_tools.main.filters import PythonFilterToolBase, FilterRegister
-from DD_tools.main.runners import MPIRunnerTool, RunnerRegister
-from DD_tools.main.schedulers import DefaultScheduler, SchedulerRegister
-
-
-@FilterRegister("fathom_net_crop")
-class FathomnetCropFilter(PythonFilterToolBase):
-    def __init__(self, cfg: Config):
-        super().__init__(cfg)
-        self.filter_name: str = "fathom_net_crop"
-
-
-@SchedulerRegister("fathom_net_crop")
-class FathomnetCropScheduleCreation(DefaultScheduler):
-    def __init__(self, cfg: Config):
-        super().__init__(cfg)
-        self.filter_name: str = "fathom_net_crop"
-
-
-@RunnerRegister("fathom_net_crop")
-class FathomnetCropRunner(MPIRunnerTool):
-    def __init__(self, cfg: Config):
-        super().__init__(cfg)
-        self.filter_name: str = "fathom_net_crop"
-        self.data_scheme: List[str] = ["server_name", "partition_id"]
-        self.verification_scheme: List[str] = ["server_name", "partition_id"]
-        self.total_time = 150
-        self.bb_df = pd.read_csv(
-            "/fs/scratch/PAS2136/gbif/processed/fathomNet/filtered_by_size.csv"
-        )
-        self.image_crop_path = os.path.join(
-            cfg.get_folder("path_to_output_folder"), "image_crop"
-        )
-
-    def apply_filter(
-            self, filtering_df: pd.DataFrame, server_name: str, partition_id: int
-    ) -> int:
-        self.is_enough_time()
-
-        input_path = os.path.join(
-            self.downloaded_images_path,
-            f"server_name={server_name}",
-            f"partition_id={partition_id}",
-        )
-        parquet_path = os.path.join(input_path, "successes.parquet")
-
-        if not os.path.exists(parquet_path):
-            self.logger.info(f"Path doesn't exists: {parquet_path}")
-            return 0
-
-        full_image = pd.read_parquet(
-            parquet_path, filters=[("source_id", "in", self.bb_df["image_uuid"])]
-        )
-
-        self.is_enough_time()
-
-        columns = full_image.columns
-        full_image = full_image.merge(
-            self.bb_df,
-            left_on="source_id",
-            right_on="image_uuid",
-            how="inner",
-            validate="1:m",
-        )
-        cropped_images = []
-        for _, row in full_image.iterrows():
-            cropped_entry = row[columns].to_dict()
-            image_binary = row["image"]
-            image_size = row["resized_size"]
-            image_np = np.frombuffer(image_binary, dtype=np.uint8).reshape(
-                [image_size[0], image_size[1], 3]
-            )
-            # fix
-            min_y = min(image_size[0], max(row["y"], 0))
-            min_x = min(image_size[1], max(row["x"], 0))
-            max_y = min(image_size[0], max(row["y"] + row["height"], 0))
-            max_x = min(image_size[1], max(row["x"] + row["width"], 0))
-
-            image_cropped = image_np[min_y:max_y, min_x:max_x]
-
-            cropped_entry["image"] = image_cropped.tobytes()
-            cropped_entry["resized_size"] = (max_y - min_y, max_x - min_x)
-            cropped_entry["hashsum_resized"] = hashlib.md5(
-                cropped_entry["image"]
-            ).hexdigest()
-            cropped_entry["uuid"] = row["tol_uuid"]
-            cropped_entry["source_id"] = row["bb_uuid"]
-
-            assert len(cropped_entry["image"]) == (
-                    cropped_entry["resized_size"][0] * cropped_entry["resized_size"][1] * 3
-            ), f"Size mismatch for {row['tol_uuid']}"
-
-            cropped_images.append(cropped_entry)
-
-        self.is_enough_time()
-        full_image = pd.DataFrame(cropped_images)
-        output_path = os.path.join(
-            self.image_crop_path,
-            f"server_name={server_name}",
-            f"partition_id={partition_id}",
-        )
-        os.makedirs(output_path, exist_ok=True)
-        full_image.to_parquet(
-            os.path.join(output_path, "successes.parquet"),
-            index=False,
-            compression="zstd",
-            compression_level=3,
-        )
-        for file in ["errors.parquet", "completed"]:
-            shutil.copyfile(
-                os.path.join(input_path, file), os.path.join(output_path, file)
-            )
-
-        return len(full_image)
diff --git a/src/TreeOfLife_toolbox/__init__.py b/src/TreeOfLife_toolbox/__init__.py
index e69de29..813bc69 100644
--- a/src/TreeOfLife_toolbox/__init__.py
+++ b/src/TreeOfLife_toolbox/__init__.py
@@ -0,0 +1 @@
+from TreeOfLife_toolbox import fathom_net_crop
diff --git a/src/TreeOfLife_toolbox/fathom_net_crop/README.md b/src/TreeOfLife_toolbox/fathom_net_crop/README.md
new file mode 100644
index 0000000..0a2583a
--- /dev/null
+++ b/src/TreeOfLife_toolbox/fathom_net_crop/README.md
@@ -0,0 +1,64 @@
+# FathomNet Image Cropping Tool
+
+This tool is used to crop images from the FathomNet dataset using the bounding box coordinates provided in an external
+CSV file. The cropping operation extracts regions of interest from full-sized images and preserves them as separate
+image entries with updated metadata.
+
+## How It Works
+
+The tool consists of three main components:
+
+1. **FathomnetCropFilter**: Identifies all valid image partitions in the dataset that can be processed. Creates a simple
+   list of server/partition pairs for the scheduler.
+
+2. **FathomnetCropScheduleCreation**: Creates a processing schedule by distributing server/partition pairs across
+   available worker nodes to balance the workload.
+
+3. **FathomnetCropRunner**: Performs the actual image cropping operation by:
+    - Loading images that have corresponding bounding box entries
+    - Cropping each image according to the specified coordinates (x, y, width, height)
+    - Computing new hash values and metadata for the cropped images
+    - Saving the results in a structure that mirrors the original dataset
+
+## Required Configuration
+
+Add these fields to your configuration file:
+
+- `bb_csv_path`: Path to the CSV file containing the bounding box information with columns:
+    - `image_uuid`: UUID of the original image
+    - `bb_uuid`: UUID for the bounding box entry
+    - `tol_uuid`: UUID to be assigned to the cropped image
+    - `x`, `y`: Top-left coordinates of the bounding box
+    - `width`, `height`: Dimensions of the bounding box
+
+- `image_crop_path`: Path where the cropped images will be stored
+
+## Pre-conditions
+
+- The source image directory follows the distributed-downloader structure:
+  ```
+  <path_to_output_folder>/<images_folder>/
+  ├── server_name=<server1>
+  │   ├── partition_id=<id>
+  │   │   ├── successes.parquet
+  │   │   ├── errors.parquet
+  │   │   └── completed
+  │   └── partition_id=<id2>
+  │       ├── ...
+  └── server_name=<server2>
+      └── ...
+  ```
+- The bounding box CSV file exists and contains the required columns
+- The original images are stored in parquet files with binary image data and metadata
+- Images with multi-channel RGB data (3 channels)
+
+## Post-conditions
+
+- Cropped images are saved in the specified output path using the same directory structure as the source
+- Each cropped image has:
+    - Updated UUID and source_id based on values from the bounding box CSV
+    - Updated resized_size reflecting the dimensions of the cropped region
+    - New hashsum_resized value calculated from the cropped image data
+- If a bounding box extends beyond the image boundaries, it will be automatically clipped to fit within the image
+- All original metadata is preserved, except for fields that needed to be updated due to cropping
+- A verification structure is created to track which server/partition pairs have been successfully processed
diff --git a/src/TreeOfLife_toolbox/fathom_net_crop/__init__.py b/src/TreeOfLife_toolbox/fathom_net_crop/__init__.py
new file mode 100644
index 0000000..c83d68e
--- /dev/null
+++ b/src/TreeOfLife_toolbox/fathom_net_crop/__init__.py
@@ -0,0 +1,5 @@
+from .classes import (
+    FathomnetCropFilter,
+    FathomnetCropScheduleCreation,
+    FathomnetCropRunner,
+)
diff --git a/src/TreeOfLife_toolbox/fathom_net_crop/classes.py b/src/TreeOfLife_toolbox/fathom_net_crop/classes.py
new file mode 100644
index 0000000..8ab7d9b
--- /dev/null
+++ b/src/TreeOfLife_toolbox/fathom_net_crop/classes.py
@@ -0,0 +1,199 @@
+import hashlib
+import os
+import shutil
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.filters import PythonFilterToolBase, FilterRegister
+from TreeOfLife_toolbox.main.runners import MPIRunnerTool, RunnerRegister
+from TreeOfLife_toolbox.main.schedulers import DefaultScheduler, SchedulerRegister
+
+
+@FilterRegister("fathom_net_crop")
+class FathomnetCropFilter(PythonFilterToolBase):
+    """
+    Filter tool that prepares a list of image files from the downloaded dataset
+    that need to be processed for the FathomNet crop operation.
+    
+    This class identifies all available image partitions from the downloaded dataset
+    that can be processed for cropping based on the bounding box information.
+    """
+    def __init__(self, cfg: Config):
+        """
+        Initialize the FathomnetCropFilter with configuration.
+        
+        Args:
+            cfg (Config): Configuration object with paths and settings.
+        """
+        super().__init__(cfg)
+        self.filter_name: str = "fathom_net_crop"
+
+
+@SchedulerRegister("fathom_net_crop")
+class FathomnetCropScheduleCreation(DefaultScheduler):
+    """
+    Scheduler for the FathomNet crop operation.
+    
+    Creates a schedule for distributed processing of image cropping tasks,
+    assigning server/partition pairs to worker ranks for load balancing.
+    Uses the default scheduling algorithm from the DefaultScheduler class.
+    """
+    def __init__(self, cfg: Config):
+        """
+        Initialize the FathomnetCropScheduleCreation with configuration.
+        
+        Args:
+            cfg (Config): Configuration object with paths and settings.
+        """
+        super().__init__(cfg)
+        self.filter_name: str = "fathom_net_crop"
+
+
+@RunnerRegister("fathom_net_crop")
+class FathomnetCropRunner(MPIRunnerTool):
+    """
+    Runner tool that performs the actual cropping operation on images
+    based on bounding box information.
+    
+    This class loads images from the distributed dataset, crops them according 
+    to the bounding box coordinates, and saves the cropped images to a new location.
+    It operates in a distributed manner using MPI for parallel processing.
+    """
+    def __init__(self, cfg: Config):
+        """
+        Initialize the FathomnetCropRunner with configuration.
+        
+        Args:
+            cfg (Config): Configuration object with paths and settings.
+                          Must contain 'bb_csv_path' and 'image_crop_path' keys.
+        """
+        super().__init__(cfg)
+        self.filter_name: str = "fathom_net_crop"
+        self.data_scheme: List[str] = ["server_name", "partition_id"]
+        self.verification_scheme: List[str] = ["server_name", "partition_id"]
+        self.total_time = 150
+        # Load bounding box information from the CSV file specified in the config
+        self.bb_df = pd.read_csv(
+            self.config["bb_csv_path"],
+        )
+        # Path where cropped images will be stored
+        self.image_crop_path = self.config["image_crop_path"]
+
+    def apply_filter(
+            self, filtering_df: pd.DataFrame, server_name: str, partition_id: int
+    ) -> int:
+        """
+        Process a batch of images from a specific server and partition, 
+        cropping them according to bounding box information.
+        
+        This method:
+        1. Checks if enough time remains to process the batch
+        2. Locates and loads the relevant images
+        3. Filters images to only those with bounding box information
+        4. Crops each image according to its bounding box
+        5. Saves the cropped images to a new location
+        
+        Args:
+            filtering_df (pd.DataFrame): DataFrame with filtering information
+            server_name (str): Name of the server containing the images
+            partition_id (int): ID of the partition to process
+            
+        Returns:
+            int: Number of images successfully processed
+            
+        Notes:
+            - If a bounding box extends outside the image, it will be clipped
+              to the image boundaries
+            - Cropped images maintain the same metadata as the original,
+              with updated size and hash information
+        """
+        self.is_enough_time()
+
+        input_path = os.path.join(
+            self.downloaded_images_path,
+            f"server_name={server_name}",
+            f"partition_id={partition_id}",
+        )
+        parquet_path = os.path.join(input_path, "successes.parquet")
+
+        if not os.path.exists(parquet_path):
+            self.logger.info(f"Path doesn't exists: {parquet_path}")
+            return 0
+
+        # Load only images that have corresponding bounding box information
+        full_image = pd.read_parquet(
+            parquet_path, filters=[("source_id", "in", self.bb_df["image_uuid"])]
+        )
+
+        self.is_enough_time()
+
+        columns = full_image.columns
+        # Merge with bounding box information
+        full_image = full_image.merge(
+            self.bb_df,
+            left_on="source_id",
+            right_on="image_uuid",
+            how="inner",
+            validate="1:m",
+        )
+        cropped_images = []
+        for _, row in full_image.iterrows():
+            cropped_entry = row[columns].to_dict()
+            image_binary = row["image"]
+            image_size = row["resized_size"]
+            # Convert binary image data to numpy array with proper dimensions
+            image_np = np.frombuffer(image_binary, dtype=np.uint8).reshape(
+                [image_size[0], image_size[1], 3]
+            )
+            # Ensure bounding box coordinates are within image boundaries
+            min_y = min(image_size[0], max(row["y"], 0))
+            min_x = min(image_size[1], max(row["x"], 0))
+            max_y = min(image_size[0], max(row["y"] + row["height"], 0))
+            max_x = min(image_size[1], max(row["x"] + row["width"], 0))
+
+            # Crop the image
+            image_cropped = image_np[min_y:max_y, min_x:max_x]
+
+            # Update the entry with cropped image information
+            cropped_entry["image"] = image_cropped.tobytes()
+            cropped_entry["resized_size"] = (max_y - min_y, max_x - min_x)
+            cropped_entry["hashsum_resized"] = hashlib.md5(
+                cropped_entry["image"]
+            ).hexdigest()
+            cropped_entry["uuid"] = row["tol_uuid"]
+            cropped_entry["source_id"] = row["bb_uuid"]
+
+            # Validate that the image size matches the expected dimensions
+            assert len(cropped_entry["image"]) == (
+                    cropped_entry["resized_size"][0] * cropped_entry["resized_size"][1] * 3
+            ), f"Size mismatch for {row['tol_uuid']}"
+
+            cropped_images.append(cropped_entry)
+
+        self.is_enough_time()
+        # Create a DataFrame with all the cropped images
+        full_image = pd.DataFrame(cropped_images)
+        # Prepare output path
+        output_path = os.path.join(
+            self.image_crop_path,
+            f"server_name={server_name}",
+            f"partition_id={partition_id}",
+        )
+        os.makedirs(output_path, exist_ok=True)
+        # Save the cropped images as a parquet file
+        full_image.to_parquet(
+            os.path.join(output_path, "successes.parquet"),
+            index=False,
+            compression="zstd",
+            compression_level=3,
+        )
+        # Copy other necessary files to maintain the same structure
+        for file in ["errors.parquet", "completed"]:
+            shutil.copyfile(
+                os.path.join(input_path, file), os.path.join(output_path, file)
+            )
+
+        return len(full_image)