From 8176b7890f4c65a6fda49c20f3b8504ba7cef619 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Sun, 16 Feb 2025 16:08:51 -0500
Subject: [PATCH 1/5] Added mam_ansp_fix tool

---
 src/DD_tools/mam_ansp_fix/README.md   |  5 ++
 src/DD_tools/mam_ansp_fix/__init__.py |  0
 src/DD_tools/mam_ansp_fix/classes.py  | 94 +++++++++++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 src/DD_tools/mam_ansp_fix/README.md
 create mode 100644 src/DD_tools/mam_ansp_fix/__init__.py
 create mode 100644 src/DD_tools/mam_ansp_fix/classes.py

diff --git a/src/DD_tools/mam_ansp_fix/README.md b/src/DD_tools/mam_ansp_fix/README.md
new file mode 100644
index 0000000..84c6379
--- /dev/null
+++ b/src/DD_tools/mam_ansp_fix/README.md
@@ -0,0 +1,5 @@
+tool to fix the duplication issue with `mam.ansp.org` server from gbif
+
+Additional config fields:
+
+* `uuid_table_path` - path to uuid table with duplicated uuids
\ No newline at end of file
diff --git a/src/DD_tools/mam_ansp_fix/__init__.py b/src/DD_tools/mam_ansp_fix/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/DD_tools/mam_ansp_fix/classes.py b/src/DD_tools/mam_ansp_fix/classes.py
new file mode 100644
index 0000000..5ae7d8f
--- /dev/null
+++ b/src/DD_tools/mam_ansp_fix/classes.py
@@ -0,0 +1,94 @@
+import os
+from typing import List
+
+import pandas as pd
+
+from DD_tools.main.config import Config
+from DD_tools.main.filters import FilterRegister, SparkFilterToolBase
+from DD_tools.main.runners import MPIRunnerTool, RunnerRegister
+from DD_tools.main.schedulers import DefaultScheduler, SchedulerRegister
+
+
+@FilterRegister("mam_ansp_fix")
+class MamAnspFixFilter(SparkFilterToolBase):
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+
+        self.filter_name: str = "mam_ansp_fix"
+
+    def run(self):
+        uuid_table_df = pd.read_csv(self.config["uuid_table_path"], low_memory=False)
+        uuid_table_df = uuid_table_df[uuid_table_df["server"] == "mam.ansp.org"][
+            ["path"]
+        ].drop_duplicates()
+
+        uuid_table_df.to_csv(
+            os.path.join(
+                self.tools_path, self.filter_name, "filter_table", "table.csv"
+            ),
+            index=False,
+        )
+
+
+@SchedulerRegister("mam_ansp_fix")
+class MamAnspFixScheduleCreation(DefaultScheduler):
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+
+        self.filter_name: str = "mam_ansp_fix"
+        self.scheme = ["path"]
+
+
+@RunnerRegister("mam_ansp_fix")
+class MamAnspFixRunner(MPIRunnerTool):
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+        self.filter_name: str = "mam_ansp_fix"
+        self.data_scheme: List[str] = ["path"]
+        self.verification_scheme: List[str] = ["path"]
+        self.total_time = 150
+        self.save_path_folder = (
+            "/fs/scratch/PAS2136/gbif/processed/mam_ansp_fix/server=mam.ansp.org"
+        )
+
+    def apply_filter(self, filtering_df: pd.DataFrame, file_path: str) -> int:
+        self.is_enough_time()
+
+        if not os.path.exists(file_path):
+            self.logger.info(f"Path doesn't exists: {file_path}")
+            return 0
+
+        filtered_parquet = pd.read_parquet(file_path)
+
+        self.is_enough_time()
+
+        if len(filtered_parquet) == 0:
+            self.logger.info(f"Fully filtered out: {file_path}")
+
+        filtered_parquet = filtered_parquet.drop_duplicates("uuid")
+        save_path = os.path.join(self.save_path_folder, os.path.basename(file_path))
+        os.makedirs(self.save_path_folder, exist_ok=True)
+
+        filtered_parquet.to_parquet(
+            save_path, index=False, compression="zstd", compression_level=3
+        )
+
+        return len(filtered_parquet)
+
+    def runner_fn(self, df_local: pd.DataFrame) -> int:
+        filtering_df = df_local.reset_index(drop=True)
+        file_path = filtering_df.iloc[0]["path"]
+        try:
+            filtered_parquet_length = self.apply_filter(filtering_df, file_path)
+        except NotImplementedError:
+            raise NotImplementedError("Filter function wasn't implemented")
+        except Exception as e:
+            self.logger.exception(e)
+            self.logger.error(f"Error occurred: {e}")
+            return 0
+        else:
+            print(f"{file_path}", end="\n", file=self.verification_IO)
+            self.logger.debug(
+                f"Completed filtering: {file_path} with {filtered_parquet_length}"
+            )
+            return 1

From f41119261d8b8b6a8a7da29ce0cfbe37b604b389 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Mon, 12 May 2025 02:29:05 -0400
Subject: [PATCH 2/5] Rename project from 'DD_tools' to 'TreeOfLife_toolbox'.

Updated package structure, filenames, and references to reflect the new name. Adjusted `pyproject.toml` to rename the project, update dependencies, and modify supported Python versions. These changes ensure consistency and alignment with the new project branding.
---
 pyproject.toml                                | 28 ++++---------------
 .../__init__.py                               |  0
 .../main/__about__.py                         |  0
 .../main/checkpoint.py                        |  0
 .../main/config.py                            |  0
 .../main/config_templates/tools.yaml          |  0
 .../main/filter.py                            |  0
 .../main/filters.py                           |  0
 .../main/main.py                              |  0
 .../main/registry.py                          |  0
 .../main/runner.py                            |  0
 .../main/runners.py                           |  0
 .../main/scheduler.py                         |  0
 .../main/schedulers.py                        |  0
 .../main/utils.py                             |  0
 .../main/verification.py                      |  0
 16 files changed, 5 insertions(+), 23 deletions(-)
 rename src/{DD_tools => TreeOfLife_toolbox}/__init__.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/__about__.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/checkpoint.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/config.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/config_templates/tools.yaml (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/filter.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/filters.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/main.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/registry.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/runner.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/runners.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/scheduler.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/schedulers.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/utils.py (100%)
 rename src/{DD_tools => TreeOfLife_toolbox}/main/verification.py (100%)

diff --git a/pyproject.toml b/pyproject.toml
index 17be78f..cb76174 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,10 +3,10 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/DD_tools"]
+packages = ["src/TreeOfLife_toolbox"]
 
 [project]
-name = "DD_tools"
+name = "TreeOfLife_toolbox"
 dynamic = ["version"]
 authors = [
     { name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
@@ -15,7 +15,7 @@ authors = [
 ]
 description = "A tool for downloading files from a list of URLs in parallel."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10, <3.12"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
@@ -24,48 +24,30 @@ classifiers = [
 dependencies = [
     "attrs",
     "brotli",
-    "certifi",
-    "charset-normalizer",
     "cramjam",
     "cython",
-    "exceptiongroup",
     "fsspec",
-    "hatchling",
-    "idna",
     "inflate64",
-    "iniconfig",
-    "mpi4py < 4",
+    "mpi4py",
     "multivolumefile",
-    "numpy",
     "opencv-python",
-    "packaging",
     "pandas",
     "pathspec",
     "pillow",
-    "pip",
-    "pluggy",
     "psutil",
-    "py4j",
     "pyarrow",
     "pybcj",
     "pycryptodomex",
     "pyppmd",
     "pyspark",
-    "pytest",
-    "python-dateutil",
     "python-dotenv",
-    "pytz",
     "pyyaml",
     "pyzstd",
     "requests",
     "setuptools",
-    "six",
     "texttable",
-    "tomli",
     "trove-classifiers",
     "typing-extensions",
-    "tzdata",
-    "urllib3",
     "wheel"
 ]
 
@@ -85,4 +67,4 @@ Repository = "https://github.com/Imageomics/distributed-downloader.git"
 "Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"
 
 [tool.hatch.version]
-path = "src/DD_tools/main/__about__.py"
+path = "src/TreeOfLife_toolbox/main/__about__.py"
diff --git a/src/DD_tools/__init__.py b/src/TreeOfLife_toolbox/__init__.py
similarity index 100%
rename from src/DD_tools/__init__.py
rename to src/TreeOfLife_toolbox/__init__.py
diff --git a/src/DD_tools/main/__about__.py b/src/TreeOfLife_toolbox/main/__about__.py
similarity index 100%
rename from src/DD_tools/main/__about__.py
rename to src/TreeOfLife_toolbox/main/__about__.py
diff --git a/src/DD_tools/main/checkpoint.py b/src/TreeOfLife_toolbox/main/checkpoint.py
similarity index 100%
rename from src/DD_tools/main/checkpoint.py
rename to src/TreeOfLife_toolbox/main/checkpoint.py
diff --git a/src/DD_tools/main/config.py b/src/TreeOfLife_toolbox/main/config.py
similarity index 100%
rename from src/DD_tools/main/config.py
rename to src/TreeOfLife_toolbox/main/config.py
diff --git a/src/DD_tools/main/config_templates/tools.yaml b/src/TreeOfLife_toolbox/main/config_templates/tools.yaml
similarity index 100%
rename from src/DD_tools/main/config_templates/tools.yaml
rename to src/TreeOfLife_toolbox/main/config_templates/tools.yaml
diff --git a/src/DD_tools/main/filter.py b/src/TreeOfLife_toolbox/main/filter.py
similarity index 100%
rename from src/DD_tools/main/filter.py
rename to src/TreeOfLife_toolbox/main/filter.py
diff --git a/src/DD_tools/main/filters.py b/src/TreeOfLife_toolbox/main/filters.py
similarity index 100%
rename from src/DD_tools/main/filters.py
rename to src/TreeOfLife_toolbox/main/filters.py
diff --git a/src/DD_tools/main/main.py b/src/TreeOfLife_toolbox/main/main.py
similarity index 100%
rename from src/DD_tools/main/main.py
rename to src/TreeOfLife_toolbox/main/main.py
diff --git a/src/DD_tools/main/registry.py b/src/TreeOfLife_toolbox/main/registry.py
similarity index 100%
rename from src/DD_tools/main/registry.py
rename to src/TreeOfLife_toolbox/main/registry.py
diff --git a/src/DD_tools/main/runner.py b/src/TreeOfLife_toolbox/main/runner.py
similarity index 100%
rename from src/DD_tools/main/runner.py
rename to src/TreeOfLife_toolbox/main/runner.py
diff --git a/src/DD_tools/main/runners.py b/src/TreeOfLife_toolbox/main/runners.py
similarity index 100%
rename from src/DD_tools/main/runners.py
rename to src/TreeOfLife_toolbox/main/runners.py
diff --git a/src/DD_tools/main/scheduler.py b/src/TreeOfLife_toolbox/main/scheduler.py
similarity index 100%
rename from src/DD_tools/main/scheduler.py
rename to src/TreeOfLife_toolbox/main/scheduler.py
diff --git a/src/DD_tools/main/schedulers.py b/src/TreeOfLife_toolbox/main/schedulers.py
similarity index 100%
rename from src/DD_tools/main/schedulers.py
rename to src/TreeOfLife_toolbox/main/schedulers.py
diff --git a/src/DD_tools/main/utils.py b/src/TreeOfLife_toolbox/main/utils.py
similarity index 100%
rename from src/DD_tools/main/utils.py
rename to src/TreeOfLife_toolbox/main/utils.py
diff --git a/src/DD_tools/main/verification.py b/src/TreeOfLife_toolbox/main/verification.py
similarity index 100%
rename from src/DD_tools/main/verification.py
rename to src/TreeOfLife_toolbox/main/verification.py

From 4a982dd82b03ab2bd8e9a8fc1a7ea270e51b6f53 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Mon, 12 May 2025 02:34:30 -0400
Subject: [PATCH 3/5] Refactor import paths to use TreeOfLife_toolbox module.

Updated all import statements to reference TreeOfLife_toolbox instead of DD_tools for consistency and clarity. Adjusted slurm scripts to align with the new module structure and standardized environment variables for toolbox path configuration.
---
 scripts/tools_filter.slurm                  |  3 +--
 scripts/tools_scheduler.slurm               |  3 +--
 scripts/tools_verifier.slurm                |  3 +--
 scripts/tools_worker.slurm                  |  3 +--
 src/TreeOfLife_toolbox/main/filter.py       |  8 ++++----
 src/TreeOfLife_toolbox/main/filters.py      |  8 ++++----
 src/TreeOfLife_toolbox/main/main.py         | 10 ++++++----
 src/TreeOfLife_toolbox/main/registry.py     |  4 ++--
 src/TreeOfLife_toolbox/main/runner.py       |  8 ++++----
 src/TreeOfLife_toolbox/main/runners.py      |  4 ++--
 src/TreeOfLife_toolbox/main/scheduler.py    |  8 ++++----
 src/TreeOfLife_toolbox/main/schedulers.py   |  4 ++--
 src/TreeOfLife_toolbox/main/verification.py | 10 +++++-----
 13 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/scripts/tools_filter.slurm b/scripts/tools_filter.slurm
index 4642e34..6aee3f6 100644
--- a/scripts/tools_filter.slurm
+++ b/scripts/tools_filter.slurm
@@ -19,11 +19,10 @@ executor_memory="64G"
 module load spark/3.4.1
 module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 pbs-spark-submit \
     --driver-memory $driver_memory \
     --executor-memory $executor_memory \
-    "${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
+    "${TOOLBOX_PATH}/main/filter.py" \
     "${tool_name}" \
     > "${logs_dir}/tool_filter.log"
diff --git a/scripts/tools_scheduler.slurm b/scripts/tools_scheduler.slurm
index e4fb6a2..ea35a32 100644
--- a/scripts/tools_scheduler.slurm
+++ b/scripts/tools_scheduler.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_scheduler.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
diff --git a/scripts/tools_verifier.slurm b/scripts/tools_verifier.slurm
index 98ca024..6a3b75e 100644
--- a/scripts/tools_verifier.slurm
+++ b/scripts/tools_verifier.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_verifier.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
diff --git a/scripts/tools_worker.slurm b/scripts/tools_worker.slurm
index 2ee2662..4856e62 100644
--- a/scripts/tools_worker.slurm
+++ b/scripts/tools_worker.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task="$TOOLS_CPU_PER_WORKER" \
   --mem=0 \
   --output="${logs_dir}/tool_worker-%2t.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
diff --git a/src/TreeOfLife_toolbox/main/filter.py b/src/TreeOfLife_toolbox/main/filter.py
index 080e1a2..ed526c5 100644
--- a/src/TreeOfLife_toolbox/main/filter.py
+++ b/src/TreeOfLife_toolbox/main/filter.py
@@ -1,10 +1,10 @@
 import argparse
 import os
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")
diff --git a/src/TreeOfLife_toolbox/main/filters.py b/src/TreeOfLife_toolbox/main/filters.py
index 11c9426..385f18e 100644
--- a/src/TreeOfLife_toolbox/main/filters.py
+++ b/src/TreeOfLife_toolbox/main/filters.py
@@ -7,10 +7,10 @@
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType
 
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsBase
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import SuccessEntry
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsBase
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import SuccessEntry
 
 FilterRegister = partial(ToolsRegistryBase.register, "filter")
 
diff --git a/src/TreeOfLife_toolbox/main/main.py b/src/TreeOfLife_toolbox/main/main.py
index b3d5732..5272354 100644
--- a/src/TreeOfLife_toolbox/main/main.py
+++ b/src/TreeOfLife_toolbox/main/main.py
@@ -1,15 +1,16 @@
 import argparse
 import os
 from logging import Logger
+from pathlib import Path
 from typing import Dict, List, Optional, TextIO, Tuple
 
 import pandas as pd
 from attr import Factory, define, field
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import (
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import (
     init_logger,
     ensure_created,
     truncate_paths,
@@ -78,6 +79,7 @@ def __attrs_post_init__(self):
 
     def __init_environment(self) -> None:
         os.environ["CONFIG_PATH"] = self.config.config_path
+        os.environ["TOOLBOX_PATH"] = str(Path(__file__).parent.parent.resolve())
 
         os.environ["ACCOUNT"] = self.config["account"]
         os.environ["PATH_TO_INPUT"] = self.config["path_to_input"]
diff --git a/src/TreeOfLife_toolbox/main/registry.py b/src/TreeOfLife_toolbox/main/registry.py
index 12774dd..03cf9d6 100644
--- a/src/TreeOfLife_toolbox/main/registry.py
+++ b/src/TreeOfLife_toolbox/main/registry.py
@@ -1,7 +1,7 @@
 from typing import Dict, Type, Optional
 
-from DD_tools.main.config import Config
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.utils import init_logger
 
 
 class ToolsRegistryBase(type):
diff --git a/src/TreeOfLife_toolbox/main/runner.py b/src/TreeOfLife_toolbox/main/runner.py
index 214237e..77dcefa 100644
--- a/src/TreeOfLife_toolbox/main/runner.py
+++ b/src/TreeOfLife_toolbox/main/runner.py
@@ -1,10 +1,10 @@
 import argparse
 import os
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")
diff --git a/src/TreeOfLife_toolbox/main/runners.py b/src/TreeOfLife_toolbox/main/runners.py
index cd875d3..bfb5d5e 100644
--- a/src/TreeOfLife_toolbox/main/runners.py
+++ b/src/TreeOfLife_toolbox/main/runners.py
@@ -6,8 +6,8 @@
 
 import pandas as pd
 
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase
 
 RunnerRegister = partial(ToolsRegistryBase.register, "runner")
 
diff --git a/src/TreeOfLife_toolbox/main/scheduler.py b/src/TreeOfLife_toolbox/main/scheduler.py
index 707b656..d686ae6 100644
--- a/src/TreeOfLife_toolbox/main/scheduler.py
+++ b/src/TreeOfLife_toolbox/main/scheduler.py
@@ -1,10 +1,10 @@
 import argparse
 import os
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")
diff --git a/src/TreeOfLife_toolbox/main/schedulers.py b/src/TreeOfLife_toolbox/main/schedulers.py
index ed70a9c..6b2c6e2 100644
--- a/src/TreeOfLife_toolbox/main/schedulers.py
+++ b/src/TreeOfLife_toolbox/main/schedulers.py
@@ -5,8 +5,8 @@
 
 import pandas as pd
 
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase
 
 SchedulerRegister = partial(ToolsRegistryBase.register, "scheduler")
 
diff --git a/src/TreeOfLife_toolbox/main/verification.py b/src/TreeOfLife_toolbox/main/verification.py
index 742bb86..31d2561 100644
--- a/src/TreeOfLife_toolbox/main/verification.py
+++ b/src/TreeOfLife_toolbox/main/verification.py
@@ -3,11 +3,11 @@
 
 import pandas as pd
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.runners import MPIRunnerTool
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.runners import MPIRunnerTool
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")

From 99ca3b016b14c27482d7152bba47d6db71b46334 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Mon, 12 May 2025 02:39:08 -0400
Subject: [PATCH 4/5] Update metadata and dependencies in pyproject.toml

Revised the project description, added programming language classifiers, and enhanced optional dependencies with 'ruff'. Introduced new keywords and added a script entry point for better usability.
---
 pyproject.toml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cb76174..b3e3a5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,11 +13,14 @@ authors = [
     { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
     { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
 ]
-description = "A tool for downloading files from a list of URLs in parallel."
+description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
 readme = "README.md"
 requires-python = ">=3.10, <3.12"
 classifiers = [
+    "Development Status :: 4 - Beta",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
@@ -52,13 +55,17 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-dev = ["pytest"]
+dev = [
+    "pytest",
+    "ruff"
+]
 
 keywords = [
     "parallel",
     "distributed",
-    "download",
     "url",
+    "mpi-applications",
+    "dataset-generation",
 ]
 
 [project.urls]
@@ -66,5 +73,8 @@ Homepage = "https://github.com/Imageomics/distributed-downloader"
 Repository = "https://github.com/Imageomics/distributed-downloader.git"
 "Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"
 
+[project.scripts]
+tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"
+
 [tool.hatch.version]
 path = "src/TreeOfLife_toolbox/main/__about__.py"

From 0f78284652a171c1d17bbe9351667e99b7e77341 Mon Sep 17 00:00:00 2001
From: Andrey170170 <andrey24122004@gmail.com>
Date: Wed, 14 May 2025 04:11:19 -0400
Subject: [PATCH 5/5] Refactor and migrate mam_ansp_fix tool to
 TreeOfLife_toolbox

Migrated the mam_ansp_fix tool from DD_tools to TreeOfLife_toolbox. Updated imports, classes, and documentation to reflect the new module structure. Enhanced clarity with detailed docstrings and improved configuration handling, maintaining duplication fix functionality for the mam.ansp.org server.
---
 src/DD_tools/mam_ansp_fix/README.md           |   5 -
 src/DD_tools/mam_ansp_fix/__init__.py         |   0
 src/DD_tools/mam_ansp_fix/classes.py          |  94 ---------
 src/TreeOfLife_toolbox/__init__.py            |   1 +
 src/TreeOfLife_toolbox/mam_ansp_fix/README.md |  39 ++++
 .../mam_ansp_fix/__init__.py                  |   1 +
 .../mam_ansp_fix/classes.py                   | 182 ++++++++++++++++++
 7 files changed, 223 insertions(+), 99 deletions(-)
 delete mode 100644 src/DD_tools/mam_ansp_fix/README.md
 delete mode 100644 src/DD_tools/mam_ansp_fix/__init__.py
 delete mode 100644 src/DD_tools/mam_ansp_fix/classes.py
 create mode 100644 src/TreeOfLife_toolbox/mam_ansp_fix/README.md
 create mode 100644 src/TreeOfLife_toolbox/mam_ansp_fix/__init__.py
 create mode 100644 src/TreeOfLife_toolbox/mam_ansp_fix/classes.py

diff --git a/src/DD_tools/mam_ansp_fix/README.md b/src/DD_tools/mam_ansp_fix/README.md
deleted file mode 100644
index 84c6379..0000000
--- a/src/DD_tools/mam_ansp_fix/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-tool to fix the duplication issue with `mam.ansp.org` server from gbif
-
-Additional config fields:
-
-* `uuid_table_path` - path to uuid table with duplicated uuids
\ No newline at end of file
diff --git a/src/DD_tools/mam_ansp_fix/__init__.py b/src/DD_tools/mam_ansp_fix/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/DD_tools/mam_ansp_fix/classes.py b/src/DD_tools/mam_ansp_fix/classes.py
deleted file mode 100644
index 5ae7d8f..0000000
--- a/src/DD_tools/mam_ansp_fix/classes.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import os
-from typing import List
-
-import pandas as pd
-
-from DD_tools.main.config import Config
-from DD_tools.main.filters import FilterRegister, SparkFilterToolBase
-from DD_tools.main.runners import MPIRunnerTool, RunnerRegister
-from DD_tools.main.schedulers import DefaultScheduler, SchedulerRegister
-
-
-@FilterRegister("mam_ansp_fix")
-class MamAnspFixFilter(SparkFilterToolBase):
-    def __init__(self, cfg: Config):
-        super().__init__(cfg)
-
-        self.filter_name: str = "mam_ansp_fix"
-
-    def run(self):
-        uuid_table_df = pd.read_csv(self.config["uuid_table_path"], low_memory=False)
-        uuid_table_df = uuid_table_df[uuid_table_df["server"] == "mam.ansp.org"][
-            ["path"]
-        ].drop_duplicates()
-
-        uuid_table_df.to_csv(
-            os.path.join(
-                self.tools_path, self.filter_name, "filter_table", "table.csv"
-            ),
-            index=False,
-        )
-
-
-@SchedulerRegister("mam_ansp_fix")
-class MamAnspFixScheduleCreation(DefaultScheduler):
-    def __init__(self, cfg: Config):
-        super().__init__(cfg)
-
-        self.filter_name: str = "mam_ansp_fix"
-        self.scheme = ["path"]
-
-
-@RunnerRegister("mam_ansp_fix")
-class MamAnspFixRunner(MPIRunnerTool):
-    def __init__(self, cfg: Config):
-        super().__init__(cfg)
-        self.filter_name: str = "mam_ansp_fix"
-        self.data_scheme: List[str] = ["path"]
-        self.verification_scheme: List[str] = ["path"]
-        self.total_time = 150
-        self.save_path_folder = (
-            "/fs/scratch/PAS2136/gbif/processed/mam_ansp_fix/server=mam.ansp.org"
-        )
-
-    def apply_filter(self, filtering_df: pd.DataFrame, file_path: str) -> int:
-        self.is_enough_time()
-
-        if not os.path.exists(file_path):
-            self.logger.info(f"Path doesn't exists: {file_path}")
-            return 0
-
-        filtered_parquet = pd.read_parquet(file_path)
-
-        self.is_enough_time()
-
-        if len(filtered_parquet) == 0:
-            self.logger.info(f"Fully filtered out: {file_path}")
-
-        filtered_parquet = filtered_parquet.drop_duplicates("uuid")
-        save_path = os.path.join(self.save_path_folder, os.path.basename(file_path))
-        os.makedirs(self.save_path_folder, exist_ok=True)
-
-        filtered_parquet.to_parquet(
-            save_path, index=False, compression="zstd", compression_level=3
-        )
-
-        return len(filtered_parquet)
-
-    def runner_fn(self, df_local: pd.DataFrame) -> int:
-        filtering_df = df_local.reset_index(drop=True)
-        file_path = filtering_df.iloc[0]["path"]
-        try:
-            filtered_parquet_length = self.apply_filter(filtering_df, file_path)
-        except NotImplementedError:
-            raise NotImplementedError("Filter function wasn't implemented")
-        except Exception as e:
-            self.logger.exception(e)
-            self.logger.error(f"Error occurred: {e}")
-            return 0
-        else:
-            print(f"{file_path}", end="\n", file=self.verification_IO)
-            self.logger.debug(
-                f"Completed filtering: {file_path} with {filtered_parquet_length}"
-            )
-            return 1
diff --git a/src/TreeOfLife_toolbox/__init__.py b/src/TreeOfLife_toolbox/__init__.py
index e69de29..1a4696b 100644
--- a/src/TreeOfLife_toolbox/__init__.py
+++ b/src/TreeOfLife_toolbox/__init__.py
@@ -0,0 +1 @@
+from TreeOfLife_toolbox import mam_ansp_fix
diff --git a/src/TreeOfLife_toolbox/mam_ansp_fix/README.md b/src/TreeOfLife_toolbox/mam_ansp_fix/README.md
new file mode 100644
index 0000000..3b7b947
--- /dev/null
+++ b/src/TreeOfLife_toolbox/mam_ansp_fix/README.md
@@ -0,0 +1,39 @@
+# MAM ANSP Duplication Fix Tool
+
+## Overview
+
+This tool addresses a specific duplication issue found in data from the "mam.ansp.org" server (from the GBIF source) in
+the Tree of Life dataset. It identifies, processes, and removes duplicate UUID entries within parquet files, ensuring
+data integrity and consistency.
+
+The tool consists of three main components:
+
+1. **Filter (MamAnspFixFilter)**: Identifies files from the mam.ansp.org server that need deduplication based on a
+   provided UUID table.
+2. **Scheduler (MamAnspFixScheduleCreation)**: Distributes the workload of file processing across available workers.
+3. **Runner (MamAnspFixRunner)**: Performs the actual deduplication process by reading each file, removing duplicate
+   UUIDs, and saving the cleaned data to a specified location.
+
+## Configuration Requirements
+
+The following fields must be included in the configuration file:
+
+* `uuid_table_path`: Path to the CSV file containing the table of UUIDs with information about duplicated entries. This
+  file must include "server" and "path" columns.
+* `save_path_folder`: Directory where the deduplicated parquet files will be saved.
+
+## Prerequisites (Pre-conditions)
+
+Before running this tool, ensure:
+
+- The dataset follows the Tree of Life format structure
+- The UUIDs table contains accurate information about mam.ansp.org server entries
+- The `uuid_table_path` CSV file contains at minimum these columns: "server" and "path"
+
+## Guarantees (Post-conditions)
+
+After successful execution:
+
+- The dataset maintains the Tree Of Life format
+- Duplicate UUID entries in files from mam.ansp.org server have been removed
+- The original files remain untouched
diff --git a/src/TreeOfLife_toolbox/mam_ansp_fix/__init__.py b/src/TreeOfLife_toolbox/mam_ansp_fix/__init__.py
new file mode 100644
index 0000000..a2f1bcf
--- /dev/null
+++ b/src/TreeOfLife_toolbox/mam_ansp_fix/__init__.py
@@ -0,0 +1 @@
+from .classes import MamAnspFixFilter, MamAnspFixScheduleCreation, MamAnspFixRunner
diff --git a/src/TreeOfLife_toolbox/mam_ansp_fix/classes.py b/src/TreeOfLife_toolbox/mam_ansp_fix/classes.py
new file mode 100644
index 0000000..e4bc9cb
--- /dev/null
+++ b/src/TreeOfLife_toolbox/mam_ansp_fix/classes.py
@@ -0,0 +1,182 @@
+import os
+from typing import List
+
+import pandas as pd
+
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.filters import FilterRegister, SparkFilterToolBase
+from TreeOfLife_toolbox.main.runners import MPIRunnerTool, RunnerRegister
+from TreeOfLife_toolbox.main.schedulers import DefaultScheduler, SchedulerRegister
+
+
+@FilterRegister("mam_ansp_fix")
+class MamAnspFixFilter(SparkFilterToolBase):
+    """
+    Filter class specifically designed to handle duplication issues with the mam.ansp.org server.
+    
+    This class loads a table of UUIDs containing duplicated entries from the mam.ansp.org server
+    and prepares the filter table for processing. It extracts the relevant paths for files that
+    need deduplication.
+    
+    Attributes:
+        filter_name (str): The name of the filter, set to "mam_ansp_fix".
+    """
+    def __init__(self, cfg: Config):
+        """
+        Initialize the MamAnspFixFilter.
+        
+        Args:
+            cfg (Config): Configuration object containing settings for the filter.
+        """
+        super().__init__(cfg)
+        self.filter_name: str = "mam_ansp_fix"
+
+    def run(self):
+        """
+        Execute the filtering process.
+        
+        Reads the UUID table from the path specified in the config, filters for records
+        from the "mam.ansp.org" server, and extracts unique file paths. The resulting
+        paths are saved to a CSV file for further processing.
+        """
+        uuid_table_df = pd.read_csv(self.config["uuid_table_path"], low_memory=False)
+        uuid_table_df = uuid_table_df[uuid_table_df["server"] == "mam.ansp.org"][
+            ["path"]
+        ].drop_duplicates()
+
+        uuid_table_df.to_csv(
+            os.path.join(
+                self.tools_path, self.filter_name, "filter_table", "table.csv"
+            ),
+            index=False,
+        )
+
+
+@SchedulerRegister("mam_ansp_fix")
+class MamAnspFixScheduleCreation(DefaultScheduler):
+    """
+    Scheduler class for the mam.ansp.fix tool.
+    
+    Creates a schedule for processing the filtered paths, distributing the workload
+    across available workers. Uses the DefaultScheduler functionality with a custom
+    schema.
+    
+    Attributes:
+        filter_name (str): The name of the filter, set to "mam_ansp_fix".
+        scheme (List[str]): The column scheme for scheduling, set to ["path"].
+    """
+    def __init__(self, cfg: Config):
+        """
+        Initialize the MamAnspFixScheduleCreation.
+        
+        Args:
+            cfg (Config): Configuration object containing settings for the scheduler.
+        """
+        super().__init__(cfg)
+        self.filter_name: str = "mam_ansp_fix"
+        self.scheme = ["path"]
+
+
+@RunnerRegister("mam_ansp_fix")
+class MamAnspFixRunner(MPIRunnerTool):
+    """
+    Runner class for the mam.ansp.fix tool.
+    
+    This class handles the actual deduplication process for the files from the mam.ansp.org
+    server. It reads each parquet file, removes duplicate UUIDs, and saves the deduplicated 
+    data to a new location.
+    
+    Attributes:
+        filter_name (str): The name of the filter, set to "mam_ansp_fix".
+        data_scheme (List[str]): The column scheme for the data, set to ["path"].
+        verification_scheme (List[str]): The column scheme for verification, set to ["path"].
+        total_time (int): Maximum processing time allowed, set to 150 seconds.
+        save_path_folder (str): Path where deduplicated files will be saved.
+    """
+    def __init__(self, cfg: Config):
+        """
+        Initialize the MamAnspFixRunner.
+        
+        Args:
+            cfg (Config): Configuration object containing settings for the runner,
+                          including the save path for processed files.
+        """
+        super().__init__(cfg)
+        self.filter_name: str = "mam_ansp_fix"
+        self.data_scheme: List[str] = ["path"]
+        self.verification_scheme: List[str] = ["path"]
+        self.total_time = 150
+        self.save_path_folder = cfg["save_path_folder"]
+
+    def apply_filter(self, filtering_df: pd.DataFrame, file_path: str) -> int:
+        """
+        Apply the deduplication filter to a specific file.
+        
+        This method reads a parquet file, removes duplicated entries based on UUID,
+        and saves the deduplicated data to the designated save path.
+        
+        Args:
+            filtering_df (pd.DataFrame): DataFrame containing filter information.
+            file_path (str): Path to the parquet file that needs deduplication.
+            
+        Returns:
+            int: The number of records in the deduplicated result.
+            
+        Raises:
+            Various exceptions can be raised if file operations fail.
+        """
+        self.is_enough_time()
+
+        if not os.path.exists(file_path):
+            self.logger.info(f"Path doesn't exists: {file_path}")
+            return 0
+
+        filtered_parquet = pd.read_parquet(file_path)
+
+        self.is_enough_time()
+
+        if len(filtered_parquet) == 0:
+            self.logger.info(f"Fully filtered out: {file_path}")
+
+        filtered_parquet = filtered_parquet.drop_duplicates("uuid")
+        save_path = os.path.join(self.save_path_folder, os.path.basename(file_path))
+        os.makedirs(self.save_path_folder, exist_ok=True)
+
+        filtered_parquet.to_parquet(
+            save_path, index=False, compression="zstd", compression_level=3
+        )
+
+        return len(filtered_parquet)
+
+    def runner_fn(self, df_local: pd.DataFrame) -> int:
+        """
+        Process a batch of files from the schedule.
+        
+        This method handles a batch of paths (typically one path at a time),
+        applies the deduplication filter, and records the completion status.
+        
+        Args:
+            df_local (pd.DataFrame): DataFrame containing the file paths to process.
+            
+        Returns:
+            int: 1 if successful, 0 if an error occurred.
+            
+        Raises:
+            NotImplementedError: If the filter function is not implemented.
+        """
+        filtering_df = df_local.reset_index(drop=True)
+        file_path = filtering_df.iloc[0]["path"]
+        try:
+            filtered_parquet_length = self.apply_filter(filtering_df, file_path)
+        except NotImplementedError:
+            raise NotImplementedError("Filter function wasn't implemented")
+        except Exception as e:
+            self.logger.exception(e)
+            self.logger.error(f"Error occurred: {e}")
+            return 0
+        else:
+            print(f"{file_path}", end="\n", file=self.verification_IO)
+            self.logger.debug(
+                f"Completed filtering: {file_path} with {filtered_parquet_length}"
+            )
+            return 1