Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 18 additions & 26 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,78 @@ requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/DD_tools"]
packages = ["src/TreeOfLife_toolbox"]

[project]
name = "DD_tools"
name = "TreeOfLife_toolbox"
dynamic = ["version"]
authors = [
{ name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
{ name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
{ name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
]
description = "A tool for downloading files from a list of URLs in parallel."
description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.10, <3.12"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"attrs",
"brotli",
"certifi",
"charset-normalizer",
"cramjam",
"cython",
"exceptiongroup",
"fsspec",
"hatchling",
"idna",
"inflate64",
"iniconfig",
"mpi4py < 4",
"mpi4py",
"multivolumefile",
"numpy",
"opencv-python",
"packaging",
"pandas",
"pathspec",
"pillow",
"pip",
"pluggy",
"psutil",
"py4j",
"pyarrow",
"pybcj",
"pycryptodomex",
"pyppmd",
"pyspark",
"pytest",
"python-dateutil",
"python-dotenv",
"pytz",
"pyyaml",
"pyzstd",
"requests",
"setuptools",
"six",
"texttable",
"tomli",
"trove-classifiers",
"typing-extensions",
"tzdata",
"urllib3",
"wheel"
]

[project.optional-dependencies]
dev = ["pytest"]
dev = [
"pytest",
"ruff"
]

keywords = [
"parallel",
"distributed",
"download",
"url",
"mpi-applications",
"dataset-generation",
]

[project.urls]
Homepage = "https://github.com/Imageomics/distributed-downloader"
Repository = "https://github.com/Imageomics/distributed-downloader.git"
"Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"

[project.scripts]
tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"

[tool.hatch.version]
path = "src/DD_tools/main/__about__.py"
path = "src/TreeOfLife_toolbox/main/__about__.py"
3 changes: 1 addition & 2 deletions scripts/tools_filter.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ executor_memory="64G"
module load spark/3.4.1
module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

pbs-spark-submit \
--driver-memory $driver_memory \
--executor-memory $executor_memory \
"${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
"${TOOLBOX_PATH}/main/filter.py" \
"${tool_name}" \
> "${logs_dir}/tool_filter.log"
3 changes: 1 addition & 2 deletions scripts/tools_scheduler.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_scheduler.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_verifier.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_verifier.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_worker.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task="$TOOLS_CPU_PER_WORKER" \
--mem=0 \
--output="${logs_dir}/tool_worker-%2t.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
Empty file removed src/DD_tools/__init__.py
Empty file.
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from TreeOfLife_toolbox import mam_ansp_fix
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import SuccessEntry
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import SuccessEntry

FilterRegister = partial(ToolsRegistryBase.register, "filter")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import argparse
import os
from logging import Logger
from pathlib import Path
from typing import Dict, List, Optional, TextIO, Tuple

import pandas as pd
from attr import Factory, define, field

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import (
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import (
init_logger,
ensure_created,
truncate_paths,
Expand Down Expand Up @@ -78,6 +79,7 @@ def __attrs_post_init__(self):

def __init_environment(self) -> None:
os.environ["CONFIG_PATH"] = self.config.config_path
os.environ["TOOLBOX_PATH"] = str(Path(__file__).parent.parent.resolve())

os.environ["ACCOUNT"] = self.config["account"]
os.environ["PATH_TO_INPUT"] = self.config["path_to_input"]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Dict, Type, Optional

from DD_tools.main.config import Config
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.utils import init_logger


class ToolsRegistryBase(type):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import pandas as pd

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase

RunnerRegister = partial(ToolsRegistryBase.register, "runner")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import pandas as pd

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase

SchedulerRegister = partial(ToolsRegistryBase.register, "scheduler")

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

import pandas as pd

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.runners import MPIRunnerTool
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.runners import MPIRunnerTool
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
39 changes: 39 additions & 0 deletions src/TreeOfLife_toolbox/mam_ansp_fix/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# MAM ANSP Duplication Fix Tool

## Overview

This tool addresses a specific duplication issue found in data from the "mam.ansp.org" server (from the GBIF source) in
the Tree of Life dataset. It identifies, processes, and removes duplicate UUID entries within parquet files, ensuring
data integrity and consistency.

The tool consists of three main components:

1. **Filter (MamAnspFixFilter)**: Identifies files from the mam.ansp.org server that need deduplication based on a
provided UUID table.
2. **Scheduler (MamAnspFixScheduleCreation)**: Distributes the workload of file processing across available workers.
3. **Runner (MamAnspFixRunner)**: Performs the actual deduplication process by reading each file, removing duplicate
UUIDs, and saving the cleaned data to a specified location.

## Configuration Requirements

The following fields must be included in the configuration file:

* `uuid_table_path`: Path to the CSV file containing the table of UUIDs with information about duplicated entries. This
file must include "server" and "path" columns.
* `save_path_folder`: Directory where the deduplicated parquet files will be saved.

## Prerequisites (Pre-conditions)

Before running this tool, ensure:

- The dataset follows the Tree of Life format structure
- The UUIDs table contains accurate information about mam.ansp.org server entries
- The `uuid_table_path` CSV file contains at minimum these columns: "server" and "path"

## Guarantees (Post-conditions)

After successful execution:

- The dataset maintains the Tree Of Life format
- Duplicate UUID entries in files from mam.ansp.org server have been removed
- The original files remain untouched
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/mam_ansp_fix/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .classes import MamAnspFixFilter, MamAnspFixScheduleCreation, MamAnspFixRunner
Loading