Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 18 additions & 26 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,78 @@ requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/DD_tools"]
packages = ["src/TreeOfLife_toolbox"]

[project]
name = "DD_tools"
name = "TreeOfLife_toolbox"
dynamic = ["version"]
authors = [
{ name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
{ name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
{ name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
]
description = "A tool for downloading files from a list of URLs in parallel."
description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.10, <3.12"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"attrs",
"brotli",
"certifi",
"charset-normalizer",
"cramjam",
"cython",
"exceptiongroup",
"fsspec",
"hatchling",
"idna",
"inflate64",
"iniconfig",
"mpi4py < 4",
"mpi4py",
"multivolumefile",
"numpy",
"opencv-python",
"packaging",
"pandas",
"pathspec",
"pillow",
"pip",
"pluggy",
"psutil",
"py4j",
"pyarrow",
"pybcj",
"pycryptodomex",
"pyppmd",
"pyspark",
"pytest",
"python-dateutil",
"python-dotenv",
"pytz",
"pyyaml",
"pyzstd",
"requests",
"setuptools",
"six",
"texttable",
"tomli",
"trove-classifiers",
"typing-extensions",
"tzdata",
"urllib3",
"wheel"
]

[project.optional-dependencies]
dev = ["pytest"]
dev = [
"pytest",
"ruff"
]

keywords = [
"parallel",
"distributed",
"download",
"url",
"mpi-applications",
"dataset-generation",
]

[project.urls]
Homepage = "https://github.com/Imageomics/distributed-downloader"
Repository = "https://github.com/Imageomics/distributed-downloader.git"
"Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"

[project.scripts]
tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"

[tool.hatch.version]
path = "src/DD_tools/main/__about__.py"
path = "src/TreeOfLife_toolbox/main/__about__.py"
3 changes: 1 addition & 2 deletions scripts/tools_filter.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ executor_memory="64G"
module load spark/3.4.1
module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

pbs-spark-submit \
--driver-memory $driver_memory \
--executor-memory $executor_memory \
"${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
"${TOOLBOX_PATH}/main/filter.py" \
"${tool_name}" \
> "${logs_dir}/tool_filter.log"
3 changes: 1 addition & 2 deletions scripts/tools_scheduler.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_scheduler.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_verifier.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_verifier.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_worker.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task="$TOOLS_CPU_PER_WORKER" \
--mem=0 \
--output="${logs_dir}/tool_worker-%2t.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
Empty file removed src/DD_tools/__init__.py
Empty file.
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from TreeOfLife_toolbox import research_filtering
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import SuccessEntry
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import SuccessEntry

FilterRegister = partial(ToolsRegistryBase.register, "filter")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import argparse
import os
from logging import Logger
from pathlib import Path
from typing import Dict, List, Optional, TextIO, Tuple

import pandas as pd
from attr import Factory, define, field

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import (
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import (
init_logger,
ensure_created,
truncate_paths,
Expand Down Expand Up @@ -78,6 +79,7 @@ def __attrs_post_init__(self):

def __init_environment(self) -> None:
os.environ["CONFIG_PATH"] = self.config.config_path
os.environ["TOOLBOX_PATH"] = str(Path(__file__).parent.parent.resolve())

os.environ["ACCOUNT"] = self.config["account"]
os.environ["PATH_TO_INPUT"] = self.config["path_to_input"]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Dict, Type, Optional

from DD_tools.main.config import Config
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.utils import init_logger


class ToolsRegistryBase(type):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import pandas as pd

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase

RunnerRegister = partial(ToolsRegistryBase.register, "runner")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import pandas as pd

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase, ToolsRegistryBase
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase, ToolsRegistryBase

SchedulerRegister = partial(ToolsRegistryBase.register, "scheduler")

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

import pandas as pd

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.runners import MPIRunnerTool
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.runners import MPIRunnerTool
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
46 changes: 46 additions & 0 deletions src/TreeOfLife_toolbox/research_filtering/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Research Filtering Tool

## Overview

The Research Filtering Tool allows filtering of TreeOfLife datasets based on the `basisOfRecord` field from occurrence
data. This tool is designed to selectively remove entries from the dataset where the `basisOfRecord` matches specific
criteria defined in the configuration.

## How It Works

The tool operates in three phases:

1. **Filtering Phase**: Reads occurrence data and the main dataset, filters occurrences based on the specified
`basisOfRecord` value, and creates a filter table identifying entries to be removed.

2. **Scheduling Phase**: Creates a schedule for distributed processing of the filtered data, organized by file paths.

3. **Running Phase**: Applies the filtering operation across distributed nodes, removing entries from parquet files
based on UUIDs identified in the filtering step.

## Configuration Requirements

The following fields must be specified in the configuration file:

* `occurrences_path`: Path to the occurrences table containing `gbifID` and `basisOfRecord` fields
* `data_path`: Path to the TreeOfLife dataset root directory
* `basis_of_record`: Value or pattern to match in the `basisOfRecord` field for entries that should be filtered out
* `save_path_folder`: Path where filtered data should be saved (if applicable)

## Initial Assumptions / Preconditions

- The dataset must follow the TreeOfLife format structure
- The occurrences data must include `gbifID` and `basisOfRecord` fields
- The dataset must be organized in parquet files following the pattern: `/source=*/server=*/data_*.parquet`
- Each data file must contain at least `uuid` and `source_id` fields

## Post-conditions

After successful execution:

- Parquet files in the dataset will be filtered to exclude entries where the `basisOfRecord` matches the specified
criteria
- Original dataset files will be overwritten with filtered versions
- Entries with matching criteria will be completely removed from the dataset
- All files processed will be logged in the verification folder
- Compression will be applied using zstd level 3 to maintain efficient storage
5 changes: 5 additions & 0 deletions src/TreeOfLife_toolbox/research_filtering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .classes import (
ResearchFilteringFilter,
ResearchFilteringRunner,
ResearchFilteringScheduleCreation,
)
Loading