Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 18 additions & 26 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,78 @@ requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/DD_tools"]
packages = ["src/TreeOfLife_toolbox"]

[project]
name = "DD_tools"
name = "TreeOfLife_toolbox"
dynamic = ["version"]
authors = [
{ name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
{ name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
{ name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
]
description = "A tool for downloading files from a list of URLs in parallel."
description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.10, <3.12"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"attrs",
"brotli",
"certifi",
"charset-normalizer",
"cramjam",
"cython",
"exceptiongroup",
"fsspec",
"hatchling",
"idna",
"inflate64",
"iniconfig",
"mpi4py < 4",
"mpi4py",
"multivolumefile",
"numpy",
"opencv-python",
"packaging",
"pandas",
"pathspec",
"pillow",
"pip",
"pluggy",
"psutil",
"py4j",
"pyarrow",
"pybcj",
"pycryptodomex",
"pyppmd",
"pyspark",
"pytest",
"python-dateutil",
"python-dotenv",
"pytz",
"pyyaml",
"pyzstd",
"requests",
"setuptools",
"six",
"texttable",
"tomli",
"trove-classifiers",
"typing-extensions",
"tzdata",
"urllib3",
"wheel"
]

[project.optional-dependencies]
dev = ["pytest"]
dev = [
"pytest",
"ruff"
]

keywords = [
"parallel",
"distributed",
"download",
"url",
"mpi-applications",
"dataset-generation",
]

[project.urls]
Homepage = "https://github.com/Imageomics/distributed-downloader"
Repository = "https://github.com/Imageomics/distributed-downloader.git"
"Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"

[project.scripts]
tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"

[tool.hatch.version]
path = "src/DD_tools/main/__about__.py"
path = "src/TreeOfLife_toolbox/main/__about__.py"
3 changes: 1 addition & 2 deletions scripts/tools_filter.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ executor_memory="64G"
module load spark/3.4.1
module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

pbs-spark-submit \
--driver-memory $driver_memory \
--executor-memory $executor_memory \
"${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
"${TOOLBOX_PATH}/main/filter.py" \
"${tool_name}" \
> "${logs_dir}/tool_filter.log"
3 changes: 1 addition & 2 deletions scripts/tools_scheduler.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_scheduler.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_verifier.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_verifier.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_worker.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task="$TOOLS_CPU_PER_WORKER" \
--mem=0 \
--output="${logs_dir}/tool_worker-%2t.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
Empty file removed src/DD_tools/__init__.py
Empty file.
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from TreeOfLife_toolbox import lila_separation_single_label_filtering
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# LILA Separation Single Label Filtering Tool

## Overview

This tool is designed to filter a dataset to retain only single-label images by removing multi-label images (images that
have several objects labeled on them). It processes datasets in the distributed-downloader format and performs filtering
based on a provided CSV file containing UUID identifiers.

## Components

### Filter Component

`LilaSeparationSingleLabelFilteringFilter` copies the input CSV file containing single-label image information to the
appropriate filter table directory, setting up the filtering process.

### Scheduler Component

`LilaSeparationSingleLabelFilteringScheduleCreation` creates a distributed work schedule based on server names and
partition IDs to efficiently process the dataset across multiple workers.

### Runner Component

`LilaSeparationSingleLabelFilteringRunner` performs the actual filtering by reading the provided UUIDs and applying the
filter to each partition of the dataset, removing any images that don't match the criteria.

## Configuration Requirements

The tool requires the following configuration fields:

- `data_path`: Path to the CSV table containing single-label images (must include a `uuid` column)

## Prerequisites

- The CSV table specified in `data_path` must contain entries identified by a `uuid` column
- The dataset must be in `distributed-downloader` format with appropriate server_name and partition_id organization
- Standard TreeOfLife toolbox environment and dependencies must be set up

## Post Conditions

- The resulting dataset will maintain the `distributed-downloader` format
- Filtering is performed in-place, modifying the original dataset
- The tool's checkpoint system tracks progress, allowing for resumption after interruptions
- Verification ensures all partitions are processed before marking the tool as completed
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .classes import (
LilaSeparationSingleLabelFilteringFilter,
LilaSeparationSingleLabelFilteringScheduleCreation,
LilaSeparationSingleLabelFilteringRunner,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import os
import shutil
from typing import List

from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.filters import FilterRegister, SparkFilterToolBase
from TreeOfLife_toolbox.main.runners import FilterRunnerTool, RunnerRegister
from TreeOfLife_toolbox.main.schedulers import DefaultScheduler, SchedulerRegister


@FilterRegister("lila_separation_single_label_filtering")
class LilaSeparationSingleLabelFilteringFilter(SparkFilterToolBase):
"""
Filter class for separating single-label images from a dataset.

This class is responsible for the initial filtering step in the single-label
filtering process. It copies a provided CSV file containing information about
single-label images to the filter table directory. This CSV file will later be
used by the runner to filter the dataset.

Attributes:
filter_name (str): The name of the filter used for registration and folder creation.
data_path (str): Path to the input CSV file containing single-label image information.
"""
def __init__(self, cfg: Config):
"""
Initialize the filter with configuration settings.

Args:
cfg (Config): Configuration object containing necessary parameters,
including the data_path for the input CSV.
"""
super().__init__(cfg)

self.filter_name: str = "lila_separation_single_label_filtering"
self.data_path = cfg["data_path"]

def run(self):
"""
Execute the filtering process by copying the input CSV to the filter table directory.

The method creates the necessary directory structure and copies the CSV file
containing UUIDs of single-label images to be used in the subsequent steps.
"""
filter_table_folder = os.path.join(
self.tools_path, self.filter_name, "filter_table"
)
os.makedirs(filter_table_folder, exist_ok=True)
filter_table_folder += "/table.csv"

shutil.copyfile(self.data_path, filter_table_folder)


@SchedulerRegister("lila_separation_single_label_filtering")
class LilaSeparationSingleLabelFilteringScheduleCreation(DefaultScheduler):
"""
Scheduler class for the single-label filtering process.

This class creates a schedule for distributing the filtering work across multiple
workers. It inherits from DefaultScheduler, which handles the standard scheduling
logic of partitioning the data by server_name and partition_id.

Attributes:
filter_name (str): The name of the filter used for registration and folder creation.
"""
def __init__(self, cfg: Config):
"""
Initialize the scheduler with configuration settings.

Args:
cfg (Config): Configuration object containing parameters needed for scheduling.
"""
super().__init__(cfg)

self.filter_name: str = "lila_separation_single_label_filtering"


@RunnerRegister("lila_separation_single_label_filtering")
class LilaSeparationSingleLabelFilteringRunner(FilterRunnerTool):
"""
Runner class that performs the actual filtering of images based on their UUIDs.

This class implements the execution logic for filtering out images that don't have
a single label. It reads the schedule created by the scheduler and processes the
dataset to keep only single-label images based on the UUIDs in the filter table.

Attributes:
data_scheme (List[str]): The column schema for the filter data.
filter_name (str): The name of the filter used for registration and folder creation.
"""
def __init__(self, cfg: Config):
"""
Initialize the runner with configuration settings.

Args:
cfg (Config): Configuration object containing parameters needed for execution.
"""
super().__init__(cfg)
self.data_scheme: List[str] = ["uuid", "server_name", "partition_id"]

self.filter_name: str = "lila_separation_single_label_filtering"
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import SuccessEntry
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import SuccessEntry

FilterRegister = partial(ToolsRegistryBase.register, "filter")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import argparse
import os
from logging import Logger
from pathlib import Path
from typing import Dict, List, Optional, TextIO, Tuple

import pandas as pd
from attr import Factory, define, field

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import (
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import (
init_logger,
ensure_created,
truncate_paths,
Expand Down Expand Up @@ -78,6 +79,7 @@ def __attrs_post_init__(self):

def __init_environment(self) -> None:
os.environ["CONFIG_PATH"] = self.config.config_path
os.environ["TOOLBOX_PATH"] = str(Path(__file__).parent.parent.resolve())

os.environ["ACCOUNT"] = self.config["account"]
os.environ["PATH_TO_INPUT"] = self.config["path_to_input"]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Dict, Type, Optional

from DD_tools.main.config import Config
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.utils import init_logger


class ToolsRegistryBase(type):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Loading