diff --git a/.circleci/config.yml b/.circleci/config.yml index e9f2313..a84e983 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,7 +9,7 @@ orbs: # Orb commands and jobs help you with common scripting around a language/tool # so you dont have to copy and paste it everywhere. # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python - python: circleci/python@1.5.0 + python: circleci/python@2.1.1 # Define a job to be invoked later in a workflow. # See: https://circleci.com/docs/2.0/configuration-reference/#jobs @@ -33,6 +33,7 @@ jobs: name: build clustpy command: | python -m pip install --upgrade pip + pip install build Cython numpy pip install pytest pip install -e . - run: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml deleted file mode 100644 index 43c4307..0000000 --- a/.github/workflows/lint.yml +++ /dev/null @@ -1,37 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - -name: Check Lint - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -permissions: - contents: read - -jobs: - lint: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: '3.12' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 - - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 064a51a..698a487 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,27 +21,28 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.12' - - name: Display Python version - run: python -c "import sys; print(sys.version)" - - name: Install dependencies run: | python -m pip install --upgrade pip pip install build + - name: Build package run: python -m build --sdist + - name: Publish package to Test PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.TEST_PYPI_API_TOKEN }} repository_url: https://test.pypi.org/legacy/ + - name: Publish package to PyPI if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/test-main.yml b/.github/workflows/test-main.yml index 38adfc8..f9b8ce0 100644 --- a/.github/workflows/test-main.yml +++ b/.github/workflows/test-main.yml @@ -13,25 +13,47 @@ permissions: contents: read jobs: - build: + lint: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' # Speeds up flake8 installation + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + build: + needs: lint # This job only starts if 'lint' passes runs-on: ubuntu-latest strategy: - # You can use PyPy versions in python-version. - # For example, pypy-2.7 and pypy-3.8 + fail-fast: false # don't break 3.12 if 3.10 fails matrix: python-version: ["3.12", "3.10"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - # You can test your matrix by printing the current Python version - - name: Display Python version - run: python -c "import sys; print(sys.version)" + cache: 'pip' # Automatically caches your dependencies - name: Install dependencies run: | @@ -39,11 +61,19 @@ jobs: pip install pytest pytest-cov pip install -e .[full] - - name: Test with pytest + - name: Test with pytest (with codecov) + if: ${{ matrix.python-version == '3.10' }} + run: | + pytest -m "not largedata" --cov --cov-report=xml + + - name: Test with pytest (without codecov) + if: ${{ matrix.python-version != '3.10' }} run: | - pytest -m "not largedata" --cov + pytest -m "not largedata" - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v4.0.1 + if: ${{ matrix.python-version == '3.10' }} + uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} + fail_ci_if_error: true # Helpful to know if upload failed diff --git a/clustpy/data/_utils.py b/clustpy/data/_utils.py index 7a08782..70c4466 100644 --- a/clustpy/data/_utils.py +++ b/clustpy/data/_utils.py @@ -7,14 +7,17 @@ except: print( "[WARNING] Could not import nltk in clustpy.data.real_world_data to use the SnowballStemmer. Please install nltk by 'pip install nltk' if necessary") +try: + from PIL import Image +except: + print( + "[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary") import numpy as np -import urllib.request import os from pathlib import Path -import ssl -from PIL import Image from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer from sklearn.feature_selection import VarianceThreshold +from sklearn.datasets import fetch_file DEFAULT_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_datafiles") @@ -63,11 +66,11 @@ def _download_file(file_url: str, filename_local: str) -> None: filename_local : str local name of the file after it has been downloaded """ + local_path = Path(filename_local) + local_dir = local_path.parent + local_filename = local_path.name print("Downloading data set from {0} to {1}".format(file_url, filename_local)) - default_ssl = ssl._create_default_https_context - ssl._create_default_https_context = ssl._create_unverified_context - urllib.request.urlretrieve(file_url, filename_local) - ssl._create_default_https_context = default_ssl + fetch_file(file_url, folder=local_dir, local_filename=local_filename) def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_size: int = 32768) -> None: @@ -187,7 +190,7 @@ def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.nda image_data : np.ndarray The numpy array containing the image data """ - if type(image) is str: + if isinstance(image, str): pil_image = Image.open(image) else: pil_image = Image.fromarray(np.uint8(image)) @@ -196,7 +199,8 @@ def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.nda # Convert to coherent size if image_size is not None: pil_image = pil_image.resize(image_size) - image_data = np.asarray(pil_image) + image_data = np.array(pil_image).copy() + pil_image.close() assert image_size is None or image_data.shape == ( image_size[0], image_size[1], 3), "Size of image is not correct. Should be {0} but is {1}".format(image_size, image_data.shape) diff --git a/clustpy/data/real_torchvision_data.py b/clustpy/data/real_torchvision_data.py index 03c1eec..a3c0fb0 100644 --- a/clustpy/data/real_torchvision_data.py +++ b/clustpy/data/real_torchvision_data.py @@ -1,7 +1,6 @@ import torchvision import torch import numpy as np -import ssl from clustpy.data._utils import _get_download_dir, _load_image_data, flatten_images from sklearn.datasets._base import Bunch @@ -45,7 +44,7 @@ def _get_data_and_labels(dataset: torchvision.datasets.VisionDataset, image_size labels.append(label) image_data = _load_image_data(path, image_size, True) data_list.append(image_data) - # Convert data form list to numpy array + # Convert data from list to numpy array data = np.array(data_list) labels = np.array(labels) if type(data) is np.ndarray: @@ -89,8 +88,6 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) # Get data from source - default_ssl = ssl._create_default_https_context - ssl._create_default_https_context = ssl._create_unverified_context if subset == "all" or subset == "train": # Load training data if uses_train_param: @@ -117,7 +114,6 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs # Convert data to float and labels to int data = data.float() labels = labels.int() - ssl._create_default_https_context = default_ssl # Check data dimensions if data.dim() < 3 or data.dim() > 5: raise Exception( @@ -137,7 +133,7 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs # Some dataset (e.g., SVHN) do not have the class information included if hasattr(dataset, "classes"): return Bunch(dataset_name=dataset.__class__.__name__, data=data_flatten, target=labels_numpy, - images=data_image, image_format=image_format, classes=dataset.classes) + images=data_image, image_format=image_format, classes=dataset.classes.copy()) else: return Bunch(dataset_name=dataset.__class__.__name__, data=data_flatten, target=labels_numpy, images=data_image, image_format=image_format) diff --git a/clustpy/data/real_uci_data.py b/clustpy/data/real_uci_data.py index f34c755..219e7e5 100644 --- a/clustpy/data/real_uci_data.py +++ b/clustpy/data/real_uci_data.py @@ -1,9 +1,4 @@ -try: - from PIL import Image -except: - print( - "[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary") -from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data +from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data, _load_image_data import os import numpy as np import zipfile @@ -1167,8 +1162,7 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc if not image.endswith("_4.pgm"): continue # get image data - image_data = Image.open(path_images + "/" + image) - image_array = np.array(image_data) + image_array = _load_image_data(path_images + "/" + image, None, False) # Get labels name_parts = image.split("_") user_id = np.argwhere(names == name_parts[0])[0][0] @@ -1188,7 +1182,7 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc return data_flatten, labels else: return Bunch(dataset_name="CMUFace", data=data_flatten, target=labels, images=data_image, image_format="HW", - classes=[names, positions, expressions, eyes]) + classes=(names, positions, expressions, eyes)) def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str = None): diff --git a/clustpy/data/real_video_data.py b/clustpy/data/real_video_data.py index d58730b..8a3211b 100644 --- a/clustpy/data/real_video_data.py +++ b/clustpy/data/real_video_data.py @@ -2,7 +2,7 @@ import cv2 except: print("[WARNING] Could not import cv2 in clustpy.data.real_video_data. Please install cv2 by 'pip install opencv-python' if necessary") -from clustpy.data._utils import _download_file, _get_download_dir, _load_image_data, flatten_images +from clustpy.data._utils import _download_file, _get_download_dir, flatten_images import numpy as np import os import zipfile @@ -33,19 +33,26 @@ def _load_video(path: str, image_size: tuple) -> np.ndarray: """ # Load video vid = cv2.VideoCapture(path) + if not vid.isOpened(): + vid.release() + raise IOError(f"OpenCV could not open {path}. This usually indicates missing codecs (ffmpeg/libav).") video_array = [] # Iterate over frames - successful = True - while successful: - successful, frame_array = vid.read() - if successful: + try: + while True: + successful, frame_array = vid.read() + if not successful: + break is_color_image = frame_array.ndim == 3 and frame_array.shape[2] == 3 if is_color_image: frame_array = cv2.cvtColor(frame_array, cv2.COLOR_BGR2RGB) if image_size is not None: - frame_array = _load_image_data(frame_array, image_size, is_color_image) - video_array.append(frame_array) - vid.release() + frame_array = cv2.resize(frame_array, image_size, interpolation=cv2.INTER_AREA) + video_array.append(frame_array.copy()) + finally: + vid.release() + if len(video_array) == 0: + raise ValueError(f"Video at {path} yielded 0 frames. File might be corrupted.") # Transform list to numpy array video_array = np.array(video_array, dtype="uint8") return video_array @@ -90,7 +97,8 @@ def _downsample_frames(data: np.ndarray, labels: np.ndarray, frame_sampling_rati """ -def load_video_weizmann(image_size: tuple = None, frame_sampling_ratio: float = 1, return_X_y: bool = False, +def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, + image_size: tuple = None, frame_sampling_ratio: float = 1, return_X_y: bool = False, downloads_path: str = None) -> Bunch: """ Load the Weizmann video data set. @@ -102,6 +110,10 @@ def load_video_weizmann(image_size: tuple = None, frame_sampling_ratio: float = Parameters ---------- + use_actions : tuple + Specify the actions. Can be None if all actions should be used (default: None) + use_persons : tuple + Specify the persons. Can be None if all persons should be used (default: None) image_size : tuple The single frames can be downsized. This is necessary for large datasets. The tuple equals (width, height) of the images. @@ -129,13 +141,17 @@ def load_video_weizmann(image_size: tuple = None, frame_sampling_ratio: float = """ directory = _get_download_dir(downloads_path) + "/Video_Weizmann/" all_actions = ["walk", "run", "jump", "side", "bend", "wave1", "wave2", "pjump", "jack", "skip"] + if use_actions is None: + use_actions = all_actions.copy() + assert all([action in all_actions for action in use_actions]) all_persons = ["daria", "denis", "eli", "ido", "ira", "lena", "lyova", "moshe", "shahar"] - all_data = np.zeros( - (0, 144 if image_size is None else image_size[0], 180 if image_size is None else image_size[1], 3), - dtype="uint8") - labels = np.zeros((0, 2), dtype="int32") + if use_persons is None: + use_persons = all_persons.copy() + assert all([person in all_persons for person in use_persons]) + all_data_list = [] + labels_list = [] # Download data - for action in all_actions: + for action in use_actions: my_zip_file = action + ".zip" filename = directory + my_zip_file if not os.path.isfile(filename): @@ -151,7 +167,6 @@ def load_video_weizmann(image_size: tuple = None, frame_sampling_ratio: float = for v_file in os.listdir(directory): # Ignore zip files if v_file.endswith(".avi"): - data_local = _load_video(directory + "/" + v_file, image_size) # Get name of person and type of activity relevant_parts = v_file.split(".")[0] person = relevant_parts.split("_")[0] @@ -161,15 +176,23 @@ def load_video_weizmann(image_size: tuple = None, frame_sampling_ratio: float = action = action[:-1] assert person in all_persons, "Wrong person. {0} is unknown".format(person) assert action in all_actions, "Wrong action. {0} is unknown".format(action) + if person not in use_persons or action not in use_actions: + continue + # Load video + data_local = _load_video(directory + "/" + v_file, image_size) # Transform string to label - label_person = all_persons.index(person) - label_action = all_actions.index(action) + label_person = use_persons.index(person) + label_action = use_actions.index(action) labels_local = np.array([[label_action, label_person]] * data_local.shape[0], dtype="int32") # Downsample frames data_local, labels_local = _downsample_frames(data_local, labels_local, frame_sampling_ratio) # Update data and labels - all_data = np.append(all_data, data_local, axis=0) - labels = np.append(labels, labels_local, axis=0) + all_data_list.append(data_local) + labels_list.append(labels_local) + all_data = np.concatenate(all_data_list, axis=0) + labels = np.concatenate(labels_list, axis=0) + del all_data_list + del labels_list # Flatten data data_flatten = flatten_images(all_data, "HWC") # Return values @@ -180,7 +203,7 @@ def load_video_weizmann(image_size: tuple = None, frame_sampling_ratio: float = data_image = np.transpose(all_data, [0, 3, 1, 2]) image_format = "CHW" return Bunch(dataset_name="VideoWeizmann", data=data_flatten, target=labels, images=data_image, - image_format=image_format) + image_format=image_format, classes=(use_actions, use_persons)) def load_video_keck_gesture(subset: str = "all", image_size: tuple = (200, 200), frame_sampling_ratio: float = 1, @@ -283,10 +306,8 @@ def parse_frames_file(frames_file: str) -> (dict, dict): # Get Relevant frames _download_file("http://www.zhuolin.umiacs.io/PrototypeTree/sequences.txt", frames_file) # Load data and labels - all_data = np.zeros( - (0, 480 if image_size is None else image_size[0], 640 if image_size is None else image_size[1], 3), - dtype="uint8") - labels = np.zeros((0, 2), dtype="int32") + all_data_list = [] + labels_list = [] # Get frame limits from sequences file frames_train_dict, frames_test_dict = parse_frames_file(frames_file) # Get necessary directories @@ -315,8 +336,12 @@ def parse_frames_file(frames_file: str) -> (dict, dict): # Downsample frames data_local, labels_local = _downsample_frames(data_local, labels_local, frame_sampling_ratio) # Update data and labels - all_data = np.append(all_data, data_local, axis=0) - labels = np.append(labels, labels_local, axis=0) + all_data_list.append(data_local) + labels_list.append(labels_local) + all_data = np.concatenate(all_data_list, axis=0) + labels = np.concatenate(labels_list, axis=0) + del all_data_list + del labels_list # Flatten data data_flatten = flatten_images(all_data, "HWC") # Return values diff --git a/clustpy/data/real_world_data.py b/clustpy/data/real_world_data.py index 347b0ec..f9637fd 100644 --- a/clustpy/data/real_world_data.py +++ b/clustpy/data/real_world_data.py @@ -704,4 +704,4 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis if return_X_y: return data, labels else: - return Bunch(dataset_name="WebKB", data=data, target=labels, classes=[use_categories, use_universities]) + return Bunch(dataset_name="WebKB", data=data, target=labels, classes=(use_categories, use_universities)) diff --git a/clustpy/data/tests/test_real_clustpy_data.py b/clustpy/data/tests/test_real_clustpy_data.py index a5f5114..192856f 100644 --- a/clustpy/data/tests/test_real_clustpy_data.py +++ b/clustpy/data/tests/test_real_clustpy_data.py @@ -1,23 +1,7 @@ from clustpy.data.tests._helpers_for_tests import _helper_test_data_loader from clustpy.data import load_aloi_small, load_fruit, load_nrletters, load_stickfigures -from pathlib import Path -import os -import shutil import pytest -TEST_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_testfiles_clustpy") - - -@pytest.fixture(autouse=True, scope='function') -def run_around_tests(): - # Code that will run before the tests - if not os.path.isdir(TEST_DOWNLOAD_PATH): - os.makedirs(TEST_DOWNLOAD_PATH) - # Test functions will be run at this point - yield - # Code that will run after the tests - shutil.rmtree(TEST_DOWNLOAD_PATH) - @pytest.mark.data def test_load_aloi_small(): diff --git a/clustpy/data/tests/test_real_medical_mnist_data.py b/clustpy/data/tests/test_real_medical_mnist_data.py index f77f0fb..c08139d 100644 --- a/clustpy/data/tests/test_real_medical_mnist_data.py +++ b/clustpy/data/tests/test_real_medical_mnist_data.py @@ -3,452 +3,447 @@ load_retina_mnist, load_breast_mnist, load_blood_mnist, load_tissue_mnist, load_organ_a_mnist, load_organ_c_mnist, \ load_organ_s_mnist, load_organ_mnist_3d, load_nodule_mnist_3d, load_adrenal_mnist_3d, load_fracture_mnist_3d, \ load_vessel_mnist_3d, load_synapse_mnist_3d -from pathlib import Path -import os -import shutil import pytest - -TEST_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_testfiles_medical_mnist") +import shutil @pytest.fixture(autouse=True, scope='function') -def run_around_tests(): +def my_tmp_dir(tmp_path): # Code that will run before the tests - if not os.path.isdir(TEST_DOWNLOAD_PATH): - os.makedirs(TEST_DOWNLOAD_PATH) + tmp_dir = str(tmp_path) # Test functions will be run at this point - yield + yield tmp_dir # Code that will run after the tests - shutil.rmtree(TEST_DOWNLOAD_PATH) + shutil.rmtree(tmp_dir) @pytest.mark.data -def test_load_path_mnist(): +def test_load_path_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_path_mnist, 107180, 2352, 9, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_path_mnist, 107180, 2352, 9, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (107180, 3, 28, 28) assert dataset.image_format == "CHW" # Train data set - dataset = _helper_test_data_loader(load_path_mnist, 89996, 2352, 9, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_path_mnist, 89996, 2352, 9, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (89996, 3, 28, 28) assert dataset.image_format == "CHW" # Validation data set - dataset = _helper_test_data_loader(load_path_mnist, 10004, 2352, 9, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_path_mnist, 10004, 2352, 9, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (10004, 3, 28, 28) assert dataset.image_format == "CHW" # Test data set - dataset = _helper_test_data_loader(load_path_mnist, 7180, 2352, 9, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_path_mnist, 7180, 2352, 9, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (7180, 3, 28, 28) assert dataset.image_format == "CHW" @pytest.mark.data -def test_load_chest_mnist(): +def test_load_chest_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_chest_mnist, 112120, 784, [2] * 14, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_chest_mnist, 112120, 784, [2] * 14, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (112120, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_chest_mnist, 78468, 784, [2] * 14, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_chest_mnist, 78468, 784, [2] * 14, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (78468, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_chest_mnist, 11219, 784, [2] * 14, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_chest_mnist, 11219, 784, [2] * 14, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (11219, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_chest_mnist, 22433, 784, [2] * 14, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_chest_mnist, 22433, 784, [2] * 14, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (22433, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_derma_mnist(): +def test_load_derma_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_derma_mnist, 10015, 2352, 7, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_derma_mnist, 10015, 2352, 7, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (10015, 3, 28, 28) assert dataset.image_format == "CHW" # Train data set - dataset = _helper_test_data_loader(load_derma_mnist, 7007, 2352, 7, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_derma_mnist, 7007, 2352, 7, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (7007, 3, 28, 28) assert dataset.image_format == "CHW" # Validation data set - dataset = _helper_test_data_loader(load_derma_mnist, 1003, 2352, 7, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_derma_mnist, 1003, 2352, 7, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1003, 3, 28, 28) assert dataset.image_format == "CHW" # Test data set - dataset = _helper_test_data_loader(load_derma_mnist, 2005, 2352, 7, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_derma_mnist, 2005, 2352, 7, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (2005, 3, 28, 28) assert dataset.image_format == "CHW" @pytest.mark.data -def test_load_oct_mnist(): +def test_load_oct_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_oct_mnist, 109309, 784, 4, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_oct_mnist, 109309, 784, 4, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (109309, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_oct_mnist, 97477, 784, 4, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_oct_mnist, 97477, 784, 4, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (97477, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_oct_mnist, 10832, 784, 4, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_oct_mnist, 10832, 784, 4, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (10832, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_oct_mnist, 1000, 784, 4, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_oct_mnist, 1000, 784, 4, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1000, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_pneumonia_mnist(): +def test_load_pneumonia_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_pneumonia_mnist, 5856, 784, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_pneumonia_mnist, 5856, 784, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (5856, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_pneumonia_mnist, 4708, 784, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_pneumonia_mnist, 4708, 784, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (4708, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_pneumonia_mnist, 524, 784, 2, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_pneumonia_mnist, 524, 784, 2, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (524, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_pneumonia_mnist, 624, 784, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_pneumonia_mnist, 624, 784, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (624, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_retina_mnist(): +def test_load_retina_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_retina_mnist, 1600, 2352, 5, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_retina_mnist, 1600, 2352, 5, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1600, 3, 28, 28) assert dataset.image_format == "CHW" # Train data set - dataset = _helper_test_data_loader(load_retina_mnist, 1080, 2352, 5, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_retina_mnist, 1080, 2352, 5, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1080, 3, 28, 28) assert dataset.image_format == "CHW" # Validation data set - dataset = _helper_test_data_loader(load_retina_mnist, 120, 2352, 5, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_retina_mnist, 120, 2352, 5, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (120, 3, 28, 28) assert dataset.image_format == "CHW" # Test data set - dataset = _helper_test_data_loader(load_retina_mnist, 400, 2352, 5, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_retina_mnist, 400, 2352, 5, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (400, 3, 28, 28) assert dataset.image_format == "CHW" @pytest.mark.data -def test_load_breast_mnist(): +def test_load_breast_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_breast_mnist, 780, 784, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_breast_mnist, 780, 784, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (780, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_breast_mnist, 546, 784, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_breast_mnist, 546, 784, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (546, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_breast_mnist, 78, 784, 2, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_breast_mnist, 78, 784, 2, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (78, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_breast_mnist, 156, 784, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_breast_mnist, 156, 784, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (156, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_blood_mnist(): +def test_load_blood_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_blood_mnist, 17092, 2352, 8, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_blood_mnist, 17092, 2352, 8, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (17092, 3, 28, 28) assert dataset.image_format == "CHW" # Train data set - dataset = _helper_test_data_loader(load_blood_mnist, 11959, 2352, 8, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_blood_mnist, 11959, 2352, 8, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (11959, 3, 28, 28) assert dataset.image_format == "CHW" # Validation data set - dataset = _helper_test_data_loader(load_blood_mnist, 1712, 2352, 8, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_blood_mnist, 1712, 2352, 8, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1712, 3, 28, 28) assert dataset.image_format == "CHW" # Test data set - dataset = _helper_test_data_loader(load_blood_mnist, 3421, 2352, 8, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_blood_mnist, 3421, 2352, 8, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (3421, 3, 28, 28) assert dataset.image_format == "CHW" @pytest.mark.data -def test_load_tissue_mnist(): +def test_load_tissue_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_tissue_mnist, 236386, 784, 8, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_tissue_mnist, 236386, 784, 8, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (236386, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_tissue_mnist, 165466, 784, 8, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_tissue_mnist, 165466, 784, 8, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (165466, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_tissue_mnist, 23640, 784, 8, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_tissue_mnist, 23640, 784, 8, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (23640, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_tissue_mnist, 47280, 784, 8, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_tissue_mnist, 47280, 784, 8, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (47280, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_organ_a_mnist(): +def test_load_organ_a_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_organ_a_mnist, 58850, 784, 11, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_a_mnist, 58850, 784, 11, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (58850, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_organ_a_mnist, 34581, 784, 11, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_a_mnist, 34581, 784, 11, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (34581, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_organ_a_mnist, 6491, 784, 11, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_a_mnist, 6491, 784, 11, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (6491, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_organ_a_mnist, 17778, 784, 11, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_a_mnist, 17778, 784, 11, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (17778, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_organ_c_mnist(): +def test_load_organ_c_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_organ_c_mnist, 23660, 784, 11, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_c_mnist, 23660, 784, 11, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (23660, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_organ_c_mnist, 13000, 784, 11, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_c_mnist, 13000, 784, 11, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (13000, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_organ_c_mnist, 2392, 784, 11, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_c_mnist, 2392, 784, 11, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (2392, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_organ_c_mnist, 8268, 784, 11, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_c_mnist, 8268, 784, 11, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (8268, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_organ_s_mnist(): +def test_load_organ_s_mnist(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_organ_s_mnist, 25221, 784, 11, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_s_mnist, 25221, 784, 11, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (25221, 28, 28) assert dataset.image_format == "HW" # Train data set - dataset = _helper_test_data_loader(load_organ_s_mnist, 13940, 784, 11, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_s_mnist, 13940, 784, 11, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (13940, 28, 28) assert dataset.image_format == "HW" # Validation data set - dataset = _helper_test_data_loader(load_organ_s_mnist, 2452, 784, 11, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_s_mnist, 2452, 784, 11, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (2452, 28, 28) assert dataset.image_format == "HW" # Test data set - dataset = _helper_test_data_loader(load_organ_s_mnist, 8829, 784, 11, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_s_mnist, 8829, 784, 11, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (8829, 28, 28) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_organ_mnist_3d(): +def test_load_organ_mnist_3d(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_organ_mnist_3d, 1743, 21952, 11, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_mnist_3d, 1743, 21952, 11, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1743, 28, 28, 28) assert dataset.image_format == "HWD" # Train data set - dataset = _helper_test_data_loader(load_organ_mnist_3d, 972, 21952, 11, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_mnist_3d, 972, 21952, 11, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (972, 28, 28, 28) assert dataset.image_format == "HWD" # Validation data set - dataset = _helper_test_data_loader(load_organ_mnist_3d, 161, 21952, 11, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_mnist_3d, 161, 21952, 11, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (161, 28, 28, 28) assert dataset.image_format == "HWD" # Test data set - dataset = _helper_test_data_loader(load_organ_mnist_3d, 610, 21952, 11, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_organ_mnist_3d, 610, 21952, 11, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (610, 28, 28, 28) assert dataset.image_format == "HWD" @pytest.mark.data -def test_load_nodule_mnist_3d(): +def test_load_nodule_mnist_3d(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_nodule_mnist_3d, 1633, 21952, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_nodule_mnist_3d, 1633, 21952, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1633, 28, 28, 28) assert dataset.image_format == "HWD" # Train data set - dataset = _helper_test_data_loader(load_nodule_mnist_3d, 1158, 21952, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_nodule_mnist_3d, 1158, 21952, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1158, 28, 28, 28) assert dataset.image_format == "HWD" # Validation data set - dataset = _helper_test_data_loader(load_nodule_mnist_3d, 165, 21952, 2, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_nodule_mnist_3d, 165, 21952, 2, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (165, 28, 28, 28) assert dataset.image_format == "HWD" # Test data set - dataset = _helper_test_data_loader(load_nodule_mnist_3d, 310, 21952, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_nodule_mnist_3d, 310, 21952, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (310, 28, 28, 28) assert dataset.image_format == "HWD" @pytest.mark.data -def test_load_adrenal_mnist_3d(): +def test_load_adrenal_mnist_3d(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 1584, 21952, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 1584, 21952, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1584, 28, 28, 28) assert dataset.image_format == "HWD" # Train data set - dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 1188, 21952, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 1188, 21952, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1188, 28, 28, 28) assert dataset.image_format == "HWD" # Validation data set - dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 98, 21952, 2, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 98, 21952, 2, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (98, 28, 28, 28) assert dataset.image_format == "HWD" # Test data set - dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 298, 21952, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_adrenal_mnist_3d, 298, 21952, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (298, 28, 28, 28) assert dataset.image_format == "HWD" @pytest.mark.data -def test_load_fracture_mnist_3d(): +def test_load_fracture_mnist_3d(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_fracture_mnist_3d, 1370, 21952, 3, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_fracture_mnist_3d, 1370, 21952, 3, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1370, 28, 28, 28) assert dataset.image_format == "HWD" # Train data set - dataset = _helper_test_data_loader(load_fracture_mnist_3d, 1027, 21952, 3, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_fracture_mnist_3d, 1027, 21952, 3, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1027, 28, 28, 28) assert dataset.image_format == "HWD" # Validation data set - dataset = _helper_test_data_loader(load_fracture_mnist_3d, 103, 21952, 3, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_fracture_mnist_3d, 103, 21952, 3, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (103, 28, 28, 28) assert dataset.image_format == "HWD" # Test data set - dataset = _helper_test_data_loader(load_fracture_mnist_3d, 240, 21952, 3, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_fracture_mnist_3d, 240, 21952, 3, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (240, 28, 28, 28) assert dataset.image_format == "HWD" @pytest.mark.data -def test_load_vessel_mnist_3d(): +def test_load_vessel_mnist_3d(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_vessel_mnist_3d, 1909, 21952, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_vessel_mnist_3d, 1909, 21952, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1909, 28, 28, 28) assert dataset.image_format == "HWD" # Train data set - dataset = _helper_test_data_loader(load_vessel_mnist_3d, 1335, 21952, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_vessel_mnist_3d, 1335, 21952, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1335, 28, 28, 28) assert dataset.image_format == "HWD" # Validation data set - dataset = _helper_test_data_loader(load_vessel_mnist_3d, 192, 21952, 2, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_vessel_mnist_3d, 192, 21952, 2, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (192, 28, 28, 28) assert dataset.image_format == "HWD" # Test data set - dataset = _helper_test_data_loader(load_vessel_mnist_3d, 382, 21952, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_vessel_mnist_3d, 382, 21952, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (382, 28, 28, 28) assert dataset.image_format == "HWD" @pytest.mark.data -def test_load_synapse_mnist_3d(): +def test_load_synapse_mnist_3d(my_tmp_dir): # Full data set - dataset = _helper_test_data_loader(load_synapse_mnist_3d, 1759, 21952, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_synapse_mnist_3d, 1759, 21952, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1759, 28, 28, 28) assert dataset.image_format == "HWD" # Train data set - dataset = _helper_test_data_loader(load_synapse_mnist_3d, 1230, 21952, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_synapse_mnist_3d, 1230, 21952, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1230, 28, 28, 28) assert dataset.image_format == "HWD" # Validation data set - dataset = _helper_test_data_loader(load_synapse_mnist_3d, 177, 21952, 2, dataloader_params={"subset": "val", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_synapse_mnist_3d, 177, 21952, 2, dataloader_params={"subset": "val", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (177, 28, 28, 28) assert dataset.image_format == "HWD" # Test data set - dataset = _helper_test_data_loader(load_synapse_mnist_3d, 352, 21952, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + dataset = _helper_test_data_loader(load_synapse_mnist_3d, 352, 21952, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) # Non-flatten assert dataset.images.shape == (352, 28, 28, 28) assert dataset.image_format == "HWD" diff --git a/clustpy/data/tests/test_real_timeseries_data.py b/clustpy/data/tests/test_real_timeseries_data.py index ae77b42..549c362 100644 --- a/clustpy/data/tests/test_real_timeseries_data.py +++ b/clustpy/data/tests/test_real_timeseries_data.py @@ -1,119 +1,114 @@ from clustpy.data.tests._helpers_for_tests import _helper_test_data_loader from clustpy.data import load_motestrain, load_proximal_phalanx_outline, load_diatom_size_reduction, load_symbols, \ load_olive_oil, load_plane, load_sony_aibo_robot_surface, load_two_patterns, load_lsst -from pathlib import Path -import os -import shutil import pytest - -TEST_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_testfiles_timeseries") +import shutil @pytest.fixture(autouse=True, scope='function') -def run_around_tests(): +def my_tmp_dir(tmp_path): # Code that will run before the tests - if not os.path.isdir(TEST_DOWNLOAD_PATH): - os.makedirs(TEST_DOWNLOAD_PATH) + tmp_dir = str(tmp_path) # Test functions will be run at this point - yield + yield tmp_dir # Code that will run after the tests - shutil.rmtree(TEST_DOWNLOAD_PATH) + shutil.rmtree(tmp_dir) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_motestrain(): +def test_load_motestrain(my_tmp_dir): # Full data set - _helper_test_data_loader(load_motestrain, 1272, 84, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_motestrain, 1272, 84, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_motestrain, 20, 84, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_motestrain, 20, 84, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_motestrain, 1252, 84, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_motestrain, 1252, 84, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_proximal_phalanx_outline(): +def test_load_proximal_phalanx_outline(my_tmp_dir): # Full data set - _helper_test_data_loader(load_proximal_phalanx_outline, 876, 80, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_proximal_phalanx_outline, 876, 80, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_proximal_phalanx_outline, 600, 80, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_proximal_phalanx_outline, 600, 80, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_proximal_phalanx_outline, 276, 80, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_proximal_phalanx_outline, 276, 80, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_diatom_size_reduction(): +def test_load_diatom_size_reduction(my_tmp_dir): # Full data set - _helper_test_data_loader(load_diatom_size_reduction, 322, 345, 4, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_diatom_size_reduction, 322, 345, 4, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_diatom_size_reduction, 16, 345, 4, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_diatom_size_reduction, 16, 345, 4, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_diatom_size_reduction, 306, 345, 4, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_diatom_size_reduction, 306, 345, 4, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_symbols(): +def test_load_symbols(my_tmp_dir): # Full data set - _helper_test_data_loader(load_symbols, 1020, 398, 6, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_symbols, 1020, 398, 6, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_symbols, 25, 398, 6, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_symbols, 25, 398, 6, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_symbols, 995, 398, 6, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_symbols, 995, 398, 6, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_olive_oil(): +def test_load_olive_oil(my_tmp_dir): # Full data set - _helper_test_data_loader(load_olive_oil, 60, 570, 4, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_olive_oil, 60, 570, 4, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_olive_oil, 30, 570, 4, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_olive_oil, 30, 570, 4, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_olive_oil, 30, 570, 4, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_olive_oil, 30, 570, 4, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_plane(): +def test_load_plane(my_tmp_dir): # Full data set - _helper_test_data_loader(load_plane, 210, 144, 7, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_plane, 210, 144, 7, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_plane, 105, 144, 7, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_plane, 105, 144, 7, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_plane, 105, 144, 7, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_plane, 105, 144, 7, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_sony_aibo_robot_surface(): +def test_load_sony_aibo_robot_surface(my_tmp_dir): # Full data set - _helper_test_data_loader(load_sony_aibo_robot_surface, 621, 70, 2, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_sony_aibo_robot_surface, 621, 70, 2, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_sony_aibo_robot_surface, 20, 70, 2, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_sony_aibo_robot_surface, 20, 70, 2, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_sony_aibo_robot_surface, 601, 70, 2, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_sony_aibo_robot_surface, 601, 70, 2, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_two_patterns(): +def test_load_two_patterns(my_tmp_dir): # Full data set - _helper_test_data_loader(load_two_patterns, 5000, 128, 4, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_two_patterns, 5000, 128, 4, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_two_patterns, 1000, 128, 4, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_two_patterns, 1000, 128, 4, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_two_patterns, 4000, 128, 4, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_two_patterns, 4000, 128, 4, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) @pytest.mark.data @pytest.mark.timeseriesdata -def test_load_lsst(): +def test_load_lsst(my_tmp_dir): # Full data set - _helper_test_data_loader(load_lsst, 4925, 216, 14, dataloader_params={"subset": "all", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_lsst, 4925, 216, 14, dataloader_params={"subset": "all", "downloads_path":my_tmp_dir}) # Train data set - _helper_test_data_loader(load_lsst, 2459, 216, 14, dataloader_params={"subset": "train", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_lsst, 2459, 216, 14, dataloader_params={"subset": "train", "downloads_path":my_tmp_dir}) # Test data set - _helper_test_data_loader(load_lsst, 2466, 216, 14, dataloader_params={"subset": "test", "downloads_path":TEST_DOWNLOAD_PATH}) + _helper_test_data_loader(load_lsst, 2466, 216, 14, dataloader_params={"subset": "test", "downloads_path":my_tmp_dir}) diff --git a/clustpy/data/tests/test_real_torchvision_data.py b/clustpy/data/tests/test_real_torchvision_data.py index b987fa7..b854d20 100644 --- a/clustpy/data/tests/test_real_torchvision_data.py +++ b/clustpy/data/tests/test_real_torchvision_data.py @@ -2,23 +2,18 @@ from clustpy.data import load_usps, load_mnist, load_fmnist, load_kmnist, load_cifar10, load_svhn, load_stl10, \ load_gtsrb, load_cifar100 import torchvision.datasets -from pathlib import Path -import os -import shutil import pytest - -TEST_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_testfiles_torchvision") +import shutil @pytest.fixture(autouse=True, scope='function') -def run_around_tests(): +def my_tmp_dir(tmp_path): # Code that will run before the tests - if not os.path.isdir(TEST_DOWNLOAD_PATH): - os.makedirs(TEST_DOWNLOAD_PATH) + tmp_dir = str(tmp_path) # Test functions will be run at this point - yield + yield tmp_dir # Code that will run after the tests - shutil.rmtree(TEST_DOWNLOAD_PATH) + shutil.rmtree(tmp_dir) # Check if loading methods still exist (could be renamed/moved) @@ -37,22 +32,22 @@ def test_torchvision_data_methods(): # Do not skip USPS as it is the smallest dataset and can check the torchvision data loading mechanism @pytest.mark.data -def test_load_usps(): +def test_load_usps(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_usps, 9298, 256, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (9298, 16, 16) assert dataset.image_format == "HW" # Train data set dataset = _helper_test_data_loader(load_usps, 7291, 256, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (7291, 16, 16) assert dataset.image_format == "HW" # Test data set dataset = _helper_test_data_loader(load_usps, 2007, 256, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (2007, 16, 16) assert dataset.image_format == "HW" @@ -60,22 +55,22 @@ def test_load_usps(): @pytest.mark.largedata @pytest.mark.data -def test_load_mnist(): +def test_load_mnist(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_mnist, 70000, 784, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (70000, 28, 28) assert dataset.image_format == "HW" # Train data set dataset = _helper_test_data_loader(load_mnist, 60000, 784, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (60000, 28, 28) assert dataset.image_format == "HW" # Test data set dataset = _helper_test_data_loader(load_mnist, 10000, 784, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (10000, 28, 28) assert dataset.image_format == "HW" @@ -83,22 +78,22 @@ def test_load_mnist(): @pytest.mark.largedata @pytest.mark.data -def test_load_kmnist(): +def test_load_kmnist(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_kmnist, 70000, 784, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (70000, 28, 28) assert dataset.image_format == "HW" # Train data set dataset = _helper_test_data_loader(load_kmnist, 60000, 784, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (60000, 28, 28) assert dataset.image_format == "HW" # Test data set dataset = _helper_test_data_loader(load_kmnist, 10000, 784, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (10000, 28, 28) assert dataset.image_format == "HW" @@ -106,22 +101,22 @@ def test_load_kmnist(): @pytest.mark.largedata @pytest.mark.data -def test_load_fmnist(): +def test_load_fmnist(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_fmnist, 70000, 784, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (70000, 28, 28) assert dataset.image_format == "HW" # Train data set dataset = _helper_test_data_loader(load_fmnist, 60000, 784, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (60000, 28, 28) assert dataset.image_format == "HW" # Test data set dataset = _helper_test_data_loader(load_fmnist, 10000, 784, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (10000, 28, 28) assert dataset.image_format == "HW" @@ -129,22 +124,22 @@ def test_load_fmnist(): # Do not skip cifar10 as it is the smallest 3-channel dataset and can check channel normalization @pytest.mark.data -def test_load_cifar10(): +def test_load_cifar10(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_cifar10, 60000, 3072, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (60000, 3, 32, 32) assert dataset.image_format == "CHW" # Train data set dataset = _helper_test_data_loader(load_cifar10, 50000, 3072, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (50000, 3, 32, 32) assert dataset.image_format == "CHW" # Test data set dataset = _helper_test_data_loader(load_cifar10, 10000, 3072, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (10000, 3, 32, 32) assert dataset.image_format == "CHW" @@ -152,22 +147,22 @@ def test_load_cifar10(): @pytest.mark.largedata @pytest.mark.data -def test_load_cifar100(): +def test_load_cifar100(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_cifar100, 60000, 3072, 100, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (60000, 3, 32, 32) assert dataset.image_format == "CHW" # Train data set dataset = _helper_test_data_loader(load_cifar100, 50000, 3072, 100, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (50000, 3, 32, 32) assert dataset.image_format == "CHW" # Test data set dataset = _helper_test_data_loader(load_cifar100, 10000, 3072, 20, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH, + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir, "use_superclasses": True}) # Non-flatten assert dataset.images.shape == (10000, 3, 32, 32) @@ -176,22 +171,22 @@ def test_load_cifar100(): @pytest.mark.largedata @pytest.mark.data -def test_load_svhn(): +def test_load_svhn(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_svhn, 99289, 3072, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (99289, 3, 32, 32) assert dataset.image_format == "CHW" # Train data set dataset = _helper_test_data_loader(load_svhn, 73257, 3072, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (73257, 3, 32, 32) assert dataset.image_format == "CHW" # Test data set dataset = _helper_test_data_loader(load_svhn, 26032, 3072, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (26032, 3, 32, 32) assert dataset.image_format == "CHW" @@ -199,22 +194,22 @@ def test_load_svhn(): @pytest.mark.largedata @pytest.mark.data -def test_load_stl10(): +def test_load_stl10(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_stl10, 13000, 27648, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (13000, 3, 96, 96) assert dataset.image_format == "CHW" # Train data set dataset = _helper_test_data_loader(load_stl10, 5000, 27648, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (5000, 3, 96, 96) assert dataset.image_format == "CHW" # Test data set dataset = _helper_test_data_loader(load_stl10, 8000, 27648, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (8000, 3, 96, 96) assert dataset.image_format == "CHW" @@ -222,22 +217,22 @@ def test_load_stl10(): @pytest.mark.data # Do not skip GTSRB as the loading mechanism is different to the other torchvision dataloaders -def test_load_gtsrb(): +def test_load_gtsrb(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_gtsrb, 39270, 3072, 43, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (39270, 3, 32, 32) assert dataset.image_format == "CHW" # Train data set dataset = _helper_test_data_loader(load_gtsrb, 26640, 3072, 43, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (26640, 3, 32, 32) assert dataset.image_format == "CHW" # Test data set (with image size 30x30) dataset = _helper_test_data_loader(load_gtsrb, 12630, 2700, 43, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH, + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir, "image_size": (30, 30)}) # Non-flatten assert dataset.images.shape == (12630, 3, 30, 30) diff --git a/clustpy/data/tests/test_real_uci_data.py b/clustpy/data/tests/test_real_uci_data.py index 103ee1a..f6318da 100644 --- a/clustpy/data/tests/test_real_uci_data.py +++ b/clustpy/data/tests/test_real_uci_data.py @@ -4,258 +4,253 @@ load_user_knowledge, load_breast_tissue, load_forest_types, load_dermatology, load_multiple_features, \ load_statlog_australian_credit_approval, load_breast_cancer_wisconsin_original, load_optdigits, load_semeion, \ load_cmu_faces, load_gene_expression_cancer_rna_seq, load_sport_articles, load_wholesale_customers, load_reuters21578 -from pathlib import Path -import os -import shutil import pytest - -TEST_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_testfiles_uci") +import shutil @pytest.fixture(autouse=True, scope='function') -def run_around_tests(): +def my_tmp_dir(tmp_path): # Code that will run before the tests - if not os.path.isdir(TEST_DOWNLOAD_PATH): - os.makedirs(TEST_DOWNLOAD_PATH) + tmp_dir = str(tmp_path) # Test functions will be run at this point - yield + yield tmp_dir # Code that will run after the tests - shutil.rmtree(TEST_DOWNLOAD_PATH) + shutil.rmtree(tmp_dir) @pytest.mark.data -def test_load_banknotes(): - _helper_test_data_loader(load_banknotes, 1372, 4, 2, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_banknotes(my_tmp_dir): + _helper_test_data_loader(load_banknotes, 1372, 4, 2, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_spambase(): - _helper_test_data_loader(load_spambase, 4601, 57, 2, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_spambase(my_tmp_dir): + _helper_test_data_loader(load_spambase, 4601, 57, 2, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_seeds(): - _helper_test_data_loader(load_seeds, 210, 7, 3, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_seeds(my_tmp_dir): + _helper_test_data_loader(load_seeds, 210, 7, 3, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_skin(): - _helper_test_data_loader(load_skin, 245057, 3, 2, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_skin(my_tmp_dir): + _helper_test_data_loader(load_skin, 245057, 3, 2, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_soybean_small(): - _helper_test_data_loader(load_soybean_small, 47, 35, 4, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_soybean_small(my_tmp_dir): + _helper_test_data_loader(load_soybean_small, 47, 35, 4, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_soybean_large(): +def test_load_soybean_large(my_tmp_dir): # Full data set _helper_test_data_loader(load_soybean_large, 562, 35, 15, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Train data set _helper_test_data_loader(load_soybean_large, 266, 35, 15, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Test data set _helper_test_data_loader(load_soybean_large, 296, 35, 15, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_pendigits(): +def test_load_pendigits(my_tmp_dir): # Full data set _helper_test_data_loader(load_pendigits, 10992, 16, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Train data set _helper_test_data_loader(load_pendigits, 7494, 16, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Test data set _helper_test_data_loader(load_pendigits, 3498, 16, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_ecoli(): - _helper_test_data_loader(load_ecoli, 336, 7, 8, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_ecoli(my_tmp_dir): + _helper_test_data_loader(load_ecoli, 336, 7, 8, dataloader_params={"downloads_path": my_tmp_dir}) # Check if ignoring small clusters works _helper_test_data_loader(load_ecoli, 327, 7, 5, - dataloader_params={"ignore_small_clusters": True, "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"ignore_small_clusters": True, "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_hrtu2(): - _helper_test_data_loader(load_htru2, 17898, 8, 2, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_hrtu2(my_tmp_dir): + _helper_test_data_loader(load_htru2, 17898, 8, 2, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_letterrecognition(): +def test_load_letterrecognition(my_tmp_dir): _helper_test_data_loader(load_letterrecognition, 20000, 16, 26, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_har(): +def test_load_har(my_tmp_dir): # Full data set _helper_test_data_loader(load_har, 10299, 561, 6, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Train data set _helper_test_data_loader(load_har, 7352, 561, 6, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Test data set _helper_test_data_loader(load_har, 2947, 561, 6, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_statlog_shuttle(): +def test_load_statlog_shuttle(my_tmp_dir): # 7z probably not installed! -> data and labels can be None - dataset = load_statlog_shuttle(downloads_path=TEST_DOWNLOAD_PATH) + dataset = load_statlog_shuttle(downloads_path=my_tmp_dir) if dataset is not None: # Full data set _helper_test_data_loader(load_statlog_shuttle, 58000, 9, 7, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Train data set _helper_test_data_loader(load_statlog_shuttle, 43500, 9, 7, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Test data set _helper_test_data_loader(load_statlog_shuttle, 14500, 9, 7, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_mice_protein(): - _helper_test_data_loader(load_mice_protein, 1077, 68, 8, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_mice_protein(my_tmp_dir): + _helper_test_data_loader(load_mice_protein, 1077, 68, 8, dataloader_params={"downloads_path": my_tmp_dir}) # Check if additional labels work _helper_test_data_loader(load_mice_protein, 1077, 68, [8, 72, 2, 2, 2], - dataloader_params={"return_additional_labels": True, "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"return_additional_labels": True, "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_user_knowledge(): +def test_load_user_knowledge(my_tmp_dir): # Full data set _helper_test_data_loader(load_user_knowledge, 403, 5, 4, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Train data set _helper_test_data_loader(load_user_knowledge, 258, 5, 4, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Test data set _helper_test_data_loader(load_user_knowledge, 145, 5, 4, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_breast_tissue(): - _helper_test_data_loader(load_breast_tissue, 106, 9, 6, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_breast_tissue(my_tmp_dir): + _helper_test_data_loader(load_breast_tissue, 106, 9, 6, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_forest_types(): +def test_load_forest_types(my_tmp_dir): # Full data set _helper_test_data_loader(load_forest_types, 523, 27, 4, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Train data set _helper_test_data_loader(load_forest_types, 198, 27, 4, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Test data set _helper_test_data_loader(load_forest_types, 325, 27, 4, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_dermatology(): - _helper_test_data_loader(load_dermatology, 358, 34, 6, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_dermatology(my_tmp_dir): + _helper_test_data_loader(load_dermatology, 358, 34, 6, dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_multiple_features(): +def test_load_multiple_features(my_tmp_dir): _helper_test_data_loader(load_multiple_features, 2000, 649, 10, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_statlog_australian_credit_approval(): +def test_load_statlog_australian_credit_approval(my_tmp_dir): _helper_test_data_loader(load_statlog_australian_credit_approval, 690, 14, 2, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_breast_cancer_wisconsin_original(): +def test_load_breast_cancer_wisconsin_original(my_tmp_dir): _helper_test_data_loader(load_breast_cancer_wisconsin_original, 683, 9, 2, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_optdigits(): +def test_load_optdigits(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_optdigits, 5620, 64, 10, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (5620, 8, 8) assert dataset.image_format == "HW" # Train data set dataset = _helper_test_data_loader(load_optdigits, 3823, 64, 10, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (3823, 8, 8) assert dataset.image_format == "HW" # Test data set dataset = _helper_test_data_loader(load_optdigits, 1797, 64, 10, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1797, 8, 8) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_semeion(): +def test_load_semeion(my_tmp_dir): dataset = _helper_test_data_loader(load_semeion, 1593, 256, 10, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1593, 16, 16) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_cmu_faces(): +def test_load_cmu_faces(my_tmp_dir): dataset = _helper_test_data_loader(load_cmu_faces, 624, 960, [20, 4, 4, 2], - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (624, 30, 32) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_gene_expression_cancer_rna_seq(): +def test_load_gene_expression_cancer_rna_seq(my_tmp_dir): _helper_test_data_loader(load_gene_expression_cancer_rna_seq, 801, 20531, 5, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_sport_articles(): +def test_load_sport_articles(my_tmp_dir): _helper_test_data_loader(load_sport_articles, 1000, 55, 2, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_wholesale_customers(): +def test_load_wholesale_customers(my_tmp_dir): _helper_test_data_loader(load_wholesale_customers, 440, 6, [2, 3], - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) @pytest.mark.data -def test_load_reuters21578(): +def test_load_reuters21578(my_tmp_dir): # Full data set _helper_test_data_loader(load_reuters21578, 8367, 2000, 5, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) # Lewis train data _helper_test_data_loader(load_reuters21578, 5791, 2000, 5, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir}) # Lewis test data _helper_test_data_loader(load_reuters21578, 2300, 2000, 5, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir}) # cgi train data _helper_test_data_loader(load_reuters21578, 8091, 2000, 5, - dataloader_params={"subset": "train-cgi", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "train-cgi", "downloads_path": my_tmp_dir}) # cgi test data _helper_test_data_loader(load_reuters21578, 276, 2000, 5, - dataloader_params={"subset": "test-cgi", "downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"subset": "test-cgi", "downloads_path": my_tmp_dir}) diff --git a/clustpy/data/tests/test_real_video_data.py b/clustpy/data/tests/test_real_video_data.py index b5343d5..6df4464 100644 --- a/clustpy/data/tests/test_real_video_data.py +++ b/clustpy/data/tests/test_real_video_data.py @@ -2,23 +2,18 @@ from clustpy.data.tests._helpers_for_tests import _helper_test_data_loader from clustpy.data import load_video_weizmann, load_video_keck_gesture from clustpy.data.real_video_data import _downsample_frames -from pathlib import Path -import os -import shutil import pytest - -TEST_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_testfiles_video") +import shutil @pytest.fixture(autouse=True, scope='function') -def run_around_tests(): +def my_tmp_dir(tmp_path): # Code that will run before the tests - if not os.path.isdir(TEST_DOWNLOAD_PATH): - os.makedirs(TEST_DOWNLOAD_PATH) + tmp_dir = str(tmp_path) # Test functions will be run at this point - yield + yield tmp_dir # Code that will run after the tests - shutil.rmtree(TEST_DOWNLOAD_PATH) + shutil.rmtree(tmp_dir) def test_downsample_frames(): @@ -41,37 +36,41 @@ def test_downsample_frames(): @pytest.mark.data -def test_load_video_weizmann(): - dataset = _helper_test_data_loader(load_video_weizmann, None, 77760, [10, 9], - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) # N not always 5687 +def test_load_video_weizmann(my_tmp_dir): + dataset = _helper_test_data_loader(load_video_weizmann, None, 77760, [2, 2], + dataloader_params={"use_actions": ["walk", "run"], "use_persons": ["daria", "denis"], + "downloads_path": my_tmp_dir}) # N not always the same (5687) # Non-flatten assert dataset.images.shape[1:] == (3, 144, 180) assert dataset.image_format == "CHW" + data_full_size = dataset.data.shape[0] # Change image size and downsample dataset = _helper_test_data_loader(load_video_weizmann, None, 30000, [10, 9], dataloader_params={"image_size": (100, 100), "frame_sampling_ratio": 0.5, - "downloads_path": TEST_DOWNLOAD_PATH}) # N not always 5687 + "downloads_path": my_tmp_dir}) # N not always the same (5687) # Non-flatten assert dataset.images.shape[1:] == (3, 100, 100) assert dataset.image_format == "CHW" # Check downsampling - data = dataset.data - assert data.shape[0] / 5687 < 0.55 and data.shape[0] / 5687 > 0.49 + data_subsampled = dataset.data + label_subsampled = dataset.target + data_subset_size = data_subsampled[(label_subsampled[:, 0] < 2) & (label_subsampled[:, 1] < 2)].shape[0] + assert data_subset_size / data_full_size < 0.55 and data_subset_size / data_full_size > 0.49 @pytest.mark.largedata @pytest.mark.data -def test_load_video_keck_gesture(): +def test_load_video_keck_gesture(my_tmp_dir): dataset = _helper_test_data_loader(load_video_keck_gesture, None, 120000, [15, 4], dataloader_params={"subset": "all", - "downloads_path": TEST_DOWNLOAD_PATH}) # N not always 25457 + "downloads_path": my_tmp_dir}) # N not always the same (25457) # Non-flatten assert dataset.images.shape[1:] == (3, 200, 200) assert dataset.image_format == "CHW" # Test data dataset = _helper_test_data_loader(load_video_keck_gesture, None, 120000, [15, 3], dataloader_params={"subset": "train", - "downloads_path": TEST_DOWNLOAD_PATH}) # N not always 11911 + "downloads_path": my_tmp_dir}) # N not always the same (11911) # Non-flatten assert dataset.images.shape[1:] == (3, 200, 200) assert dataset.image_format == "CHW" @@ -79,7 +78,7 @@ def test_load_video_keck_gesture(): dataset = _helper_test_data_loader(load_video_keck_gesture, None, 30000, [15, 4], dataloader_params={"image_size": (100, 100), "frame_sampling_ratio": 0.5, "subset": "test", - "downloads_path": TEST_DOWNLOAD_PATH}) # N not always 13546 + "downloads_path": my_tmp_dir}) # N not always the same (13546) # Non-flatten assert dataset.images.shape[1:] == (3, 100, 100) assert dataset.image_format == "CHW" diff --git a/clustpy/data/tests/test_real_world_data.py b/clustpy/data/tests/test_real_world_data.py index 1b9c8ec..c1dba3c 100644 --- a/clustpy/data/tests/test_real_world_data.py +++ b/clustpy/data/tests/test_real_world_data.py @@ -1,23 +1,18 @@ from clustpy.data.tests._helpers_for_tests import _helper_test_data_loader from clustpy.data import load_iris, load_wine, load_breast_cancer, load_olivetti_faces, load_newsgroups, load_rcv1, \ load_imagenet_dog, load_imagenet10, load_coil20, load_coil100, load_webkb -from pathlib import Path -import os -import shutil import pytest - -TEST_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_testfiles_realworld") +import shutil @pytest.fixture(autouse=True, scope='function') -def run_around_tests(): +def my_tmp_dir(tmp_path): # Code that will run before the tests - if not os.path.isdir(TEST_DOWNLOAD_PATH): - os.makedirs(TEST_DOWNLOAD_PATH) + tmp_dir = str(tmp_path) # Test functions will be run at this point - yield + yield tmp_dir # Code that will run after the tests - shutil.rmtree(TEST_DOWNLOAD_PATH) + shutil.rmtree(tmp_dir) @pytest.mark.data @@ -66,31 +61,31 @@ def test_load_rcv1(): @pytest.mark.data @pytest.mark.largedata -def test_load_imagenet_dog(): +def test_load_imagenet_dog(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_imagenet_dog, 20580, 150528, 120, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH, + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir, "breeds": None}) # Non-flatten assert dataset.images.shape == (20580, 3, 224, 224) assert dataset.image_format == "CHW" # Train data set dataset = _helper_test_data_loader(load_imagenet_dog, 12000, 150528, 120, - dataloader_params={"subset": "train", "downloads_path": TEST_DOWNLOAD_PATH, + dataloader_params={"subset": "train", "downloads_path": my_tmp_dir, "breeds": None}) # Non-flatten assert dataset.images.shape == (12000, 3, 224, 224) assert dataset.image_format == "CHW" # Test data set dataset = _helper_test_data_loader(load_imagenet_dog, 8580, 150528, 120, - dataloader_params={"subset": "test", "downloads_path": TEST_DOWNLOAD_PATH, + dataloader_params={"subset": "test", "downloads_path": my_tmp_dir, "breeds": None}) # Non-flatten assert dataset.images.shape == (8580, 3, 224, 224) assert dataset.image_format == "CHW" # Test default breeds and different image size dataset = _helper_test_data_loader(load_imagenet_dog, 2574, 3072, 15, - dataloader_params={"subset": "all", "downloads_path": TEST_DOWNLOAD_PATH, + dataloader_params={"subset": "all", "downloads_path": my_tmp_dir, "image_size": (32, 32)}) # Non-flatten assert dataset.images.shape == (2574, 3, 32, 32) @@ -99,16 +94,16 @@ def test_load_imagenet_dog(): @pytest.mark.data @pytest.mark.largedata -def test_load_imagenet10(): +def test_load_imagenet10(my_tmp_dir): # Full data set dataset = _helper_test_data_loader(load_imagenet10, 13000, 150528, 10, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (13000, 3, 224, 224) assert dataset.image_format == "CHW" # Test different image size dataset = _helper_test_data_loader(load_imagenet10, 13000, 27648, 10, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH, + dataloader_params={"downloads_path": my_tmp_dir, "use_224_size": False}) # Non-flatten assert dataset.images.shape == (13000, 3, 96, 96) @@ -116,23 +111,23 @@ def test_load_imagenet10(): @pytest.mark.data -def test_load_coil20(): +def test_load_coil20(my_tmp_dir): dataset = _helper_test_data_loader(load_coil20, 1440, 16384, 20, - dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) + dataloader_params={"downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (1440, 128, 128) assert dataset.image_format == "HW" @pytest.mark.data -def test_load_coil100(): - dataset = _helper_test_data_loader(load_coil100, 7200, 49152, 100, dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) +def test_load_coil100(my_tmp_dir): + dataset = _helper_test_data_loader(load_coil100, 7200, 49152, 100, dataloader_params={"downloads_path": my_tmp_dir}) # Non-flatten assert dataset.images.shape == (7200, 3, 128, 128) assert dataset.image_format == "CHW" @pytest.mark.data -def test_load_webkb(): - _helper_test_data_loader(load_webkb, 1041, 323, [4, 4], dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH}) - _helper_test_data_loader(load_webkb, 8282, 761, [7, 5], dataloader_params={"downloads_path": TEST_DOWNLOAD_PATH, "use_categories": None, "use_universities": None}) +def test_load_webkb(my_tmp_dir): + _helper_test_data_loader(load_webkb, 1041, 323, [4, 4], dataloader_params={"downloads_path": my_tmp_dir}) + _helper_test_data_loader(load_webkb, 8282, 761, [7, 5], dataloader_params={"downloads_path": my_tmp_dir, "use_categories": None, "use_universities": None}) diff --git a/clustpy/metrics/__init__.py b/clustpy/metrics/__init__.py index 18be708..b2a5cc2 100644 --- a/clustpy/metrics/__init__.py +++ b/clustpy/metrics/__init__.py @@ -8,7 +8,7 @@ multiple_labelings_pc_jaccard_score, multiple_labelings_pc_precision_score, multiple_labelings_pc_rand_score, \ multiple_labelings_pc_recall_score from .confusion_matrix import ConfusionMatrix -from .hierarchical_metrics import dendrogram_purity, leaf_purity +from .hierarchical_metrics import dendrogram_purity, leaf_purity, node_purity __all__ = ['variation_of_information', 'unsupervised_clustering_accuracy', @@ -33,4 +33,5 @@ 'dendrogram_purity', 'leaf_purity', 'purity', - 'cvnn_score'] + 'cvnn_score', + 'node_purity'] diff --git a/clustpy/metrics/_metrics_utils.py b/clustpy/metrics/_metrics_utils.py index b69e49b..b24137b 100644 --- a/clustpy/metrics/_metrics_utils.py +++ b/clustpy/metrics/_metrics_utils.py @@ -1,9 +1,11 @@ import numpy as np +from sklearn.metrics.cluster._supervised import check_clusterings +from sklearn.utils import check_X_y -def _check_number_of_points(labels_true: np.ndarray, labels_pred: np.ndarray) -> bool: +def _check_labels_arrays(labels_true: np.ndarray, labels_pred: np.ndarray, allow_2d_labels: bool = False) -> (np.ndarray, np.ndarray): """ - Check if the length of the ground truth labels and the prediction labels match. + Check that the ground truth labels and the prediction labels are compatible. If they do not match throw an exception. Parameters @@ -12,14 +14,63 @@ def _check_number_of_points(labels_true: np.ndarray, labels_pred: np.ndarray) -> The ground truth labels of the data set labels_pred : np.ndarray The labels as predicted by a clustering algorithm + allow_2d_labels: bool + Specifies whether 2d labels (multiple label sets) are allowed (default: False) Returns ------- - boolean : bool - True if execution was successful + tuple : (np.ndarray, np.ndarray) + The ground truth labels, + The predicted labels """ - if labels_pred.shape[0] != labels_true.shape[0]: - raise Exception( - "Number of objects of the prediction and ground truth are not equal.\nNumber of prediction objects: " + str( - labels_pred.shape[0]) + "\nNumber of ground truth objects: " + str(labels_true.shape[0])) - return True \ No newline at end of file + labels_true = np.asarray(labels_true).astype(int) + labels_pred = np.asarray(labels_pred).astype(int) + + if labels_true.ndim == 1 and labels_pred.ndim == 1: + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + elif allow_2d_labels: + true_ref = labels_true[:, 0].copy() if labels_true.ndim > 1 else labels_true.copy() + pred_ref = labels_pred[:, 0].copy() if labels_pred.ndim > 1 else labels_pred.copy() + if labels_true.ndim > 1: + labels_true = labels_true.copy() + for i in range(labels_true.shape[1]): + # Align each column of 'true' against the reference of 'pred' + labels_true[:, i], _ = check_clusterings(labels_true[:, i], pred_ref) + else: + labels_true, _ = check_clusterings(labels_true, pred_ref) + if labels_pred.ndim > 1: + labels_pred = labels_pred.copy() + for i in range(labels_pred.shape[1]): + # Align each column of 'pred' against the (now potentially updated) 'true' reference + _, labels_pred[:, i] = check_clusterings(true_ref, labels_pred[:, i]) + else: + _, labels_pred = check_clusterings(true_ref, labels_pred) + else: + raise ValueError(f"Your labels are not 1d arrays. Shape of labels_true: {labels_true.shape}, shape of labels_pred: {labels_pred.shape}") + return labels_true, labels_pred + + +def _check_length_data_and_labels(X: np.ndarray, labels: np.ndarray) -> (np.ndarray, np.ndarray): + """ + Check that the data and the prediction labels are compatible. + If they do not match throw an exception. + + Parameters + ---------- + X : np.ndarray + The data set + labels : np.ndarray + The labels as predicted by a clustering algorithm + + Returns + ------- + tuple : (np.ndarray, np.ndarray) + The data set, + The predicted labels + """ + X, labels = check_X_y(X, labels) + labels = labels.astype(int) + n_pred_clusters = len(np.unique(labels)) + if n_pred_clusters == 1 or n_pred_clusters == X.shape[0]: + raise ValueError("The number of different labels must be within [2, n_samples -1]") + return X, labels diff --git a/clustpy/metrics/confusion_matrix.py b/clustpy/metrics/confusion_matrix.py index 1c34179..20c78f5 100644 --- a/clustpy/metrics/confusion_matrix.py +++ b/clustpy/metrics/confusion_matrix.py @@ -1,10 +1,10 @@ import numpy as np import matplotlib.pyplot as plt from scipy.optimize import linear_sum_assignment -from clustpy.metrics._metrics_utils import _check_number_of_points +from clustpy.metrics._metrics_utils import _check_labels_arrays -def _rearrange(confusion_matrix: np.ndarray) -> np.ndarray: +def _rearrange(confusion_matrix: np.ndarray) -> (np.ndarray, np.ndarray): """ Rearrange the confusion matrix in such a way that the sum of the diagonal is maximized. Thereby, the best matching combination of labels will be shown. @@ -20,27 +20,30 @@ def _rearrange(confusion_matrix: np.ndarray) -> np.ndarray: Returns ------- rearranged_confusion_matrix : np.ndarray - The rearranged confusion matrix. - If number of ground truth labels is larger than the number of predicted labels, the resulting confusion matrix will be quadradic with multiple 0 columns. + The rearranged confusion matrix + (If number of ground truth labels is larger than the number of predicted labels, the resulting confusion matrix will be quadradic with multiple 0 columns), + The indices regarding the rearrangement """ # Change order using the Hungarian Method max_number_labels = max(confusion_matrix.shape) rearranged_confusion_matrix = np.zeros((max_number_labels, max_number_labels), dtype=confusion_matrix.dtype) # Linear sum assignment tries to minimize the diagonal sum -> use negative confusion_matrix - rearranged_confusion_matrix[:confusion_matrix.shape[0], :confusion_matrix.shape[1]] = -confusion_matrix - indices = linear_sum_assignment(rearranged_confusion_matrix) - # Revert values back to positive range, change order of the columns - rearranged_confusion_matrix = -rearranged_confusion_matrix[:, indices[1]] + rearranged_confusion_matrix[:confusion_matrix.shape[0], :confusion_matrix.shape[1]] = confusion_matrix + indices = linear_sum_assignment(-rearranged_confusion_matrix) + # Change order of the columns + rearranged_order = indices[1] + rearranged_confusion_matrix = rearranged_confusion_matrix[:, rearranged_order] rearranged_confusion_matrix = rearranged_confusion_matrix[:confusion_matrix.shape[0], :] # If there are more columns than rows sort remaining columns by highest value if confusion_matrix.shape[1] > confusion_matrix.shape[0]: missing_columns = np.arange(confusion_matrix.shape[0], confusion_matrix.shape[1]) missing_columns_order = np.argsort(np.max(rearranged_confusion_matrix[:, missing_columns], axis=0))[::-1] rearranged_confusion_matrix[:, missing_columns] = rearranged_confusion_matrix[:, missing_columns[missing_columns_order]] - return rearranged_confusion_matrix + rearranged_order[missing_columns] = rearranged_order[missing_columns[missing_columns_order]] + return rearranged_confusion_matrix, rearranged_order -def _plot_confusion_matrix(confusion_matrix: np.ndarray, show_text: bool, figsize: tuple, cmap: str, textcolor: str, +def _plot_confusion_matrix(confusion_matrix: np.ndarray, show_text: bool, row_names : list, column_names : list, figsize: tuple, cmap: str, textcolor: str, vmin: float, vmax: float) -> None: """ Plot the confusion matrix. @@ -51,6 +54,10 @@ def _plot_confusion_matrix(confusion_matrix: np.ndarray, show_text: bool, figsiz The confusion matrix to plot show_text : bool Show the value in each cell as text + row_names : list + List of containing the names of the rows + column_names : list + List of containing the names of the columns figsize : tuple Tuple indicating the height and width of the plot cmap : str @@ -66,9 +73,17 @@ def _plot_confusion_matrix(confusion_matrix: np.ndarray, show_text: bool, figsiz If None, it will be set as the maximum value within the confusion matrix. Used to choose the color from the colormap """ + if len(row_names) != confusion_matrix.shape[0]: + raise ValueError("Length of the row names list must match the number of rows (ground turth clusters) in the confusion matrix. Length is {0} and number of rows is {1}".format(len(row_names), confusion_matrix.shape[0])) + if len(column_names) != confusion_matrix.shape[1]: + raise ValueError("Length of the column names list must match the number of columns (predicted clusters) in the confusion matrix. Length is {0} and number of columns is {1}".format(len(column_names), confusion_matrix.shape[1])) fig, ax = plt.subplots(figsize=figsize) # Plot confusion matrix using colors ax.imshow(confusion_matrix, cmap=cmap, vmin=vmin, vmax=vmax) + ax.set_xticks(np.arange(confusion_matrix.shape[1])) + ax.set_xticklabels(column_names) + ax.set_yticks(np.arange(confusion_matrix.shape[0])) + ax.set_yticklabels(row_names) # Optional: Add text to the color cells if show_text: for i in range(confusion_matrix.shape[0]): @@ -90,8 +105,9 @@ class ConfusionMatrix(): The ground truth labels of the data set labels_pred : np.ndarray The labels as predicted by a clustering algorithm - shape : tuple - Shape of the resulting confusion matrix (default: None) + shape : tuple | str | None + The desired shape of the confusion matrix. + Can be "square" to encforce a squared confusion matrix (default: None) Attributes ---------- @@ -99,22 +115,27 @@ class ConfusionMatrix(): The confusion matrix """ - def __init__(self, labels_true: np.ndarray, labels_pred: np.ndarray, shape: tuple=None): - _check_number_of_points(labels_true, labels_pred) - if np.any(labels_true < 0): - labels_true = labels_true.copy() - labels_true -= labels_true.min() - if np.any(labels_pred < 0): - labels_pred = labels_pred.copy() - labels_pred -= labels_pred.min() - labels_true = labels_true.astype(int) - labels_pred = labels_pred.astype(int) + def __init__(self, labels_true: np.ndarray, labels_pred: np.ndarray, shape: tuple | str | None=None): + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred) + true_clusters, true_clusters_idx = np.unique(labels_true, return_inverse=True) + pred_clusters, pred_clusters_idx = np.unique(labels_pred, return_inverse=True) + self.true_clusters = true_clusters + self.pred_clusters = pred_clusters if shape is None: - conf_matrix = np.zeros((labels_true.max() + 1, labels_pred.max() + 1), dtype=int) + shape = (len(true_clusters), len(pred_clusters)) else: - assert len(shape) == 2 and shape[0] > labels_true.max() and shape[1] > labels_pred.max(), f"Shape must contain two values such that shape[0] > labels_true.max() and shape[1] > labels_true.max(). Your values: shape = {shape}, labels_true.max() = {labels_true.max()}, labels_pred.max() = {labels_pred.max()}" - conf_matrix = np.zeros(shape, dtype=int) - np.add.at(conf_matrix, (labels_true, labels_pred), 1) + if shape == "square": + max_labels = max(len(true_clusters), len(pred_clusters)) + shape = (max_labels, max_labels) + else: + assert len(shape) == 2 and shape[0] >= len(true_clusters) and shape[1] >= len(pred_clusters), f"Shape must be 'square' or a tuple containing two values such that shape[0] >= len(np.unique(labels_true)) and shape[1] >= len(np.unique(labels_pred)). Your values: shape = {shape}, len(np.unique(labels_true)) = {len(np.unique(labels_true))}, len(np.unique(labels_pred)) = {len(np.unique(labels_pred))}" + # Fill unique label information (self.true_clusters and self.pred_clusters) with -2 placeholders + if shape[0] > len(true_clusters): + self.true_clusters = np.append(self.true_clusters, [-2] * (shape[0] - len(true_clusters))) + if shape[1] > len(pred_clusters): + self.pred_clusters = np.append(self.pred_clusters, [-2] * (shape[1] - len(pred_clusters))) + conf_matrix = np.zeros(shape, dtype=int) + np.add.at(conf_matrix, (true_clusters_idx, pred_clusters_idx), 1) self.confusion_matrix = conf_matrix def __str__(self): @@ -148,12 +169,14 @@ def rearrange(self, inplace: bool = True) -> np.ndarray: The rearranged confusion matrix If number of ground truth labels is larer than the number of predicted labels, the resulting confusion matrix will be quadradic with multiple 0 columns. """ - rearranged_confusion_matrix = _rearrange(self.confusion_matrix) + rearranged_confusion_matrix, rearranged_order = _rearrange(self.confusion_matrix) if inplace: self.confusion_matrix = rearranged_confusion_matrix + self.pred_clusters = self.pred_clusters[rearranged_order[:len(self.pred_clusters)]] return rearranged_confusion_matrix - def plot(self, show_text: bool = True, figsize: tuple = (10, 10), cmap: str = "YlGn", textcolor: str = "black", + def plot(self, show_text: bool = True, ground_truth_names: list | None = None, + figsize: tuple = (10, 10), cmap: str = "YlGn", textcolor: str = "black", vmin: int = 0, vmax: int = None) -> None: """ Plot the confusion matrix. @@ -162,6 +185,8 @@ def plot(self, show_text: bool = True, figsize: tuple = (10, 10), cmap: str = "Y ---------- show_text : bool Show the value in each cell as text (default: True) + ground_truth_names : list | None + List of containing the names of the ground truth clusters figsize : tuple Tuple indicating the height and width of the plot (default: (10, 10)) cmap : str @@ -177,4 +202,6 @@ def plot(self, show_text: bool = True, figsize: tuple = (10, 10), cmap: str = "Y If None, it will be set as the maximum value within the confusion matrix. Used to choose the color from the colormap (default: None) """ - _plot_confusion_matrix(self.confusion_matrix, show_text, figsize, cmap, textcolor, vmin, vmax) + if ground_truth_names is None: + ground_truth_names = self.true_clusters + _plot_confusion_matrix(self.confusion_matrix, show_text, ground_truth_names, self.pred_clusters, figsize, cmap, textcolor, vmin, vmax) diff --git a/clustpy/metrics/external_clustering_metrics.py b/clustpy/metrics/external_clustering_metrics.py index 90855cb..5744e4b 100644 --- a/clustpy/metrics/external_clustering_metrics.py +++ b/clustpy/metrics/external_clustering_metrics.py @@ -3,7 +3,7 @@ from clustpy.metrics.confusion_matrix import ConfusionMatrix from scipy.special import comb from sklearn.metrics import normalized_mutual_info_score as nmi -from clustpy.metrics._metrics_utils import _check_number_of_points +from clustpy.metrics._metrics_utils import _check_labels_arrays def variation_of_information(labels_true: np.ndarray, labels_pred: np.ndarray) -> float: @@ -29,21 +29,17 @@ def variation_of_information(labels_true: np.ndarray, labels_pred: np.ndarray) - Meilă, Marina. "Comparing clusterings by the variation of information." Learning theory and kernel machines. Springer, Berlin, Heidelberg, 2003. 173-187. """ - _check_number_of_points(labels_true, labels_pred) + confusion_matrix = ConfusionMatrix(labels_true, labels_pred).confusion_matrix n = len(labels_true) - cluster_ids_true = np.unique(labels_true) - cluster_ids_pred = np.unique(labels_pred) - result = 0.0 - for id_true in cluster_ids_true: - points_in_cluster_gt = np.argwhere(labels_true == id_true)[:, 0] - p = len(points_in_cluster_gt) / n - for id_pred in cluster_ids_pred: - points_in_cluster_pred = np.argwhere(labels_pred == id_pred)[:, 0] - q = len(points_in_cluster_pred) / n - r = len([point for point in points_in_cluster_gt if point in points_in_cluster_pred]) / n - if r != 0: - result += r * (np.log(r / p) + np.log(r / q)) - vi = -1 * result + p = confusion_matrix.sum(1).reshape((-1, 1)) / n + q = confusion_matrix.sum(0).reshape((1, -1)) / n + r = confusion_matrix / n + # Consider zero entries + mask = (r == 0) + r[mask] = 1 + result = r * (np.log(r / p) + np.log(r / q)) + result[mask] = 0 + vi = -result.sum() return vi @@ -72,13 +68,9 @@ def unsupervised_clustering_accuracy(labels_true: np.ndarray, labels_pred: np.nd Yang, Yi, et al. "Image clustering using local discriminant models and global integration." IEEE Transactions on Image Processing 19.10 (2010): 2761-2773. """ - _check_number_of_points(labels_true, labels_pred) - max_label = int(max(labels_pred.max(), labels_true.max()) + 1) - match_matrix = np.zeros((max_label, max_label), dtype=np.int64) - for i in range(labels_true.shape[0]): - match_matrix[int(labels_true[i]), int(labels_pred[i])] -= 1 - indices = linear_sum_assignment(match_matrix) - acc = -np.sum(match_matrix[indices]) / labels_pred.size + confusion_matrix = ConfusionMatrix(labels_true, labels_pred, "square").confusion_matrix + indices = linear_sum_assignment(-confusion_matrix) + acc = np.sum(confusion_matrix[indices]) / len(labels_true) return acc @@ -110,17 +102,16 @@ def information_theoretic_external_cluster_validity_measure(labels_true: np.ndar Byron E. Dom. 2002. "An information-theoretic external cluster-validity measure." In Proceedings of the Eighteenth conference on Uncertainty in artificial intelligence (UAI'02). """ - _check_number_of_points(labels_true, labels_pred) # Build confusion matrix - cm = ConfusionMatrix(labels_true, labels_pred) + confusion_matrix = ConfusionMatrix(labels_true, labels_pred).confusion_matrix n_points = labels_true.shape[0] - n_classes = cm.confusion_matrix.shape[0] + n_classes = confusion_matrix.shape[0] # Get number of objects per predicted label - hks = np.sum(cm.confusion_matrix, axis=0) + hks = np.sum(confusion_matrix, axis=0) # Calculate Q_0 - cm_tmp = cm.confusion_matrix.copy() # Needed if some cells are 0 so log can be calculated + cm_tmp = confusion_matrix.copy() # Needed if some cells are 0 so log can be calculated cm_tmp[cm_tmp == 0] = 1 # will later be multiplied by 0, so this does not change the final result - empirical_conditional_entropy = cm.confusion_matrix / n_points * np.log(cm_tmp / hks) + empirical_conditional_entropy = confusion_matrix / n_points * np.log(cm_tmp / hks) empirical_conditional_entropy = - np.sum( empirical_conditional_entropy) # [~np.isnan(empirical_conditional_entropy)]) sum_binom_coefficient = np.sum([np.log(comb(hk + n_classes - 1, n_classes - 1)) for hk in hks]) @@ -128,7 +119,7 @@ def information_theoretic_external_cluster_validity_measure(labels_true: np.ndar if scale: # --- Scale Q_0 to (0, 1] --- # Get number of objects per ground truth label - hcs = np.sum(cm.confusion_matrix, axis=1) + hcs = np.sum(confusion_matrix, axis=1) # Calculate Q_2 min_Q_0 = np.sum([np.log(comb(hc + n_classes - 1, n_classes - 1)) for hc in hcs]) / n_points entropy_H_C = -np.sum([hc / n_points * np.log(hc / n_points) for hc in hcs]) @@ -164,7 +155,7 @@ def fair_normalized_mutual_information(labels_true: np.ndarray, labels_pred: np. Amelio, Alessia, and Clara Pizzuti. "Is normalized mutual information a fair measure for comparing community detection methods?." Proceedings of the 2015 IEEE/ACM international conference on advances in social networks analysis and mining 2015. 2015. """ - _check_number_of_points(labels_true, labels_pred) + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred) # Get the normalized mutual information my_nmi = nmi(labels_true, labels_pred) # Get number of clusters @@ -199,7 +190,6 @@ def purity(labels_true: np.ndarray, labels_pred: np.ndarray) -> float: ------- Manning, Christopher D. An introduction to information retrieval. 2009. """ - _check_number_of_points(labels_true, labels_pred) conf_matrix = ConfusionMatrix(labels_true, labels_pred).confusion_matrix best_matches = np.max(conf_matrix, axis=0) purity = np.sum(best_matches) / labels_true.shape[0] diff --git a/clustpy/metrics/hierarchical_metrics.py b/clustpy/metrics/hierarchical_metrics.py index a8ec145..cdc3353 100644 --- a/clustpy/metrics/hierarchical_metrics.py +++ b/clustpy/metrics/hierarchical_metrics.py @@ -1,16 +1,45 @@ -from clustpy.hierarchical._cluster_tree import BinaryClusterTree +from clustpy.hierarchical._cluster_tree import BinaryClusterTree, _ClusterTreeNode from clustpy.metrics.confusion_matrix import ConfusionMatrix -from clustpy.metrics.external_clustering_metrics import purity -from clustpy.metrics._metrics_utils import _check_number_of_points +from clustpy.metrics._metrics_utils import _check_labels_arrays import numpy as np +def node_purity(node: _ClusterTreeNode, labels_true: np.ndarray, labels_pred: np.ndarray) -> float: + """ + Calculate the purity of this node within a Cluster Tree. + A leaf with no assigned points receives a purity score of 0. + + Parameters + ---------- + node: _ClusterTreeNode + The node of a clustering tree + labels_true : np.ndarray + The ground truth labels of the data set + labels_pred : np.ndarray + The labels as predicted by a clustering algorithm + + Returns + ------- + node_purity : float + The node purity + """ + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred) + samples_in_leaf = np.isin(labels_pred, node.labels) + if np.any(samples_in_leaf): + sizes_gt_matches = np.unique(labels_true[samples_in_leaf], return_counts=True)[1] + node_purity = sizes_gt_matches.max() / samples_in_leaf.sum() + else: + node_purity = 0. + return node_purity + + def leaf_purity( tree: BinaryClusterTree, labels_true: np.ndarray, labels_pred: np.ndarray ) -> float: """ Calculates the leaf purity of the tree. - Uses labels fromm leafs in the tree to calculate the purity (see clustpy.metrics.purity). + The leaf purity is equal to a weighted average of the maximum class purity across all leaves. + Uses labels from leafs in the tree to identify the most frequent ground truth class and weights the score by the size of the leaf. If each label contains a single label, this is equal to the standard purity metric. Parameters @@ -32,12 +61,14 @@ def leaf_purity( Mautz, Dominik, Claudia Plant, and Christian Böhm. "Deepect: The deep embedded cluster tree." Data Science and Engineering 5 (2020): 419-432. """ - _check_number_of_points(labels_true, labels_pred) + cm = ConfusionMatrix(labels_true, labels_pred) leaf_nodes, _ = tree.get_leaf_and_split_nodes() - labels_pred_adj = -np.ones(labels_pred.shape[0]) - for i, leaf_node in enumerate(leaf_nodes): - labels_pred_adj[np.isin(labels_pred, leaf_node.labels)] = i - leaf_purity = purity(labels_true, labels_pred_adj) + leaf_purity = 0 + for leaf_node in leaf_nodes: + relevant_columns = np.isin(cm.pred_clusters, leaf_node.labels) + column_sum = cm.confusion_matrix[:, relevant_columns].sum(1) + leaf_purity += column_sum.max() + leaf_purity = leaf_purity / len(labels_true) return leaf_purity @@ -74,7 +105,7 @@ def dendrogram_purity( """ if labels_pred is None: labels_pred = np.arange(labels_true.shape[0]) - _check_number_of_points(labels_true, labels_pred) + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred) if type(dendrogram) is BinaryClusterTree: # Transform ClusterTree to sklearn dendrogram dendrogram = dendrogram.export_sklearn_dendrogram() diff --git a/clustpy/metrics/internal_clustering_metrics.py b/clustpy/metrics/internal_clustering_metrics.py index 6d46610..d2de64e 100644 --- a/clustpy/metrics/internal_clustering_metrics.py +++ b/clustpy/metrics/internal_clustering_metrics.py @@ -1,6 +1,7 @@ from sklearn.neighbors import NearestNeighbors from scipy.spatial.distance import pdist import numpy as np +from clustpy.metrics._metrics_utils import _check_length_data_and_labels def cvnn_score(X: np.ndarray, labels: np.ndarray | int | tuple, n_neighbors: int = 5, metric: str = "euclidean") -> float | np.ndarray: @@ -55,29 +56,26 @@ def _internal_cvnn_score(X: np.ndarray, labels: np.ndarray, nrbs_indices: np.nda tuple : (float, float) The cluster spearation and cluster compactness value """ + X, labels = _check_length_data_and_labels(X, labels) assert isinstance(labels, np.ndarray), "labels must be of type np.nddary. Your input has type {0}".format(type(labels)) unique_clusters = np.unique(labels) # Calculate neighbor weights n_neighbors = nrbs_indices.shape[1] - n_neighbors_not_in_cluster = np.zeros(X.shape[0]) - for k in range(n_neighbors): - n_neighbors_not_in_cluster += (labels != labels[nrbs_indices[:, k]]) - n_neighbors_not_in_cluster /= n_neighbors + n_neighbors_not_in_cluster = (labels.reshape((-1, 1)) != labels[nrbs_indices]).mean(1) cluster_separation_scores = np.zeros(unique_clusters.shape[0]) cluster_compactness_scores = np.zeros(unique_clusters.shape[0]) # Do per-cluster calculations - for c in unique_clusters: + for i, c in enumerate(unique_clusters): in_cluster = (labels == c) # Calculate separation (mean of neighbor weights in cluster) - cluster_separation_scores[c] = n_neighbors_not_in_cluster[in_cluster].mean() + cluster_separation_scores[i] = n_neighbors_not_in_cluster[in_cluster].mean() # Calculate compartness (mean of pair-wise distances in cluster) X_in_cluster = X[in_cluster] if X_in_cluster.shape[0] > 1: cluster_distances = pdist(X_in_cluster, metric=metric) - in_cluster_pairs = (X_in_cluster.shape[0] * (X_in_cluster.shape[0] - 1)) / 2 - cluster_compactness_scores[c] = cluster_distances.sum() / in_cluster_pairs + cluster_compactness_scores[i] = cluster_distances.mean() else: - cluster_compactness_scores[c] = 0 + cluster_compactness_scores[i] = 0 # Calculate final CVNN cluster_separation_final = cluster_separation_scores.max() cluster_compactness_final = cluster_compactness_scores.sum() @@ -97,7 +95,12 @@ def _internal_cvnn_score(X: np.ndarray, labels: np.ndarray, nrbs_indices: np.nda cluster_separations[i] = cluster_separation_l cluster_compactnesses[i] = cluster_compactness_l # Normalize scores - cvnn = cluster_separations / cluster_separations.max() + cluster_compactnesses / cluster_compactnesses.max() + max_cluster_separations = cluster_separations.max() + max_cluster_compactnesses = cluster_compactnesses.max() + if max_cluster_separations != 0 and max_cluster_compactnesses != 0: + cvnn = cluster_separations / max_cluster_separations + cluster_compactnesses / max_cluster_compactnesses + else: + cvnn = 0 elif isinstance(labels, np.ndarray): # Do not normalize scores cluster_separation, cluster_compactness = _internal_cvnn_score(X, labels, nrbs_indices, metric) diff --git a/clustpy/metrics/multipe_labelings_scoring.py b/clustpy/metrics/multipe_labelings_scoring.py index 0409c08..21b8d18 100644 --- a/clustpy/metrics/multipe_labelings_scoring.py +++ b/clustpy/metrics/multipe_labelings_scoring.py @@ -1,5 +1,5 @@ import numpy as np -from clustpy.metrics._metrics_utils import _check_number_of_points +from clustpy.metrics._metrics_utils import _check_labels_arrays from clustpy.metrics.pair_counting_scores import PairCountingScores, _f1_score, _recall_score, _precision_score, \ _rand_score, _jaccard_score from sklearn.metrics import normalized_mutual_info_score as nmi @@ -251,7 +251,7 @@ def _get_multiple_labelings_pair_counting_categories(labels_true: np.ndarray, la The number of false negatives, The number of true negatives """ - _check_number_of_points(labels_true, labels_pred) + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred, allow_2d_labels=True) if labels_true.ndim == 1: labels_true = labels_true.reshape((-1, 1)) if labels_pred.ndim == 1: @@ -393,7 +393,7 @@ class MultipleLabelingsConfusionMatrix(ConfusionMatrix): def __init__(self, labels_true: np.ndarray, labels_pred: np.ndarray, metric: Callable = nmi, remove_noise_spaces: bool = True, metric_params: dict = {}): - _check_number_of_points(labels_true, labels_pred) + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred, allow_2d_labels=True) assert type(metric_params) is dict, "metric_params must be a dict" assert callable(metric), "metric must be a method" # Reshape labels if we have only a single set of labels @@ -413,8 +413,11 @@ def __init__(self, labels_true: np.ndarray, labels_pred: np.ndarray, metric: Cal for j in range(labels_pred.shape[1]): confusion_matrix[i, j] = metric(labels_true[:, i], labels_pred[:, j], **metric_params) self.confusion_matrix = confusion_matrix + self.true_clusters = np.arange(labels_true.shape[1]) + self.pred_clusters = np.arange(labels_pred.shape[1]) - def plot(self, show_text: bool = True, figsize: tuple = (10, 10), cmap: str = "YlGn", textcolor: str = "black", + def plot(self, show_text: bool = True, ground_truth_names: list | None = None, + figsize: tuple = (10, 10), cmap: str = "YlGn", textcolor: str = "black", vmin: float = 0.0, vmax: float = 1.0) -> None: """ Plot the Multiple Labelings Confusion Matrix. @@ -424,6 +427,8 @@ def plot(self, show_text: bool = True, figsize: tuple = (10, 10), cmap: str = "Y ---------- show_text : bool Show the value in each cell as text (default: True) + ground_truth_names : list | None + List of containing the names of the ground truth cluster sets figsize : tuple Tuple indicating the height and width of the plot (default: (10, 10)) cmap : str @@ -439,7 +444,9 @@ def plot(self, show_text: bool = True, figsize: tuple = (10, 10), cmap: str = "Y If None, it will be set as the maximum value within the confusion matrix. Used to choose the color from the colormap (default: 1.0) """ - _plot_confusion_matrix(self.confusion_matrix, show_text, figsize, cmap, textcolor, vmin=vmin, vmax=vmax) + if ground_truth_names is None: + ground_truth_names = self.true_clusters + _plot_confusion_matrix(self.confusion_matrix, show_text, ground_truth_names, self.pred_clusters, figsize, cmap, textcolor, vmin=vmin, vmax=vmax) def aggregate(self, aggregation_strategy: str = "max") -> float: """ @@ -488,9 +495,9 @@ def aggregate(self, aggregation_strategy: str = "max") -> float: if aggregation_strategy == "permut-max": # Linear sum assignment tries to minimize the diagonal sum -> use negative confusion_matrix rearranged_confusion_matrix[:self.confusion_matrix.shape[0], - :self.confusion_matrix.shape[1]] = -self.confusion_matrix - indices = linear_sum_assignment(rearranged_confusion_matrix) - rearranged_confusion_matrix = -rearranged_confusion_matrix[:, indices[1]] + :self.confusion_matrix.shape[1]] = self.confusion_matrix + indices = linear_sum_assignment(-rearranged_confusion_matrix) + rearranged_confusion_matrix = rearranged_confusion_matrix[:, indices[1]] else: rearranged_confusion_matrix[:self.confusion_matrix.shape[0], :self.confusion_matrix.shape[1]] = self.confusion_matrix @@ -536,9 +543,9 @@ def is_multi_labelings_n_clusters_correct(labels_true: np.ndarray, labels_pred: Parameters ---------- labels_true : np.ndarray - The true set of labelings. Shape must match (n_samples, n_subspaces) + The true set of labelings. Shape must match (n_samples, n_labelings) labels_pred : np.ndarray - The predicted set of labelings. Shape must match (n_samples, n_subspaces) + The predicted set of labelings. Shape must match (n_samples, n_labelings) check_subset : bool Boolean defines if it is sufficient if a subset of n_clusters_pred is equal to n_clusters_true (default: True) remove_noise_spaces : bool @@ -549,7 +556,7 @@ def is_multi_labelings_n_clusters_correct(labels_true: np.ndarray, labels_pred: is_equal : bool Boolean indicating if the number of clusters of labels_true and labels_pred matches """ - _check_number_of_points(labels_true, labels_pred) + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred, allow_2d_labels=True) if labels_true.ndim == 1: labels_true = labels_true.reshape((-1, 1)) if labels_pred.ndim == 1: diff --git a/clustpy/metrics/pair_counting_scores.py b/clustpy/metrics/pair_counting_scores.py index ddae0b2..e3dbcff 100644 --- a/clustpy/metrics/pair_counting_scores.py +++ b/clustpy/metrics/pair_counting_scores.py @@ -1,4 +1,4 @@ -from clustpy.metrics._metrics_utils import _check_number_of_points +from clustpy.metrics._metrics_utils import _check_labels_arrays import numpy as np """ @@ -333,9 +333,7 @@ def _get_pair_counting_categories(labels_true: np.ndarray, labels_pred: np.ndarr The number of false negatives, The number of true negatives """ - _check_number_of_points(labels_true, labels_pred) - if labels_true.ndim != 1 or labels_pred.ndim != 1: - raise Exception("labels_true and labels_pred labels should just contain a single column.") + labels_true, labels_pred = _check_labels_arrays(labels_true, labels_pred) n_tp = 0 n_fp = 0 n_fn = 0 @@ -384,7 +382,6 @@ class PairCountingScores(): """ def __init__(self, labels_true: np.ndarray, labels_pred: np.ndarray): - _check_number_of_points(labels_true, labels_pred) n_tp, n_fp, n_fn, n_tn = _get_pair_counting_categories(labels_true, labels_pred) self.n_tp = n_tp self.n_fp = n_fp diff --git a/clustpy/metrics/tests/test_confusion_matrix.py b/clustpy/metrics/tests/test_confusion_matrix.py index a7d34d1..5f5c37a 100644 --- a/clustpy/metrics/tests/test_confusion_matrix.py +++ b/clustpy/metrics/tests/test_confusion_matrix.py @@ -1,7 +1,8 @@ import numpy as np from clustpy.metrics import ConfusionMatrix -from clustpy.metrics.confusion_matrix import _rearrange +from clustpy.metrics.confusion_matrix import _rearrange, _plot_confusion_matrix from unittest.mock import patch +import pytest def test_rearrange(): @@ -10,31 +11,34 @@ def test_rearrange(): [0, 0, 0, 50], [30, 10, 5, 5], [5, 5, 35, 5]]) - rearranged_confusion_matrix = _rearrange(confusion_matrix) + rearranged_confusion_matrix, rearrange_order = _rearrange(confusion_matrix) assert np.array_equal(rearranged_confusion_matrix, np.array([[45, 2, 0, 3], [0, 50, 0, 0], [10, 5, 30, 5], [5, 5, 5, 35]])) + assert np.array_equal(rearrange_order, np.array([1, 3, 0, 2])) # More prediction labels than ground truth confusion_matrix = np.array([[0, 10, 45, 3, 2, 25], [0, 10, 0, 0, 50, 25], [30, 10, 10, 5, 5, 25], [5, 10, 5, 35, 5, 25]]) - rearranged_confusion_matrix = _rearrange(confusion_matrix) + rearranged_confusion_matrix, rearrange_order = _rearrange(confusion_matrix) assert np.array_equal(rearranged_confusion_matrix, np.array([[45, 2, 0, 3, 25, 10], [0, 50, 0, 0, 25, 10], [10, 5, 30, 5, 25, 10], [5, 5, 5, 35, 25, 10]])) + assert np.array_equal(rearrange_order, np.array([2, 4, 0, 3, 5, 1])) # More ground truth labels than prediction confusion_matrix = np.array([[0, 3, 2], [0, 0, 50], [30, 5, 5], [5, 35, 5]]) - rearranged_confusion_matrix = _rearrange(confusion_matrix) + rearranged_confusion_matrix, rearrange_order = _rearrange(confusion_matrix) assert np.array_equal(rearranged_confusion_matrix, np.array([[0, 2, 0, 3], [0, 50, 0, 0], [0, 5, 30, 5], [0, 5, 5, 35]])) + assert np.array_equal(rearrange_order, np.array([3, 2, 0, 1])) """ @@ -60,25 +64,50 @@ def test_confusion_matrix_object(): [0, 1, 0, 1]]) assert np.array_equal(cm.confusion_matrix, expected_cm) # Third test - labels_true = np.array([0, 1, 2, -3, 0, 1, 2, -3]) + labels_true = np.array([0, 1, 2, -1, 0, 1, 2, -1]) labels_pred = np.array([0, 0, -1, -1, 2, 2, -1, 3]) cm = ConfusionMatrix(labels_true, labels_pred) - expected_cm = np.array([[1, 0, 0, 0, 1], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 1, 0, 1, 0], - [0, 1, 0, 1, 0], - [2, 0, 0, 0, 0]]) + expected_cm = np.array([[1, 0, 0, 1], + [0, 1, 1, 0], + [0, 1, 1, 0], + [2, 0, 0, 0]]) assert np.array_equal(cm.confusion_matrix, expected_cm) + +def test_confusion_matrix_object_with_shape(): + # First test + labels_true = np.array([0, 0, 0, 0, 1, 1, 1, 1]) + labels_pred = np.array([0, 0, 1, 1, 2, 2, 3, 3]) + cm = ConfusionMatrix(labels_true, labels_pred, "square") + expected_cm = np.array([[2, 2, 0, 0], + [0, 0, 2, 2], + [0, 0, 0, 0], + [0, 0, 0, 0]]) + assert np.array_equal(cm.confusion_matrix, expected_cm) + assert np.array_equal(cm.true_clusters, np.array([0,1,-2,-2])) + assert np.array_equal(cm.pred_clusters, np.array([0,1,2,3])) + # Second test + cm = ConfusionMatrix(labels_pred, labels_true, (5, 6)) + expected_cm = np.array([[2, 0, 0, 0, 0, 0], + [2, 0, 0, 0, 0, 0], + [0, 2, 0, 0, 0, 0], + [0, 2, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]]) + assert np.array_equal(cm.confusion_matrix, expected_cm) + assert np.array_equal(cm.true_clusters, np.array([0,1,2,3,-2])) + assert np.array_equal(cm.pred_clusters, np.array([0,1,-2,-2,-2,-2])) + + def test_confusion_matrix_rearrange(): labels_true = np.array([0, 1, 2, 3, 0, 1, 2, 3]) - labels_pred = np.array([0, 0, 1, 1, 2, 2, 3, 3]) + labels_pred = np.array([-1, -1, 1, 1, 2, 2, 3, 3]) cm = ConfusionMatrix(labels_true, labels_pred) cm_copy = cm.confusion_matrix.copy() rearranged_cm = cm.rearrange(inplace=False) assert np.array_equal(cm.confusion_matrix, cm_copy) + assert np.array_equal(cm.true_clusters, np.array([0, 1, 2, 3])) + assert np.array_equal(cm.pred_clusters, np.array([-1, 1, 2, 3])) expected_rearranged_cm = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 1, 1], @@ -86,6 +115,20 @@ def test_confusion_matrix_rearrange(): assert np.array_equal(rearranged_cm, expected_rearranged_cm) rearranged_cm = cm.rearrange(inplace=True) assert np.array_equal(cm.confusion_matrix, rearranged_cm) + assert np.array_equal(cm.pred_clusters, np.array([-1, 2, 1, 3])) + + +@patch("matplotlib.pyplot.show") # Used to test plots (show will not be called) +def test_plot_confusion_matrix(mock_fig): + cm = np.array([[1, 0, 1, 0], + [1, 0, 1, 0], + [0, 1, 0, 1], + [0, 1, 0, 1]]) + with pytest.raises(ValueError): + _plot_confusion_matrix(cm, True, ["One", "Two"], ["One", "Two", "Three", "Four"], (5,5), "YlGn", "red", "0", "100") + with pytest.raises(ValueError): + _plot_confusion_matrix(cm, True, ["One", "Two", "Three", "Four"], ["One", "Two"], (5,5), "YlGn", "red", "0", "100") + assert None == _plot_confusion_matrix(cm, True, ["One", "Two", "Three", "Four"], ["One", "Two", "Three", "Four"], (5,5), "YlGn", "red", "0", "100") @patch("matplotlib.pyplot.show") # Used to test plots (show will not be called) @@ -93,4 +136,4 @@ def test_confusion_matrix_plot(mock_fig): labels_true = np.array([0, 0, 0, 0, 1, 1, 1, 1]) labels_pred = np.array([0, 0, 1, 1, 2, 2, 3, 3]) cm = ConfusionMatrix(labels_true, labels_pred) - assert None == cm.plot() + assert None == cm.plot() \ No newline at end of file diff --git a/clustpy/metrics/tests/test_external_clustering_metrics.py b/clustpy/metrics/tests/test_external_clustering_metrics.py index 7327672..0729b78 100644 --- a/clustpy/metrics/tests/test_external_clustering_metrics.py +++ b/clustpy/metrics/tests/test_external_clustering_metrics.py @@ -8,14 +8,16 @@ def test_unsupervised_clustering_accuracy(): l1 = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) l2 = np.array([1, 1, 2, 2, 3, 3, 4, 4, 0, 0]) assert unsupervised_clustering_accuracy(l1, l2) == 1.0 - l2 = np.array([0, 0, 1, 1, 1, 2, 3, 3, 4, 4]) + l2 = np.array([-1, -1, 1, 1, 1, 2, 3, 3, 4, 4]) assert unsupervised_clustering_accuracy(l1, l2) == 0.9 l2 = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) assert unsupervised_clustering_accuracy(l1, l2) == 0.5 l2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert unsupervised_clustering_accuracy(l1, l2) == 0.2 - l2 = np.array([0, 0, 0, 0, 2, 2, 3, 3, 4, 1]) + l2 = np.array([0, 0, 0, 0, 2, 2, 3, 3, 4, -1]) assert unsupervised_clustering_accuracy(l1, l2) == 0.7 + l2 = np.array([4, 4, 4, 1, 2, 2, 3, 3, 0, -1]) + assert unsupervised_clustering_accuracy(l1, l2) == 0.8 def test_variation_of_information(): @@ -23,7 +25,7 @@ def test_variation_of_information(): l2 = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) assert variation_of_information(l1, l2) == 0.0 l1 = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - l2 = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) + l2 = np.array([1, 1, 1, 1, 1, -1, -1, -1, -1, -1]) assert variation_of_information(l1, l2) == 0.0 l1 = np.array([1, 1, 1, 1, 0, 0, 0, 0]) l2 = np.array([0, 0, 1, 1, 1, 1, 1, 1]) @@ -38,10 +40,13 @@ def test_information_theoretic_external_cluster_validity_measure(): scaled_result_1 = information_theoretic_external_cluster_validity_measure(l1, l2, True) assert scaled_result_1 == 1.0 # Medium cluster result - l2 = np.array([0, 0, 1, 1, 1, 2, 3, 3, 3, 4]) + l2 = np.array([-1, -1, 1, 1, 1, 2, 3, 3, 3, 4]) non_scaled_result_2 = information_theoretic_external_cluster_validity_measure(l1, l2, False) scaled_result_2 = information_theoretic_external_cluster_validity_measure(l1, l2) assert scaled_result_2 >= 0 and scaled_result_2 <= 1 + l2 = np.array([0, 0, 1, 1, 1, 2, 3, 3, 3, 4]) + assert non_scaled_result_2 == information_theoretic_external_cluster_validity_measure(l1, l2, False) + assert scaled_result_2 == information_theoretic_external_cluster_validity_measure(l1, l2) # Poor cluster result l2 = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 0]) non_scaled_result_3 = information_theoretic_external_cluster_validity_measure(l1, l2, False) @@ -56,10 +61,12 @@ def test_fair_normalized_mutual_information(): l2 = np.array([1, 1, 2, 2, 3, 3, 4, 4, 0, 0]) fnmi1 = fair_normalized_mutual_information(l1, l2) assert fnmi1 == 1.0 - l2 = np.array([0, 0, 1, 1, 1, 2, 3, 3, 4, 4]) + l2 = np.array([-1, -1, 1, 1, 1, 2, 3, 3, 4, 4]) fnmi2 = fair_normalized_mutual_information(l1, l2) assert fnmi2 < fnmi1 assert fnmi2 == nmi(l1, l2) + l2 = np.array([0, 0, 1, 1, 1, 2, 3, 3, 4, 4]) + assert fnmi2 == fair_normalized_mutual_information(l1, l2) l2 = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) fnmi3 = fair_normalized_mutual_information(l1, l2) assert fnmi3 < fnmi2 @@ -74,7 +81,7 @@ def test_purity(): l1 = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) l2 = np.array([1, 1, 2, 2, 3, 3, 4, 4, 0, 0]) assert purity(l1, l2) == 1.0 - l2 = np.array([0, 0, 1, 1, 1, 2, 3, 3, 4, 4]) + l2 = np.array([-1, -1, 1, 1, 1, 2, 3, 3, 4, 4]) assert purity(l1, l2) == 0.9 l2 = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) assert purity(l1, l2) == 1.0 diff --git a/clustpy/metrics/tests/test_hierarchical_metrics.py b/clustpy/metrics/tests/test_hierarchical_metrics.py index 9eed73e..02bf0b7 100644 --- a/clustpy/metrics/tests/test_hierarchical_metrics.py +++ b/clustpy/metrics/tests/test_hierarchical_metrics.py @@ -1,9 +1,31 @@ -from clustpy.metrics import dendrogram_purity, leaf_purity +from clustpy.metrics import dendrogram_purity, leaf_purity, node_purity from clustpy.metrics.hierarchical_metrics import _get_parent_matrix from clustpy.hierarchical._cluster_tree import BinaryClusterTree import numpy as np +def test_node_purity(): + bct = BinaryClusterTree() + node_023, node_145 = bct.split_cluster(0) + node_03, node_2 = bct.split_cluster(0) + node_0, node_3 = bct.split_cluster(0) + node_15, node_4 = bct.split_cluster(1) + node_1, node_5 = bct.split_cluster(1) + l1 = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]) + l2 = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 0, 0, 0]) + assert node_purity(bct.root_node_, l1, l2) == 1/5 + assert node_purity(node_023, l1, l2) == 1/3 + assert node_purity(node_145, l1, l2) == 1/2 + assert node_purity(node_03, l1, l2) == 1/2 + assert node_purity(node_2, l1, l2) == 1. + assert node_purity(node_0, l1, l2) == 1. + assert node_purity(node_3, l1, l2) == 1. + assert node_purity(node_15, l1, l2) == 1. + assert node_purity(node_4, l1, l2) == 1. + assert node_purity(node_1, l1, l2) == 1. + assert node_purity(node_5, l1, l2) == 0. + + def test_leaf_purity(): bct = BinaryClusterTree() bct.split_cluster(0) diff --git a/clustpy/metrics/tests/test_metrics_utils.py b/clustpy/metrics/tests/test_metrics_utils.py index c73376e..63640ef 100644 --- a/clustpy/metrics/tests/test_metrics_utils.py +++ b/clustpy/metrics/tests/test_metrics_utils.py @@ -1,10 +1,34 @@ -from clustpy.metrics.external_clustering_metrics import _check_number_of_points +from clustpy.metrics._metrics_utils import _check_labels_arrays, _check_length_data_and_labels import pytest import numpy as np -def test_check_number_of_points(): - l1 = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) +def test_check_labels_arrays(): + l1 = np.array([0., 0., 1., 1., 2., 2., 3., 3., 4., 4.]) + assert l1.dtype == float l2 = np.array([0, 0, 1, 1, 1, 2, 3, 3, 4, 4]) - assert _check_number_of_points(l1, l2) == True - with pytest.raises(Exception): - _check_number_of_points(l1, l2[1:]) \ No newline at end of file + l1, l2 =_check_labels_arrays(l1, l2) + assert l1.dtype == int and l2.dtype == int + with pytest.raises(ValueError): + _check_labels_arrays(l1, l2[1:]) + l3 = np.c_[l1, l2] + with pytest.raises(ValueError): + _check_labels_arrays(l1, l3) + l1, l3 =_check_labels_arrays(l1, l3, allow_2d_labels = True) + assert l1.shape == (10, ) and l3.shape == (10, 2) + l3, l1 =_check_labels_arrays(l3, l1, allow_2d_labels = True) + assert l1.shape == (10, ) and l3.shape == (10, 2) + l3, l4 =_check_labels_arrays(l3, l3, allow_2d_labels = True) + assert l3.shape == (10, 2) and l4.shape == (10, 2) + + +def test_check_length_data_and_labels(): + l1 = np.array([0., 0., 1., 1., 2., 2., 3., 3., 4., 4.]) + assert l1.dtype == float + X = np.array([[0., 2.], [1., 2.], [2., 3.], [3., 4.], [4., 5.], [5., 6.], [6., 7.], [7., 8.], [8., 9.], [9., 10.]]) + print(X.shape) + X, l1 =_check_length_data_and_labels(X, l1) + assert X.dtype == float and l1.dtype == int + with pytest.raises(ValueError): + _check_length_data_and_labels(X, l1[1:]) + with pytest.raises(ValueError): + _check_length_data_and_labels(X, np.array([0] * 10)) diff --git a/clustpy/metrics/tests/test_multiple_labelings_scoring.py b/clustpy/metrics/tests/test_multiple_labelings_scoring.py index 79eb378..8f18f38 100644 --- a/clustpy/metrics/tests/test_multiple_labelings_scoring.py +++ b/clustpy/metrics/tests/test_multiple_labelings_scoring.py @@ -82,9 +82,9 @@ def test_is_multi_labelings_n_clusters_correct(): labels_true = np.array([[0, 0, 0, 0, 1], [0, 0, -1, 1, 2], [0, 0, 0, 0, 0]]).T - labels_pred = np.array([[[0, 0, -1, 0, 1], + labels_pred = np.array([[0, 0, -1, 0, 1], [0, 0, 0, 1, 2], - [0, 0, 1, 2, 3]]]).T + [0, 0, 1, 2, 3]]).T assert is_multi_labelings_n_clusters_correct(labels_true, labels_pred, check_subset=True, remove_noise_spaces=True) == True assert is_multi_labelings_n_clusters_correct(labels_true, labels_pred, check_subset=True, diff --git a/clustpy/utils/checks.py b/clustpy/utils/checks.py index 76ce289..ef681d8 100644 --- a/clustpy/utils/checks.py +++ b/clustpy/utils/checks.py @@ -1,4 +1,4 @@ -from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils.estimator_checks import estimator_checks_generator from sklearn.base import BaseEstimator import numpy as np from sklearn.utils import check_X_y, check_array, check_random_state @@ -16,7 +16,7 @@ def check_clustpy_estimator(estimator_obj: BaseEstimator, checks_to_ignore: tupl checks_to_ignore : tuple | list List containing the names of checks to ignore (default: ("check_complex_data")) """ - all_checks = check_estimator(estimator_obj, True) + all_checks = estimator_checks_generator(estimator_obj) for estimator, check in all_checks: check_name = check.func.__name__ if not check_name in checks_to_ignore: diff --git a/clustpy/utils/dip.c b/clustpy/utils/dip.c index b11f6f3..32420ee 100644 --- a/clustpy/utils/dip.c +++ b/clustpy/utils/dip.c @@ -45,6 +45,7 @@ Compile Windows: cc -fPIC -shared -std=c99 -o dip.dll dip.c Compile Linux: cc -fPIC -shared -o dip.so dip.c */ +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #include #include @@ -306,13 +307,13 @@ static PyObject *method_c_diptest(PyObject *self, PyObject *args) { return NULL; } // Convert PyObjects to C arrays - c_x = (double*)py_x->data; - c_low_high = (int*)py_low_high->data; - c_modaltriangle = (int*)py_modaltriangle->data; - c_gcm = (int*)py_gcm->data; - c_lcm = (int*)py_lcm->data; - c_mn = (int*)py_mn->data; - c_mj = (int*)py_mj->data; + c_x = (double*)PyArray_DATA(py_x); + c_low_high = (int*)PyArray_DATA(py_low_high); + c_modaltriangle = (int*)PyArray_DATA(py_modaltriangle); + c_gcm = (int*)PyArray_DATA(py_gcm); + c_lcm = (int*)PyArray_DATA(py_lcm); + c_mn = (int*)PyArray_DATA(py_mn); + c_mj = (int*)PyArray_DATA(py_mj); // Execute C diptest method double dip_value = fast_diptest(c_x, c_low_high, c_modaltriangle, c_gcm, c_lcm, c_mn, c_mj, n, debug); // Return dip value @@ -334,5 +335,8 @@ static struct PyModuleDef diptestModule = { PyMODINIT_FUNC PyInit_dipModule(void) { import_array(); + if (PyErr_Occurred()) { + return NULL; + } return PyModule_Create(&diptestModule); }; diff --git a/clustpy/utils/diptest.py b/clustpy/utils/diptest.py index 2d191d2..b993e62 100644 --- a/clustpy/utils/diptest.py +++ b/clustpy/utils/diptest.py @@ -107,15 +107,18 @@ def _dip_c_impl(X: np.ndarray, debug: bool) -> (float, tuple, tuple, np.ndarray, The minorant values, The majorant values """ + # Ensure X is float64 and contiguous + X_input = np.ascontiguousarray(X, dtype=np.float64) + n = X_input.shape[0] # Create reference numpy arrays - modal_interval = np.zeros(2, dtype=np.int32) - modal_triangle = -np.ones(3, dtype=np.int32) - gcm = np.zeros(X.shape, dtype=np.int32) - lcm = np.zeros(X.shape, dtype=np.int32) - mj = np.zeros(X.shape, dtype=np.int32) - mn = np.zeros(X.shape, dtype=np.int32) + modal_interval = np.zeros(2, dtype=np.int32, order='C') + modal_triangle = -np.ones(3, dtype=np.int32, order='C') + gcm = np.zeros(n, dtype=np.int32, order='C') + lcm = np.zeros(n, dtype=np.int32, order='C') + mj = np.zeros(n, dtype=np.int32, order='C') + mn = np.zeros(n, dtype=np.int32, order='C') # Execute C function - dip_value = c_diptest(X.astype(np.float64), modal_interval, modal_triangle, gcm, lcm, mn, mj, X.shape[0], + dip_value = c_diptest(X_input, modal_interval, modal_triangle, gcm, lcm, mn, mj, n, 1 if debug else 0) return dip_value, (modal_interval[0], modal_interval[1]), ( modal_triangle[0], modal_triangle[1], modal_triangle[2]), gcm, lcm, mn, mj diff --git a/clustpy/utils/evaluation.py b/clustpy/utils/evaluation.py index 6996048..d95d205 100644 --- a/clustpy/utils/evaluation.py +++ b/clustpy/utils/evaluation.py @@ -219,7 +219,7 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr 1]) == 1, "Some names of your metrics do not seem to be unique! Note that metrics must not be named 'runtime' or 'n_clusters'" header = pd.MultiIndex.from_product([algo_names, metric_names], names=["algorithm", "metric"]) value_placeholder = np.zeros((n_repetitions, len(algo_names) * len(metric_names))) - df = pd.DataFrame(value_placeholder, columns=header, index=range(n_repetitions)) + df = pd.DataFrame(value_placeholder, columns=header, index=[str(rep) for rep in range(n_repetitions)]) for eval_algo in evaluation_algorithms: automatically_set_n_clusters = False try: @@ -336,11 +336,11 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr if X_test is not None and labels_predicted_test is not None: result_test = eval_metric.method(X_test, labels_true_test, labels_predicted_test, algo_obj, **eval_metric.params) - df.at[rep, (eval_algo.name, eval_metric.name)] = result + df.at[str(rep), (eval_algo.name, eval_metric.name)] = result if not quiet: print("-- {0}: {1}".format(eval_metric.name, result)) if X_test is not None and labels_predicted_test is not None: - df.at[rep, (eval_algo.name, eval_metric.name + "_TEST")] = result_test + df.at[str(rep), (eval_algo.name, eval_metric.name + "_TEST")] = result_test if not quiet: print("-- {0} (TEST): {1}".format(eval_metric.name, result_test)) except Exception as e: @@ -348,28 +348,29 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr print("Metric {0} raised an exception and will be skipped".format(eval_metric.name)) print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(e).__name__, e) if add_runtime: - df.at[rep, (eval_algo.name, "runtime")] = runtime + df.at[str(rep), (eval_algo.name, "runtime")] = runtime if not quiet: print("-- runtime: {0}".format(runtime)) if add_n_clusters: n_clusters = _get_n_clusters_from_algo(algo_obj) - df.at[rep, (eval_algo.name, "n_clusters")] = n_clusters + df.at[str(rep), (eval_algo.name, "n_clusters")] = n_clusters if not quiet: print("-- n_clusters: {0}".format(n_clusters)) if eval_algo.deterministic: for element in range(1, n_repetitions): - if add_runtime: - df.at[element, (eval_algo.name, "runtime")] = df.at[ - 0, (eval_algo.name, "runtime")] - if add_n_clusters: - df.at[element, (eval_algo.name, "n_clusters")] = df.at[ - 0, (eval_algo.name, "n_clusters")] + for eval_metric in evaluation_metrics: - df.at[element, (eval_algo.name, eval_metric.name)] = df.at[ - 0, (eval_algo.name, eval_metric.name)] + df.at[str(element), (eval_algo.name, eval_metric.name)] = df.at[ + "0", (eval_algo.name, eval_metric.name)] if X_test is not None: - df.at[element, (eval_algo.name, eval_metric.name + "_TEST")] = df.at[ - 0, (eval_algo.name, eval_metric.name + "_TEST")] + df.at[str(element), (eval_algo.name, eval_metric.name + "_TEST")] = df.at[ + "0", (eval_algo.name, eval_metric.name + "_TEST")] + if add_runtime: + df.at[str(element), (eval_algo.name, "runtime")] = df.at[ + "0", (eval_algo.name, "runtime")] + if add_n_clusters: + df.at[str(element), (eval_algo.name, "n_clusters")] = df.at[ + "0", (eval_algo.name, "n_clusters")] break except Exception as e: if not quiet: @@ -606,8 +607,8 @@ def _get_data_and_labels_from_evaluation_dataset(data_input: np.ndarray, data_lo return X, labels_true, X_test, labels_true_test -def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str | int = "mean", output_path: str = None, pm_row: str | int | None = "std", - bracket_row: str | int | None = None, best_in_bold: bool = True, second_best_underlined: bool = True, +def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str = "mean", output_path: str = None, pm_row: str | None = "std", + bracket_row: str | None = None, best_in_bold: bool = True, second_best_underlined: bool = True, third_best_dashed_underlined: bool = False, color_by_value: str = None, higher_is_better: list = None, multiplier: int | float | list | None = 100, decimal_places: int = 1, color_min_max: tuple = (5, 70)) -> str: """ @@ -621,13 +622,13 @@ def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str | in ---------- df : pd.DataFrame | str The pandas dataframe. Can also be a string that contains the path to the saved dataframe - relevant_row : str | int + relevant_row : str The name of the row in the df that is used to create the latex table (default: "mean") output_path : str The path were the resulting latex table text file will be stored (default: None) - pm_row : str | int + pm_row : str The name of the row in the df that should be added to the latex table after the value from relevant_row separated by plus-minus (default: "std") - bracket_row : str | int + bracket_row : str The name of the row in the df that should be added to the latex table in brackets after the value from relevant_row and, if stated, the value from pm_row (default: None) best_in_bold : bool Print best value for each combination of dataset and metric in bold. diff --git a/clustpy/utils/tests/test_evaluation.py b/clustpy/utils/tests/test_evaluation.py index 4a968a8..0ec76c9 100644 --- a/clustpy/utils/tests/test_evaluation.py +++ b/clustpy/utils/tests/test_evaluation.py @@ -139,12 +139,12 @@ def test_evaluate_dataset_with_neural_networks_as_iteration_parameters(): labels_true=L, n_repetitions=n_repetitions, add_runtime=False, add_n_clusters=False, save_path=None, random_state=1) # Check if scores are equal - assert abs(df.at[0, ("DEC1", "nmi")] - df.at[0, ("DEC2", "nmi")]) < 1e-8 # is equal - assert abs(df.at[0, ("DEC1", "silhouette")] - df.at[0, ("DEC2", "silhouette")]) < 1e-8 # is equal - assert abs(df.at[1, ("DEC1", "nmi")] - df.at[1, ("DEC2", "nmi")]) < 1e-8 # is equal - assert abs(df.at[1, ("DEC1", "silhouette")] - df.at[1, ("DEC2", "silhouette")]) < 1e-8 # is equal - assert abs(df.at[0, ("DEC1", "nmi")] - df.at[1, ("DEC1", "nmi")]) > 1e-2 # is not equal - assert abs(df.at[0, ("DEC1", "silhouette")] - df.at[1, ("DEC1", "silhouette")]) > 1e-2 # is not equal + assert abs(df.at["0", ("DEC1", "nmi")] - df.at["0", ("DEC2", "nmi")]) < 1e-8 # is equal + assert abs(df.at["0", ("DEC1", "silhouette")] - df.at["0", ("DEC2", "silhouette")]) < 1e-8 # is equal + assert abs(df.at["1", ("DEC1", "nmi")] - df.at["1", ("DEC2", "nmi")]) < 1e-8 # is equal + assert abs(df.at["1", ("DEC1", "silhouette")] - df.at["1", ("DEC2", "silhouette")]) < 1e-8 # is equal + assert abs(df.at["0", ("DEC1", "nmi")] - df.at["1", ("DEC1", "nmi")]) > 1e-2 # is not equal + assert abs(df.at["0", ("DEC1", "silhouette")] - df.at["1", ("DEC1", "silhouette")]) > 1e-2 # is not equal @pytest.fixture @@ -292,14 +292,14 @@ def test_evaluation_df_to_latex_table_single_dataset(): df = evaluate_dataset(X=X, evaluation_algorithms=algorithms, evaluation_metrics=metrics, labels_true=L, n_repetitions=n_repetitions, add_runtime=False, add_n_clusters=False, save_path="df.csv", random_state=1, aggregation_functions=[np.max, np.std]) - output_str1 = evaluation_df_to_latex_table(df, 1, "latex1.txt", None, None, False, False, False, None, None, None, 0) + output_str1 = evaluation_df_to_latex_table(df, "1", "latex1.txt", None, None, False, False, False, None, None, None, 0) output_str1 = output_str1.split("\n") assert os.path.isfile("latex1.txt") read_file1 = open("latex1.txt", "r").readlines() assert len(output_str1) == len(read_file1) assert all([output_str1[i] + "\n" == read_file1[i] for i in range(len(output_str1) - 1)] + [output_str1[-1] == read_file1[-1]]) # Test with input file - output_str2 = evaluation_df_to_latex_table("df.csv", 1, "latex2.txt", "std", "max", True, True, True, "red", [True, True, False], + output_str2 = evaluation_df_to_latex_table("df.csv", "1", "latex2.txt", "std", "max", True, True, True, "red", [True, True, False], 100, 2) output_str2 = output_str2.split("\n") assert os.path.isfile("latex2.txt") diff --git a/codecov.yml b/codecov.yml index e004668..eb95c13 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,2 +1,22 @@ ignore: - - ".*/tests" # ignore folders and all its contents \ No newline at end of file + - "**/tests/**/*" + - "setup.py" + - "**/__init__.py" + +coverage: + status: + project: + default: + # 'auto' compares coverage against the base branch. + # It ensures coverage doesn't drop. + target: auto + threshold: 1% # Allows a 1% drop before turning the check red + patch: + default: + target: 85% # New code in the PR should have at least 80% coverage + base: auto + +comment: + layout: "reach, diff, flags, files" + behavior: default + require_changes: true # Only comment if coverage actually changes \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 39a675d..c47f6ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,51 @@ [build-system] -requires = ["setuptools", "wheel", "Cython>=0.29", "numpy >= 1.15"] \ No newline at end of file +requires = ["setuptools", "wheel", "Cython>=3.0", "numpy>=2.0.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "clustpy" +dynamic = ["version"] +description = "A Python library for advanced clustering algorithms" +readme = "README.md" +authors = [{name = "Collin Leiber", email = "leiber@dbs.ifi.lmu.de"}] +license = {text = "BSD-3-Clause License"} +requires-python = ">=3.10" +dependencies = [ + "numpy", + "scipy", + "scikit-learn>=1.6", + "matplotlib", + "torch", + "pandas", + "tqdm", + "torchvision" +] + +[project.optional-dependencies] +full = ["Pillow", "nltk", "xlrd", "requests", "opencv-python-headless<4.13"] + +[project.urls] +Homepage = "https://clustpy.readthedocs.io/en/latest/" + +[tool.setuptools] +package-data = {"clustpy" = ["data/datasets/*.data"]} + +[tool.setuptools.packages.find] +where = ["."] +include = ["clustpy*"] +exclude = ["*tests*", "docs*"] + +[tool.setuptools.dynamic] +version = {attr = "clustpy.__version__"} + +[tool.pytest.ini_options] +# Merged from your pytest.ini +markers = [ + "data: marks tests concerning data loaders", + "largedata: marks tests concerning large data loaders (e.g. image data sets from torchvision)", + "timeseriesdata: marks tests concerning dataloader from www.timeseriesclassification.com" +] + +[tool.coverage.run] +source = ["clustpy"] +omit = ["*/tests/*"] \ No newline at end of file diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index aa37366..0000000 --- a/pytest.ini +++ /dev/null @@ -1,5 +0,0 @@ -[pytest] -markers = - data: marks tests concerning data loaders - largedata: marks tests concerning large data loaders (e.g. image data sets from torchvision) - timeseriesdata: marks tests concerning dataloader from www.timeseriesclassification.com diff --git a/setup.py b/setup.py index c04f376..1d7b16f 100644 --- a/setup.py +++ b/setup.py @@ -1,41 +1,11 @@ -from setuptools import setup, find_packages, Extension -import clustpy +from setuptools import setup, Extension import numpy as np -def _load_readme(): - with open("README.md", "r") as file: - readme = file.read() - return readme - - dip_extension = Extension('clustpy.utils.dipModule', include_dirs=[np.get_include()], sources=['clustpy/utils/dip.c']) -setup( - name='clustpy', - version=clustpy.__version__, - packages=find_packages(exclude=["*tests"]), - package_data={'clustpy': ['data/datasets/*.data']}, - url='https://clustpy.readthedocs.io/en/latest/', - license='BSD-3-Clause License', - author='Collin Leiber', - author_email='leiber@dbs.ifi.lmu.de', - description='A Python library for advanced clustering algorithms', - long_description=_load_readme(), - long_description_content_type="text/markdown", - python_requires='>=3.10', - install_requires=['numpy', - 'scipy', - 'scikit-learn', - 'matplotlib', - 'torch', - 'pandas', - 'tqdm', - 'torchvision'], - extras_require={ - 'full': ['Pillow', 'nltk', 'xlrd', 'opencv-python', 'requests'] - }, - ext_modules=[dip_extension] -) + +if __name__ == "__main__": + setup(ext_modules=[dip_extension])