collinleiber · collinleiber · Feb 24, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -9,7 +9,7 @@ orbs:
   # Orb commands and jobs help you with common scripting around a language/tool
   # so you dont have to copy and paste it everywhere.
   # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python
-  python: circleci/python@1.5.0
+  python: circleci/python@2.1.1
 
 # Define a job to be invoked later in a workflow.
 # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
@@ -33,6 +33,7 @@ jobs:
           name: build clustpy
           command: |
             python -m pip install --upgrade pip
+            pip install build Cython numpy
             pip install pytest
             pip install -e .
       - run:

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -21,27 +21,28 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
+
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: '3.12'
 
-    - name: Display Python version
-      run: python -c "import sys; print(sys.version)"
-
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install build
+
     - name: Build package
       run: python -m build --sdist
+
     - name: Publish package to Test PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
         repository_url: https://test.pypi.org/legacy/
+
     - name: Publish package to PyPI
       if: startsWith(github.ref, 'refs/tags')
       uses: pypa/gh-action-pypi-publish@release/v1

diff --git a/.github/workflows/test-main.yml b/.github/workflows/test-main.yml
@@ -13,37 +13,67 @@ permissions:
   contents: read
 
 jobs:
-  build:
+  lint:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip' # Speeds up flake8 installation
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8
 
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+
+  build:
+    needs: lint  # This job only starts if 'lint' passes
     runs-on: ubuntu-latest
     strategy:
-      # You can use PyPy versions in python-version.
-      # For example, pypy-2.7 and pypy-3.8
+      fail-fast: false # don't break 3.12 if 3.10 fails
       matrix:
         python-version: ["3.12", "3.10"]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
+
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-
-      # You can test your matrix by printing the current Python version
-      - name: Display Python version
-        run: python -c "import sys; print(sys.version)"
+          cache: 'pip' # Automatically caches your dependencies
 
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install pytest pytest-cov
           pip install -e .[full]
 
-      - name: Test with pytest
+      - name: Test with pytest (with codecov)
+        if: ${{ matrix.python-version == '3.10' }}
+        run: |
+          pytest -m "not largedata" --cov --cov-report=xml
+
+      - name: Test with pytest (without codecov)
+        if: ${{ matrix.python-version != '3.10' }}
         run: |
-          pytest -m "not largedata" --cov
+          pytest -m "not largedata"
 
       - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v4.0.1
+        if: ${{ matrix.python-version == '3.10' }}
+        uses: codecov/codecov-action@v4
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: true # Helpful to know if upload failed
diff --git a/clustpy/data/_utils.py b/clustpy/data/_utils.py
@@ -7,14 +7,17 @@
 except:
     print(
         "[WARNING] Could not import nltk in clustpy.data.real_world_data to use the SnowballStemmer. Please install nltk by 'pip install nltk' if necessary")
+try:
+    from PIL import Image
+except:
+    print(
+        "[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary")
 import numpy as np
-import urllib.request
 import os
 from pathlib import Path
-import ssl
-from PIL import Image
 from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
 from sklearn.feature_selection import VarianceThreshold
+from sklearn.datasets import fetch_file
 
 
 DEFAULT_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_datafiles")
@@ -63,11 +66,11 @@ def _download_file(file_url: str, filename_local: str) -> None:
     filename_local : str
         local name of the file after it has been downloaded
     """
+    local_path = Path(filename_local)
+    local_dir = local_path.parent
+    local_filename = local_path.name
     print("Downloading data set from {0} to {1}".format(file_url, filename_local))
-    default_ssl = ssl._create_default_https_context
-    ssl._create_default_https_context = ssl._create_unverified_context
-    urllib.request.urlretrieve(file_url, filename_local)
-    ssl._create_default_https_context = default_ssl
+    fetch_file(file_url, folder=local_dir, local_filename=local_filename)
 
 
 def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_size: int = 32768) -> None:
@@ -187,7 +190,7 @@ def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.nda
     image_data : np.ndarray
         The numpy array containing the image data
     """
-    if type(image) is str:
+    if isinstance(image, str):
         pil_image = Image.open(image)
     else:
         pil_image = Image.fromarray(np.uint8(image))
@@ -196,7 +199,8 @@ def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.nda
     # Convert to coherent size
     if image_size is not None:
         pil_image = pil_image.resize(image_size)
-    image_data = np.asarray(pil_image)
+    image_data = np.array(pil_image).copy()
+    pil_image.close()
     assert image_size is None or image_data.shape == (
         image_size[0], image_size[1], 3), "Size of image is not correct. Should be {0} but is {1}".format(image_size,
                                                                                                           image_data.shape)

diff --git a/clustpy/data/real_torchvision_data.py b/clustpy/data/real_torchvision_data.py
@@ -1,7 +1,6 @@
 import torchvision
 import torch
 import numpy as np
-import ssl
 from clustpy.data._utils import _get_download_dir, _load_image_data, flatten_images
 from sklearn.datasets._base import Bunch
 
@@ -45,7 +44,7 @@ def _get_data_and_labels(dataset: torchvision.datasets.VisionDataset, image_size
             labels.append(label)
             image_data = _load_image_data(path, image_size, True)
             data_list.append(image_data)
-        # Convert data form list to numpy array
+        # Convert data from list to numpy array
         data = np.array(data_list)
         labels = np.array(labels)
     if type(data) is np.ndarray:
@@ -89,8 +88,6 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
     # Get data from source
-    default_ssl = ssl._create_default_https_context
-    ssl._create_default_https_context = ssl._create_unverified_context
     if subset == "all" or subset == "train":
         # Load training data
         if uses_train_param:
@@ -117,7 +114,6 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs
     # Convert data to float and labels to int
     data = data.float()
     labels = labels.int()
-    ssl._create_default_https_context = default_ssl
     # Check data dimensions
     if data.dim() < 3 or data.dim() > 5:
         raise Exception(
@@ -137,7 +133,7 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs
         # Some dataset (e.g., SVHN) do not have the class information included
         if hasattr(dataset, "classes"):
             return Bunch(dataset_name=dataset.__class__.__name__, data=data_flatten, target=labels_numpy,
-                         images=data_image, image_format=image_format, classes=dataset.classes)
+                         images=data_image, image_format=image_format, classes=dataset.classes.copy())
         else:
             return Bunch(dataset_name=dataset.__class__.__name__, data=data_flatten, target=labels_numpy,
                          images=data_image, image_format=image_format)

diff --git a/clustpy/data/real_uci_data.py b/clustpy/data/real_uci_data.py
@@ -1,9 +1,4 @@
-try:
-    from PIL import Image
-except:
-    print(
-        "[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary")
-from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data
+from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data, _load_image_data
 import os
 import numpy as np
 import zipfile
@@ -1167,8 +1162,7 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc
             if not image.endswith("_4.pgm"):
                 continue
             # get image data
-            image_data = Image.open(path_images + "/" + image)
-            image_array = np.array(image_data)
+            image_array = _load_image_data(path_images + "/" + image, None, False)
             # Get labels
             name_parts = image.split("_")
             user_id = np.argwhere(names == name_parts[0])[0][0]
@@ -1188,7 +1182,7 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc
         return data_flatten, labels
     else:
         return Bunch(dataset_name="CMUFace", data=data_flatten, target=labels, images=data_image, image_format="HW",
-                     classes=[names, positions, expressions, eyes])
+                     classes=(names, positions, expressions, eyes))
 
 
 def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str = None):