diff --git a/pyproject.toml b/pyproject.toml
index 9f92fb9c2..714fe6c3b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "deepforest"
 version = "2.0.1dev0"
 description = "Platform for individual detection from airborne remote sensing including trees, birds, and livestock. Supports multiple detection models, adding models for species classification, and easy fine tuning to particular ecosystems."
 readme = "README.md"
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.10,<3.14"
 license = {text = "MIT"}
 keywords = ["deep-learning", "forest", "ecology", "computer-vision"]
 classifiers = [
diff --git a/src/deepforest/datasets/prediction.py b/src/deepforest/datasets/prediction.py
index bb5b2d3b5..0f866cc7d 100644
--- a/src/deepforest/datasets/prediction.py
+++ b/src/deepforest/datasets/prediction.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -14,6 +15,80 @@
 from deepforest.utilities import format_geometry, read_file
 
 
+def _load_image_array(
+    image_path: str | None = None, image: np.ndarray | Image.Image | None = None
+) -> np.ndarray:
+    """Load image from path or array; converts to RGB when loading from
+    path."""
+    if image is None:
+        if image_path is None:
+            raise ValueError("Either image_path or image must be provided")
+        return np.asarray(Image.open(image_path).convert("RGB"))
+
+    return image if isinstance(image, np.ndarray) else np.asarray(image)
+
+
+def _ensure_rgb_chw(image: np.ndarray) -> np.ndarray:
+    """Return 3-channel RGB in CHW order (no normalization).
+
+    Raises if grayscale or wrong shape.
+    """
+    if image.ndim == 2:
+        raise ValueError("Grayscale images are not supported (expected 3-channel RGB)")
+    if image.ndim != 3:
+        raise ValueError(f"Expected 3D image array, got shape {image.shape}")
+
+    # Ensure channels-first (C, H, W)
+    if image.shape[0] == 3:
+        chw = image
+    elif image.shape[-1] == 3:
+        chw = np.moveaxis(image, -1, 0)
+    else:
+        raise ValueError(f"Expected image with 3 channels, got shape {image.shape}")
+
+    return np.ascontiguousarray(chw)
+
+
+def _ensure_rgb_chw_float32(image: np.ndarray) -> np.ndarray:
+    """Normalize to RGB CHW float32 in [0, 1].
+
+    Accepts HWC/CHW uint8 or float. Raises if invalid. For float images,
+    uses full-image heuristic (max > 1.0 -> divide by 255).
+    """
+    chw = _ensure_rgb_chw(image)
+
+    # Normalize based primarily on dtype
+    if chw.dtype == np.uint8:
+        chw = chw.astype(np.float32)
+        chw /= 255.0
+    elif np.issubdtype(chw.dtype, np.floating):
+        if chw.dtype != np.float32:
+            chw = chw.astype(np.float32)
+
+        # Allow already-normalized float images.
+        # If values look like 0-255 floats, normalize.
+        max_val = float(chw.max())
+        min_val = float(chw.min())
+        if min_val < 0:
+            raise ValueError(
+                f"Expected float image in [0, 1] or [0, 255], got min {min_val}"
+            )
+        if max_val > 1.0:
+            if max_val <= 255.0:
+                if np.shares_memory(chw, image):
+                    chw = chw.copy()
+                chw /= 255.0
+            else:
+                raise ValueError(
+                    f"Expected float image in [0, 1] or [0, 255], got max {max_val}"
+                )
+    else:
+        # Integers other than uint8 are ambiguous; be explicit.
+        raise ValueError(f"Unsupported image dtype {chw.dtype}. Expected uint8 or float.")
+
+    return np.ascontiguousarray(chw)
+
+
 # Base prediction class
 class PredictionDataset(Dataset):
     """Base class for prediction datasets. Defines the common interface and
@@ -48,32 +123,9 @@ def __init__(
     def load_and_preprocess_image(
         self, image_path: str = None, image: np.ndarray | Image.Image = None
     ):
-        if image is None:
-            if image_path is None:
-                raise ValueError("Either image_path or image must be provided")
-            image = np.array(Image.open(image_path).convert("RGB"))
-        else:
-            image = np.array(image)
-        # If dtype is not float32, convert to float32
-        if image.dtype != "float32":
-            image = image.astype("float32")
-
-        # If image is not normalized, normalize to [0, 1]
-        if image.max() > 1 or image.min() < 0:
-            image = image / 255.0
-
-        # If image is not in CHW format, convert to CHW
-        if image.shape[0] != 3:
-            if image.shape[-1] != 3:
-                raise ValueError(
-                    f"Expected 3 channel image, got image shape {image.shape}"
-                )
-            else:
-                image = np.rollaxis(image, 2, 0)
-
-        image = torch.from_numpy(image)
-
-        return image
+        image_arr = _load_image_array(image_path=image_path, image=image)
+        image_arr = _ensure_rgb_chw_float32(image_arr)
+        return torch.from_numpy(image_arr)
 
     def prepare_items(self):
         """Prepare the items for the dataset.
@@ -169,9 +221,13 @@ def __init__(self, path=None, image=None, patch_size=400, patch_overlap=0):
         )
 
     def prepare_items(self):
-        self.image = self.load_and_preprocess_image(self.path, image=self.image)
+        image_arr = _load_image_array(image_path=self.path, image=self.image)
+        # Normalize full image once so all crops share consistent treatment
+        # (uniform across dtype, float [0,1] vs [0,255], and dark vs bright regions).
+        image_norm = _ensure_rgb_chw_float32(image_arr)
+        self.image = torch.from_numpy(image_norm)
         self.windows = preprocess.compute_windows(
-            self.image, self.patch_size, self.patch_overlap
+            image_norm, self.patch_size, self.patch_overlap
         )
 
     def __len__(self):
@@ -182,8 +238,9 @@ def window_list(self):
 
     def get_crop(self, idx):
         crop = self.image[self.windows[idx].indices()]
-
-        return crop
+        # Clone to avoid in-place modification corrupting self.image when crop
+        # is a view (overlapping windows).
+        return crop.clone()
 
     def get_image_basename(self, idx):
         if self.path is not None:
@@ -433,33 +490,47 @@ class TiledRaster(PredictionDataset):
     def __init__(self, path, patch_size, patch_overlap):
         if path is None:
             raise ValueError("path is required for a memory raster dataset")
+        self._src = None
         super().__init__(path=path, patch_size=patch_size, patch_overlap=patch_overlap)
 
     def prepare_items(self):
-        # Get raster shape without keeping file open
-        with rio.open(self.path) as src:
-            width = src.shape[0]
-            height = src.shape[1]
-
-            # Check is tiled
-            if not src.is_tiled:
-                raise ValueError(
-                    "Out-of-memory dataset is selected, but raster is not tiled, "
-                    "leading to entire raster being read into memory and defeating "
-                    "the purpose of an out-of-memory dataset. "
-                    "\nPlease run: "
-                    "\ngdal_translate -of GTiff -co TILED=YES <input> <output> "
-                    "to create a tiled raster"
-                )
+        # Open once; workers=0 is enforced by caller for this dataset.
+        self._src = rio.open(self.path)
+        height = self._src.height
+        width = self._src.width
+
+        # Warn on non-tiled rasters: window reads may still be efficient (strip-based),
+        # but performance can degrade depending on driver/strip layout.
+        if not self._src.is_tiled:
+            warnings.warn(
+                "dataloader_strategy='window' is selected, but raster is not tiled. "
+                "Windowed reads may be slower depending on file layout. If needed, "
+                "create a tiled GeoTIFF with: "
+                "gdal_translate -of GTiff -co TILED=YES <input> <output>",
+                stacklevel=2,
+            )
 
         # Generate sliding windows
-        self.windows = slidingwindow.generateForSize(
+        all_windows = slidingwindow.generateForSize(
             height,
             width,
             dimOrder=slidingwindow.DimOrder.ChannelHeightWidth,
             maxWindowSize=self.patch_size,
             overlapPercent=self.patch_overlap,
         )
+        # Filter out invalid windows: zero-size or extending past raster bounds.
+        # Rasterio returns (C,0,W) or (C,H,0) for out-of-bounds reads, which breaks RetinaNet.
+        self.windows = [
+            w
+            for w in all_windows
+            if w.w > 0 and w.h > 0 and w.x + w.w <= width and w.y + w.h <= height
+        ]
+        n_filtered = len(all_windows) - len(self.windows)
+        if n_filtered > 0:
+            warnings.warn(
+                f"Filtered {n_filtered} window(s) extending past raster bounds or zero-size.",
+                stacklevel=2,
+            )
 
     def __len__(self):
         return len(self.windows)
@@ -469,12 +540,19 @@ def window_list(self):
 
     def get_crop(self, idx):
         window = self.windows[idx]
-        with rio.open(self.path) as src:
-            window_data = src.read(window=Window(window.x, window.y, window.w, window.h))
+        assert self._src is not None, "Raster dataset is not open"
+        window_data = self._src.read(
+            window=Window(window.x, window.y, window.w, window.h)
+        )
 
-        # Rasterio already returns (C, H, W), just normalize and convert
-        window_data = window_data.astype("float32") / 255.0
-        window_data = torch.from_numpy(window_data).float()
+        if window_data.shape[1] == 0 or window_data.shape[2] == 0:
+            raise ValueError(
+                f"Window {idx} returned zero-size array (shape={window_data.shape}). "
+                "RetinaNet cannot process images with zero height or width."
+            )
+        window_data = window_data.astype(np.float32)
+        window_data /= 255.0
+        window_data = torch.from_numpy(window_data)
         if window_data.shape[0] != 3:
             raise ValueError(
                 f"Expected 3 channel image, got {window_data.shape[0]} channels"
@@ -487,3 +565,29 @@ def get_image_basename(self, idx):
 
     def get_crop_bounds(self, idx):
         return self.window_list()[idx]
+
+    def close(self) -> None:
+        """Close the underlying raster dataset."""
+        if self._src is not None:
+            self._src.close()
+            self._src = None
+
+    def __getstate__(self) -> dict:
+        """Make picklable for multiprocessing; rasterio handles are not
+        serializable."""
+        state = self.__dict__.copy()
+        state["_src"] = None  # Exclude handle; __setstate__ will reopen in new process
+        return state
+
+    def __setstate__(self, state: dict) -> None:
+        """Restore after unpickle; reopen raster since handle was excluded."""
+        self.__dict__.update(state)
+        if self._src is None and self.path is not None:
+            self._src = rio.open(self.path)
+
+    def __del__(self):
+        # Best-effort cleanup
+        try:
+            self.close()
+        except Exception:
+            pass
diff --git a/src/deepforest/main.py b/src/deepforest/main.py
index af331eb78..d942b0251 100644
--- a/src/deepforest/main.py
+++ b/src/deepforest/main.py
@@ -241,6 +241,26 @@ def create_trainer(self, logger=None, callbacks=None, **kwargs):
 
         self.trainer = pl.Trainer(**trainer_args)
 
+        # Helpful warning: CUDA visible but trainer not using it.
+        # This commonly happens if accelerator/devices were overridden to CPU, or
+        # if the trainer wasn't recreated after changing config.
+        try:
+            accel_name = type(self.trainer.accelerator).__name__.lower()
+        except Exception:
+            accel_name = ""
+
+        requested_accel = str(trainer_args.get("accelerator", "")).lower()
+        if torch.cuda.is_available() and requested_accel in {"auto", "gpu", "cuda"}:
+            if "cuda" not in accel_name and "gpu" not in accel_name:
+                warnings.warn(
+                    "CUDA appears to be available, but the Lightning trainer is not "
+                    f"using a GPU accelerator (accelerator={trainer_args.get('accelerator')}, "
+                    f"devices={trainer_args.get('devices')}). "
+                    "To force GPU inference, call create_trainer(accelerator='gpu', devices=1) "
+                    "or set config.accelerator='gpu' and config.devices=1, then recreate the trainer.",
+                    stacklevel=2,
+                )
+
     def on_fit_start(self):
         if self.config.train.csv_file is None:
             raise AttributeError(
@@ -586,19 +606,24 @@ def predict_tile(
                         patch_size=patch_size,
                     )
 
-                dataloader = self.predict_dataloader(ds)
-                batched_results = self.trainer.predict(self, dataloader)
-
-                # Flatten list from batched prediction
-                # Track global window index across batches
-                global_window_idx = 0
-                for _idx, batch in enumerate(batched_results):
-                    for _window_idx, window_result in enumerate(batch):
-                        formatted_result = ds.postprocess(
-                            window_result, global_window_idx
-                        )
-                        image_results.append(formatted_result)
-                        global_window_idx += 1
+                try:
+                    dataloader = self.predict_dataloader(ds)
+                    batched_results = self.trainer.predict(self, dataloader)
+
+                    # Flatten list from batched prediction
+                    # Track global window index across batches
+                    global_window_idx = 0
+                    for _idx, batch in enumerate(batched_results):
+                        for _window_idx, window_result in enumerate(batch):
+                            formatted_result = ds.postprocess(
+                                window_result, global_window_idx
+                            )
+                            image_results.append(formatted_result)
+                            global_window_idx += 1
+                finally:
+                    # Ensure raster datasets are closed even if predict/postprocess raises
+                    if hasattr(ds, "close"):
+                        ds.close()
 
             if not image_results:
                 results = pd.DataFrame()
@@ -895,8 +920,7 @@ def predict_step(self, batch, batch_idx):
 
         self.model.eval()
         with torch.no_grad():
-            preds = self.model.forward(images)
-        return preds
+            return self.model.forward(images)
 
     def predict_batch(self, images, preprocess_fn=None):
         """Predict a batch of images with the deepforest model.
diff --git a/tests/hpc_multi_gpu_train.py b/tests/hpc_multi_gpu_train.py
new file mode 100644
index 000000000..9ea1307b0
--- /dev/null
+++ b/tests/hpc_multi_gpu_train.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+"""HPC-only multi-GPU training smoke test (DDP).
+
+Run with:
+  torchrun --nproc_per_node=2 tests/hpc_multi_gpu_train.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+
+from deepforest import get_data
+from deepforest.main import deepforest
+
+
+def _require_hpc() -> None:
+    if os.environ.get("GITHUB_ACTIONS") or os.environ.get("CI"):
+        raise SystemExit("CI environment detected; skip HPC-only test.")
+    if not os.environ.get("HIPERGATOR") and not os.environ.get("SLURM_JOB_ID"):
+        raise SystemExit(
+            "This script is intended for HPC use only. "
+            "Set HIPERGATOR=1 or run under SLURM."
+        )
+
+
+def _require_ddp() -> None:
+    if "LOCAL_RANK" not in os.environ and "RANK" not in os.environ:
+        raise SystemExit(
+            "DDP environment not detected. Run with:\n"
+            "  torchrun --nproc_per_node=2 tests/hpc_multi_gpu_train.py"
+        )
+
+
+def main() -> int:
+    _require_hpc()
+    _require_ddp()
+
+    if torch.cuda.device_count() < 2:
+        raise SystemExit("Need at least 2 GPUs for this test.")
+
+    m = deepforest()
+    m.config.workers = 0
+    m.config.batch_size = 1
+    m.config.num_classes = 1
+    m.config.label_dict = {"Tree": 0}
+    train_csv = get_data("example.csv")
+    m.config.train.csv_file = train_csv
+    m.config.train.root_dir = os.path.dirname(train_csv)
+    m.config.validation.csv_file = train_csv
+    m.config.validation.root_dir = os.path.dirname(train_csv)
+    m.create_model(initialize_model=True)
+
+    # Keep this fast but avoid fast_dev_run's zero-length warning in DDP.
+    m.create_trainer(
+        accelerator="gpu",
+        devices=2,
+        strategy="ddp",
+        fast_dev_run=False,
+        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        log_every_n_steps=1,
+    )
+    m.trainer.fit(m)
+
+    # Multi-GPU evaluation pass (uses same example.csv)
+    m.trainer.validate(m)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_datasets_prediction.py b/tests/test_datasets_prediction.py
index f98ab3f72..642b64f83 100644
--- a/tests/test_datasets_prediction.py
+++ b/tests/test_datasets_prediction.py
@@ -47,6 +47,34 @@ def test_valid_array():
     test_data = np.random.randint(0, 256, (300,300,3)).astype(np.uint8)
     SingleImage(image=test_data)
 
+
+def test_single_image_float32_0_255_consistent_normalization():
+    """Float32 [0, 255] crops must be normalized uniformly from full-image decision.
+
+    A dark crop (all pixels <= 1.0) would be misclassified as [0, 1] by the
+    per-crop heuristic; with the fix, we use the full-image max to decide once.
+    """
+    # Image: left half dark (0.5), right half bright (128). Full max > 1.
+    h, w = 200, 400
+    img = np.zeros((h, w, 3), dtype=np.float32)
+    img[:, : w // 2, :] = 0.5  # Dark region
+    img[:, w // 2 :, :] = 128.0  # Bright region
+    # CHW for preprocess.compute_windows
+    img = np.moveaxis(img, -1, 0)
+
+    ds = SingleImage(image=img, patch_size=100, patch_overlap=0)
+    assert len(ds) >= 2
+
+    # First crop(s) from dark region: max=0.5. Should be divided by 255 -> max ~0.002
+    dark_crop = ds[0]
+    assert dark_crop.shape == (3, 100, 100)
+    assert dark_crop.max().item() < 0.01, "Dark crop should be /255, not left as [0,1]"
+
+    # Crop from bright region: max=128. Should be divided by 255 -> max ~0.5
+    bright_idx = len(ds) - 1
+    bright_crop = ds[bright_idx]
+    assert bright_crop.max().item() > 0.4
+
 def test_MultiImage():
     ds = MultiImage(paths=[get_data("OSBS_029.png"), get_data("OSBS_029.png")],
                     patch_size=300,
diff --git a/tests/test_gpu_inference_uses_cuda.py b/tests/test_gpu_inference_uses_cuda.py
new file mode 100644
index 000000000..5cbfc03fa
--- /dev/null
+++ b/tests/test_gpu_inference_uses_cuda.py
@@ -0,0 +1,39 @@
+import os
+
+import pytest
+import torch
+
+from deepforest import get_data
+from deepforest.main import deepforest
+
+
+@pytest.mark.skipif(
+    not os.environ.get("HIPERGATOR"),
+    reason="Only run on HIPERGATOR (requires GPU + model downloads).",
+)
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="CUDA not available in this test environment.",
+)
+def test_predict_tile_uses_cuda_when_requested():
+    """Ensure predict_tile runs on CUDA when accelerator/devices request GPU.
+
+    This is a regression test to catch silent CPU fallbacks on GPU nodes.
+    """
+    m = deepforest(config_args={"accelerator": "gpu", "devices": 1, "workers": 0})
+    m.load_model(model_name="weecology/deepforest-tree", revision="main")
+    m.create_trainer(accelerator="gpu", devices=1)
+
+    results = m.predict_tile(
+        path=get_data("OSBS_029.png"),
+        patch_size=400,
+        patch_overlap=0.0,
+        iou_threshold=0.15,
+        dataloader_strategy="single",
+    )
+    assert results is not None and not results.empty
+
+    # Assert trainer is actually using a GPU accelerator (no silent CPU fallback).
+    assert m.trainer is not None
+    accel_name = type(m.trainer.accelerator).__name__.lower()
+    assert "cuda" in accel_name or "gpu" in accel_name