weecology · musaqlain · Jan 25, 2026 · Jan 25, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py
@@ -1,10 +1,10 @@
 """Dataset model for object detection tasks."""
 
-import math
 import os
 
 import kornia.augmentation as K
 import numpy as np
+import pandas as pd
 import shapely
 import torch
 from PIL import Image
@@ -103,50 +103,67 @@ def _validate_labels(self):
             )
 
     def _validate_coordinates(self):
-        """Validate that all bounding box coordinates occur within the image.
-
-        Raises:
-            ValueError: If any bounding box coordinate occurs outside the image
-        """
+        """Validate that all bounding box coordinates occur within the
+        image."""
         errors = []
-        for _idx, row in self.annotations.iterrows():
-            img_path = os.path.join(self.root_dir, row["image_path"])
+
+        unique_images = self.annotations["image_path"].unique()
+        image_dims = {}
+
+        for img_path_rel in unique_images:
+            full_path = os.path.join(self.root_dir, img_path_rel)
             try:
-                with Image.open(img_path) as img:
-                    width, height = img.size
+                with Image.open(full_path) as img:
+                    image_dims[img_path_rel] = img.size
             except Exception as e:
-                errors.append(f"Failed to open image {img_path}: {e}")
-                continue
-
-            # Extract bounding box
-            geom = row["geometry"]
-            xmin, ymin, xmax, ymax = geom.bounds
-
-            # All coordinates equal to zero is how we code empty frames.
-            # Therefore these are valid coordinates even though they would
-            # fail other checks.
-            if xmin == 0 and ymin == 0 and xmax == 0 and ymax == 0:
-                continue
-
-            # Check if box is valid
-            oob_issues = []
-            if not geom.equals(shapely.envelope(geom)):
-                oob_issues.append(f"geom ({geom}) is not a valid bounding box")
-            if xmin < 0:
-                oob_issues.append(f"xmin ({xmin}) < 0")
-            if xmax > width:
-                oob_issues.append(f"xmax ({xmax}) > image width ({width})")
-            if ymin < 0:
-                oob_issues.append(f"ymin ({ymin}) < 0")
-            if ymax > height:
-                oob_issues.append(f"ymax ({ymax}) > image height ({height})")
-            if math.isclose(geom.area, 1):
-                oob_issues.append("area of bounding box is a single pixel")
-
-            if oob_issues:
-                errors.append(
-                    f"Box, ({xmin}, {ymin}, {xmax}, {ymax}) exceeds image dimensions, ({width}, {height}). Issues: {', '.join(oob_issues)}."
-                )
+                errors.append(f"Failed to open image {full_path}: {e}")
+
+        if errors:
+            raise ValueError("\n".join(errors))
+
+        self.annotations["_img_width"] = self.annotations["image_path"].map(
+            lambda x: image_dims.get(x, (0, 0))[0]
+        )
+        self.annotations["_img_height"] = self.annotations["image_path"].map(
+            lambda x: image_dims.get(x, (0, 0))[1]
+        )
+
+        if not {"xmin", "ymin", "xmax", "ymax"}.issubset(self.annotations.columns):
+            bounds = self.annotations["geometry"].apply(lambda x: x.bounds).tolist()
+            bounds_df = pd.DataFrame(
+                bounds,
+                columns=["xmin", "ymin", "xmax", "ymax"],
+                index=self.annotations.index,
+            )
+            self.annotations = pd.concat([self.annotations, bounds_df], axis=1)
+            working_df = self.annotations
+        else:
+            working_df = self.annotations
+
+        cols = ["xmin", "ymin", "xmax", "ymax"]
+
+        # All coordinates equal to zero is how we code empty frames.
+        # These are valid coordinates even though they would fail other checks.
+        empty_mask = (working_df[cols] == 0).all(axis=1)
+
+        neg_mask = (working_df[cols] < 0).any(axis=1)
+
+        oob_mask = (working_df["xmax"] > working_df["_img_width"]) | (
+            working_df["ymax"] > working_df["_img_height"]
+        )
+
+        bad_mask = (neg_mask | oob_mask) & (~empty_mask)
+
+        if bad_mask.any():
+            bad_rows = working_df[bad_mask]
+
+            report = bad_rows[["image_path", "xmin", "ymin", "xmax", "ymax"]]
+
+            errors.append(
+                f"Found {len(bad_rows)} invalid bounding boxes (negative or out-of-bounds):\n{report.to_string()}"
+            )
+
+        self.annotations.drop(columns=["_img_width", "_img_height"], inplace=True)
 
         if errors:
             raise ValueError("\n".join(errors))

diff --git a/tests/test_datasets_training.py b/tests/test_datasets_training.py
@@ -256,7 +256,7 @@ def test_BoxDataset_validate_coordinates(tmp_path, raster_path):
         )
         df.to_csv(csv_path, index=False)
 
-        with pytest.raises(ValueError, match="exceeds image dimensions"):
+        with pytest.raises(ValueError, match="invalid bounding boxes"):
             BoxDataset(csv_file=csv_path, root_dir=root_dir)
 
 
@@ -280,10 +280,13 @@ def test_BoxDataset_validate_non_rectangular_polygon(tmp_path, raster_path):
 
     root_dir = os.path.dirname(raster_path)
 
-    # Should raise an error because the geometry is not a valid bounding box
-    with pytest.raises(ValueError, match="is not a valid bounding box"):
-        BoxDataset(csv_file=csv_path, root_dir=root_dir)
+    # Should automatically convert non-rectangular geometry to bounding box
+    ds = BoxDataset(csv_file=csv_path, root_dir=root_dir)
 
+    assert ds.annotations.iloc[0]["xmin"] == 10
+    assert ds.annotations.iloc[0]["ymin"] == 10
+    assert ds.annotations.iloc[0]["xmax"] == 50
+    assert ds.annotations.iloc[0]["ymax"] == 40
 
 def test_BoxDataset_with_projected_shapefile(tmp_path, raster_path):
     """Test that BoxDataset can load a shapefile with projected coordinates and converts to pixel coordinates"""
@@ -344,3 +347,41 @@ def test_BoxDataset_with_projected_shapefile(tmp_path, raster_path):
     )
     assert torch.all(boxes[:, 2] > boxes[:, 0]), "xmax should be greater than xmin"
     assert torch.all(boxes[:, 3] > boxes[:, 1]), "ymax should be greater than ymin"
+
+def test_validate_coordinates_negative(tmpdir):
+    """
+    Ensure validation catches negative coordinates
+    """
+    img_path = os.path.join(tmpdir, "test_neg.jpg")
+    Image.new('RGB', (100, 100), color='white').save(img_path)
+
+    csv_file = os.path.join(tmpdir, "neg.csv")
+    df = pd.DataFrame({
+        'image_path': ["test_neg.jpg"],
+        'xmin': [-5], 'ymin': [10],
+        'xmax': [50], 'ymax': [50],
+        'label': ["Tree"]
+    })
+    df.to_csv(csv_file, index=False)
+
+    with pytest.raises(ValueError, match="invalid bounding boxes"):
+        BoxDataset(csv_file=csv_file, root_dir=str(tmpdir))
+
+def test_validate_coordinates_out_of_bounds(tmpdir):
+    """
+    Ensure validation catches OOB coordinates
+    """
+    img_path = os.path.join(tmpdir, "test_oob.jpg")
+    Image.new('RGB', (100, 100), color='white').save(img_path)
+
+    csv_file = os.path.join(tmpdir, "oob.csv")
+    df = pd.DataFrame({
+        'image_path': ["test_oob.jpg"],
+        'xmin': [10], 'ymin': [10],
+        'xmax': [150], 'ymax': [50],
+        'label': ["Tree"]
+    })
+    df.to_csv(csv_file, index=False)
+
+    with pytest.raises(ValueError, match="invalid bounding boxes"):
+        BoxDataset(csv_file=csv_file, root_dir=str(tmpdir))