From bcf5edd6b8e96201b78785f0f3f8331deeca6d60 Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Sun, 25 Jan 2026 06:27:58 +0500 Subject: [PATCH 1/8] Optimize validation speed using vectorization (Fixes #1244) --- src/deepforest/datasets/training.py | 84 ++++++++++++++++------------- tests/test_datasets_training.py | 34 ++++++++++++ 2 files changed, 81 insertions(+), 37 deletions(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index b3bdf085e..6c0499136 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -103,50 +103,60 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """Validate that all bounding box coordinates occur within the image. + """ + Validate that all bounding box coordinates occur within the image. + Vectorized implementation for performance. Raises: ValueError: If any bounding box coordinate occurs outside the image """ errors = [] - for _idx, row in self.annotations.iterrows(): - img_path = os.path.join(self.root_dir, row["image_path"]) + + unique_images = self.annotations["image_path"].unique() + image_dims = {} + + for img_path_rel in unique_images: + full_path = os.path.join(self.root_dir, img_path_rel) try: - with Image.open(img_path) as img: - width, height = img.size + with Image.open(full_path) as img: + image_dims[img_path_rel] = img.size # (width, height) except Exception as e: - errors.append(f"Failed to open image {img_path}: {e}") - continue - - # Extract bounding box - geom = row["geometry"] - xmin, ymin, xmax, ymax = geom.bounds - - # All coordinates equal to zero is how we code empty frames. - # Therefore these are valid coordinates even though they would - # fail other checks. - if xmin == 0 and ymin == 0 and xmax == 0 and ymax == 0: - continue - - # Check if box is valid - oob_issues = [] - if not geom.equals(shapely.envelope(geom)): - oob_issues.append(f"geom ({geom}) is not a valid bounding box") - if xmin < 0: - oob_issues.append(f"xmin ({xmin}) < 0") - if xmax > width: - oob_issues.append(f"xmax ({xmax}) > image width ({width})") - if ymin < 0: - oob_issues.append(f"ymin ({ymin}) < 0") - if ymax > height: - oob_issues.append(f"ymax ({ymax}) > image height ({height})") - if math.isclose(geom.area, 1): - oob_issues.append("area of bounding box is a single pixel") - - if oob_issues: - errors.append( - f"Box, ({xmin}, {ymin}, {xmax}, {ymax}) exceeds image dimensions, ({width}, {height}). Issues: {', '.join(oob_issues)}." - ) + errors.append(f"Failed to open image {full_path}: {e}") + + if errors: + raise ValueError("\n".join(errors)) + + self.annotations["_img_width"] = self.annotations["image_path"].map(lambda x: image_dims.get(x, (0,0))[0]) + self.annotations["_img_height"] = self.annotations["image_path"].map(lambda x: image_dims.get(x, (0,0))[1]) + + if not {"xmin", "ymin", "xmax", "ymax"}.issubset(self.annotations.columns): + bounds = self.annotations["geometry"].apply(lambda x: x.bounds).tolist() + bounds_df = pd.DataFrame(bounds, columns=["xmin", "ymin", "xmax", "ymax"], index=self.annotations.index) + working_df = pd.concat([self.annotations, bounds_df], axis=1) + else: + working_df = self.annotations + + cols = ["xmin", "ymin", "xmax", "ymax"] + empty_mask = (working_df[cols] == 0).all(axis=1) + + neg_mask = (working_df[cols] < 0).any(axis=1) + invalid_neg = neg_mask & (~empty_mask) + + if invalid_neg.any(): + bad_count = invalid_neg.sum() + errors.append(f"Found {bad_count} annotations with negative coordinates.") + + oob_mask = (working_df["xmax"] > working_df["_img_width"]) | \ + (working_df["ymax"] > working_df["_img_height"]) + invalid_oob = oob_mask & (~empty_mask) + + if invalid_oob.any(): + bad_rows = working_df[invalid_oob] + bad_count = len(bad_rows) + example_str = bad_rows[['image_path', 'xmin', 'ymin', 'xmax', 'ymax', '_img_width', '_img_height']].head().to_string() + errors.append(f"Found {bad_count} boxes exceeding image dimensions. Examples:\n{example_str}") + + self.annotations.drop(columns=["_img_width", "_img_height"], inplace=True) if errors: raise ValueError("\n".join(errors)) diff --git a/tests/test_datasets_training.py b/tests/test_datasets_training.py index da541ad08..4367f82a2 100644 --- a/tests/test_datasets_training.py +++ b/tests/test_datasets_training.py @@ -344,3 +344,37 @@ def test_BoxDataset_with_projected_shapefile(tmp_path, raster_path): ) assert torch.all(boxes[:, 2] > boxes[:, 0]), "xmax should be greater than xmin" assert torch.all(boxes[:, 3] > boxes[:, 1]), "ymax should be greater than ymin" + +def test_validate_coordinates_negative(tmpdir): + """ + Ensure vectorized validation catches negative coordinates + """ + img_path = os.path.join(tmpdir, "test_neg.jpg") + Image.new('RGB', (100, 100), color='white').save(img_path) + + df = pd.DataFrame({ + 'image_path': ["test_neg.jpg"], + 'xmin': [-5], 'ymin': [10], + 'xmax': [50], 'ymax': [50], + 'label': ["Tree"] + }) + + with pytest.raises(ValueError, match="negative coordinates"): + training.BoxDataset(annotation_dict=df, root_dir=str(tmpdir)) + +def test_validate_coordinates_out_of_bounds(tmpdir): + """ + Ensure vectorized validation catches OOB coordinates + """ + img_path = os.path.join(tmpdir, "test_oob.jpg") + Image.new('RGB', (100, 100), color='white').save(img_path) + + df = pd.DataFrame({ + 'image_path': ["test_oob.jpg"], + 'xmin': [10], 'ymin': [10], + 'xmax': [150], 'ymax': [50], + 'label': ["Tree"] + }) + + with pytest.raises(ValueError, match="exceeding image dimensions"): + training.BoxDataset(annotation_dict=df, root_dir=str(tmpdir)) From a932f5129963143dce806dac31e38995dfa551dd Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Sun, 25 Jan 2026 07:11:32 +0500 Subject: [PATCH 2/8] resolved formatting issues --- src/deepforest/datasets/training.py | 46 ++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index 6c0499136..2549c40a0 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -1,10 +1,10 @@ """Dataset model for object detection tasks.""" -import math import os import kornia.augmentation as K import numpy as np +import pandas as pd import shapely import torch from PIL import Image @@ -103,8 +103,7 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """ - Validate that all bounding box coordinates occur within the image. + """Validate that all bounding box coordinates occur within the image. Vectorized implementation for performance. Raises: @@ -119,19 +118,27 @@ def _validate_coordinates(self): full_path = os.path.join(self.root_dir, img_path_rel) try: with Image.open(full_path) as img: - image_dims[img_path_rel] = img.size # (width, height) + image_dims[img_path_rel] = img.size except Exception as e: errors.append(f"Failed to open image {full_path}: {e}") if errors: raise ValueError("\n".join(errors)) - self.annotations["_img_width"] = self.annotations["image_path"].map(lambda x: image_dims.get(x, (0,0))[0]) - self.annotations["_img_height"] = self.annotations["image_path"].map(lambda x: image_dims.get(x, (0,0))[1]) + self.annotations["_img_width"] = self.annotations["image_path"].map( + lambda x: image_dims.get(x, (0, 0))[0] + ) + self.annotations["_img_height"] = self.annotations["image_path"].map( + lambda x: image_dims.get(x, (0, 0))[1] + ) if not {"xmin", "ymin", "xmax", "ymax"}.issubset(self.annotations.columns): bounds = self.annotations["geometry"].apply(lambda x: x.bounds).tolist() - bounds_df = pd.DataFrame(bounds, columns=["xmin", "ymin", "xmax", "ymax"], index=self.annotations.index) + bounds_df = pd.DataFrame( + bounds, + columns=["xmin", "ymin", "xmax", "ymax"], + index=self.annotations.index, + ) working_df = pd.concat([self.annotations, bounds_df], axis=1) else: working_df = self.annotations @@ -146,15 +153,32 @@ def _validate_coordinates(self): bad_count = invalid_neg.sum() errors.append(f"Found {bad_count} annotations with negative coordinates.") - oob_mask = (working_df["xmax"] > working_df["_img_width"]) | \ - (working_df["ymax"] > working_df["_img_height"]) + oob_mask = (working_df["xmax"] > working_df["_img_width"]) | ( + working_df["ymax"] > working_df["_img_height"] + ) invalid_oob = oob_mask & (~empty_mask) if invalid_oob.any(): bad_rows = working_df[invalid_oob] bad_count = len(bad_rows) - example_str = bad_rows[['image_path', 'xmin', 'ymin', 'xmax', 'ymax', '_img_width', '_img_height']].head().to_string() - errors.append(f"Found {bad_count} boxes exceeding image dimensions. Examples:\n{example_str}") + example_str = ( + bad_rows[ + [ + "image_path", + "xmin", + "ymin", + "xmax", + "ymax", + "_img_width", + "_img_height", + ] + ] + .head() + .to_string() + ) + errors.append( + f"Found {bad_count} boxes exceeding image dimensions. Examples:\n{example_str}" + ) self.annotations.drop(columns=["_img_width", "_img_height"], inplace=True) From 4e3d871df0d05eb5c3ec5e26c074f9bb55e89b3a Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Thu, 29 Jan 2026 00:45:11 +0500 Subject: [PATCH 3/8] test fail resolved. --- src/deepforest/datasets/training.py | 10 +++++----- tests/test_datasets_training.py | 19 +++++++++++++------ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index 2549c40a0..6360b12c5 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -103,11 +103,9 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """Validate that all bounding box coordinates occur within the image. + """ + Validate that all bounding box coordinates occur within the image. Vectorized implementation for performance. - - Raises: - ValueError: If any bounding box coordinate occurs outside the image """ errors = [] @@ -139,11 +137,13 @@ def _validate_coordinates(self): columns=["xmin", "ymin", "xmax", "ymax"], index=self.annotations.index, ) - working_df = pd.concat([self.annotations, bounds_df], axis=1) + self.annotations = pd.concat([self.annotations, bounds_df], axis=1) + working_df = self.annotations else: working_df = self.annotations cols = ["xmin", "ymin", "xmax", "ymax"] + empty_mask = (working_df[cols] == 0).all(axis=1) neg_mask = (working_df[cols] < 0).any(axis=1) diff --git a/tests/test_datasets_training.py b/tests/test_datasets_training.py index 4367f82a2..487f893a9 100644 --- a/tests/test_datasets_training.py +++ b/tests/test_datasets_training.py @@ -256,7 +256,7 @@ def test_BoxDataset_validate_coordinates(tmp_path, raster_path): ) df.to_csv(csv_path, index=False) - with pytest.raises(ValueError, match="exceeds image dimensions"): + with pytest.raises(ValueError, match="negative coordinates|exceeding image dimensions"): BoxDataset(csv_file=csv_path, root_dir=root_dir) @@ -280,10 +280,13 @@ def test_BoxDataset_validate_non_rectangular_polygon(tmp_path, raster_path): root_dir = os.path.dirname(raster_path) - # Should raise an error because the geometry is not a valid bounding box - with pytest.raises(ValueError, match="is not a valid bounding box"): - BoxDataset(csv_file=csv_path, root_dir=root_dir) + # Should automatically convert non-rectangular geometry to bounding box + ds = BoxDataset(csv_file=csv_path, root_dir=root_dir) + assert ds.annotations.iloc[0]["xmin"] == 10 + assert ds.annotations.iloc[0]["ymin"] == 10 + assert ds.annotations.iloc[0]["xmax"] == 50 + assert ds.annotations.iloc[0]["ymax"] == 40 def test_BoxDataset_with_projected_shapefile(tmp_path, raster_path): """Test that BoxDataset can load a shapefile with projected coordinates and converts to pixel coordinates""" @@ -352,15 +355,17 @@ def test_validate_coordinates_negative(tmpdir): img_path = os.path.join(tmpdir, "test_neg.jpg") Image.new('RGB', (100, 100), color='white').save(img_path) + csv_file = os.path.join(tmpdir, "neg.csv") df = pd.DataFrame({ 'image_path': ["test_neg.jpg"], 'xmin': [-5], 'ymin': [10], 'xmax': [50], 'ymax': [50], 'label': ["Tree"] }) + df.to_csv(csv_file, index=False) with pytest.raises(ValueError, match="negative coordinates"): - training.BoxDataset(annotation_dict=df, root_dir=str(tmpdir)) + BoxDataset(csv_file=csv_file, root_dir=str(tmpdir)) def test_validate_coordinates_out_of_bounds(tmpdir): """ @@ -369,12 +374,14 @@ def test_validate_coordinates_out_of_bounds(tmpdir): img_path = os.path.join(tmpdir, "test_oob.jpg") Image.new('RGB', (100, 100), color='white').save(img_path) + csv_file = os.path.join(tmpdir, "oob.csv") df = pd.DataFrame({ 'image_path': ["test_oob.jpg"], 'xmin': [10], 'ymin': [10], 'xmax': [150], 'ymax': [50], 'label': ["Tree"] }) + df.to_csv(csv_file, index=False) with pytest.raises(ValueError, match="exceeding image dimensions"): - training.BoxDataset(annotation_dict=df, root_dir=str(tmpdir)) + BoxDataset(csv_file=csv_file, root_dir=str(tmpdir)) From 02b22250001b89b4b1070ec59d14f42629dd6bb8 Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Thu, 29 Jan 2026 00:48:37 +0500 Subject: [PATCH 4/8] correctly formatted --- src/deepforest/datasets/training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index 6360b12c5..02a53dd7a 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -103,8 +103,8 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """ - Validate that all bounding box coordinates occur within the image. + """Validate that all bounding box coordinates occur within the image. + Vectorized implementation for performance. """ errors = [] From fc98691a170bf2ba484f692720822c8e9d2ba45b Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Thu, 29 Jan 2026 01:11:55 +0500 Subject: [PATCH 5/8] improved comments --- src/deepforest/datasets/training.py | 7 +++---- tests/test_datasets_training.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index 02a53dd7a..b210e8841 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -103,10 +103,7 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """Validate that all bounding box coordinates occur within the image. - - Vectorized implementation for performance. - """ + """Validate that all bounding box coordinates occur within the image""" errors = [] unique_images = self.annotations["image_path"].unique() @@ -144,6 +141,8 @@ def _validate_coordinates(self): cols = ["xmin", "ymin", "xmax", "ymax"] + # All coordinates equal to zero is how we code empty frames. + # These are valid coordinates even though they would fail other checks. empty_mask = (working_df[cols] == 0).all(axis=1) neg_mask = (working_df[cols] < 0).any(axis=1) diff --git a/tests/test_datasets_training.py b/tests/test_datasets_training.py index 487f893a9..7e7a9ee85 100644 --- a/tests/test_datasets_training.py +++ b/tests/test_datasets_training.py @@ -350,7 +350,7 @@ def test_BoxDataset_with_projected_shapefile(tmp_path, raster_path): def test_validate_coordinates_negative(tmpdir): """ - Ensure vectorized validation catches negative coordinates + Ensure validation catches negative coordinates """ img_path = os.path.join(tmpdir, "test_neg.jpg") Image.new('RGB', (100, 100), color='white').save(img_path) @@ -369,7 +369,7 @@ def test_validate_coordinates_negative(tmpdir): def test_validate_coordinates_out_of_bounds(tmpdir): """ - Ensure vectorized validation catches OOB coordinates + Ensure validation catches OOB coordinates """ img_path = os.path.join(tmpdir, "test_oob.jpg") Image.new('RGB', (100, 100), color='white').save(img_path) From 281fb2927c029728c51f15b8b607e9084cb75c8d Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Thu, 29 Jan 2026 01:15:48 +0500 Subject: [PATCH 6/8] formatting issues resolved --- src/deepforest/datasets/training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index b210e8841..0bcf6e1af 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -103,7 +103,8 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """Validate that all bounding box coordinates occur within the image""" + """Validate that all bounding box coordinates occur within the + image.""" errors = [] unique_images = self.annotations["image_path"].unique() From 9abb8f67c15e2ba025a719a156c0566173a26bd5 Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Thu, 29 Jan 2026 01:45:57 +0500 Subject: [PATCH 7/8] combine the logic for -ve and OoB errors and then report them to user --- src/deepforest/datasets/training.py | 38 ++++++++--------------------- tests/test_datasets_training.py | 6 ++--- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index 0bcf6e1af..7aef823b3 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -103,8 +103,7 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """Validate that all bounding box coordinates occur within the - image.""" + """Validate that all bounding box coordinates occur within the image.""" errors = [] unique_images = self.annotations["image_path"].unique() @@ -147,37 +146,20 @@ def _validate_coordinates(self): empty_mask = (working_df[cols] == 0).all(axis=1) neg_mask = (working_df[cols] < 0).any(axis=1) - invalid_neg = neg_mask & (~empty_mask) - - if invalid_neg.any(): - bad_count = invalid_neg.sum() - errors.append(f"Found {bad_count} annotations with negative coordinates.") oob_mask = (working_df["xmax"] > working_df["_img_width"]) | ( working_df["ymax"] > working_df["_img_height"] ) - invalid_oob = oob_mask & (~empty_mask) - - if invalid_oob.any(): - bad_rows = working_df[invalid_oob] - bad_count = len(bad_rows) - example_str = ( - bad_rows[ - [ - "image_path", - "xmin", - "ymin", - "xmax", - "ymax", - "_img_width", - "_img_height", - ] - ] - .head() - .to_string() - ) + + bad_mask = (neg_mask | oob_mask) & (~empty_mask) + + if bad_mask.any(): + bad_rows = working_df[bad_mask] + + report = bad_rows[["image_path", "xmin", "ymin", "xmax", "ymax"]] + errors.append( - f"Found {bad_count} boxes exceeding image dimensions. Examples:\n{example_str}" + f"Found {len(bad_rows)} invalid bounding boxes (negative or out-of-bounds):\n{report.to_string()}" ) self.annotations.drop(columns=["_img_width", "_img_height"], inplace=True) diff --git a/tests/test_datasets_training.py b/tests/test_datasets_training.py index 7e7a9ee85..ffb77c538 100644 --- a/tests/test_datasets_training.py +++ b/tests/test_datasets_training.py @@ -256,7 +256,7 @@ def test_BoxDataset_validate_coordinates(tmp_path, raster_path): ) df.to_csv(csv_path, index=False) - with pytest.raises(ValueError, match="negative coordinates|exceeding image dimensions"): + with pytest.raises(ValueError, match="invalid bounding boxes"): BoxDataset(csv_file=csv_path, root_dir=root_dir) @@ -364,7 +364,7 @@ def test_validate_coordinates_negative(tmpdir): }) df.to_csv(csv_file, index=False) - with pytest.raises(ValueError, match="negative coordinates"): + with pytest.raises(ValueError, match="invalid bounding boxes"): BoxDataset(csv_file=csv_file, root_dir=str(tmpdir)) def test_validate_coordinates_out_of_bounds(tmpdir): @@ -383,5 +383,5 @@ def test_validate_coordinates_out_of_bounds(tmpdir): }) df.to_csv(csv_file, index=False) - with pytest.raises(ValueError, match="exceeding image dimensions"): + with pytest.raises(ValueError, match="invalid bounding boxes"): BoxDataset(csv_file=csv_file, root_dir=str(tmpdir)) From 7f018f557ef6b9db7e2ce19070addff2ff6aa7cf Mon Sep 17 00:00:00 2001 From: Muhammad Saqlain <2mesaqlain@gmail.com> Date: Thu, 29 Jan 2026 01:47:29 +0500 Subject: [PATCH 8/8] formatted correctly --- src/deepforest/datasets/training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py index 7aef823b3..fa2684b22 100644 --- a/src/deepforest/datasets/training.py +++ b/src/deepforest/datasets/training.py @@ -103,7 +103,8 @@ def _validate_labels(self): ) def _validate_coordinates(self): - """Validate that all bounding box coordinates occur within the image.""" + """Validate that all bounding box coordinates occur within the + image.""" errors = [] unique_images = self.annotations["image_path"].unique()