Add validation checks for bounding boxes

ethanwhite · reddykkeerthi · ethanwhite · commit c8a48c2df42a · 2025-11-18T20:59:17.000-05:00
Checks to see if bounding boxes occur outside of image boundaries and clearly communicates to user when they do. This is a version of #1015 that accounts for changes in the codebase over the last year. Closes #1014 Co-authored-by: Keerthi Reddy <reddykkeerthi@users.noreply.github.com>
diff --git a/src/deepforest/datasets/training.py b/src/deepforest/datasets/training.py
@@ -71,6 +71,7 @@ def __init__(
         self.preload_images = preload_images
 
         self._validate_labels()
+        self._validate_coordinates()
 
         # Pin data to memory if desired
         if self.preload_images:
@@ -94,6 +95,49 @@ def _validate_labels(self):
                 f"Please ensure all labels in the annotations exist as keys in label_dict."
             )
 
+    def _validate_coordinates(self):
+        """Validate that all bounding box coordinates occur within the image.
+
+        Raises:
+            ValueError: If any bounding box coordinate occurs outside the image
+        """
+        errors = []
+        for idx, row in self.annotations.iterrows():
+            img_path = os.path.join(self.root_dir, row["image_path"])
+            try:
+                with Image.open(img_path) as img:
+                    width, height = img.size
+            except Exception as e:
+                errors.append(f"Failed to open image {img_path}: {e}")
+                continue
+
+            # Extract bounding box
+            try:
+                geom = row["geometry"]
+                xmin, ymin, xmax, ymax = geom.bounds
+            except Exception as e:
+                errors.append(f"Invalid box format at index {idx}: {e}")
+                continue
+
+            # Check if box is valid
+            oob_issues = []
+            if xmin < 0:
+                oob_issues.append(f"xmin ({xmin}) < 0")
+            if xmax > width:
+                oob_issues.append(f"xmax ({xmax}) > image width ({width})")
+            if ymin < 0:
+                oob_issues.append(f"ymin ({ymin}) < 0")
+            if ymax > height:
+                oob_issues.append(f"ymax ({ymax}) > image height ({height})")
+
+            if oob_issues:
+                errors.append(
+                    f"Box, ({xmin}, {ymin}, {xmax}, {ymax}) exceeds image dimensions, ({width}, {height}). Issues: {', '.join(oob_issues)}."
+                )
+
+        if errors:
+            raise ValueError("\n".join(errors))
+
     def __len__(self):
         return len(self.image_names)
 
diff --git a/tests/test_datasets_training.py b/tests/test_datasets_training.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import pytest
 import torch
+from PIL import Image
 
 from deepforest import get_data, main, utilities
 from deepforest.datasets.training import BoxDataset
@@ -25,10 +26,13 @@ def multi_class():
 
 @pytest.fixture()
 def raster_path():
-    return get_data(path='OSBS_029.tif')
+    return get_data(path="OSBS_029.tif")
 
 
-@pytest.mark.parametrize("csv_file,label_dict", [(single_class(), {"Tree": 0}), (multi_class(), {"Alive": 0, "Dead": 1})])
+@pytest.mark.parametrize(
+    "csv_file,label_dict",
+    [(single_class(), {"Tree": 0}), (multi_class(), {"Alive": 0, "Dead": 1})],
+)
 def test_BoxDataset(csv_file, label_dict):
     root_dir = os.path.dirname(get_data("OSBS_029.png"))
     ds = BoxDataset(csv_file=csv_file, root_dir=root_dir, label_dict=label_dict)
@@ -48,7 +52,7 @@ def test_BoxDataset(csv_file, label_dict):
 
 
 def test_single_class_with_empty(tmpdir):
-    """Add fake empty annotations to test parsing """
+    """Add fake empty annotations to test parsing"""
     csv_file1 = get_data("example.csv")
     csv_file2 = get_data("OSBS_029.csv")
 
@@ -64,9 +68,9 @@ def test_single_class_with_empty(tmpdir):
     df.to_csv(f"{tmpdir}_test_empty.csv")
 
     root_dir = os.path.dirname(get_data("OSBS_029.png"))
-    ds = BoxDataset(csv_file=f"{tmpdir}_test_empty.csv",
-                             root_dir=root_dir,
-                             label_dict={"Tree": 0})
+    ds = BoxDataset(
+        csv_file=f"{tmpdir}_test_empty.csv", root_dir=root_dir, label_dict={"Tree": 0}
+    )
     assert len(ds) == 2
     # First image has annotations
     assert not torch.sum(ds[0][1]["boxes"]) == 0
@@ -78,9 +82,11 @@ def test_single_class_with_empty(tmpdir):
 def test_BoxDataset_transform(augment):
     csv_file = get_data("example.csv")
     root_dir = os.path.dirname(csv_file)
-    ds = BoxDataset(csv_file=csv_file,
-                             root_dir=root_dir,
-                             augmentations=["HorizontalFlip"] if augment else None)
+    ds = BoxDataset(
+        csv_file=csv_file,
+        root_dir=root_dir,
+        augmentations=["HorizontalFlip"] if augment else None,
+    )
 
     for i in range(len(ds)):
         # Between 0 and 1
@@ -100,8 +106,7 @@ def test_collate():
     """Due to data augmentations the dataset class may yield empty bounding box annotations"""
     csv_file = get_data("example.csv")
     root_dir = os.path.dirname(csv_file)
-    ds = BoxDataset(csv_file=csv_file,
-                             root_dir=root_dir)
+    ds = BoxDataset(csv_file=csv_file, root_dir=root_dir)
 
     for i in range(len(ds)):
         # Between 0 and 1
@@ -114,8 +119,7 @@ def test_empty_collate():
     """Due to data augmentations the dataset class may yield empty bounding box annotations"""
     csv_file = get_data("example.csv")
     root_dir = os.path.dirname(csv_file)
-    ds = BoxDataset(csv_file=csv_file,
-                             root_dir=root_dir)
+    ds = BoxDataset(csv_file=csv_file, root_dir=root_dir)
 
     for i in range(len(ds)):
         # Between 0 and 1
@@ -145,8 +149,7 @@ def test_multi_image_warning():
     df.to_csv(csv_file)
 
     root_dir = os.path.dirname(csv_file1)
-    ds = BoxDataset(csv_file=csv_file,
-                             root_dir=root_dir)
+    ds = BoxDataset(csv_file=csv_file, root_dir=root_dir)
 
     for i in range(len(ds)):
         # Between 0 and 1
@@ -162,7 +165,9 @@ def test_label_validation__training_csv():
     m.config.train.root_dir = os.path.dirname(get_data("example.csv"))
     m.create_trainer()
 
-    with pytest.raises(ValueError, match="Labels \\['Tree'\\] are missing from label_dict"):
+    with pytest.raises(
+        ValueError, match="Labels \\['Tree'\\] are missing from label_dict"
+    ):
         m.trainer.fit(m)
 
 
@@ -171,11 +176,15 @@ def test_csv_label_validation__validation_csv(m):
     m = main.deepforest(config_args={"num_classes": 1, "label_dict": {"Tree": 0}})
     m.config.train.csv_file = get_data("example.csv")  # contains 'Tree' label
     m.config.train.root_dir = os.path.dirname(get_data("example.csv"))
-    m.config.validation.csv_file = get_data("testfile_multi.csv")  # contains 'Dead', 'Alive' labels
+    m.config.validation.csv_file = get_data(
+        "testfile_multi.csv"
+    )  # contains 'Dead', 'Alive' labels
     m.config.validation.root_dir = os.path.dirname(get_data("testfile_multi.csv"))
     m.create_trainer()
 
-    with pytest.raises(ValueError, match="Labels \\['Dead', 'Alive'\\] are missing from label_dict"):
+    with pytest.raises(
+        ValueError, match="Labels \\['Dead', 'Alive'\\] are missing from label_dict"
+    ):
         m.trainer.fit(m)
 
 
@@ -191,17 +200,73 @@ def test_BoxDataset_validate_labels():
     # Should not raise an error
 
     # Invalid case: CSV labels are not in label_dict
-    with pytest.raises(ValueError, match="Labels \\['Tree'\\] are missing from label_dict"):
+    with pytest.raises(
+        ValueError, match="Labels \\['Tree'\\] are missing from label_dict"
+    ):
         BoxDataset(csv_file=csv_file, root_dir=root_dir, label_dict={"Bird": 0})
 
 
+def test_validate_BoxDataset_missing_image(tmpdir, raster_path):
+    csv_path = os.path.join(tmpdir, "test.csv")
+    df = pd.DataFrame(
+        {
+            "image_path": ["missing.tif"],
+            "xmin": 0,
+            "ymin": 0,
+            "xmax": 10,
+            "ymax": 10,
+            "label": ["Tree"],
+        }
+    )
+    df.to_csv(csv_path, index=False)
+    root_dir = os.path.dirname(raster_path)
+    with pytest.raises(ValueError, match="Failed to open image"):
+        _ = BoxDataset(csv_file=csv_path, root_dir=root_dir)
+
+
+def test_BoxDataset_validate_coordinates(tmpdir, raster_path):
+    # Valid case: uses example.csv with all valid boxes
+    csv_path = get_data("example.csv")
+    root_dir = os.path.dirname(csv_path)
+    _ = BoxDataset(csv_file=csv_path, root_dir=root_dir)
+
+    # Test various invalid box coordinates
+    with Image.open(raster_path) as image:
+        width, height = image.size
+
+    invalid_boxes = [
+        (width - 5, 0, width + 10, 10),  # xmax exceeds width
+        (0, height - 5, 10, height + 10),  # ymax exceeds height
+        (-5, 0, 10, 10),  # negative xmin
+        (0, -5, 10, 10),  # negative ymin
+    ]
+
+    for box in invalid_boxes:
+        csv_path = os.path.join(tmpdir, "test.csv")
+        df = pd.DataFrame(
+            {
+                "image_path": ["OSBS_029.tif"],
+                "xmin": [box[0]],
+                "ymin": [box[1]],
+                "xmax": [box[2]],
+                "ymax": [box[3]],
+                "label": ["Tree"],
+            }
+        )
+        df.to_csv(csv_path, index=False)
+
+        with pytest.raises(ValueError, match="exceeds image dimensions"):
+            BoxDataset(csv_file=csv_path, root_dir=root_dir)
+
+
 def test_BoxDataset_with_projected_shapefile(tmpdir, raster_path):
     """Test that BoxDataset can load a shapefile with projected coordinates and converts to pixel coordinates"""
     import geopandas as gpd
 
     # Get the raster to extract CRS and bounds
     import rasterio
     from shapely import geometry
+
     with rasterio.open(raster_path) as src:
         raster_crs = src.crs
         bounds = src.bounds
@@ -216,12 +281,19 @@ def test_BoxDataset_with_projected_shapefile(tmpdir, raster_path):
 
     sample_geometry = [
         geometry.box(sample_x, sample_y, sample_x + box_size, sample_y + box_size),
-        geometry.box(sample_x + box_size * 2, sample_y + box_size * 2, sample_x + box_size * 3, sample_y + box_size * 3)
+        geometry.box(
+            sample_x + box_size * 2,
+            sample_y + box_size * 2,
+            sample_x + box_size * 3,
+            sample_y + box_size * 3,
+        ),
     ]
     labels = ["Tree", "Tree"]
     image_path = os.path.basename(raster_path)
 
-    df = pd.DataFrame({"geometry": sample_geometry, "label": labels, "image_path": image_path})
+    df = pd.DataFrame(
+        {"geometry": sample_geometry, "label": labels, "image_path": image_path}
+    )
     gdf = gpd.GeoDataFrame(df, geometry="geometry", crs=raster_crs)
 
     # Save as shapefile
@@ -241,6 +313,8 @@ def test_BoxDataset_with_projected_shapefile(tmpdir, raster_path):
     # Verify boxes are in pixel coordinates (should be positive and reasonable)
     # After geo_to_image_coordinates conversion, values should be in pixel space
     boxes = targets["boxes"]
-    assert torch.all(boxes >= 0), "Boxes should have non-negative coordinates in pixel space"
+    assert torch.all(boxes >= 0), (
+        "Boxes should have non-negative coordinates in pixel space"
+    )
     assert torch.all(boxes[:, 2] > boxes[:, 0]), "xmax should be greater than xmin"
     assert torch.all(boxes[:, 3] > boxes[:, 1]), "ymax should be greater than ymin"