Transformers/DETR integration (#1078)

jveitchmichaelis · web-flow · commit 6bf5ecb6fdc0 · 2025-07-04T02:18:38.000-04:00
Add Transformers-based Deformable DETR and AutoModelForObjectDetection
integration for object detection tasks
diff --git a/docs/user_guide/13_annotation.md b/docs/user_guide/13_annotation.md
@@ -22,7 +22,7 @@ An incomplete list of annotation tools DeepForest users have reported success wi
 - AWS Ground Truth
 - LabelBox
 - Roboflow
-- and many more  
+- and many more
 
 We intentionally do not create our own annotation tools, but rather focus on supporting community-created tools. Look for exports in `.xml`, `.json`, or `.csv` formats, which are all common in the above tools.
 
@@ -142,9 +142,7 @@ for path in files:
     if boxes is None:
         continue
 
-    image = np.rollaxis(image, 0, 3)
-    fig = plot_predictions(df=boxes, image=image)
-    plt.imshow(fig)
+    plot_results(results=boxes, image=image)
 
     basename = os.path.splitext(os.path.basename(path))[0]
     shp = boxes_to_shapefile(boxes, root_dir=PATH_TO_DIR, projected=False)
@@ -183,4 +181,4 @@ Avoid collecting all annotations before model testing. Start with a small number
 
 # Please Make Your Annotations Open-Source!
 
-DeepForest's models are not perfect. Please consider sharing your annotations with the community to make the models stronger. You can post your annotations on **Zenodo** or open an [issue](https://github.com/weecology/DeepForest/issues) to share your data with the maintainers.
+DeepForest's models are not perfect. Please consider sharing your annotations with the community to make the models stronger. You can post your annotations on **Zenodo** or open an [issue](https://github.com/weecology/DeepForest/issues) to share your data with the maintainers.
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,8 @@ dependencies = [
     "torchvision>=0.13",
     "tqdm",
     "xmltodict",
+    "transformers>=4.46.3",
+    "timm>=1.0.15",
 ]
 
 [project.optional-dependencies]
diff --git a/src/deepforest/conf/config.yaml b/src/deepforest/conf/config.yaml
@@ -11,6 +11,7 @@ batch_size: 1
 architecture: 'retinanet'
 num_classes: 1
 nms_thresh: 0.05
+score_thresh: 0.1
 
 model:
     name: 'weecology/deepforest-tree'
@@ -24,11 +25,6 @@ annotations_xml:
 rgb_dir:
 path_to_rgb:
 
-# Architecture specific params
-retinanet:
-    # Non-max suppression of overlapping predictions
-    score_thresh: 0.1
-
 train:
     csv_file:
     root_dir:
diff --git a/src/deepforest/main.py b/src/deepforest/main.py
@@ -63,7 +63,8 @@ def __init__(
         # If not provided, load default config via hydra.
         if config is None:
             config = utilities.load_config(overrides=config_args)
-        elif 'config_file' in config:
+        # Hub overrides
+        elif 'config_file' in config or 'config_args' in config:
             config = utilities.load_config(overrides=config['config_args'])
         elif config_args is not None:
             warnings.warn(
@@ -118,7 +119,7 @@ def __init__(
 
         self.save_hyperparameters()
 
-    def load_model(self, model_name="weecology/deepforest-tree", revision='main'):
+    def load_model(self, model_name=None, revision=None):
         """Loads a model that has already been pretrained for a specific task,
         like tree crown detection.
 
@@ -136,16 +137,22 @@ def load_model(self, model_name="weecology/deepforest-tree", revision='main'):
         Returns:
             None
         """
+
+        if model_name is None:
+            model_name = self.config.model.name
+
+        if revision is None:
+            revision = self.config.model.revision
+
         # Load the model using from_pretrained
-        self.create_model()
         loaded_model = self.from_pretrained(model_name, revision=revision)
         self.label_dict = loaded_model.label_dict
         self.model = loaded_model.model
         self.numeric_to_label_dict = loaded_model.numeric_to_label_dict
 
         # Set bird-specific settings if loading the bird model
         if model_name == "weecology/deepforest-bird":
-            self.config.retinanet.score_thresh = 0.3
+            self.config.score_thresh = 0.3
             self.label_dict = {"Bird": 0}
             self.numeric_to_label_dict = {v: k for k, v in self.label_dict.items()}
 
diff --git a/src/deepforest/model.py b/src/deepforest/model.py
@@ -21,17 +21,14 @@ class Model():
     statement below.
 
     Args:
-        num_classes (int): number of classes in the model
-        nms_thresh (float): non-max suppression threshold for intersection-over-union [0,1]
-        score_thresh (float): minimum prediction score to keep during prediction  [0,1]
-    Returns:
-        model: a pytorch nn module
+        config (DictConfig): DeepForest config settings object
     """
 
     def __init__(self, config):
 
         # Check for required properties and formats
         self.config = config
+        self.nms_thresh = None  # Required for some models but not all
 
     def create_model(self):
         """This function converts a deepforest config file into a model.
diff --git a/src/deepforest/models/DeformableDetr.py b/src/deepforest/models/DeformableDetr.py
@@ -0,0 +1,107 @@
+import warnings
+from transformers import DeformableDetrForObjectDetection, DeformableDetrImageProcessor, logging
+from deepforest.model import Model
+from torch import nn
+
+# Suppress huge amounts of unnecessary warnings from transformers.
+logging.set_verbosity_error()
+
+
+class DeformableDetrWrapper(nn.Module):
+    """This class wraps a transformers DeformableDetrForObjectDetection model
+    so that input pre- and post-processing happens transparently."""
+
+    def __init__(self, config, name, revision):
+        """Initialize a DeformableDetrForObjectDetection model.
+
+        We assume that the provided name applies to both model and
+        processor. By default this function creates a model with MS-COCO
+        initialized weights, but can be overridden if needed.
+        """
+        super().__init__()
+        self.config = config
+
+        # This suppresses a bunch of messages which are specific to DETR,
+        # but do not impact model function.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+
+            self.net = DeformableDetrForObjectDetection.from_pretrained(
+                name,
+                revision=revision,
+                num_labels=self.config.num_classes,
+                ignore_mismatched_sizes=True)
+            self.processor = DeformableDetrImageProcessor.from_pretrained(
+                name, revision=revision)
+
+    def _prepare_targets(self, targets):
+
+        if not isinstance(targets, list):
+            targets = [targets]
+
+        coco_targets = []
+
+        for target in targets:
+            coco_targets.append({
+                "image_id":
+                    0,
+                "annotations": [{
+                    "id": i,
+                    "image_id": i,
+                    "category_id": label,
+                    "bbox": box.tolist(),
+                    "area": (box[3] - box[1]) * (box[2] - box[0]),
+                    "iscrowd": 0,
+                } for i, (label, box) in enumerate(zip(target["labels"], target["boxes"]))
+                               ]
+            })
+
+        return coco_targets
+
+    def forward(self, images, targets=None, prepare_targets=True):
+        """AutoModelForObjectDetection forward pass. If targets are provided
+        the function returns a loss dictionary, otherwise it returns processed
+        predictions. For details, see the transformers documentation for
+        "post_process_object_detection".
+
+        Returns:
+            predictions: list of dictionaries with "score", "boxes" and "labels", or
+                          a loss dict for training.
+        """
+
+        if targets and prepare_targets:
+            targets = self._prepare_targets(targets)
+
+        encoded_inputs = self.processor.preprocess(images=images,
+                                                   annotations=targets,
+                                                   return_tensors="pt",
+                                                   do_rescale=False)
+
+        preds = self.net(**encoded_inputs)
+
+        if targets is None:
+            return self.processor.post_process_object_detection(
+                preds,
+                threshold=self.config.score_thresh,
+                target_sizes=[i.shape[-2:] for i in images]
+                if isinstance(images, list) else [images.shape[-2:]])
+        else:
+            return preds.loss_dict
+
+
+class Model(Model):
+
+    def __init__(self, config, **kwargs):
+        """
+        Args:
+        """
+        super().__init__(config)
+
+    def create_model(self, name="SenseTime/deformable-detr", revision="main"):
+        """Create a Deformable DETR model from pretrained weights.
+
+        The number of classes set via config and will override the
+        downloaded checkpoint. The default weights will load a model
+        trained on MS-COCO that should fine-tune well on other tasks.
+        """
+        return DeformableDetrWrapper(self.config, name, revision)
diff --git a/src/deepforest/models/retinanet.py b/src/deepforest/models/retinanet.py
@@ -41,11 +41,8 @@ def create_anchor_generator(self,
         return anchor_generator
 
     def create_model(self):
-        """Create a retinanet model
-        Args:
-            num_classes (int): number of classes in the model
-            nms_thresh (float): non-max suppression threshold for intersection-over-union [0,1]
-            score_thresh (float): minimum prediction score to keep during prediction  [0,1]
+        """Create a retinanet model.
+
         Returns:
             model: a pytorch nn module
         """
@@ -54,7 +51,7 @@ def create_model(self):
 
         model = RetinaNet(backbone=backbone, num_classes=self.config.num_classes)
         model.nms_thresh = self.config.nms_thresh
-        model.score_thresh = self.config.retinanet.score_thresh
+        model.score_thresh = self.config.score_thresh
 
         # Optionally allow anchor generator parameters to be created here
         # https://pytorch.org/vision/stable/_modules/torchvision/models/detection/retinanet.html
diff --git a/tests/deepforest_config_test.yml b/tests/deepforest_config_test.yml
@@ -11,16 +11,12 @@ batch_size: 1
 architecture: 'retinanet'
 num_classes: 1
 nms_thresh: 0.05
-
-# Architecture specific params
-retinanet:
-    # Non-max suppression of overlapping predictions
-    score_thresh: 0.1
+score_thresh: 0.1
 
 train:
     csv_file:
     root_dir:
-    
+
     # Optimizer initial learning rate
     lr: 0.001
     scheduler:
@@ -50,10 +46,10 @@ train:
     fast_dev_run: False
     # pin images to GPU memory for fast training. This depends on GPU size and number of images.
     preload_images: False
-    
+
 validation:
     # callback args
-    csv_file: 
+    csv_file:
     root_dir:
     # Intersection over union evaluation
     iou_threshold: 0.4
diff --git a/tests/test_FasterRCNN.py b/tests/test_FasterRCNN.py
@@ -26,7 +26,7 @@ def _make_empty_sample():
     return images, targets
 
 
-def test_retinanet(config):
+def test_faster_rcnn(config):
     r = FasterRCNN.Model(config)
     assert r
 
@@ -48,10 +48,10 @@ def test_check_model(config):
 @pytest.mark.parametrize("num_classes", [1, 2, 10])
 def test_create_model(config, num_classes):
     config.num_classes = num_classes
-    retinanet_model = FasterRCNN.Model(config).create_model()
-    retinanet_model.eval()
+    faster_rcnn_model = FasterRCNN.Model(config).create_model()
+    faster_rcnn_model.eval()
     x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
-    predictions = retinanet_model(x)
+    predictions = faster_rcnn_model(x)
 
 
 def test_forward_empty(config):
diff --git a/tests/test_detr.py b/tests/test_detr.py
diff --git a/tests/test_main.py b/tests/test_main.py
diff --git a/tests/test_retinanet.py b/tests/test_retinanet.py

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,8 @@ dependencies = [`
`71`	`71`	`"torchvision>=0.13",`
`72`	`72`	`"tqdm",`
`73`	`73`	`"xmltodict",`
	`74`	`+ "transformers>=4.46.3",`
	`75`	`+ "timm>=1.0.15",`
`74`	`76`	`]`
`75`	`77`
`76`	`78`	`[project.optional-dependencies]`