diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py index aacee7818e..a2a0b90734 100644 --- a/keras_hub/api/layers/__init__.py +++ b/keras_hub/api/layers/__init__.py @@ -147,6 +147,9 @@ from keras_hub.src.models.vit.vit_image_converter import ( ViTImageConverter as ViTImageConverter, ) +from keras_hub.src.models.vit_det.vit_det_image_converter import ( + ViTDetImageConverter as ViTDetImageConverter, +) from keras_hub.src.models.whisper.whisper_audio_converter import ( WhisperAudioConverter as WhisperAudioConverter, ) diff --git a/keras_hub/src/models/vit_det/vit_det_backbone.py b/keras_hub/src/models/vit_det/vit_det_backbone.py index e595d7749a..fec55ad335 100644 --- a/keras_hub/src/models/vit_det/vit_det_backbone.py +++ b/keras_hub/src/models/vit_det/vit_det_backbone.py @@ -1,5 +1,4 @@ import keras -from keras import ops from keras_hub.src.api_export import keras_hub_export from keras_hub.src.models.backbone import Backbone @@ -105,10 +104,6 @@ def __init__( ) img_size = img_input.shape[-3] x = img_input - # VITDet scales inputs based on the standard ImageNet mean/stddev. - x = (x - ops.array([0.485, 0.456, 0.406], dtype=x.dtype)) / ( - ops.array([0.229, 0.224, 0.225], dtype=x.dtype) - ) x = ViTDetPatchingAndEmbedding( kernel_size=(patch_size, patch_size), strides=(patch_size, patch_size), diff --git a/keras_hub/src/models/vit_det/vit_det_image_converter.py b/keras_hub/src/models/vit_det/vit_det_image_converter.py new file mode 100644 index 0000000000..5958c02ff6 --- /dev/null +++ b/keras_hub/src/models/vit_det/vit_det_image_converter.py @@ -0,0 +1,40 @@ +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.layers.preprocessing.image_converter import ImageConverter +from keras_hub.src.models.vit_det.vit_det_backbone import ViTDetBackbone + + +@keras_hub_export("keras_hub.layers.ViTDetImageConverter") +class ViTDetImageConverter(ImageConverter): + """Image converter for ViTDet models. + + This layer applies ImageNet normalization (mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) to input images for ViTDet models. + + Args: + image_size: int or tuple of (height, width). The output size of the + image. Defaults to `(1024, 1024)`. + + Example: + ```python + converter = keras_hub.layers.ViTDetImageConverter(image_size=(1024, 1024)) + converter(np.random.rand(1, 512, 512, 3)) # Resizes and normalizes + ``` + """ + + backbone_cls = ViTDetBackbone + + def __init__( + self, + image_size=(1024, 1024), + **kwargs, + ): + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + variance = [x**2 for x in std] + super().__init__( + image_size=image_size, + scale=1.0 / 255.0, # Scale to [0, 1] + mean=mean, + variance=variance, + **kwargs, + )