From 1faafe0a676254fd65240aa32b646440e5373b98 Mon Sep 17 00:00:00 2001
From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com>
Date: Sun, 19 Oct 2025 14:33:58 +0800
Subject: [PATCH 1/5] Add DINOV3 with the help from Gemini CLI.

---
 keras_hub/api/layers/__init__.py              |   3 +
 keras_hub/api/models/__init__.py              |   3 +
 keras_hub/src/models/dinov3/__init__.py       |   5 +
 .../src/models/dinov3/dinov3_backbone.py      | 200 +++++
 .../models/dinov3/dinov3_image_converter.py   |   8 +
 keras_hub/src/models/dinov3/dinov3_layers.py  | 722 ++++++++++++++++++
 keras_hub/src/models/dinov3/dinov3_presets.py |   4 +
 .../src/utils/transformers/convert_dinov3.py  | 100 +++
 .../src/utils/transformers/preset_loader.py   |   3 +
 .../convert_dinov3_checkpoints.py             | 136 ++++
 10 files changed, 1184 insertions(+)
 create mode 100644 keras_hub/src/models/dinov3/__init__.py
 create mode 100644 keras_hub/src/models/dinov3/dinov3_backbone.py
 create mode 100644 keras_hub/src/models/dinov3/dinov3_image_converter.py
 create mode 100644 keras_hub/src/models/dinov3/dinov3_layers.py
 create mode 100644 keras_hub/src/models/dinov3/dinov3_presets.py
 create mode 100644 keras_hub/src/utils/transformers/convert_dinov3.py
 create mode 100644 tools/checkpoint_conversion/convert_dinov3_checkpoints.py

diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py
index aacee7818e..58afa65934 100644
--- a/keras_hub/api/layers/__init__.py
+++ b/keras_hub/api/layers/__init__.py
@@ -93,6 +93,9 @@
 from keras_hub.src.models.dinov2.dinov2_image_converter import (
     DINOV2ImageConverter as DINOV2ImageConverter,
 )
+from keras_hub.src.models.dinov3.dinov3_image_converter import (
+    DINOV3ImageConverter as DINOV3ImageConverter,
+)
 from keras_hub.src.models.efficientnet.efficientnet_image_converter import (
     EfficientNetImageConverter as EfficientNetImageConverter,
 )
diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
index b90dde2cc7..72eebbf64a 100644
--- a/keras_hub/api/models/__init__.py
+++ b/keras_hub/api/models/__init__.py
@@ -184,6 +184,9 @@
 from keras_hub.src.models.dinov2.dinov2_backbone import (
     DINOV2Backbone as DINOV2Backbone,
 )
+from keras_hub.src.models.dinov3.dinov3_backbone import (
+    DINOV3Backbone as DINOV3Backbone,
+)
 from keras_hub.src.models.distil_bert.distil_bert_backbone import (
     DistilBertBackbone as DistilBertBackbone,
 )
diff --git a/keras_hub/src/models/dinov3/__init__.py b/keras_hub/src/models/dinov3/__init__.py
new file mode 100644
index 0000000000..1752b3c838
--- /dev/null
+++ b/keras_hub/src/models/dinov3/__init__.py
@@ -0,0 +1,5 @@
+from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone
+from keras_hub.src.models.dinov3.dinov3_presets import backbone_presets
+from keras_hub.src.utils.preset_utils import register_presets
+
+register_presets(backbone_presets, DINOV3Backbone)
diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py
new file mode 100644
index 0000000000..385247d719
--- /dev/null
+++ b/keras_hub/src/models/dinov3/dinov3_backbone.py
@@ -0,0 +1,200 @@
+from keras import layers
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.dinov3.dinov3_layers import DINOV3Embedding
+from keras_hub.src.models.dinov3.dinov3_layers import DINOV3Encoder
+from keras_hub.src.models.dinov3.dinov3_layers import (
+    DINOV3RopePositionEmbedding,
+)
+from keras_hub.src.models.feature_pyramid_backbone import FeaturePyramidBackbone
+from keras_hub.src.utils.keras_utils import standardize_data_format
+
+
+@keras_hub_export("keras_hub.models.DINOV3Backbone")
+class DINOV3Backbone(FeaturePyramidBackbone):
+    """DINOV3 core network with hyperparameters.
+
+    Args:
+        patch_size: int. The size of each square patch in the input image.
+        num_layers: int. The number of transformer layers.
+        hidden_dim: int. The size of the transformer hidden state at the end
+            of each transformer layer.
+        num_heads: int. The number of attention heads for each transformer.
+        intermediate_dim: int. The output dimension of the first Dense layer in
+            a two-layer feedforward network for each transformer.
+        layer_scale_init_value: float. The initial value for the layer scale in
+            the transformer layers. Defaults to `1.0`.
+        num_register_tokens: int. The number of register tokens to use in the
+            embedding layer. Defaults to `0`.
+        use_mask_token: bool. Whether to use a mask token in the embedding
+            layer. Defaults to `True`.
+        use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to
+            `False`.
+        attention_dropout: float. The dropout rate for the attention
+            probabilities. Defaults to `0.0`.
+        drop_path_rate: float. The drop path rate to use. Defaults to `0.0`.
+        image_shape: tuple. The input shape without the batch size. Defaults to
+            `(518, 518, 3)`.
+        rope_theta: float. The base period of the rotary position embeddings.
+        apply_layernorm: bool. Whether to apply layer normalization to the
+            outputs of each stage in the feature pyramid. Defaults to `False`.
+        query_bias: bool. Whether to use a bias for the query projection.
+        key_bias: bool. Whether to use a bias for the key projection.
+        value_bias: bool. Whether to use a bias for the value projection.
+        proj_bias: bool. Whether to use a bias for the output projection.
+        data_format: `None` or str. If specified, either `"channels_last"` or
+            `"channels_first"`. The ordering of the dimensions in the
+            inputs. `"channels_last"` corresponds to inputs with shape
+            `(batch_size, height, width, channels)`
+            while `"channels_first"` corresponds to inputs with shape
+            `(batch_size, channels, height, width)`. It defaults to the
+            `image_data_format` value found in your Keras config file at
+            `~/.keras/keras.json`. If you never set it, then it will be
+            `"channels_last"`.
+        dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
+            for the models computations and weights. Note that some
+            computations, such as softmax and layer normalization will always
+            be done a float32 precision regardless of dtype.
+    """
+
+    def __init__(
+        self,
+        patch_size,
+        num_layers,
+        hidden_dim,
+        num_heads,
+        intermediate_dim,
+        layer_scale_init_value=1.0,
+        num_register_tokens=4,
+        use_mask_token=True,
+        use_gated_mlp=False,
+        attention_dropout=0.0,
+        drop_path_rate=0.0,
+        image_shape=(518, 518, 3),
+        rope_theta=10000.0,
+        apply_layernorm=False,
+        query_bias=True,
+        key_bias=True,
+        value_bias=True,
+        proj_bias=True,
+        data_format=None,
+        dtype=None,
+        name=None,
+        **kwargs,
+    ):
+        data_format = standardize_data_format(data_format)
+
+        prefix = str(name) + "_" if name is not None else ""
+
+        # === Layers ===
+        self.embeddings = DINOV3Embedding(
+            hidden_dim=hidden_dim,
+            patch_size=patch_size,
+            num_register_tokens=num_register_tokens,
+            use_mask_token=use_mask_token,
+            data_format=data_format,
+            dtype=dtype,
+            name=f"{prefix}embeddings",
+        )
+        self.rope_embedding = DINOV3RopePositionEmbedding(
+            hidden_dim=hidden_dim,
+            num_heads=num_heads,
+            rope_theta=rope_theta,
+            patch_size=patch_size,
+            dtype=dtype,
+            name=f"{prefix}rope_embedding",
+        )
+        self.encoder = DINOV3Encoder(
+            num_layers=num_layers,
+            hidden_dim=hidden_dim,
+            num_heads=num_heads,
+            intermediate_dim=intermediate_dim,
+            layer_scale_init_value=layer_scale_init_value,
+            use_gated_mlp=use_gated_mlp,
+            attention_dropout=attention_dropout,
+            drop_path_rate=drop_path_rate,
+            query_bias=query_bias,
+            key_bias=key_bias,
+            value_bias=value_bias,
+            proj_bias=proj_bias,
+            dtype=dtype,
+            name=f"{prefix}encoder",
+        )
+        self.layernorm = layers.LayerNormalization(
+            epsilon=1e-6, dtype=dtype, name=f"{prefix}layernorm"
+        )
+
+        # === Functional Model ===
+        pyramid_outputs = {}
+        image_input = layers.Input(shape=image_shape, name="images")
+        x = self.embeddings(image_input)
+        pyramid_outputs["stem"] = x
+
+        position_embeddings = self.rope_embedding(image_input)
+        num_prefix_tokens = 1 + num_register_tokens
+
+        x, encoder_pyramid_outputs = self.encoder(
+            x,
+            position_embeddings=position_embeddings,
+            num_prefix_tokens=num_prefix_tokens,
+        )
+        pyramid_outputs.update(encoder_pyramid_outputs)
+        x = self.layernorm(x)
+        if apply_layernorm:
+            for key in pyramid_outputs:
+                pyramid_outputs[key] = self.layernorm(pyramid_outputs[key])
+        outputs = x
+        super().__init__(
+            inputs={"images": image_input},
+            outputs=outputs,
+            dtype=dtype,
+            name=name,
+            **kwargs,
+        )
+
+        # === Config ===
+        self.patch_size = int(patch_size)
+        self.num_layers = int(num_layers)
+        self.hidden_dim = int(hidden_dim)
+        self.num_heads = int(num_heads)
+        self.intermediate_dim = int(intermediate_dim)
+        self.layer_scale_init_value = float(layer_scale_init_value)
+        self.num_register_tokens = int(num_register_tokens)
+        self.use_mask_token = bool(use_mask_token)
+        self.use_gated_mlp = bool(use_gated_mlp)
+        self.attention_dropout = float(attention_dropout)
+        self.drop_path_rate = float(drop_path_rate)
+        self.image_shape = image_shape
+        self.rope_theta = rope_theta
+        self.apply_layernorm = apply_layernorm
+        self.query_bias = query_bias
+        self.key_bias = key_bias
+        self.value_bias = value_bias
+        self.proj_bias = proj_bias
+        self.pyramid_outputs = pyramid_outputs
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "patch_size": self.patch_size,
+                "num_layers": self.num_layers,
+                "hidden_dim": self.hidden_dim,
+                "num_heads": self.num_heads,
+                "intermediate_dim": self.intermediate_dim,
+                "layer_scale_init_value": self.layer_scale_init_value,
+                "num_register_tokens": self.num_register_tokens,
+                "use_mask_token": self.use_mask_token,
+                "use_gated_mlp": self.use_gated_mlp,
+                "attention_dropout": self.attention_dropout,
+                "drop_path_rate": self.drop_path_rate,
+                "image_shape": self.image_shape,
+                "rope_theta": self.rope_theta,
+                "apply_layernorm": self.apply_layernorm,
+                "query_bias": self.query_bias,
+                "key_bias": self.key_bias,
+                "value_bias": self.value_bias,
+                "proj_bias": self.proj_bias,
+            }
+        )
+        return config
diff --git a/keras_hub/src/models/dinov3/dinov3_image_converter.py b/keras_hub/src/models/dinov3/dinov3_image_converter.py
new file mode 100644
index 0000000000..54b08eacf3
--- /dev/null
+++ b/keras_hub/src/models/dinov3/dinov3_image_converter.py
@@ -0,0 +1,8 @@
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
+from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone
+
+
+@keras_hub_export("keras_hub.layers.DINOV3ImageConverter")
+class DINOV3ImageConverter(ImageConverter):
+    backbone_cls = DINOV3Backbone
diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py
new file mode 100644
index 0000000000..1eb80141ee
--- /dev/null
+++ b/keras_hub/src/models/dinov3/dinov3_layers.py
@@ -0,0 +1,722 @@
+import math
+
+from keras import initializers
+from keras import layers
+from keras import ops
+from keras import random
+
+from keras_hub.src.utils.keras_utils import standardize_data_format
+
+
+class DINOV3PatchEmbedding(layers.Layer):
+    """A layer that converts images into patches.
+
+    Args:
+        hidden_dim: int. The number of units in the hidden layers.
+        patch_size: int. The size of one side of each patch.
+        data_format: `None` or str. If specified, either `"channels_last"` or
+            `"channels_first"`. The ordering of the dimensions in the
+            inputs. `"channels_last"` corresponds to inputs with shape
+            `(batch_size, height, width, channels)`
+            while `"channels_first"` corresponds to inputs with shape
+            `(batch_size, channels, height, width)`. It defaults to the
+            `image_data_format` value found in your Keras config file at
+            `~/.keras/keras.json`. If you never set it, then it will be
+            `"channels_last"`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
+    def __init__(self, hidden_dim, patch_size, data_format=None, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_dim = int(hidden_dim)
+        self.patch_size = int(patch_size)
+        self.data_format = standardize_data_format(data_format)
+
+        self.projection = layers.Conv2D(
+            hidden_dim,
+            kernel_size=patch_size,
+            strides=patch_size,
+            data_format=data_format,
+            kernel_initializer=initializers.TruncatedNormal(stddev=0.02),
+            dtype=self.dtype_policy,
+            name="projection",
+        )
+
+    def build(self, input_shape):
+        self.projection.build(input_shape)
+
+    def call(self, inputs, training=None):
+        batch_size = ops.shape(inputs)[0]
+        embeddings = self.projection(inputs, training=training)
+        if self.data_format == "channels_last":
+            embeddings = ops.reshape(
+                embeddings, (batch_size, -1, self.hidden_dim)
+            )
+        else:
+            embeddings = ops.reshape(
+                embeddings, (batch_size, self.hidden_dim, -1)
+            )
+            embeddings = ops.transpose(embeddings, (0, 2, 1))
+        return embeddings
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "patch_size": self.patch_size,
+            }
+        )
+        return config
+
+    def compute_output_shape(self, input_shape):
+        output_shape = [input_shape[0], None, self.hidden_dim]
+        if self.data_format == "channels_last":
+            if input_shape[1] is not None and input_shape[2] is not None:
+                patch_num = input_shape[1] // self.patch_size
+                output_shape[1] = patch_num**2
+        else:
+            if input_shape[2] is not None and input_shape[3] is not None:
+                patch_num = input_shape[2] // self.patch_size
+                output_shape[1] = patch_num**2
+        return output_shape
+
+
+class DINOV3Embedding(layers.Layer):
+    """A layer that converts images into patches.
+
+    This layer adds all the necessary tokens to the embeddings, inlcuding
+    the class token, register tokens and mask token if specified.
+
+    Args:
+        hidden_dim: int. The number of units in the hidden layers.
+        patch_size: int. The size of one side of each patch.
+        num_register_tokens: int. The number of register tokens to add to the
+            embeddings. Defaults to `0`.
+        use_mask_token: bool. Whether to use a mask token. Defaults to `True`.
+        initializer_range: float. The standard deviation of the truncated
+            normal initializer. Defaults to `0.02`.
+        data_format: `None` or str. If specified, either `"channels_last"` or
+            `"channels_first"`. The ordering of the dimensions in the
+            inputs. `"channels_last"` corresponds to inputs with shape
+            `(batch_size, height, width, channels)`
+            while `"channels_first"` corresponds to inputs with shape
+            `(batch_size, channels, height, width)`. It defaults to the
+            `image_data_format` value found in your Keras config file at
+            `~/.keras/keras.json`. If you never set it, then it will be
+            `"channels_last"`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
+    def __init__(
+        self,
+        hidden_dim,
+        patch_size,
+        num_register_tokens=0,
+        use_mask_token=True,
+        initializer_range=0.02,
+        data_format=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_dim = int(hidden_dim)
+        self.patch_size = int(patch_size)
+        self.num_register_tokens = int(num_register_tokens)
+        self.use_mask_token = bool(use_mask_token)
+        self.initializer_range = float(initializer_range)
+        self.data_format = standardize_data_format(data_format)
+
+        self.patch_embeddings = DINOV3PatchEmbedding(
+            hidden_dim,
+            patch_size,
+            data_format=data_format,
+            dtype=self.dtype_policy,
+            name="patch_embeddings",
+        )
+
+    def build(self, input_shape):
+        self.cls_token = self.add_weight(
+            shape=(1, 1, self.hidden_dim),
+            initializer=initializers.TruncatedNormal(
+                stddev=self.initializer_range
+            ),
+            trainable=True,
+            name="cls_token",
+        )
+        if self.use_mask_token:
+            self.mask_token = self.add_weight(
+                shape=(1, 1, self.hidden_dim),
+                initializer="zeros",
+                trainable=True,
+                name="mask_token",
+            )
+        if self.num_register_tokens > 0:
+            self.register_tokens = self.add_weight(
+                shape=(1, self.num_register_tokens, self.hidden_dim),
+                initializer=initializers.TruncatedNormal(
+                    stddev=self.initializer_range
+                ),
+                trainable=True,
+                name="register_tokens",
+            )
+        self.patch_embeddings.build(input_shape)
+
+    def call(self, inputs, masks=None, training=None):
+        batch_size = ops.shape(inputs)[0]
+        embeddings = self.patch_embeddings(inputs, training=training)
+
+        if masks is not None and self.use_mask_token:
+            mask_token = ops.cast(self.mask_token, embeddings.dtype)
+            embeddings = ops.where(
+                ops.expand_dims(masks, axis=-1),
+                mask_token,
+                embeddings,
+            )
+
+        cls_tokens = ops.tile(self.cls_token, (batch_size, 1, 1))
+        embeddings = ops.concatenate((cls_tokens, embeddings), axis=1)
+
+        if self.num_register_tokens > 0:
+            register_tokens = ops.tile(self.register_tokens, (batch_size, 1, 1))
+            embeddings = ops.concatenate(
+                (
+                    embeddings[:, :1, ...],
+                    register_tokens,
+                    embeddings[:, 1:, ...],
+                ),
+                axis=1,
+            )
+
+        return embeddings
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "patch_size": self.patch_size,
+                "num_register_tokens": self.num_register_tokens,
+                "use_mask_token": self.use_mask_token,
+                "initializer_range": self.initializer_range,
+            }
+        )
+        return config
+
+    def compute_output_shape(self, input_shape):
+        output_shape = [input_shape[0], None, self.hidden_dim]
+        if self.data_format == "channels_last":
+            if input_shape[1] is not None and input_shape[2] is not None:
+                patch_num = input_shape[1] // self.patch_size
+                output_shape[1] = 1 + self.num_register_tokens + patch_num**2
+        else:
+            if input_shape[2] is not None and input_shape[3] is not None:
+                patch_num = input_shape[2] // self.patch_size
+                output_shape[1] = 1 + self.num_register_tokens + patch_num**2
+        return output_shape
+
+
+def _get_patches_center_coordinates(
+    num_patches_h, num_patches_w, dtype="float32"
+):
+    coords_h = ops.arange(0.5, num_patches_h, dtype=dtype)
+    coords_w = ops.arange(0.5, num_patches_w, dtype=dtype)
+
+    coords_h = coords_h / num_patches_h
+    coords_w = coords_w / num_patches_w
+
+    coords_h = ops.expand_dims(coords_h, axis=1)
+    coords_w = ops.expand_dims(coords_w, axis=0)
+
+    coords_h = ops.repeat(coords_h, num_patches_w, axis=1)
+    coords_w = ops.repeat(coords_w, num_patches_h, axis=0)
+
+    coords = ops.stack([coords_h, coords_w], axis=-1)
+    coords = ops.reshape(coords, (-1, 2))
+    coords = 2.0 * coords - 1.0
+    return coords
+
+
+class DINOV3RopePositionEmbedding(layers.Layer):
+    def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.rope_theta = rope_theta
+        self.patch_size = patch_size
+        self.head_dim = hidden_dim // num_heads
+        inv_freq = 1.0 / (
+            rope_theta ** (ops.arange(0, 1, 4 / self.head_dim, dtype="float32"))
+        )
+        self.inv_freq = inv_freq
+
+    def call(self, pixel_values):
+        shape = ops.shape(pixel_values)
+        height, width = shape[1], shape[2]
+        num_patches_h = height // self.patch_size
+        num_patches_w = width // self.patch_size
+
+        patch_coords = _get_patches_center_coordinates(
+            num_patches_h, num_patches_w, dtype="float32"
+        )
+
+        angles = (
+            2
+            * math.pi
+            * ops.expand_dims(patch_coords, axis=-1)
+            * ops.expand_dims(ops.expand_dims(self.inv_freq, axis=0), axis=0)
+        )
+        angles = ops.reshape(angles, (ops.shape(angles)[0], -1))
+        angles = ops.tile(angles, (1, 2))
+
+        cos = ops.cos(angles)
+        sin = ops.sin(angles)
+
+        return ops.cast(cos, pixel_values.dtype), ops.cast(
+            sin, pixel_values.dtype
+        )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "num_heads": self.num_heads,
+                "rope_theta": self.rope_theta,
+                "patch_size": self.patch_size,
+            }
+        )
+        return config
+
+
+def _rotate_half(x):
+    x1 = x[..., : ops.shape(x)[-1] // 2]
+    x2 = x[..., ops.shape(x)[-1] // 2 :]
+    return ops.concatenate([-x2, x1], axis=-1)
+
+
+def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens):
+    q_prefix_tokens = q[:, :, :num_prefix_tokens, :]
+    q_patches = q[:, :, num_prefix_tokens:, :]
+    k_prefix_tokens = k[:, :, :num_prefix_tokens, :]
+    k_patches = k[:, :, num_prefix_tokens:, :]
+
+    cos = ops.expand_dims(ops.expand_dims(cos, axis=0), axis=0)
+    sin = ops.expand_dims(ops.expand_dims(sin, axis=0), axis=0)
+
+    q_patches = (q_patches * cos) + (_rotate_half(q_patches) * sin)
+    k_patches = (k_patches * cos) + (_rotate_half(k_patches) * sin)
+
+    q = ops.concatenate([q_prefix_tokens, q_patches], axis=-2)
+    k = ops.concatenate([k_prefix_tokens, k_patches], axis=-2)
+
+    return q, k
+
+
+class DINOV3Attention(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        num_heads,
+        dropout_rate=0.0,
+        query_bias=True,
+        key_bias=True,
+        value_bias=True,
+        proj_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.dropout_rate = dropout_rate
+        self.head_dim = hidden_dim // num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.q_proj = layers.Dense(
+            hidden_dim, use_bias=query_bias, name="q_proj"
+        )
+        self.k_proj = layers.Dense(hidden_dim, use_bias=key_bias, name="k_proj")
+        self.v_proj = layers.Dense(
+            hidden_dim, use_bias=value_bias, name="v_proj"
+        )
+        self.o_proj = layers.Dense(
+            hidden_dim, use_bias=proj_bias, name="o_proj"
+        )
+        self.dropout = layers.Dropout(dropout_rate)
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_embeddings=None,
+        num_prefix_tokens=0,
+    ):
+        batch_size, seq_len, _ = ops.shape(hidden_states)
+
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+
+        q = ops.reshape(q, (batch_size, seq_len, self.num_heads, self.head_dim))
+        k = ops.reshape(k, (batch_size, seq_len, self.num_heads, self.head_dim))
+        v = ops.reshape(v, (batch_size, seq_len, self.num_heads, self.head_dim))
+
+        q = ops.transpose(q, (0, 2, 1, 3))
+        k = ops.transpose(k, (0, 2, 1, 3))
+        v = ops.transpose(v, (0, 2, 1, 3))
+
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            q, k = _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens)
+
+        attn_weights = (
+            ops.matmul(q, ops.transpose(k, (0, 1, 3, 2))) * self.scale
+        )
+
+        if attention_mask is not None:
+            attn_weights += attention_mask
+
+        attn_weights = ops.softmax(attn_weights, axis=-1)
+        attn_weights = self.dropout(attn_weights)
+
+        attn_output = ops.matmul(attn_weights, v)
+        attn_output = ops.transpose(attn_output, (0, 2, 1, 3))
+        attn_output = ops.reshape(attn_output, (batch_size, seq_len, -1))
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "num_heads": self.num_heads,
+                "dropout_rate": self.dropout_rate,
+                "query_bias": self.q_proj.use_bias,
+                "key_bias": self.k_proj.use_bias,
+                "value_bias": self.v_proj.use_bias,
+                "proj_bias": self.o_proj.use_bias,
+            }
+        )
+        return config
+
+
+class DINOV3LayerScale(layers.Layer):
+    def __init__(self, hidden_dim, init_values=1.0, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_dim = int(hidden_dim)
+        self.init_values = float(init_values)
+
+    def build(self, input_shape):
+        self.lambda1 = self.add_weight(
+            shape=(self.hidden_dim,),
+            initializer=initializers.Constant(self.init_values),
+            trainable=True,
+            name="lambda1",
+        )
+
+    def call(self, inputs, training=None):
+        return ops.multiply(inputs, self.lambda1)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {"hidden_dim": self.hidden_dim, "init_values": self.init_values}
+        )
+        return config
+
+
+class DINOV3DropPath(layers.Layer):
+    def __init__(self, rate=0.0, **kwargs):
+        super().__init__(**kwargs)
+        self.rate = float(rate)
+
+    def build(self, input_shape):
+        self.noise_shape = (input_shape[0],) + (1,) * (len(input_shape) - 1)
+
+    def call(self, inputs, training=None):
+        if not training or self.rate == 0.0:
+            return inputs
+
+        keep_prob = 1.0 - self.rate
+        random_tensor = keep_prob + random.uniform(
+            self.noise_shape, dtype=inputs.dtype
+        )
+        random_tensor = ops.floor(random_tensor)
+        return ops.multiply(ops.divide(inputs, keep_prob), random_tensor)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"rate": self.rate})
+        return config
+
+
+class DINOV3MLP(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        intermediate_dim,
+        activation="gelu",
+        use_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_dim = hidden_dim
+        self.intermediate_dim = intermediate_dim
+        self.activation = activation
+        self.use_bias = use_bias
+        self.up_proj = layers.Dense(
+            intermediate_dim, use_bias=use_bias, name="up_proj"
+        )
+        self.down_proj = layers.Dense(
+            hidden_dim, use_bias=use_bias, name="down_proj"
+        )
+        self.act_fn = layers.Activation(activation)
+
+    def call(self, x):
+        return self.down_proj(self.act_fn(self.up_proj(x)))
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "intermediate_dim": self.intermediate_dim,
+                "activation": self.activation,
+                "use_bias": self.use_bias,
+            }
+        )
+        return config
+
+
+class DINOV3GatedMLP(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        intermediate_dim,
+        activation="gelu",
+        use_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_dim = hidden_dim
+        self.intermediate_dim = intermediate_dim
+        self.activation = activation
+        self.use_bias = use_bias
+        self.gate_proj = layers.Dense(
+            intermediate_dim, use_bias=use_bias, name="gate_proj"
+        )
+        self.up_proj = layers.Dense(
+            intermediate_dim, use_bias=use_bias, name="up_proj"
+        )
+        self.down_proj = layers.Dense(
+            hidden_dim, use_bias=use_bias, name="down_proj"
+        )
+        self.act_fn = layers.Activation(activation)
+
+    def call(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "intermediate_dim": self.intermediate_dim,
+                "activation": self.activation,
+                "use_bias": self.use_bias,
+            }
+        )
+        return config
+
+
+class DINOV3Layer(layers.Layer):
+    def __init__(
+        self,
+        hidden_dim,
+        num_heads,
+        intermediate_dim,
+        layer_scale_init_value=1.0,
+        use_gated_mlp=False,
+        attention_dropout=0.0,
+        drop_path_rate=0.0,
+        layer_norm_eps=1e-6,
+        query_bias=True,
+        key_bias=True,
+        value_bias=True,
+        proj_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.intermediate_dim = intermediate_dim
+        self.layer_scale_init_value = layer_scale_init_value
+        self.use_gated_mlp = use_gated_mlp
+        self.attention_dropout = attention_dropout
+        self.drop_path_rate = drop_path_rate
+        self.layer_norm_eps = layer_norm_eps
+
+        self.norm1 = layers.LayerNormalization(
+            epsilon=layer_norm_eps, name="norm1"
+        )
+        self.attention = DINOV3Attention(
+            hidden_dim=hidden_dim,
+            num_heads=num_heads,
+            dropout_rate=attention_dropout,
+            query_bias=query_bias,
+            key_bias=key_bias,
+            value_bias=value_bias,
+            proj_bias=proj_bias,
+            name="attention",
+        )
+        self.layer_scale1 = DINOV3LayerScale(
+            hidden_dim,
+            init_values=layer_scale_init_value,
+            name="layer_scale1",
+        )
+        self.drop_path = (
+            DINOV3DropPath(drop_path_rate)
+            if drop_path_rate > 0.0
+            else layers.Identity()
+        )
+        self.norm2 = layers.LayerNormalization(
+            epsilon=layer_norm_eps, name="norm2"
+        )
+        if use_gated_mlp:
+            self.mlp = DINOV3GatedMLP(hidden_dim, intermediate_dim, name="mlp")
+        else:
+            self.mlp = DINOV3MLP(hidden_dim, intermediate_dim, name="mlp")
+        self.layer_scale2 = DINOV3LayerScale(
+            hidden_dim, init_values=layer_scale_init_value, name="layer_scale2"
+        )
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_embeddings=None,
+        num_prefix_tokens=0,
+    ):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            num_prefix_tokens=num_prefix_tokens,
+        )
+        hidden_states = self.layer_scale1(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.layer_scale2(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+
+        return hidden_states
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "hidden_dim": self.hidden_dim,
+                "num_heads": self.num_heads,
+                "intermediate_dim": self.intermediate_dim,
+                "layer_scale_init_value": self.layer_scale_init_value,
+                "use_gated_mlp": self.use_gated_mlp,
+                "attention_dropout": self.attention_dropout,
+                "drop_path_rate": self.drop_path_rate,
+                "layer_norm_eps": self.layer_norm_eps,
+            }
+        )
+        return config
+
+
+class DINOV3Encoder(layers.Layer):
+    def __init__(
+        self,
+        num_layers,
+        hidden_dim,
+        num_heads,
+        intermediate_dim,
+        layer_scale_init_value=1.0,
+        use_gated_mlp=False,
+        attention_dropout=0.0,
+        drop_path_rate=0.0,
+        layer_norm_eps=1e-6,
+        query_bias=True,
+        key_bias=True,
+        value_bias=True,
+        proj_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.intermediate_dim = intermediate_dim
+        self.layer_scale_init_value = layer_scale_init_value
+        self.use_gated_mlp = use_gated_mlp
+        self.attention_dropout = attention_dropout
+        self.drop_path_rate = drop_path_rate
+        self.layer_norm_eps = layer_norm_eps
+
+        dpr = [x for x in ops.linspace(0.0, drop_path_rate, num_layers)]
+        self.layers = [
+            DINOV3Layer(
+                hidden_dim=hidden_dim,
+                num_heads=num_heads,
+                intermediate_dim=intermediate_dim,
+                layer_scale_init_value=layer_scale_init_value,
+                use_gated_mlp=use_gated_mlp,
+                attention_dropout=attention_dropout,
+                drop_path_rate=dpr[i],
+                layer_norm_eps=layer_norm_eps,
+                query_bias=query_bias,
+                key_bias=key_bias,
+                value_bias=value_bias,
+                proj_bias=proj_bias,
+                name=f"layers.{i}",
+            )
+            for i in range(num_layers)
+        ]
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_embeddings=None,
+        num_prefix_tokens=0,
+    ):
+        pyramid_outputs = {}
+        for i, layer_module in enumerate(self.layers):
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+                num_prefix_tokens=num_prefix_tokens,
+            )
+            pyramid_outputs[f"stage{i + 1}"] = hidden_states
+
+        return hidden_states, pyramid_outputs
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_layers": self.num_layers,
+                "hidden_dim": self.hidden_dim,
+                "num_heads": self.num_heads,
+                "intermediate_dim": self.intermediate_dim,
+                "layer_scale_init_value": self.layer_scale_init_value,
+                "use_gated_mlp": self.use_gated_mlp,
+                "attention_dropout": self.attention_dropout,
+                "drop_path_rate": self.drop_path_rate,
+                "layer_norm_eps": self.layer_norm_eps,
+            }
+        )
+        return config
diff --git a/keras_hub/src/models/dinov3/dinov3_presets.py b/keras_hub/src/models/dinov3/dinov3_presets.py
new file mode 100644
index 0000000000..077663f11b
--- /dev/null
+++ b/keras_hub/src/models/dinov3/dinov3_presets.py
@@ -0,0 +1,4 @@
+"""DINOV3 model preset configurations."""
+
+# Metadata for loading pretrained model weights.
+backbone_presets = {}
diff --git a/keras_hub/src/utils/transformers/convert_dinov3.py b/keras_hub/src/utils/transformers/convert_dinov3.py
new file mode 100644
index 0000000000..97d9bd0b09
--- /dev/null
+++ b/keras_hub/src/utils/transformers/convert_dinov3.py
@@ -0,0 +1,100 @@
+import numpy as np
+
+from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone
+
+backbone_cls = DINOV3Backbone
+
+
+def convert_backbone_config(transformers_config):
+    image_size = transformers_config["image_size"]
+    return {
+        "patch_size": transformers_config["patch_size"],
+        "num_layers": transformers_config["num_hidden_layers"],
+        "hidden_dim": transformers_config["hidden_size"],
+        "num_heads": transformers_config["num_attention_heads"],
+        "intermediate_dim": transformers_config["intermediate_size"],
+        "layer_scale_init_value": transformers_config["layerscale_value"],
+        "num_register_tokens": transformers_config.get(
+            "num_register_tokens", 0
+        ),
+        "use_mask_token": True,
+        "use_gated_mlp": transformers_config["use_gated_mlp"],
+        "attention_dropout": transformers_config["attention_dropout"],
+        "drop_path_rate": transformers_config["drop_path_rate"],
+        "image_shape": (image_size, image_size, 3),
+        "rope_theta": transformers_config["rope_theta"],
+        "apply_layernorm": False,
+        "query_bias": transformers_config["query_bias"],
+        "key_bias": transformers_config["key_bias"],
+        "value_bias": transformers_config["value_bias"],
+        "proj_bias": transformers_config["proj_bias"],
+    }
+
+
+def convert_weights(backbone, loader, transformers_config):
+    if not isinstance(backbone, DINOV3Backbone):
+        raise ValueError(
+            "The provided backbone must be an instance of DINOV3Backbone. "
+            f"Received: {type(backbone)}"
+        )
+
+    def port_ln(keras_variable, weight_key):
+        loader.port_weight(keras_variable.gamma, f"{weight_key}.weight")
+        loader.port_weight(keras_variable.beta, f"{weight_key}.bias")
+
+    def port_dense(keras_variable, weight_key):
+        loader.port_weight(
+            keras_variable.kernel,
+            f"{weight_key}.weight",
+            hook_fn=lambda x, _: x.T,
+        )
+        if keras_variable.bias is not None:
+            loader.port_weight(keras_variable.bias, f"{weight_key}.bias")
+
+    # Embedding.
+    loader.port_weight(
+        keras_variable=backbone.embeddings.cls_token,
+        hf_weight_key="embeddings.cls_token",
+    )
+    if backbone.num_register_tokens > 0:
+        loader.port_weight(
+            keras_variable=backbone.embeddings.register_tokens,
+            hf_weight_key="embeddings.register_tokens",
+        )
+    loader.port_weight(
+        keras_variable=backbone.embeddings.patch_embeddings.projection.kernel,
+        hf_weight_key="embeddings.patch_embeddings.weight",
+        hook_fn=lambda x, _: np.transpose(x, (2, 3, 1, 0)),
+    )
+    loader.port_weight(
+        keras_variable=backbone.embeddings.patch_embeddings.projection.bias,
+        hf_weight_key="embeddings.patch_embeddings.bias",
+    )
+
+    # Encoder.
+    for i, layer in enumerate(backbone.encoder.layers):
+        prefix = f"layer.{i}"
+        port_ln(layer.norm1, f"{prefix}.norm1")
+        port_dense(layer.attention.q_proj, f"{prefix}.attention.q_proj")
+        port_dense(layer.attention.k_proj, f"{prefix}.attention.k_proj")
+        port_dense(layer.attention.v_proj, f"{prefix}.attention.v_proj")
+        port_dense(layer.attention.o_proj, f"{prefix}.attention.o_proj")
+
+        loader.port_weight(
+            keras_variable=layer.layer_scale1.lambda1,
+            hf_weight_key=f"{prefix}.layer_scale1.lambda1",
+        )
+        port_ln(layer.norm2, f"{prefix}.norm2")
+        if backbone.use_gated_mlp:
+            port_dense(layer.mlp.gate_proj, f"{prefix}.mlp.gate_proj")
+            port_dense(layer.mlp.up_proj, f"{prefix}.mlp.up_proj")
+            port_dense(layer.mlp.down_proj, f"{prefix}.mlp.down_proj")
+        else:
+            port_dense(layer.mlp.up_proj, f"{prefix}.mlp.up_proj")
+            port_dense(layer.mlp.down_proj, f"{prefix}.mlp.down_proj")
+        loader.port_weight(
+            keras_variable=layer.layer_scale2.lambda1,
+            hf_weight_key=f"{prefix}.layer_scale2.lambda1",
+        )
+
+    port_ln(backbone.layernorm, "norm")
diff --git a/keras_hub/src/utils/transformers/preset_loader.py b/keras_hub/src/utils/transformers/preset_loader.py
index 73f6a27717..f98007b438 100644
--- a/keras_hub/src/utils/transformers/preset_loader.py
+++ b/keras_hub/src/utils/transformers/preset_loader.py
@@ -8,6 +8,7 @@
 from keras_hub.src.utils.transformers import convert_bert
 from keras_hub.src.utils.transformers import convert_deit
 from keras_hub.src.utils.transformers import convert_dinov2
+from keras_hub.src.utils.transformers import convert_dinov3
 from keras_hub.src.utils.transformers import convert_distilbert
 from keras_hub.src.utils.transformers import convert_esm
 from keras_hub.src.utils.transformers import convert_gemma
@@ -42,6 +43,8 @@ def __init__(self, preset, config):
             self.converter = convert_distilbert
         elif model_type in ("dinov2", "dinov2_with_registers"):
             self.converter = convert_dinov2
+        elif model_type == "dinov3_vit":
+            self.converter = convert_dinov3
         elif model_type == "esm":
             self.converter = convert_esm
         elif model_type in ("gemma", "gemma2"):
diff --git a/tools/checkpoint_conversion/convert_dinov3_checkpoints.py b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py
new file mode 100644
index 0000000000..03893b6885
--- /dev/null
+++ b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py
@@ -0,0 +1,136 @@
+import keras
+import numpy as np
+import torch
+from absl import app
+from absl import flags
+from PIL import Image
+from transformers import AutoImageProcessor
+from transformers import AutoModel
+
+import keras_hub
+
+PRESET_MAP = {
+    "dinov3_vit_small_lvd1689m": "facebook/dinov3-vits16-pretrain-lvd1689m",
+}
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "preset",
+    None,
+    f"Must be one of {','.join(PRESET_MAP.keys())}",
+    required=True,
+)
+flags.DEFINE_string(
+    "upload_uri",
+    None,
+    'Could be "kaggle://keras/{variant}/keras/{preset}"',
+    required=False,
+)
+
+
+def convert_image_converter(image_size, hf_image_processor):
+    config = hf_image_processor.to_dict()
+    image_size = (image_size, image_size)
+    std = config["image_std"]
+    mean = config["image_mean"]
+    return keras_hub.layers.DINOV3ImageConverter(
+        image_size=image_size,
+        scale=[1.0 / 255.0 / s for s in std],
+        offset=[-m / s for m, s in zip(mean, std)],
+        interpolation="bilinear",
+        antialias=True,
+    )
+
+
+def validate_output(
+    keras_hub_model,
+    keras_hub_image_converter,
+    hf_model,
+    hf_image_processor,
+):
+    file = keras.utils.get_file(
+        origin=("http://images.cocodataset.org/val2017/000000039769.jpg")
+    )
+    image = Image.open(file)
+
+    # Preprocess with hf.
+    hf_inputs = hf_image_processor(images=image, return_tensors="pt")
+    hf_preprocessed = hf_inputs["pixel_values"].detach().cpu().numpy()
+    print("🔶 HF preprocessed shape:", hf_preprocessed.shape)
+
+    # Preprocess with keras.
+    images = np.expand_dims(np.array(image).astype("float32"), axis=0)
+    images = keras_hub_image_converter(images)
+    keras_preprocessed = keras.ops.convert_to_numpy(images)
+
+    # Call with hf. Use the keras preprocessed image so we can keep modeling
+    # and preprocessing comparisons independent.
+    hf_inputs["pixel_values"] = torch.from_numpy(
+        keras.ops.convert_to_numpy(
+            keras.ops.transpose(keras_preprocessed, (0, 3, 1, 2))
+        )
+    )
+    hf_outputs = hf_model(**hf_inputs)
+    hf_outputs = hf_outputs[0].detach().cpu().numpy()
+
+    # Call with keras.
+    keras_outputs = keras_hub_model.predict({"images": images}, verbose=0)
+    keras_outputs = keras.ops.convert_to_numpy(keras_outputs)
+
+    print("🔶 Keras output:", keras_outputs[0, 0, :10])
+    print("🔶 HF output:", hf_outputs[0, 0, :10])
+    modeling_diff = np.mean(np.abs(keras_outputs - hf_outputs))
+    print("🔶 Modeling difference:", modeling_diff)
+    preprocessing_diff = np.mean(
+        np.abs(keras_preprocessed - np.transpose(hf_preprocessed, (0, 2, 3, 1)))
+    )
+    print("🔶 Preprocessing difference:", preprocessing_diff)
+
+
+def main(_):
+    # === Get the preset name ===
+    if FLAGS.preset not in PRESET_MAP.keys():
+        raise ValueError(
+            f"Invalid preset {FLAGS.preset}. Must be one "
+            f"of {','.join(PRESET_MAP.keys())}"
+        )
+    preset = FLAGS.preset
+    hf_preset = PRESET_MAP[preset]
+
+    # Load the HF model.
+    hf_model = AutoModel.from_pretrained(hf_preset)
+    hf_model.eval()
+    image_size = int(hf_model.config.image_size)
+    hf_image_processor = AutoImageProcessor.from_pretrained(hf_preset)
+
+    # Load the KerasHub model.
+    keras_hub_backbone = keras_hub.models.DINOV3Backbone.from_preset(
+        f"hf://{hf_preset}"
+    )
+    keras_hub_backbone.summary()
+    keras_hub_image_converter = convert_image_converter(
+        image_size, hf_image_processor
+    )
+    print("✅ KerasHub model loaded.")
+    print("✅ Weights converted.")
+
+    validate_output(
+        keras_hub_backbone,
+        keras_hub_image_converter,
+        hf_model,
+        hf_image_processor,
+    )
+    print("✅ Output validated.")
+
+    keras_hub_backbone.save_to_preset(f"./{preset}")
+    keras_hub_image_converter.save_to_preset(f"./{preset}")
+    print(f"🏁 Preset saved to ./{preset}.")
+
+    upload_uri = FLAGS.upload_uri
+    if upload_uri:
+        keras_hub.upload_preset(uri=upload_uri, preset=f"./{preset}")
+        print(f"🏁 Preset uploaded to {upload_uri}")
+
+
+if __name__ == "__main__":
+    app.run(main)

From 809abbddc1a11a04cf6146c3a560e3ff8d31217b Mon Sep 17 00:00:00 2001
From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com>
Date: Sun, 19 Oct 2025 21:08:18 +0800
Subject: [PATCH 2/5] Add tests and docstrings.

---
 .../src/models/dinov3/dinov3_backbone_test.py | 101 +++++++++
 keras_hub/src/models/dinov3/dinov3_layers.py  | 209 ++++++++++++++++--
 2 files changed, 289 insertions(+), 21 deletions(-)
 create mode 100644 keras_hub/src/models/dinov3/dinov3_backbone_test.py

diff --git a/keras_hub/src/models/dinov3/dinov3_backbone_test.py b/keras_hub/src/models/dinov3/dinov3_backbone_test.py
new file mode 100644
index 0000000000..1320752c61
--- /dev/null
+++ b/keras_hub/src/models/dinov3/dinov3_backbone_test.py
@@ -0,0 +1,101 @@
+import os
+
+import keras
+import pytest
+from keras import ops
+
+from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone
+from keras_hub.src.tests.test_case import TestCase
+
+
+class DINOV3BackboneTest(TestCase):
+    def setUp(self):
+        self.init_kwargs = {
+            "patch_size": 14,
+            "num_layers": 2,
+            "hidden_dim": 16,
+            "num_heads": 2,
+            "intermediate_dim": 16 * 4,
+            "layer_scale_init_value": 1.0,
+            "num_register_tokens": 4,
+            "use_gated_mlp": False,
+            "image_shape": (70, 70, 3),
+            "name": "dinov3_backbone",
+        }
+        self.input_data = {
+            "images": ops.ones((2, 70, 70, 3)),
+        }
+
+    def test_backbone_basics(self):
+        patch_size = self.init_kwargs["patch_size"]
+        image_size = self.init_kwargs["image_shape"][0]
+        hidden_dim = self.init_kwargs["hidden_dim"]
+        num_register_tokens = self.init_kwargs["num_register_tokens"]
+        sequence_length = (
+            (image_size // patch_size) ** 2 + 1 + num_register_tokens
+        )
+        self.run_vision_backbone_test(
+            cls=DINOV3Backbone,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output_shape=(2, sequence_length, hidden_dim),
+            expected_pyramid_output_keys=["stem", "stage1", "stage2"],
+            expected_pyramid_image_sizes=[(sequence_length, hidden_dim)] * 3,
+            run_data_format_check=False,
+        )
+
+    @pytest.mark.large
+    def test_saved_model(self):
+        self.run_model_saving_test(
+            cls=DINOV3Backbone,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+        )
+
+    @pytest.mark.large
+    def test_position_embedding_interpolation(self):
+        model = DINOV3Backbone(**self.init_kwargs)
+        model_output = model(self.input_data)
+
+        # Test not using interpolation in `save` and `load_model`.
+        path = os.path.join(self.get_temp_dir(), "model.keras")
+        model.save(path)
+        restored_model = keras.models.load_model(path)
+        restored_output = restored_model(self.input_data)
+        self.assertAllClose(model_output, restored_output, atol=1e-5, rtol=1e-5)
+
+        # Test using interpolation in `save_to_preset` and `from_preset` if
+        # image_shape is different.
+        path = os.path.join(self.get_temp_dir(), "model")
+        model.save_to_preset(path)
+        restored_model = DINOV3Backbone.from_preset(
+            path,
+            image_shape=(128, 128, 3),  # From 70 to 128.
+        )
+        input_data = {
+            "images": ops.ones((2, 128, 128, 3)),
+        }
+        restored_output = restored_model(input_data)
+        self.assertNotEqual(model_output.shape, restored_output.shape)
+
+    @pytest.mark.kaggle_key_required
+    @pytest.mark.extra_large
+    def test_smallest_preset(self):
+        self.skipTest("Presets are not uploaded yet.")
+        self.run_preset_test(
+            cls=DINOV3Backbone,
+            preset="dinov3_vit_small_lvd1689m",
+            input_data=self.input_data,
+            expected_output_shape=(2, 1374, 768),
+        )
+
+    @pytest.mark.kaggle_key_required
+    @pytest.mark.extra_large
+    def test_all_presets(self):
+        self.skipTest("Presets are not uploaded yet.")
+        for preset in DINOV3Backbone.presets:
+            self.run_preset_test(
+                cls=DINOV3Backbone,
+                preset=preset,
+                input_data=self.input_data,
+            )
diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py
index 1eb80141ee..fa30616fad 100644
--- a/keras_hub/src/models/dinov3/dinov3_layers.py
+++ b/keras_hub/src/models/dinov3/dinov3_layers.py
@@ -220,6 +220,7 @@ def compute_output_shape(self, input_shape):
 def _get_patches_center_coordinates(
     num_patches_h, num_patches_w, dtype="float32"
 ):
+    """A helper function to get the center coordinates of the patches."""
     coords_h = ops.arange(0.5, num_patches_h, dtype=dtype)
     coords_w = ops.arange(0.5, num_patches_w, dtype=dtype)
 
@@ -239,6 +240,17 @@ def _get_patches_center_coordinates(
 
 
 class DINOV3RopePositionEmbedding(layers.Layer):
+    """A layer that implements Rotary Position Embedding.
+
+    Args:
+        hidden_dim: int. The number of units in the hidden layers.
+        num_heads: int. Number of attention heads.
+        rope_theta: float. The base period of the rotary position embeddings.
+        patch_size: int. The size of one side of each patch.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs):
         super().__init__(**kwargs)
         self.hidden_dim = hidden_dim
@@ -247,7 +259,8 @@ def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs):
         self.patch_size = patch_size
         self.head_dim = hidden_dim // num_heads
         inv_freq = 1.0 / (
-            rope_theta ** (ops.arange(0, 1, 4 / self.head_dim, dtype="float32"))
+            rope_theta
+            ** (ops.arange(0, 1, 4 / self.head_dim, dtype=self.dtype))
         )
         self.inv_freq = inv_freq
 
@@ -258,7 +271,7 @@ def call(self, pixel_values):
         num_patches_w = width // self.patch_size
 
         patch_coords = _get_patches_center_coordinates(
-            num_patches_h, num_patches_w, dtype="float32"
+            num_patches_h, num_patches_w, dtype=self.dtype
         )
 
         angles = (
@@ -291,12 +304,14 @@ def get_config(self):
 
 
 def _rotate_half(x):
+    """A helper function to rotate half of the features."""
     x1 = x[..., : ops.shape(x)[-1] // 2]
     x2 = x[..., ops.shape(x)[-1] // 2 :]
     return ops.concatenate([-x2, x1], axis=-1)
 
 
 def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens):
+    """A helper function to apply rotary position embedding to query and key."""
     q_prefix_tokens = q[:, :, :num_prefix_tokens, :]
     q_patches = q[:, :, num_prefix_tokens:, :]
     k_prefix_tokens = k[:, :, :num_prefix_tokens, :]
@@ -315,6 +330,20 @@ def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens):
 
 
 class DINOV3Attention(layers.Layer):
+    """A multi-head attention layer with dropout.
+
+    Args:
+        hidden_dim: int. The number of units in the hidden layers.
+        num_heads: int. Number of attention heads.
+        dropout_rate: float. The dropout rate to use. Defaults to `0.0`.
+        query_bias: bool. Whether to use a bias for the query projection.
+        key_bias: bool. Whether to use a bias for the key projection.
+        value_bias: bool. Whether to use a bias for the value projection.
+        proj_bias: bool. Whether to use a bias for the output projection.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(
         self,
         hidden_dim,
@@ -334,16 +363,32 @@ def __init__(
         self.scale = self.head_dim**-0.5
 
         self.q_proj = layers.Dense(
-            hidden_dim, use_bias=query_bias, name="q_proj"
+            hidden_dim,
+            use_bias=query_bias,
+            dtype=self.dtype_policy,
+            name="q_proj",
+        )
+        self.k_proj = layers.Dense(
+            hidden_dim,
+            use_bias=key_bias,
+            dtype=self.dtype_policy,
+            name="k_proj",
         )
-        self.k_proj = layers.Dense(hidden_dim, use_bias=key_bias, name="k_proj")
         self.v_proj = layers.Dense(
-            hidden_dim, use_bias=value_bias, name="v_proj"
+            hidden_dim,
+            use_bias=value_bias,
+            dtype=self.dtype_policy,
+            name="v_proj",
         )
         self.o_proj = layers.Dense(
-            hidden_dim, use_bias=proj_bias, name="o_proj"
+            hidden_dim,
+            use_bias=proj_bias,
+            dtype=self.dtype_policy,
+            name="o_proj",
+        )
+        self.dropout = layers.Dropout(
+            dropout_rate, dtype=self.dtype_policy, name="dropout"
         )
-        self.dropout = layers.Dropout(dropout_rate)
 
     def call(
         self,
@@ -404,6 +449,15 @@ def get_config(self):
 
 
 class DINOV3LayerScale(layers.Layer):
+    """A layer scale.
+
+    Args:
+        hidden_dim: int. The number of units in the hidden layers.
+        init_values: float. The initial value for the scale. Defaults to `1.0`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(self, hidden_dim, init_values=1.0, **kwargs):
         super().__init__(**kwargs)
         self.hidden_dim = int(hidden_dim)
@@ -429,6 +483,14 @@ def get_config(self):
 
 
 class DINOV3DropPath(layers.Layer):
+    """A drop path layer.
+
+    Args:
+        rate: float. The drop path rate to use. Defaults to `0.0`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(self, rate=0.0, **kwargs):
         super().__init__(**kwargs)
         self.rate = float(rate)
@@ -454,6 +516,18 @@ def get_config(self):
 
 
 class DINOV3MLP(layers.Layer):
+    """A DINOV3 MLP block.
+
+    Args:
+        hidden_dim: int. The number of units in the output layer.
+        intermediate_dim: int. The output dimension of the first Dense layer.
+        activation: str of callable. Activation to use in the intermediate
+            layer. Defaults to `"gelu"`.
+        use_bias: bool. Whether to use a bias for the dense layers.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(
         self,
         hidden_dim,
@@ -468,12 +542,21 @@ def __init__(
         self.activation = activation
         self.use_bias = use_bias
         self.up_proj = layers.Dense(
-            intermediate_dim, use_bias=use_bias, name="up_proj"
+            intermediate_dim,
+            use_bias=use_bias,
+            dtype=self.dtype_policy,
+            name="up_proj",
         )
         self.down_proj = layers.Dense(
-            hidden_dim, use_bias=use_bias, name="down_proj"
+            hidden_dim,
+            use_bias=use_bias,
+            dtype=self.dtype_policy,
+            name="down_proj",
+        )
+        self.act_fn = layers.Activation(
+            activation,
+            dtype=self.dtype_policy,
         )
-        self.act_fn = layers.Activation(activation)
 
     def call(self, x):
         return self.down_proj(self.act_fn(self.up_proj(x)))
@@ -492,6 +575,18 @@ def get_config(self):
 
 
 class DINOV3GatedMLP(layers.Layer):
+    """A DINOV3 Gated MLP block.
+
+    Args:
+        hidden_dim: int. The number of units in the output layer.
+        intermediate_dim: int. The output dimension of the first Dense layer.
+        activation: str of callable. Activation to use in the intermediate
+            layer. Defaults to `"gelu"`.
+        use_bias: bool. Whether to use a bias for the dense layers.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(
         self,
         hidden_dim,
@@ -506,15 +601,24 @@ def __init__(
         self.activation = activation
         self.use_bias = use_bias
         self.gate_proj = layers.Dense(
-            intermediate_dim, use_bias=use_bias, name="gate_proj"
+            intermediate_dim,
+            use_bias=use_bias,
+            dtype=self.dtype_policy,
+            name="gate_proj",
         )
         self.up_proj = layers.Dense(
-            intermediate_dim, use_bias=use_bias, name="up_proj"
+            intermediate_dim,
+            use_bias=use_bias,
+            dtype=self.dtype_policy,
+            name="up_proj",
         )
         self.down_proj = layers.Dense(
-            hidden_dim, use_bias=use_bias, name="down_proj"
+            hidden_dim,
+            use_bias=use_bias,
+            dtype=self.dtype_policy,
+            name="down_proj",
         )
-        self.act_fn = layers.Activation(activation)
+        self.act_fn = layers.Activation(activation, dtype=self.dtype_policy)
 
     def call(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
@@ -533,6 +637,29 @@ def get_config(self):
 
 
 class DINOV3Layer(layers.Layer):
+    """A DINOV3 encoder layer.
+
+    Args:
+        hidden_dim: int. The number of units in the hidden layers.
+        num_heads: int. Number of attention heads.
+        intermediate_dim: int. The output dimension of the first Dense layer in
+            a two-layer feedforward network for each transformer.
+        layer_scale_init_value: float. The initial value for the scale.
+            Defaults to `1.0`.
+        use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to
+            `False`.
+        attention_dropout: float. The dropout rate for the attention
+            probabilities. Defaults to `0.0`.
+        drop_path_rate: float. The drop path rate to use. Defaults to `0.0`.
+        layer_norm_eps: float. The epsilon for layer normalization.
+        query_bias: bool. Whether to use a bias for the query projection.
+        key_bias: bool. Whether to use a bias for the key projection.
+        value_bias: bool. Whether to use a bias for the value projection.
+        proj_bias: bool. Whether to use a bias for the output projection.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(
         self,
         hidden_dim,
@@ -560,7 +687,7 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
 
         self.norm1 = layers.LayerNormalization(
-            epsilon=layer_norm_eps, name="norm1"
+            epsilon=layer_norm_eps, dtype=self.dtype_policy, name="norm1"
         )
         self.attention = DINOV3Attention(
             hidden_dim=hidden_dim,
@@ -570,27 +697,42 @@ def __init__(
             key_bias=key_bias,
             value_bias=value_bias,
             proj_bias=proj_bias,
+            dtype=self.dtype_policy,
             name="attention",
         )
         self.layer_scale1 = DINOV3LayerScale(
             hidden_dim,
             init_values=layer_scale_init_value,
+            dtype=self.dtype_policy,
             name="layer_scale1",
         )
         self.drop_path = (
-            DINOV3DropPath(drop_path_rate)
+            DINOV3DropPath(drop_path_rate, dtype=self.dtype_policy)
             if drop_path_rate > 0.0
-            else layers.Identity()
+            else layers.Identity(dtype=self.dtype_policy)
         )
         self.norm2 = layers.LayerNormalization(
-            epsilon=layer_norm_eps, name="norm2"
+            epsilon=layer_norm_eps, dtype=self.dtype_policy, name="norm2"
         )
         if use_gated_mlp:
-            self.mlp = DINOV3GatedMLP(hidden_dim, intermediate_dim, name="mlp")
+            self.mlp = DINOV3GatedMLP(
+                hidden_dim,
+                intermediate_dim,
+                dtype=self.dtype_policy,
+                name="mlp",
+            )
         else:
-            self.mlp = DINOV3MLP(hidden_dim, intermediate_dim, name="mlp")
+            self.mlp = DINOV3MLP(
+                hidden_dim,
+                intermediate_dim,
+                dtype=self.dtype_policy,
+                name="mlp",
+            )
         self.layer_scale2 = DINOV3LayerScale(
-            hidden_dim, init_values=layer_scale_init_value, name="layer_scale2"
+            hidden_dim,
+            init_values=layer_scale_init_value,
+            dtype=self.dtype_policy,
+            name="layer_scale2",
         )
 
     def call(
@@ -637,6 +779,30 @@ def get_config(self):
 
 
 class DINOV3Encoder(layers.Layer):
+    """A DINOV3 encoder.
+
+    Args:
+        num_layers: int. The number of transformer layers.
+        hidden_dim: int. The number of units in the hidden layers.
+        num_heads: int. Number of attention heads.
+        intermediate_dim: int. The output dimension of the first Dense layer in
+            a two-layer feedforward network for each transformer.
+        layer_scale_init_value: float. The initial value for the scale.
+            Defaults to `1.0`.
+        use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to
+            `False`.
+        attention_dropout: float. The dropout rate for the attention
+            probabilities. Defaults to `0.0`.
+        drop_path_rate: float. The drop path rate to use. Defaults to `0.0`.
+        layer_norm_eps: float. The epsilon for layer normalization.
+        query_bias: bool. Whether to use a bias for the query projection.
+        key_bias: bool. Whether to use a bias for the key projection.
+        value_bias: bool. Whether to use a bias for the value projection.
+        proj_bias: bool. Whether to use a bias for the output projection.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `dtype` etc.
+    """
+
     def __init__(
         self,
         num_layers,
@@ -680,6 +846,7 @@ def __init__(
                 key_bias=key_bias,
                 value_bias=value_bias,
                 proj_bias=proj_bias,
+                dtype=self.dtype_policy,
                 name=f"layers.{i}",
             )
             for i in range(num_layers)

From fadc407550054074dc07a263e38cf7b0bafda1b6 Mon Sep 17 00:00:00 2001
From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com>
Date: Sun, 19 Oct 2025 22:11:36 +0800
Subject: [PATCH 3/5] Update DINOV3 impls.

---
 keras_hub/src/models/dinov2/dinov2_layers.py  |   4 +-
 .../src/models/dinov3/dinov3_backbone.py      |  67 ++-
 .../src/models/dinov3/dinov3_backbone_test.py |   8 +-
 keras_hub/src/models/dinov3/dinov3_layers.py  | 505 +++++++++++-------
 .../src/utils/transformers/convert_dinov3.py  |  28 +-
 .../convert_dinov3_checkpoints.py             |  43 +-
 6 files changed, 423 insertions(+), 232 deletions(-)

diff --git a/keras_hub/src/models/dinov2/dinov2_layers.py b/keras_hub/src/models/dinov2/dinov2_layers.py
index 1124b57a50..ce040ae266 100644
--- a/keras_hub/src/models/dinov2/dinov2_layers.py
+++ b/keras_hub/src/models/dinov2/dinov2_layers.py
@@ -502,7 +502,9 @@ def call(self, inputs, training=None):
 
     def get_config(self):
         config = super().get_config()
-        config.update({"hidden_dim": self.hidden_dim})
+        config.update(
+            {"hidden_dim": self.hidden_dim, "init_values": self.init_values}
+        )
         return config
 
     def compute_output_shape(self, input_shape):
diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py
index 385247d719..448051a340 100644
--- a/keras_hub/src/models/dinov3/dinov3_backbone.py
+++ b/keras_hub/src/models/dinov3/dinov3_backbone.py
@@ -28,20 +28,29 @@ class DINOV3Backbone(FeaturePyramidBackbone):
             embedding layer. Defaults to `0`.
         use_mask_token: bool. Whether to use a mask token in the embedding
             layer. Defaults to `True`.
+        hidden_activation: str or callable. Activation to use in the MLP.
+            Defaults to `"gelu"`.
         use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to
             `False`.
+        use_query_bias: bool. Whether to use a bias for the query projection.
+            Defaults to `True`.
+        use_key_bias: bool. Whether to use a bias for the key projection.
+            Defaults to `True`.
+        use_value_bias: bool. Whether to use a bias for the value projection.
+            Defaults to `True`.
+        use_proj_bias: bool. Whether to use a bias for the output projection.
+            Defaults to `True`.
+        use_mlp_bias: bool. Whether to use a bias for the dense layers in MLP.
+            Defaults to `True`.
         attention_dropout: float. The dropout rate for the attention
             probabilities. Defaults to `0.0`.
         drop_path_rate: float. The drop path rate to use. Defaults to `0.0`.
         image_shape: tuple. The input shape without the batch size. Defaults to
             `(518, 518, 3)`.
         rope_theta: float. The base period of the rotary position embeddings.
+            Defaults to `100.0`.
         apply_layernorm: bool. Whether to apply layer normalization to the
             outputs of each stage in the feature pyramid. Defaults to `False`.
-        query_bias: bool. Whether to use a bias for the query projection.
-        key_bias: bool. Whether to use a bias for the key projection.
-        value_bias: bool. Whether to use a bias for the value projection.
-        proj_bias: bool. Whether to use a bias for the output projection.
         data_format: `None` or str. If specified, either `"channels_last"` or
             `"channels_first"`. The ordering of the dimensions in the
             inputs. `"channels_last"` corresponds to inputs with shape
@@ -67,16 +76,19 @@ def __init__(
         layer_scale_init_value=1.0,
         num_register_tokens=4,
         use_mask_token=True,
+        hidden_activation="gelu",
         use_gated_mlp=False,
+        use_query_bias=True,
+        use_key_bias=True,
+        use_value_bias=True,
+        use_proj_bias=True,
+        use_mlp_bias=True,
         attention_dropout=0.0,
         drop_path_rate=0.0,
+        layer_norm_eps=1e-5,
         image_shape=(518, 518, 3),
-        rope_theta=10000.0,
+        rope_theta=100.0,
         apply_layernorm=False,
-        query_bias=True,
-        key_bias=True,
-        value_bias=True,
-        proj_bias=True,
         data_format=None,
         dtype=None,
         name=None,
@@ -110,18 +122,21 @@ def __init__(
             num_heads=num_heads,
             intermediate_dim=intermediate_dim,
             layer_scale_init_value=layer_scale_init_value,
+            hidden_activation=hidden_activation,
             use_gated_mlp=use_gated_mlp,
+            use_query_bias=use_query_bias,
+            use_key_bias=use_key_bias,
+            use_value_bias=use_value_bias,
+            use_proj_bias=use_proj_bias,
+            use_mlp_bias=use_mlp_bias,
             attention_dropout=attention_dropout,
             drop_path_rate=drop_path_rate,
-            query_bias=query_bias,
-            key_bias=key_bias,
-            value_bias=value_bias,
-            proj_bias=proj_bias,
+            layer_norm_eps=layer_norm_eps,
             dtype=dtype,
             name=f"{prefix}encoder",
         )
         self.layernorm = layers.LayerNormalization(
-            epsilon=1e-6, dtype=dtype, name=f"{prefix}layernorm"
+            epsilon=layer_norm_eps, dtype=dtype, name=f"{prefix}layernorm"
         )
 
         # === Functional Model ===
@@ -161,16 +176,19 @@ def __init__(
         self.layer_scale_init_value = float(layer_scale_init_value)
         self.num_register_tokens = int(num_register_tokens)
         self.use_mask_token = bool(use_mask_token)
+        self.hidden_activation = hidden_activation
         self.use_gated_mlp = bool(use_gated_mlp)
+        self.use_query_bias = bool(use_query_bias)
+        self.use_key_bias = bool(use_key_bias)
+        self.use_value_bias = bool(use_value_bias)
+        self.use_proj_bias = bool(use_proj_bias)
+        self.use_mlp_bias = bool(use_mlp_bias)
         self.attention_dropout = float(attention_dropout)
         self.drop_path_rate = float(drop_path_rate)
+        self.layer_norm_eps = float(layer_norm_eps)
         self.image_shape = image_shape
         self.rope_theta = rope_theta
         self.apply_layernorm = apply_layernorm
-        self.query_bias = query_bias
-        self.key_bias = key_bias
-        self.value_bias = value_bias
-        self.proj_bias = proj_bias
         self.pyramid_outputs = pyramid_outputs
 
     def get_config(self):
@@ -182,19 +200,22 @@ def get_config(self):
                 "hidden_dim": self.hidden_dim,
                 "num_heads": self.num_heads,
                 "intermediate_dim": self.intermediate_dim,
-                "layer_scale_init_value": self.layer_scale_init_value,
                 "num_register_tokens": self.num_register_tokens,
                 "use_mask_token": self.use_mask_token,
+                "layer_scale_init_value": self.layer_scale_init_value,
+                "hidden_activation": self.hidden_activation,
                 "use_gated_mlp": self.use_gated_mlp,
+                "use_query_bias": self.use_query_bias,
+                "use_key_bias": self.use_key_bias,
+                "use_value_bias": self.use_value_bias,
+                "use_proj_bias": self.use_proj_bias,
+                "use_mlp_bias": self.use_mlp_bias,
                 "attention_dropout": self.attention_dropout,
                 "drop_path_rate": self.drop_path_rate,
+                "layer_norm_eps": self.layer_norm_eps,
                 "image_shape": self.image_shape,
                 "rope_theta": self.rope_theta,
                 "apply_layernorm": self.apply_layernorm,
-                "query_bias": self.query_bias,
-                "key_bias": self.key_bias,
-                "value_bias": self.value_bias,
-                "proj_bias": self.proj_bias,
             }
         )
         return config
diff --git a/keras_hub/src/models/dinov3/dinov3_backbone_test.py b/keras_hub/src/models/dinov3/dinov3_backbone_test.py
index 1320752c61..e032ab1c4e 100644
--- a/keras_hub/src/models/dinov3/dinov3_backbone_test.py
+++ b/keras_hub/src/models/dinov3/dinov3_backbone_test.py
@@ -11,7 +11,7 @@
 class DINOV3BackboneTest(TestCase):
     def setUp(self):
         self.init_kwargs = {
-            "patch_size": 14,
+            "patch_size": 16,
             "num_layers": 2,
             "hidden_dim": 16,
             "num_heads": 2,
@@ -19,11 +19,11 @@ def setUp(self):
             "layer_scale_init_value": 1.0,
             "num_register_tokens": 4,
             "use_gated_mlp": False,
-            "image_shape": (70, 70, 3),
+            "image_shape": (64, 64, 3),
             "name": "dinov3_backbone",
         }
         self.input_data = {
-            "images": ops.ones((2, 70, 70, 3)),
+            "images": ops.ones((2, 64, 64, 3)),
         }
 
     def test_backbone_basics(self):
@@ -70,7 +70,7 @@ def test_position_embedding_interpolation(self):
         model.save_to_preset(path)
         restored_model = DINOV3Backbone.from_preset(
             path,
-            image_shape=(128, 128, 3),  # From 70 to 128.
+            image_shape=(128, 128, 3),  # From 64 to 128.
         )
         input_data = {
             "images": ops.ones((2, 128, 128, 3)),
diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py
index fa30616fad..d27c3485dc 100644
--- a/keras_hub/src/models/dinov3/dinov3_layers.py
+++ b/keras_hub/src/models/dinov3/dinov3_layers.py
@@ -188,7 +188,6 @@ def call(self, inputs, masks=None, training=None):
                 ),
                 axis=1,
             )
-
         return embeddings
 
     def get_config(self):
@@ -217,28 +216,6 @@ def compute_output_shape(self, input_shape):
         return output_shape
 
 
-def _get_patches_center_coordinates(
-    num_patches_h, num_patches_w, dtype="float32"
-):
-    """A helper function to get the center coordinates of the patches."""
-    coords_h = ops.arange(0.5, num_patches_h, dtype=dtype)
-    coords_w = ops.arange(0.5, num_patches_w, dtype=dtype)
-
-    coords_h = coords_h / num_patches_h
-    coords_w = coords_w / num_patches_w
-
-    coords_h = ops.expand_dims(coords_h, axis=1)
-    coords_w = ops.expand_dims(coords_w, axis=0)
-
-    coords_h = ops.repeat(coords_h, num_patches_w, axis=1)
-    coords_w = ops.repeat(coords_w, num_patches_h, axis=0)
-
-    coords = ops.stack([coords_h, coords_w], axis=-1)
-    coords = ops.reshape(coords, (-1, 2))
-    coords = 2.0 * coords - 1.0
-    return coords
-
-
 class DINOV3RopePositionEmbedding(layers.Layer):
     """A layer that implements Rotary Position Embedding.
 
@@ -247,33 +224,72 @@ class DINOV3RopePositionEmbedding(layers.Layer):
         num_heads: int. Number of attention heads.
         rope_theta: float. The base period of the rotary position embeddings.
         patch_size: int. The size of one side of each patch.
+        data_format: `None` or str. If specified, either `"channels_last"` or
+            `"channels_first"`. The ordering of the dimensions in the
+            inputs. `"channels_last"` corresponds to inputs with shape
+            `(batch_size, height, width, channels)`
+            while `"channels_first"` corresponds to inputs with shape
+            `(batch_size, channels, height, width)`. It defaults to the
+            `image_data_format` value found in your Keras config file at
+            `~/.keras/keras.json`. If you never set it, then it will be
+            `"channels_last"`.
         **kwargs: other keyword arguments passed to `keras.layers.Layer`,
             including `name`, `dtype` etc.
     """
 
-    def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs):
+    def __init__(
+        self,
+        hidden_dim,
+        num_heads,
+        rope_theta,
+        patch_size,
+        data_format=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        self.rope_theta = rope_theta
-        self.patch_size = patch_size
+        self.hidden_dim = int(hidden_dim)
+        self.num_heads = int(num_heads)
+        self.rope_theta = float(rope_theta)
+        self.patch_size = int(patch_size)
+        self.data_format = standardize_data_format(data_format)
         self.head_dim = hidden_dim // num_heads
-        inv_freq = 1.0 / (
-            rope_theta
-            ** (ops.arange(0, 1, 4 / self.head_dim, dtype=self.dtype))
+        self.inv_freq = 1.0 / (
+            rope_theta ** (ops.arange(0, 1, 4 / self.head_dim, dtype="float32"))
         )
-        self.inv_freq = inv_freq
 
-    def call(self, pixel_values):
-        shape = ops.shape(pixel_values)
-        height, width = shape[1], shape[2]
+    def _get_patches_center_coordinates(
+        self, num_patches_h, num_patches_w, dtype="float32"
+    ):
+        """A helper function to get the center coordinates of the patches."""
+        coords_h = ops.arange(0.5, num_patches_h, dtype=dtype)
+        coords_w = ops.arange(0.5, num_patches_w, dtype=dtype)
+
+        coords_h = coords_h / num_patches_h
+        coords_w = coords_w / num_patches_w
+
+        coords_h = ops.expand_dims(coords_h, axis=1)
+        coords_w = ops.expand_dims(coords_w, axis=0)
+
+        coords_h = ops.repeat(coords_h, num_patches_w, axis=1)
+        coords_w = ops.repeat(coords_w, num_patches_h, axis=0)
+
+        coords = ops.stack([coords_h, coords_w], axis=-1)
+        coords = ops.reshape(coords, (-1, 2))
+        coords = 2.0 * coords - 1.0
+        return coords
+
+    def call(self, inputs):
+        shape = ops.shape(inputs)
+        if self.data_format == "channels_last":
+            height, width = shape[1], shape[2]
+        else:
+            height, width = shape[2], shape[3]
         num_patches_h = height // self.patch_size
         num_patches_w = width // self.patch_size
 
-        patch_coords = _get_patches_center_coordinates(
-            num_patches_h, num_patches_w, dtype=self.dtype
+        patch_coords = self._get_patches_center_coordinates(
+            num_patches_h, num_patches_w, dtype="float32"
         )
-
         angles = (
             2
             * math.pi
@@ -283,12 +299,9 @@ def call(self, pixel_values):
         angles = ops.reshape(angles, (ops.shape(angles)[0], -1))
         angles = ops.tile(angles, (1, 2))
 
-        cos = ops.cos(angles)
-        sin = ops.sin(angles)
-
-        return ops.cast(cos, pixel_values.dtype), ops.cast(
-            sin, pixel_values.dtype
-        )
+        cos = ops.cast(ops.cos(angles), inputs.dtype)
+        sin = ops.cast(ops.sin(angles), inputs.dtype)
+        return cos, sin
 
     def get_config(self):
         config = super().get_config()
@@ -302,31 +315,17 @@ def get_config(self):
         )
         return config
 
-
-def _rotate_half(x):
-    """A helper function to rotate half of the features."""
-    x1 = x[..., : ops.shape(x)[-1] // 2]
-    x2 = x[..., ops.shape(x)[-1] // 2 :]
-    return ops.concatenate([-x2, x1], axis=-1)
-
-
-def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens):
-    """A helper function to apply rotary position embedding to query and key."""
-    q_prefix_tokens = q[:, :, :num_prefix_tokens, :]
-    q_patches = q[:, :, num_prefix_tokens:, :]
-    k_prefix_tokens = k[:, :, :num_prefix_tokens, :]
-    k_patches = k[:, :, num_prefix_tokens:, :]
-
-    cos = ops.expand_dims(ops.expand_dims(cos, axis=0), axis=0)
-    sin = ops.expand_dims(ops.expand_dims(sin, axis=0), axis=0)
-
-    q_patches = (q_patches * cos) + (_rotate_half(q_patches) * sin)
-    k_patches = (k_patches * cos) + (_rotate_half(k_patches) * sin)
-
-    q = ops.concatenate([q_prefix_tokens, q_patches], axis=-2)
-    k = ops.concatenate([k_prefix_tokens, k_patches], axis=-2)
-
-    return q, k
+    def compute_output_shape(self, input_shape):
+        output_shape = input_shape
+        if self.data_format == "channels_last":
+            height, width = input_shape[1], input_shape[2]
+        else:
+            height, width = input_shape[2], input_shape[3]
+        num_patches_h = height // self.patch_size
+        num_patches_w = width // self.patch_size
+        seq_len = num_patches_h * num_patches_w
+        output_shape = (seq_len, self.head_dim)
+        return output_shape, output_shape
 
 
 class DINOV3Attention(layers.Layer):
@@ -336,10 +335,10 @@ class DINOV3Attention(layers.Layer):
         hidden_dim: int. The number of units in the hidden layers.
         num_heads: int. Number of attention heads.
         dropout_rate: float. The dropout rate to use. Defaults to `0.0`.
-        query_bias: bool. Whether to use a bias for the query projection.
-        key_bias: bool. Whether to use a bias for the key projection.
-        value_bias: bool. Whether to use a bias for the value projection.
-        proj_bias: bool. Whether to use a bias for the output projection.
+        use_query_bias: bool. Whether to use a bias for the query projection.
+        use_key_bias: bool. Whether to use a bias for the key projection.
+        use_value_bias: bool. Whether to use a bias for the value projection.
+        use_proj_bias: bool. Whether to use a bias for the output projection.
         **kwargs: other keyword arguments passed to `keras.layers.Layer`,
             including `name`, `dtype` etc.
     """
@@ -349,40 +348,44 @@ def __init__(
         hidden_dim,
         num_heads,
         dropout_rate=0.0,
-        query_bias=True,
-        key_bias=True,
-        value_bias=True,
-        proj_bias=True,
+        use_query_bias=True,
+        use_key_bias=True,
+        use_value_bias=True,
+        use_proj_bias=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        self.dropout_rate = dropout_rate
+        self.hidden_dim = int(hidden_dim)
+        self.num_heads = int(num_heads)
+        self.dropout_rate = float(dropout_rate)
+        self.use_query_bias = bool(use_query_bias)
+        self.use_key_bias = bool(use_key_bias)
+        self.use_value_bias = bool(use_value_bias)
+        self.use_proj_bias = bool(use_proj_bias)
         self.head_dim = hidden_dim // num_heads
         self.scale = self.head_dim**-0.5
 
-        self.q_proj = layers.Dense(
+        self.query_dense = layers.Dense(
             hidden_dim,
-            use_bias=query_bias,
+            use_bias=use_query_bias,
             dtype=self.dtype_policy,
             name="q_proj",
         )
-        self.k_proj = layers.Dense(
+        self.key_dense = layers.Dense(
             hidden_dim,
-            use_bias=key_bias,
+            use_bias=use_key_bias,
             dtype=self.dtype_policy,
             name="k_proj",
         )
-        self.v_proj = layers.Dense(
+        self.value_dense = layers.Dense(
             hidden_dim,
-            use_bias=value_bias,
+            use_bias=use_value_bias,
             dtype=self.dtype_policy,
             name="v_proj",
         )
-        self.o_proj = layers.Dense(
+        self.output_dense = layers.Dense(
             hidden_dim,
-            use_bias=proj_bias,
+            use_bias=use_proj_bias,
             dtype=self.dtype_policy,
             name="o_proj",
         )
@@ -390,47 +393,63 @@ def __init__(
             dropout_rate, dtype=self.dtype_policy, name="dropout"
         )
 
+    def build(self, input_shape):
+        self.query_dense.build(input_shape)
+        self.key_dense.build(input_shape)
+        self.value_dense.build(input_shape)
+        self.output_dense.build(input_shape)
+
+    def _apply_rotary(self, q, k, cos, sin, num_prefix_tokens):
+        """Apply rotary position embedding to query and key."""
+
+        def _rotate_half(x):
+            """A helper function to rotate half of the features."""
+            x1 = x[..., : ops.shape(x)[-1] // 2]
+            x2 = x[..., ops.shape(x)[-1] // 2 :]
+            return ops.concatenate([-x2, x1], axis=-1)
+
+        q_prefix_tokens = q[:, :num_prefix_tokens, :, :]
+        q_patches = q[:, num_prefix_tokens:, :, :]
+        k_prefix_tokens = k[:, :num_prefix_tokens, :, :]
+        k_patches = k[:, num_prefix_tokens:, :, :]
+        cos = ops.expand_dims(ops.expand_dims(cos, axis=0), axis=2)
+        sin = ops.expand_dims(ops.expand_dims(sin, axis=0), axis=2)
+
+        q_patches = (q_patches * cos) + (_rotate_half(q_patches) * sin)
+        k_patches = (k_patches * cos) + (_rotate_half(k_patches) * sin)
+        q = ops.concatenate([q_prefix_tokens, q_patches], axis=-3)
+        k = ops.concatenate([k_prefix_tokens, k_patches], axis=-3)
+        return q, k
+
     def call(
         self,
-        hidden_states,
+        inputs,
         attention_mask=None,
         position_embeddings=None,
         num_prefix_tokens=0,
+        training=None,
     ):
-        batch_size, seq_len, _ = ops.shape(hidden_states)
-
-        q = self.q_proj(hidden_states)
-        k = self.k_proj(hidden_states)
-        v = self.v_proj(hidden_states)
-
+        batch_size, seq_len, _ = ops.shape(inputs)
+        q = self.query_dense(inputs, training=training)
+        k = self.key_dense(inputs, training=training)
+        v = self.value_dense(inputs, training=training)
         q = ops.reshape(q, (batch_size, seq_len, self.num_heads, self.head_dim))
         k = ops.reshape(k, (batch_size, seq_len, self.num_heads, self.head_dim))
         v = ops.reshape(v, (batch_size, seq_len, self.num_heads, self.head_dim))
-
-        q = ops.transpose(q, (0, 2, 1, 3))
-        k = ops.transpose(k, (0, 2, 1, 3))
-        v = ops.transpose(v, (0, 2, 1, 3))
-
         if position_embeddings is not None:
             cos, sin = position_embeddings
-            q, k = _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens)
+            q, k = self._apply_rotary(q, k, cos, sin, num_prefix_tokens)
 
-        attn_weights = (
-            ops.matmul(q, ops.transpose(k, (0, 1, 3, 2))) * self.scale
+        attn_output = ops.nn.dot_product_attention(
+            q,
+            k,
+            v,
+            mask=attention_mask,
+            scale=self.scale,
+            is_causal=False,
         )
-
-        if attention_mask is not None:
-            attn_weights += attention_mask
-
-        attn_weights = ops.softmax(attn_weights, axis=-1)
-        attn_weights = self.dropout(attn_weights)
-
-        attn_output = ops.matmul(attn_weights, v)
-        attn_output = ops.transpose(attn_output, (0, 2, 1, 3))
         attn_output = ops.reshape(attn_output, (batch_size, seq_len, -1))
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output
+        return self.output_dense(attn_output, training=training)
 
     def get_config(self):
         config = super().get_config()
@@ -439,14 +458,17 @@ def get_config(self):
                 "hidden_dim": self.hidden_dim,
                 "num_heads": self.num_heads,
                 "dropout_rate": self.dropout_rate,
-                "query_bias": self.q_proj.use_bias,
-                "key_bias": self.k_proj.use_bias,
-                "value_bias": self.v_proj.use_bias,
-                "proj_bias": self.o_proj.use_bias,
+                "query_bias": self.use_query_bias,
+                "key_bias": self.use_key_bias,
+                "value_bias": self.use_value_bias,
+                "proj_bias": self.use_proj_bias,
             }
         )
         return config
 
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
 
 class DINOV3LayerScale(layers.Layer):
     """A layer scale.
@@ -503,10 +525,8 @@ def call(self, inputs, training=None):
             return inputs
 
         keep_prob = 1.0 - self.rate
-        random_tensor = keep_prob + random.uniform(
-            self.noise_shape, dtype=inputs.dtype
-        )
-        random_tensor = ops.floor(random_tensor)
+        random_tensor = random.uniform(self.noise_shape, dtype=inputs.dtype)
+        random_tensor = ops.add(random_tensor, keep_prob)
         return ops.multiply(ops.divide(inputs, keep_prob), random_tensor)
 
     def get_config(self):
@@ -514,6 +534,9 @@ def get_config(self):
         config.update({"rate": self.rate})
         return config
 
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
 
 class DINOV3MLP(layers.Layer):
     """A DINOV3 MLP block.
@@ -537,12 +560,14 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.hidden_dim = hidden_dim
-        self.intermediate_dim = intermediate_dim
+        self.hidden_dim = int(hidden_dim)
+        self.intermediate_dim = int(intermediate_dim)
         self.activation = activation
-        self.use_bias = use_bias
+        self.use_bias = bool(use_bias)
+
         self.up_proj = layers.Dense(
             intermediate_dim,
+            activation=activation,
             use_bias=use_bias,
             dtype=self.dtype_policy,
             name="up_proj",
@@ -553,13 +578,15 @@ def __init__(
             dtype=self.dtype_policy,
             name="down_proj",
         )
-        self.act_fn = layers.Activation(
-            activation,
-            dtype=self.dtype_policy,
-        )
 
-    def call(self, x):
-        return self.down_proj(self.act_fn(self.up_proj(x)))
+    def build(self, input_shape):
+        self.up_proj.build(input_shape)
+        input_shape = self.up_proj.compute_output_shape(input_shape)
+        self.down_proj.build(input_shape)
+
+    def call(self, inputs, training=None):
+        x = self.up_proj(inputs, training=training)
+        return self.down_proj(x, training=training)
 
     def get_config(self):
         config = super().get_config()
@@ -573,6 +600,11 @@ def get_config(self):
         )
         return config
 
+    def compute_output_shape(self, input_shape):
+        output_shape = list(input_shape)
+        output_shape[-1] = self.hidden_dim
+        return output_shape
+
 
 class DINOV3GatedMLP(layers.Layer):
     """A DINOV3 Gated MLP block.
@@ -596,12 +628,14 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.hidden_dim = hidden_dim
-        self.intermediate_dim = intermediate_dim
+        self.hidden_dim = int(hidden_dim)
+        self.intermediate_dim = int(intermediate_dim)
         self.activation = activation
-        self.use_bias = use_bias
+        self.use_bias = bool(use_bias)
+
         self.gate_proj = layers.Dense(
             intermediate_dim,
+            activation=activation,
             use_bias=use_bias,
             dtype=self.dtype_policy,
             name="gate_proj",
@@ -618,10 +652,19 @@ def __init__(
             dtype=self.dtype_policy,
             name="down_proj",
         )
-        self.act_fn = layers.Activation(activation, dtype=self.dtype_policy)
 
-    def call(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+    def build(self, input_shape):
+        self.gate_proj.build(input_shape)
+        self.up_proj.build(input_shape)
+        input_shape = self.up_proj.compute_output_shape(input_shape)
+        self.down_proj.build(input_shape)
+
+    def call(self, inputs, training=None):
+        x = ops.multiply(
+            self.gate_proj(inputs, training=training),
+            self.up_proj(inputs, training=training),
+        )
+        return self.down_proj(x, training=training)
 
     def get_config(self):
         config = super().get_config()
@@ -635,6 +678,11 @@ def get_config(self):
         )
         return config
 
+    def compute_output_shape(self, input_shape):
+        output_shape = list(input_shape)
+        output_shape[-1] = self.hidden_dim
+        return output_shape
+
 
 class DINOV3Layer(layers.Layer):
     """A DINOV3 encoder layer.
@@ -646,16 +694,19 @@ class DINOV3Layer(layers.Layer):
             a two-layer feedforward network for each transformer.
         layer_scale_init_value: float. The initial value for the scale.
             Defaults to `1.0`.
+        hidden_activation: str or callable. Activation to use in the MLP.
+            Defaults to `"gelu"`.
         use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to
             `False`.
+        use_query_bias: bool. Whether to use a bias for the query projection.
+        use_key_bias: bool. Whether to use a bias for the key projection.
+        use_value_bias: bool. Whether to use a bias for the value projection.
+        use_proj_bias: bool. Whether to use a bias for the output projection.
+        use_mlp_bias: bool. Whether to use a bias for the MLP layers.
         attention_dropout: float. The dropout rate for the attention
             probabilities. Defaults to `0.0`.
         drop_path_rate: float. The drop path rate to use. Defaults to `0.0`.
         layer_norm_eps: float. The epsilon for layer normalization.
-        query_bias: bool. Whether to use a bias for the query projection.
-        key_bias: bool. Whether to use a bias for the key projection.
-        value_bias: bool. Whether to use a bias for the value projection.
-        proj_bias: bool. Whether to use a bias for the output projection.
         **kwargs: other keyword arguments passed to `keras.layers.Layer`,
             including `name`, `dtype` etc.
     """
@@ -666,25 +717,33 @@ def __init__(
         num_heads,
         intermediate_dim,
         layer_scale_init_value=1.0,
+        hidden_activation="gelu",
         use_gated_mlp=False,
+        use_query_bias=True,
+        use_key_bias=True,
+        use_value_bias=True,
+        use_proj_bias=True,
+        use_mlp_bias=True,
         attention_dropout=0.0,
         drop_path_rate=0.0,
         layer_norm_eps=1e-6,
-        query_bias=True,
-        key_bias=True,
-        value_bias=True,
-        proj_bias=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        self.intermediate_dim = intermediate_dim
-        self.layer_scale_init_value = layer_scale_init_value
-        self.use_gated_mlp = use_gated_mlp
-        self.attention_dropout = attention_dropout
-        self.drop_path_rate = drop_path_rate
-        self.layer_norm_eps = layer_norm_eps
+        self.hidden_dim = int(hidden_dim)
+        self.num_heads = int(num_heads)
+        self.intermediate_dim = int(intermediate_dim)
+        self.layer_scale_init_value = float(layer_scale_init_value)
+        self.hidden_activation = hidden_activation
+        self.use_gated_mlp = bool(use_gated_mlp)
+        self.use_query_bias = bool(use_query_bias)
+        self.use_key_bias = bool(use_key_bias)
+        self.use_value_bias = bool(use_value_bias)
+        self.use_proj_bias = bool(use_proj_bias)
+        self.use_mlp_bias = bool(use_mlp_bias)
+        self.attention_dropout = float(attention_dropout)
+        self.drop_path_rate = float(drop_path_rate)
+        self.layer_norm_eps = float(layer_norm_eps)
 
         self.norm1 = layers.LayerNormalization(
             epsilon=layer_norm_eps, dtype=self.dtype_policy, name="norm1"
@@ -693,10 +752,10 @@ def __init__(
             hidden_dim=hidden_dim,
             num_heads=num_heads,
             dropout_rate=attention_dropout,
-            query_bias=query_bias,
-            key_bias=key_bias,
-            value_bias=value_bias,
-            proj_bias=proj_bias,
+            use_query_bias=use_query_bias,
+            use_key_bias=use_key_bias,
+            use_value_bias=use_value_bias,
+            use_proj_bias=use_proj_bias,
             dtype=self.dtype_policy,
             name="attention",
         )
@@ -718,6 +777,8 @@ def __init__(
             self.mlp = DINOV3GatedMLP(
                 hidden_dim,
                 intermediate_dim,
+                activation=hidden_activation,
+                use_bias=use_mlp_bias,
                 dtype=self.dtype_policy,
                 name="mlp",
             )
@@ -725,6 +786,8 @@ def __init__(
             self.mlp = DINOV3MLP(
                 hidden_dim,
                 intermediate_dim,
+                activation=hidden_activation,
+                use_bias=use_mlp_bias,
                 dtype=self.dtype_policy,
                 name="mlp",
             )
@@ -735,15 +798,26 @@ def __init__(
             name="layer_scale2",
         )
 
+    def build(self, input_shape):
+        self.norm1.build(input_shape)
+        self.attention.build(input_shape)
+        input_shape = self.attention.compute_output_shape(input_shape)
+        self.layer_scale1.build(input_shape)
+        self.drop_path.build(input_shape)
+        self.norm2.build(input_shape)
+        self.mlp.build(input_shape)
+        input_shape = self.mlp.compute_output_shape(input_shape)
+        self.layer_scale2.build(input_shape)
+
     def call(
         self,
-        hidden_states,
+        inputs,
         attention_mask=None,
         position_embeddings=None,
         num_prefix_tokens=0,
     ):
-        residual = hidden_states
-        hidden_states = self.norm1(hidden_states)
+        residual = inputs
+        hidden_states = self.norm1(inputs)
         hidden_states = self.attention(
             hidden_states,
             attention_mask=attention_mask,
@@ -769,7 +843,13 @@ def get_config(self):
                 "num_heads": self.num_heads,
                 "intermediate_dim": self.intermediate_dim,
                 "layer_scale_init_value": self.layer_scale_init_value,
+                "hidden_activation": self.hidden_activation,
                 "use_gated_mlp": self.use_gated_mlp,
+                "use_query_bias": self.use_query_bias,
+                "use_key_bias": self.use_key_bias,
+                "use_value_bias": self.use_value_bias,
+                "use_proj_bias": self.use_proj_bias,
+                "use_mlp_bias": self.use_mlp_bias,
                 "attention_dropout": self.attention_dropout,
                 "drop_path_rate": self.drop_path_rate,
                 "layer_norm_eps": self.layer_norm_eps,
@@ -777,6 +857,9 @@ def get_config(self):
         )
         return config
 
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
 
 class DINOV3Encoder(layers.Layer):
     """A DINOV3 encoder.
@@ -789,16 +872,25 @@ class DINOV3Encoder(layers.Layer):
             a two-layer feedforward network for each transformer.
         layer_scale_init_value: float. The initial value for the scale.
             Defaults to `1.0`.
+        hidden_activation: str or callable. Activation to use in the MLP.
+            Defaults to `"gelu"`.
         use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to
             `False`.
+        use_query_bias: bool. Whether to use a bias for the query projection.
+            Defaults to `True`.
+        use_key_bias: bool. Whether to use a bias for the key projection.
+            Defaults to `True`.
+        use_value_bias: bool. Whether to use a bias for the value projection.
+            Defaults to `True`.
+        use_proj_bias: bool. Whether to use a bias for the output projection.
+            Defaults to `True`.
+        use_mlp_bias: bool. Whether to use a bias for the dense layers in MLP.
+            Defaults to `True`.
         attention_dropout: float. The dropout rate for the attention
             probabilities. Defaults to `0.0`.
         drop_path_rate: float. The drop path rate to use. Defaults to `0.0`.
-        layer_norm_eps: float. The epsilon for layer normalization.
-        query_bias: bool. Whether to use a bias for the query projection.
-        key_bias: bool. Whether to use a bias for the key projection.
-        value_bias: bool. Whether to use a bias for the value projection.
-        proj_bias: bool. Whether to use a bias for the output projection.
+        layer_norm_eps: float. The epsilon for layer normalization. Defaults to
+            `1e-5`.
         **kwargs: other keyword arguments passed to `keras.layers.Layer`,
             including `name`, `dtype` etc.
     """
@@ -810,26 +902,34 @@ def __init__(
         num_heads,
         intermediate_dim,
         layer_scale_init_value=1.0,
+        hidden_activation="gelu",
         use_gated_mlp=False,
+        use_query_bias=True,
+        use_key_bias=True,
+        use_value_bias=True,
+        use_proj_bias=True,
+        use_mlp_bias=True,
         attention_dropout=0.0,
         drop_path_rate=0.0,
-        layer_norm_eps=1e-6,
-        query_bias=True,
-        key_bias=True,
-        value_bias=True,
-        proj_bias=True,
+        layer_norm_eps=1e-5,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.num_layers = num_layers
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        self.intermediate_dim = intermediate_dim
-        self.layer_scale_init_value = layer_scale_init_value
-        self.use_gated_mlp = use_gated_mlp
-        self.attention_dropout = attention_dropout
-        self.drop_path_rate = drop_path_rate
-        self.layer_norm_eps = layer_norm_eps
+        self.num_layers = int(num_layers)
+        self.hidden_dim = int(hidden_dim)
+        self.num_heads = int(num_heads)
+        self.intermediate_dim = int(intermediate_dim)
+        self.layer_scale_init_value = float(layer_scale_init_value)
+        self.hidden_activation = hidden_activation
+        self.use_gated_mlp = bool(use_gated_mlp)
+        self.use_query_bias = bool(use_query_bias)
+        self.use_key_bias = bool(use_key_bias)
+        self.use_value_bias = bool(use_value_bias)
+        self.use_proj_bias = bool(use_proj_bias)
+        self.use_mlp_bias = bool(use_mlp_bias)
+        self.attention_dropout = float(attention_dropout)
+        self.drop_path_rate = float(drop_path_rate)
+        self.layer_norm_eps = float(layer_norm_eps)
 
         dpr = [x for x in ops.linspace(0.0, drop_path_rate, num_layers)]
         self.layers = [
@@ -838,38 +938,47 @@ def __init__(
                 num_heads=num_heads,
                 intermediate_dim=intermediate_dim,
                 layer_scale_init_value=layer_scale_init_value,
+                hidden_activation=hidden_activation,
                 use_gated_mlp=use_gated_mlp,
+                use_query_bias=use_query_bias,
+                use_key_bias=use_key_bias,
+                use_value_bias=use_value_bias,
+                use_proj_bias=use_proj_bias,
+                use_mlp_bias=use_mlp_bias,
                 attention_dropout=attention_dropout,
                 drop_path_rate=dpr[i],
                 layer_norm_eps=layer_norm_eps,
-                query_bias=query_bias,
-                key_bias=key_bias,
-                value_bias=value_bias,
-                proj_bias=proj_bias,
                 dtype=self.dtype_policy,
-                name=f"layers.{i}",
+                name=f"layers_{i}",
             )
             for i in range(num_layers)
         ]
 
+    def build(self, input_shape):
+        for layer in self.layers:
+            layer.build(input_shape)
+            input_shape = layer.compute_output_shape(input_shape)
+
     def call(
         self,
-        hidden_states,
+        inputs,
         attention_mask=None,
         position_embeddings=None,
         num_prefix_tokens=0,
+        training=None,
     ):
         pyramid_outputs = {}
-        for i, layer_module in enumerate(self.layers):
-            hidden_states = layer_module(
-                hidden_states,
+        x = inputs
+        for layer_index, layer in enumerate(self.layers, start=1):
+            x = layer(
+                x,
                 attention_mask=attention_mask,
                 position_embeddings=position_embeddings,
                 num_prefix_tokens=num_prefix_tokens,
+                training=training,
             )
-            pyramid_outputs[f"stage{i + 1}"] = hidden_states
-
-        return hidden_states, pyramid_outputs
+            pyramid_outputs[f"stage{str(layer_index)}"] = x
+        return x, pyramid_outputs
 
     def get_config(self):
         config = super().get_config()
@@ -880,10 +989,22 @@ def get_config(self):
                 "num_heads": self.num_heads,
                 "intermediate_dim": self.intermediate_dim,
                 "layer_scale_init_value": self.layer_scale_init_value,
+                "hidden_activation": self.hidden_activation,
                 "use_gated_mlp": self.use_gated_mlp,
+                "use_query_bias": self.use_query_bias,
+                "use_key_bias": self.use_key_bias,
+                "use_value_bias": self.use_value_bias,
+                "use_proj_bias": self.use_proj_bias,
+                "use_mlp_bias": self.use_mlp_bias,
                 "attention_dropout": self.attention_dropout,
                 "drop_path_rate": self.drop_path_rate,
                 "layer_norm_eps": self.layer_norm_eps,
             }
         )
         return config
+
+    def compute_output_shape(self, input_shape):
+        pyramid_outputs = {}
+        for layer_index in range(1, len(self.layers) + 1):
+            pyramid_outputs[f"stage{str(layer_index)}"] = input_shape
+        return input_shape, pyramid_outputs
diff --git a/keras_hub/src/utils/transformers/convert_dinov3.py b/keras_hub/src/utils/transformers/convert_dinov3.py
index 97d9bd0b09..7d51eeb118 100644
--- a/keras_hub/src/utils/transformers/convert_dinov3.py
+++ b/keras_hub/src/utils/transformers/convert_dinov3.py
@@ -14,20 +14,21 @@ def convert_backbone_config(transformers_config):
         "num_heads": transformers_config["num_attention_heads"],
         "intermediate_dim": transformers_config["intermediate_size"],
         "layer_scale_init_value": transformers_config["layerscale_value"],
-        "num_register_tokens": transformers_config.get(
-            "num_register_tokens", 0
-        ),
+        "num_register_tokens": transformers_config["num_register_tokens"],
         "use_mask_token": True,
+        "hidden_activation": transformers_config["hidden_act"],
         "use_gated_mlp": transformers_config["use_gated_mlp"],
+        "use_query_bias": transformers_config["query_bias"],
+        "use_key_bias": transformers_config["key_bias"],
+        "use_value_bias": transformers_config["value_bias"],
+        "use_proj_bias": transformers_config["proj_bias"],
+        "use_mlp_bias": transformers_config["mlp_bias"],
         "attention_dropout": transformers_config["attention_dropout"],
         "drop_path_rate": transformers_config["drop_path_rate"],
+        "layer_norm_eps": transformers_config["layer_norm_eps"],
         "image_shape": (image_size, image_size, 3),
         "rope_theta": transformers_config["rope_theta"],
         "apply_layernorm": False,
-        "query_bias": transformers_config["query_bias"],
-        "key_bias": transformers_config["key_bias"],
-        "value_bias": transformers_config["value_bias"],
-        "proj_bias": transformers_config["proj_bias"],
     }
 
 
@@ -56,6 +57,11 @@ def port_dense(keras_variable, weight_key):
         keras_variable=backbone.embeddings.cls_token,
         hf_weight_key="embeddings.cls_token",
     )
+    if backbone.use_mask_token:
+        loader.port_weight(
+            keras_variable=backbone.embeddings.mask_token,
+            hf_weight_key="embeddings.mask_token",
+        )
     if backbone.num_register_tokens > 0:
         loader.port_weight(
             keras_variable=backbone.embeddings.register_tokens,
@@ -75,10 +81,10 @@ def port_dense(keras_variable, weight_key):
     for i, layer in enumerate(backbone.encoder.layers):
         prefix = f"layer.{i}"
         port_ln(layer.norm1, f"{prefix}.norm1")
-        port_dense(layer.attention.q_proj, f"{prefix}.attention.q_proj")
-        port_dense(layer.attention.k_proj, f"{prefix}.attention.k_proj")
-        port_dense(layer.attention.v_proj, f"{prefix}.attention.v_proj")
-        port_dense(layer.attention.o_proj, f"{prefix}.attention.o_proj")
+        port_dense(layer.attention.query_dense, f"{prefix}.attention.q_proj")
+        port_dense(layer.attention.key_dense, f"{prefix}.attention.k_proj")
+        port_dense(layer.attention.value_dense, f"{prefix}.attention.v_proj")
+        port_dense(layer.attention.output_dense, f"{prefix}.attention.o_proj")
 
         loader.port_weight(
             keras_variable=layer.layer_scale1.lambda1,
diff --git a/tools/checkpoint_conversion/convert_dinov3_checkpoints.py b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py
index 03893b6885..36d06e3fb4 100644
--- a/tools/checkpoint_conversion/convert_dinov3_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py
@@ -1,3 +1,28 @@
+"""Convert DINOV3 checkpoints.
+
+export KAGGLE_USERNAME=xxx KAGGLE_KEY=xxx
+
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_small_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_small_lvd1689m
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_small_plus_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_small_plus_lvd1689m
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_base_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_base_lvd1689m
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_large_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_large_lvd1689m
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_huge_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_huge_lvd1689m
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_huge_plus_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_huge_plus_lvd1689m
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_7b_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_7b_lvd1689m
+
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_large_sat493m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_large_sat493m
+python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \
+    --preset dinov3_vit_7b_sat493m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_7b_sat493m
+"""
+
 import keras
 import numpy as np
 import torch
@@ -10,7 +35,20 @@
 import keras_hub
 
 PRESET_MAP = {
+    # ViT lvd1689m variants.
     "dinov3_vit_small_lvd1689m": "facebook/dinov3-vits16-pretrain-lvd1689m",
+    "dinov3_vit_small_plus_lvd1689m": (
+        "facebook/dinov3-vits16plus-pretrain-lvd1689m"
+    ),
+    "dinov3_vit_base_lvd1689m": "facebook/dinov3-vitb16-pretrain-lvd1689m",
+    "dinov3_vit_large_lvd1689m": "facebook/dinov3-vitl16-pretrain-lvd1689m",
+    "dinov3_vit_huge_plus_lvd1689m": (
+        "facebook/dinov3-vith16plus-pretrain-lvd1689m"
+    ),
+    "dinov3_vit_7b_lvd1689m": "facebook/dinov3-vit7b16-pretrain-lvd1689m",
+    # ViT sat493m variants.
+    "dinov3_vit_large_sat493m": "facebook/dinov3-vitl16-pretrain-sat493m",
+    "dinov3_vit_7b_sat493m": "facebook/dinov3-vit7b16-pretrain-sat493m",
 }
 
 FLAGS = flags.FLAGS
@@ -37,6 +75,7 @@ def convert_image_converter(image_size, hf_image_processor):
         image_size=image_size,
         scale=[1.0 / 255.0 / s for s in std],
         offset=[-m / s for m, s in zip(mean, std)],
+        crop_to_aspect_ratio=False,
         interpolation="bilinear",
         antialias=True,
     )
@@ -56,13 +95,15 @@ def validate_output(
     # Preprocess with hf.
     hf_inputs = hf_image_processor(images=image, return_tensors="pt")
     hf_preprocessed = hf_inputs["pixel_values"].detach().cpu().numpy()
-    print("🔶 HF preprocessed shape:", hf_preprocessed.shape)
 
     # Preprocess with keras.
     images = np.expand_dims(np.array(image).astype("float32"), axis=0)
     images = keras_hub_image_converter(images)
     keras_preprocessed = keras.ops.convert_to_numpy(images)
 
+    print("🔶 Keras preprocessor output:", keras_preprocessed[0, 0, :10, 0])
+    print("🔶 HF preprocessor output:", hf_preprocessed[0, 0, 0, :10])
+
     # Call with hf. Use the keras preprocessed image so we can keep modeling
     # and preprocessing comparisons independent.
     hf_inputs["pixel_values"] = torch.from_numpy(

From b3156b1d6cbf895e6c1fa8e98cfa66e8cdabeb73 Mon Sep 17 00:00:00 2001
From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com>
Date: Mon, 27 Oct 2025 21:01:41 +0800
Subject: [PATCH 4/5] Resolves Gemini comments.

---
 .../src/models/dinov3/dinov3_backbone.py      | 46 ++++++++++++++++++-
 .../src/models/dinov3/dinov3_backbone_test.py |  4 +-
 keras_hub/src/models/dinov3/dinov3_layers.py  | 19 ++++----
 .../utils/transformers/convert_dinov3_test.py | 34 ++++++++++++++
 4 files changed, 91 insertions(+), 12 deletions(-)
 create mode 100644 keras_hub/src/utils/transformers/convert_dinov3_test.py

diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py
index 448051a340..5f52d9a509 100644
--- a/keras_hub/src/models/dinov3/dinov3_backbone.py
+++ b/keras_hub/src/models/dinov3/dinov3_backbone.py
@@ -64,6 +64,48 @@ class DINOV3Backbone(FeaturePyramidBackbone):
             for the models computations and weights. Note that some
             computations, such as softmax and layer normalization will always
             be done a float32 precision regardless of dtype.
+
+    Example:
+    ```python
+    # Pretrained DINOV3 model.
+    input_data = {
+        "images": np.ones(shape=(1, 518, 518, 3), dtype="float32"),
+    }
+    model = keras_hub.models.DINOV3Backbone.from_preset(
+        "dinov3_vit_small_lvd1689m"
+    )
+    model(input_data)
+
+    # Pretrained DINOV3 model with custom image shape.
+    input_data = {
+        "images": np.ones(shape=(1, 224, 224, 3), dtype="float32"),
+    }
+    model = keras_hub.models.DINOV3Backbone.from_preset(
+        "dinov3_vit_small_lvd1689m", image_shape=(224, 224, 3)
+    )
+    model(input_data)
+
+    # Randomly initialized DINOV3 model with custom config.
+    model = keras_hub.models.DINOV3Backbone(
+        patch_size=14,
+        num_layers=2,
+        hidden_dim=32,
+        num_heads=2,
+        intermediate_dim=128,
+        image_shape=(224, 224, 3),
+    )
+    model(input_data)
+
+    # Accessing feature pyramid outputs.
+    backbone = keras_hub.models.DINOV3Backbone.from_preset(
+        "dinov3_vit_small_lvd1689m", image_shape=(224, 224, 3)
+    )
+    model = keras.Model(
+        inputs=backbone.inputs,
+        outputs=backbone.pyramid_outputs,
+    )
+    features = model(input_data)
+    ```
     """
 
     def __init__(
@@ -141,7 +183,7 @@ def __init__(
 
         # === Functional Model ===
         pyramid_outputs = {}
-        image_input = layers.Input(shape=image_shape, name="images")
+        image_input = layers.Input(shape=image_shape, name="pixel_values")
         x = self.embeddings(image_input)
         pyramid_outputs["stem"] = x
 
@@ -160,7 +202,7 @@ def __init__(
                 pyramid_outputs[key] = self.layernorm(pyramid_outputs[key])
         outputs = x
         super().__init__(
-            inputs={"images": image_input},
+            inputs={"pixel_values": image_input},
             outputs=outputs,
             dtype=dtype,
             name=name,
diff --git a/keras_hub/src/models/dinov3/dinov3_backbone_test.py b/keras_hub/src/models/dinov3/dinov3_backbone_test.py
index e032ab1c4e..b8fdd9a0c6 100644
--- a/keras_hub/src/models/dinov3/dinov3_backbone_test.py
+++ b/keras_hub/src/models/dinov3/dinov3_backbone_test.py
@@ -23,7 +23,7 @@ def setUp(self):
             "name": "dinov3_backbone",
         }
         self.input_data = {
-            "images": ops.ones((2, 64, 64, 3)),
+            "pixel_values": ops.ones((2, 64, 64, 3)),
         }
 
     def test_backbone_basics(self):
@@ -73,7 +73,7 @@ def test_position_embedding_interpolation(self):
             image_shape=(128, 128, 3),  # From 64 to 128.
         )
         input_data = {
-            "images": ops.ones((2, 128, 128, 3)),
+            "pixel_values": ops.ones((2, 128, 128, 3)),
         }
         restored_output = restored_model(input_data)
         self.assertNotEqual(model_output.shape, restored_output.shape)
diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py
index d27c3485dc..edc46006a8 100644
--- a/keras_hub/src/models/dinov3/dinov3_layers.py
+++ b/keras_hub/src/models/dinov3/dinov3_layers.py
@@ -449,6 +449,7 @@ def call(
             is_causal=False,
         )
         attn_output = ops.reshape(attn_output, (batch_size, seq_len, -1))
+        attn_output = self.dropout(attn_output, training=training)
         return self.output_dense(attn_output, training=training)
 
     def get_config(self):
@@ -815,6 +816,7 @@ def call(
         attention_mask=None,
         position_embeddings=None,
         num_prefix_tokens=0,
+        training=None,
     ):
         residual = inputs
         hidden_states = self.norm1(inputs)
@@ -823,17 +825,18 @@ def call(
             attention_mask=attention_mask,
             position_embeddings=position_embeddings,
             num_prefix_tokens=num_prefix_tokens,
+            training=training,
+        )
+        hidden_states = self.layer_scale1(hidden_states, training=training)
+        hidden_states = (
+            self.drop_path(hidden_states, training=training) + residual
         )
-        hidden_states = self.layer_scale1(hidden_states)
-        hidden_states = self.drop_path(hidden_states) + residual
 
         residual = hidden_states
-        hidden_states = self.norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = self.layer_scale2(hidden_states)
-        hidden_states = self.drop_path(hidden_states) + residual
-
-        return hidden_states
+        hidden_states = self.norm2(hidden_states, training=training)
+        hidden_states = self.mlp(hidden_states, training=training)
+        hidden_states = self.layer_scale2(hidden_states, training=training)
+        return self.drop_path(hidden_states, training=training) + residual
 
     def get_config(self):
         config = super().get_config()
diff --git a/keras_hub/src/utils/transformers/convert_dinov3_test.py b/keras_hub/src/utils/transformers/convert_dinov3_test.py
new file mode 100644
index 0000000000..d3aa16d4db
--- /dev/null
+++ b/keras_hub/src/utils/transformers/convert_dinov3_test.py
@@ -0,0 +1,34 @@
+import numpy as np
+import pytest
+
+from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone
+from keras_hub.src.tests.test_case import TestCase
+
+
+class TestTask(TestCase):
+    @pytest.mark.large
+    def test_convert_tiny_preset(self):
+        model = DINOV3Backbone.from_preset(
+            "hf://facebook/dinov3-vits16-pretrain-lvd1689m",
+            image_shape=(224, 224, 3),
+        )
+        dummy_input = {
+            "pixel_values": np.ones((1, 224, 224, 3), dtype="float32")
+        }
+        output = model.predict(dummy_input)
+        self.assertAllClose(
+            output[0, 0, :10],
+            [
+                -0.2769,
+                0.5487,
+                0.2501,
+                -1.2269,
+                0.5886,
+                0.0762,
+                0.6251,
+                0.1874,
+                -0.4259,
+                -0.4362,
+            ],
+            atol=1e-2,
+        )

From 3ad0e9fdd9b1765f101d33056e20289acbeb2402 Mon Sep 17 00:00:00 2001
From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com>
Date: Tue, 28 Oct 2025 20:49:33 +0800
Subject: [PATCH 5/5] Skip the HF conversion test.

---
 keras_hub/src/utils/transformers/convert_dinov3_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/keras_hub/src/utils/transformers/convert_dinov3_test.py b/keras_hub/src/utils/transformers/convert_dinov3_test.py
index d3aa16d4db..81e3775dc8 100644
--- a/keras_hub/src/utils/transformers/convert_dinov3_test.py
+++ b/keras_hub/src/utils/transformers/convert_dinov3_test.py
@@ -8,6 +8,7 @@
 class TestTask(TestCase):
     @pytest.mark.large
     def test_convert_tiny_preset(self):
+        pytest.skip(reason="TODO: enable after HF token is available in CI")
         model = DINOV3Backbone.from_preset(
             "hf://facebook/dinov3-vits16-pretrain-lvd1689m",
             image_shape=(224, 224, 3),