From 1faafe0a676254fd65240aa32b646440e5373b98 Mon Sep 17 00:00:00 2001 From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com> Date: Sun, 19 Oct 2025 14:33:58 +0800 Subject: [PATCH 1/5] Add DINOV3 with the help from Gemini CLI. --- keras_hub/api/layers/__init__.py | 3 + keras_hub/api/models/__init__.py | 3 + keras_hub/src/models/dinov3/__init__.py | 5 + .../src/models/dinov3/dinov3_backbone.py | 200 +++++ .../models/dinov3/dinov3_image_converter.py | 8 + keras_hub/src/models/dinov3/dinov3_layers.py | 722 ++++++++++++++++++ keras_hub/src/models/dinov3/dinov3_presets.py | 4 + .../src/utils/transformers/convert_dinov3.py | 100 +++ .../src/utils/transformers/preset_loader.py | 3 + .../convert_dinov3_checkpoints.py | 136 ++++ 10 files changed, 1184 insertions(+) create mode 100644 keras_hub/src/models/dinov3/__init__.py create mode 100644 keras_hub/src/models/dinov3/dinov3_backbone.py create mode 100644 keras_hub/src/models/dinov3/dinov3_image_converter.py create mode 100644 keras_hub/src/models/dinov3/dinov3_layers.py create mode 100644 keras_hub/src/models/dinov3/dinov3_presets.py create mode 100644 keras_hub/src/utils/transformers/convert_dinov3.py create mode 100644 tools/checkpoint_conversion/convert_dinov3_checkpoints.py diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py index aacee7818e..58afa65934 100644 --- a/keras_hub/api/layers/__init__.py +++ b/keras_hub/api/layers/__init__.py @@ -93,6 +93,9 @@ from keras_hub.src.models.dinov2.dinov2_image_converter import ( DINOV2ImageConverter as DINOV2ImageConverter, ) +from keras_hub.src.models.dinov3.dinov3_image_converter import ( + DINOV3ImageConverter as DINOV3ImageConverter, +) from keras_hub.src.models.efficientnet.efficientnet_image_converter import ( EfficientNetImageConverter as EfficientNetImageConverter, ) diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py index b90dde2cc7..72eebbf64a 100644 --- a/keras_hub/api/models/__init__.py +++ b/keras_hub/api/models/__init__.py @@ -184,6 +184,9 @@ from keras_hub.src.models.dinov2.dinov2_backbone import ( DINOV2Backbone as DINOV2Backbone, ) +from keras_hub.src.models.dinov3.dinov3_backbone import ( + DINOV3Backbone as DINOV3Backbone, +) from keras_hub.src.models.distil_bert.distil_bert_backbone import ( DistilBertBackbone as DistilBertBackbone, ) diff --git a/keras_hub/src/models/dinov3/__init__.py b/keras_hub/src/models/dinov3/__init__.py new file mode 100644 index 0000000000..1752b3c838 --- /dev/null +++ b/keras_hub/src/models/dinov3/__init__.py @@ -0,0 +1,5 @@ +from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone +from keras_hub.src.models.dinov3.dinov3_presets import backbone_presets +from keras_hub.src.utils.preset_utils import register_presets + +register_presets(backbone_presets, DINOV3Backbone) diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py new file mode 100644 index 0000000000..385247d719 --- /dev/null +++ b/keras_hub/src/models/dinov3/dinov3_backbone.py @@ -0,0 +1,200 @@ +from keras import layers + +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.models.dinov3.dinov3_layers import DINOV3Embedding +from keras_hub.src.models.dinov3.dinov3_layers import DINOV3Encoder +from keras_hub.src.models.dinov3.dinov3_layers import ( + DINOV3RopePositionEmbedding, +) +from keras_hub.src.models.feature_pyramid_backbone import FeaturePyramidBackbone +from keras_hub.src.utils.keras_utils import standardize_data_format + + +@keras_hub_export("keras_hub.models.DINOV3Backbone") +class DINOV3Backbone(FeaturePyramidBackbone): + """DINOV3 core network with hyperparameters. + + Args: + patch_size: int. The size of each square patch in the input image. + num_layers: int. The number of transformer layers. + hidden_dim: int. The size of the transformer hidden state at the end + of each transformer layer. + num_heads: int. The number of attention heads for each transformer. + intermediate_dim: int. The output dimension of the first Dense layer in + a two-layer feedforward network for each transformer. + layer_scale_init_value: float. The initial value for the layer scale in + the transformer layers. Defaults to `1.0`. + num_register_tokens: int. The number of register tokens to use in the + embedding layer. Defaults to `0`. + use_mask_token: bool. Whether to use a mask token in the embedding + layer. Defaults to `True`. + use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to + `False`. + attention_dropout: float. The dropout rate for the attention + probabilities. Defaults to `0.0`. + drop_path_rate: float. The drop path rate to use. Defaults to `0.0`. + image_shape: tuple. The input shape without the batch size. Defaults to + `(518, 518, 3)`. + rope_theta: float. The base period of the rotary position embeddings. + apply_layernorm: bool. Whether to apply layer normalization to the + outputs of each stage in the feature pyramid. Defaults to `False`. + query_bias: bool. Whether to use a bias for the query projection. + key_bias: bool. Whether to use a bias for the key projection. + value_bias: bool. Whether to use a bias for the value projection. + proj_bias: bool. Whether to use a bias for the output projection. + data_format: `None` or str. If specified, either `"channels_last"` or + `"channels_first"`. The ordering of the dimensions in the + inputs. `"channels_last"` corresponds to inputs with shape + `(batch_size, height, width, channels)` + while `"channels_first"` corresponds to inputs with shape + `(batch_size, channels, height, width)`. It defaults to the + `image_data_format` value found in your Keras config file at + `~/.keras/keras.json`. If you never set it, then it will be + `"channels_last"`. + dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use + for the models computations and weights. Note that some + computations, such as softmax and layer normalization will always + be done a float32 precision regardless of dtype. + """ + + def __init__( + self, + patch_size, + num_layers, + hidden_dim, + num_heads, + intermediate_dim, + layer_scale_init_value=1.0, + num_register_tokens=4, + use_mask_token=True, + use_gated_mlp=False, + attention_dropout=0.0, + drop_path_rate=0.0, + image_shape=(518, 518, 3), + rope_theta=10000.0, + apply_layernorm=False, + query_bias=True, + key_bias=True, + value_bias=True, + proj_bias=True, + data_format=None, + dtype=None, + name=None, + **kwargs, + ): + data_format = standardize_data_format(data_format) + + prefix = str(name) + "_" if name is not None else "" + + # === Layers === + self.embeddings = DINOV3Embedding( + hidden_dim=hidden_dim, + patch_size=patch_size, + num_register_tokens=num_register_tokens, + use_mask_token=use_mask_token, + data_format=data_format, + dtype=dtype, + name=f"{prefix}embeddings", + ) + self.rope_embedding = DINOV3RopePositionEmbedding( + hidden_dim=hidden_dim, + num_heads=num_heads, + rope_theta=rope_theta, + patch_size=patch_size, + dtype=dtype, + name=f"{prefix}rope_embedding", + ) + self.encoder = DINOV3Encoder( + num_layers=num_layers, + hidden_dim=hidden_dim, + num_heads=num_heads, + intermediate_dim=intermediate_dim, + layer_scale_init_value=layer_scale_init_value, + use_gated_mlp=use_gated_mlp, + attention_dropout=attention_dropout, + drop_path_rate=drop_path_rate, + query_bias=query_bias, + key_bias=key_bias, + value_bias=value_bias, + proj_bias=proj_bias, + dtype=dtype, + name=f"{prefix}encoder", + ) + self.layernorm = layers.LayerNormalization( + epsilon=1e-6, dtype=dtype, name=f"{prefix}layernorm" + ) + + # === Functional Model === + pyramid_outputs = {} + image_input = layers.Input(shape=image_shape, name="images") + x = self.embeddings(image_input) + pyramid_outputs["stem"] = x + + position_embeddings = self.rope_embedding(image_input) + num_prefix_tokens = 1 + num_register_tokens + + x, encoder_pyramid_outputs = self.encoder( + x, + position_embeddings=position_embeddings, + num_prefix_tokens=num_prefix_tokens, + ) + pyramid_outputs.update(encoder_pyramid_outputs) + x = self.layernorm(x) + if apply_layernorm: + for key in pyramid_outputs: + pyramid_outputs[key] = self.layernorm(pyramid_outputs[key]) + outputs = x + super().__init__( + inputs={"images": image_input}, + outputs=outputs, + dtype=dtype, + name=name, + **kwargs, + ) + + # === Config === + self.patch_size = int(patch_size) + self.num_layers = int(num_layers) + self.hidden_dim = int(hidden_dim) + self.num_heads = int(num_heads) + self.intermediate_dim = int(intermediate_dim) + self.layer_scale_init_value = float(layer_scale_init_value) + self.num_register_tokens = int(num_register_tokens) + self.use_mask_token = bool(use_mask_token) + self.use_gated_mlp = bool(use_gated_mlp) + self.attention_dropout = float(attention_dropout) + self.drop_path_rate = float(drop_path_rate) + self.image_shape = image_shape + self.rope_theta = rope_theta + self.apply_layernorm = apply_layernorm + self.query_bias = query_bias + self.key_bias = key_bias + self.value_bias = value_bias + self.proj_bias = proj_bias + self.pyramid_outputs = pyramid_outputs + + def get_config(self): + config = super().get_config() + config.update( + { + "patch_size": self.patch_size, + "num_layers": self.num_layers, + "hidden_dim": self.hidden_dim, + "num_heads": self.num_heads, + "intermediate_dim": self.intermediate_dim, + "layer_scale_init_value": self.layer_scale_init_value, + "num_register_tokens": self.num_register_tokens, + "use_mask_token": self.use_mask_token, + "use_gated_mlp": self.use_gated_mlp, + "attention_dropout": self.attention_dropout, + "drop_path_rate": self.drop_path_rate, + "image_shape": self.image_shape, + "rope_theta": self.rope_theta, + "apply_layernorm": self.apply_layernorm, + "query_bias": self.query_bias, + "key_bias": self.key_bias, + "value_bias": self.value_bias, + "proj_bias": self.proj_bias, + } + ) + return config diff --git a/keras_hub/src/models/dinov3/dinov3_image_converter.py b/keras_hub/src/models/dinov3/dinov3_image_converter.py new file mode 100644 index 0000000000..54b08eacf3 --- /dev/null +++ b/keras_hub/src/models/dinov3/dinov3_image_converter.py @@ -0,0 +1,8 @@ +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.layers.preprocessing.image_converter import ImageConverter +from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone + + +@keras_hub_export("keras_hub.layers.DINOV3ImageConverter") +class DINOV3ImageConverter(ImageConverter): + backbone_cls = DINOV3Backbone diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py new file mode 100644 index 0000000000..1eb80141ee --- /dev/null +++ b/keras_hub/src/models/dinov3/dinov3_layers.py @@ -0,0 +1,722 @@ +import math + +from keras import initializers +from keras import layers +from keras import ops +from keras import random + +from keras_hub.src.utils.keras_utils import standardize_data_format + + +class DINOV3PatchEmbedding(layers.Layer): + """A layer that converts images into patches. + + Args: + hidden_dim: int. The number of units in the hidden layers. + patch_size: int. The size of one side of each patch. + data_format: `None` or str. If specified, either `"channels_last"` or + `"channels_first"`. The ordering of the dimensions in the + inputs. `"channels_last"` corresponds to inputs with shape + `(batch_size, height, width, channels)` + while `"channels_first"` corresponds to inputs with shape + `(batch_size, channels, height, width)`. It defaults to the + `image_data_format` value found in your Keras config file at + `~/.keras/keras.json`. If you never set it, then it will be + `"channels_last"`. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + + def __init__(self, hidden_dim, patch_size, data_format=None, **kwargs): + super().__init__(**kwargs) + self.hidden_dim = int(hidden_dim) + self.patch_size = int(patch_size) + self.data_format = standardize_data_format(data_format) + + self.projection = layers.Conv2D( + hidden_dim, + kernel_size=patch_size, + strides=patch_size, + data_format=data_format, + kernel_initializer=initializers.TruncatedNormal(stddev=0.02), + dtype=self.dtype_policy, + name="projection", + ) + + def build(self, input_shape): + self.projection.build(input_shape) + + def call(self, inputs, training=None): + batch_size = ops.shape(inputs)[0] + embeddings = self.projection(inputs, training=training) + if self.data_format == "channels_last": + embeddings = ops.reshape( + embeddings, (batch_size, -1, self.hidden_dim) + ) + else: + embeddings = ops.reshape( + embeddings, (batch_size, self.hidden_dim, -1) + ) + embeddings = ops.transpose(embeddings, (0, 2, 1)) + return embeddings + + def get_config(self): + config = super().get_config() + config.update( + { + "hidden_dim": self.hidden_dim, + "patch_size": self.patch_size, + } + ) + return config + + def compute_output_shape(self, input_shape): + output_shape = [input_shape[0], None, self.hidden_dim] + if self.data_format == "channels_last": + if input_shape[1] is not None and input_shape[2] is not None: + patch_num = input_shape[1] // self.patch_size + output_shape[1] = patch_num**2 + else: + if input_shape[2] is not None and input_shape[3] is not None: + patch_num = input_shape[2] // self.patch_size + output_shape[1] = patch_num**2 + return output_shape + + +class DINOV3Embedding(layers.Layer): + """A layer that converts images into patches. + + This layer adds all the necessary tokens to the embeddings, inlcuding + the class token, register tokens and mask token if specified. + + Args: + hidden_dim: int. The number of units in the hidden layers. + patch_size: int. The size of one side of each patch. + num_register_tokens: int. The number of register tokens to add to the + embeddings. Defaults to `0`. + use_mask_token: bool. Whether to use a mask token. Defaults to `True`. + initializer_range: float. The standard deviation of the truncated + normal initializer. Defaults to `0.02`. + data_format: `None` or str. If specified, either `"channels_last"` or + `"channels_first"`. The ordering of the dimensions in the + inputs. `"channels_last"` corresponds to inputs with shape + `(batch_size, height, width, channels)` + while `"channels_first"` corresponds to inputs with shape + `(batch_size, channels, height, width)`. It defaults to the + `image_data_format` value found in your Keras config file at + `~/.keras/keras.json`. If you never set it, then it will be + `"channels_last"`. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + + def __init__( + self, + hidden_dim, + patch_size, + num_register_tokens=0, + use_mask_token=True, + initializer_range=0.02, + data_format=None, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_dim = int(hidden_dim) + self.patch_size = int(patch_size) + self.num_register_tokens = int(num_register_tokens) + self.use_mask_token = bool(use_mask_token) + self.initializer_range = float(initializer_range) + self.data_format = standardize_data_format(data_format) + + self.patch_embeddings = DINOV3PatchEmbedding( + hidden_dim, + patch_size, + data_format=data_format, + dtype=self.dtype_policy, + name="patch_embeddings", + ) + + def build(self, input_shape): + self.cls_token = self.add_weight( + shape=(1, 1, self.hidden_dim), + initializer=initializers.TruncatedNormal( + stddev=self.initializer_range + ), + trainable=True, + name="cls_token", + ) + if self.use_mask_token: + self.mask_token = self.add_weight( + shape=(1, 1, self.hidden_dim), + initializer="zeros", + trainable=True, + name="mask_token", + ) + if self.num_register_tokens > 0: + self.register_tokens = self.add_weight( + shape=(1, self.num_register_tokens, self.hidden_dim), + initializer=initializers.TruncatedNormal( + stddev=self.initializer_range + ), + trainable=True, + name="register_tokens", + ) + self.patch_embeddings.build(input_shape) + + def call(self, inputs, masks=None, training=None): + batch_size = ops.shape(inputs)[0] + embeddings = self.patch_embeddings(inputs, training=training) + + if masks is not None and self.use_mask_token: + mask_token = ops.cast(self.mask_token, embeddings.dtype) + embeddings = ops.where( + ops.expand_dims(masks, axis=-1), + mask_token, + embeddings, + ) + + cls_tokens = ops.tile(self.cls_token, (batch_size, 1, 1)) + embeddings = ops.concatenate((cls_tokens, embeddings), axis=1) + + if self.num_register_tokens > 0: + register_tokens = ops.tile(self.register_tokens, (batch_size, 1, 1)) + embeddings = ops.concatenate( + ( + embeddings[:, :1, ...], + register_tokens, + embeddings[:, 1:, ...], + ), + axis=1, + ) + + return embeddings + + def get_config(self): + config = super().get_config() + config.update( + { + "hidden_dim": self.hidden_dim, + "patch_size": self.patch_size, + "num_register_tokens": self.num_register_tokens, + "use_mask_token": self.use_mask_token, + "initializer_range": self.initializer_range, + } + ) + return config + + def compute_output_shape(self, input_shape): + output_shape = [input_shape[0], None, self.hidden_dim] + if self.data_format == "channels_last": + if input_shape[1] is not None and input_shape[2] is not None: + patch_num = input_shape[1] // self.patch_size + output_shape[1] = 1 + self.num_register_tokens + patch_num**2 + else: + if input_shape[2] is not None and input_shape[3] is not None: + patch_num = input_shape[2] // self.patch_size + output_shape[1] = 1 + self.num_register_tokens + patch_num**2 + return output_shape + + +def _get_patches_center_coordinates( + num_patches_h, num_patches_w, dtype="float32" +): + coords_h = ops.arange(0.5, num_patches_h, dtype=dtype) + coords_w = ops.arange(0.5, num_patches_w, dtype=dtype) + + coords_h = coords_h / num_patches_h + coords_w = coords_w / num_patches_w + + coords_h = ops.expand_dims(coords_h, axis=1) + coords_w = ops.expand_dims(coords_w, axis=0) + + coords_h = ops.repeat(coords_h, num_patches_w, axis=1) + coords_w = ops.repeat(coords_w, num_patches_h, axis=0) + + coords = ops.stack([coords_h, coords_w], axis=-1) + coords = ops.reshape(coords, (-1, 2)) + coords = 2.0 * coords - 1.0 + return coords + + +class DINOV3RopePositionEmbedding(layers.Layer): + def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs): + super().__init__(**kwargs) + self.hidden_dim = hidden_dim + self.num_heads = num_heads + self.rope_theta = rope_theta + self.patch_size = patch_size + self.head_dim = hidden_dim // num_heads + inv_freq = 1.0 / ( + rope_theta ** (ops.arange(0, 1, 4 / self.head_dim, dtype="float32")) + ) + self.inv_freq = inv_freq + + def call(self, pixel_values): + shape = ops.shape(pixel_values) + height, width = shape[1], shape[2] + num_patches_h = height // self.patch_size + num_patches_w = width // self.patch_size + + patch_coords = _get_patches_center_coordinates( + num_patches_h, num_patches_w, dtype="float32" + ) + + angles = ( + 2 + * math.pi + * ops.expand_dims(patch_coords, axis=-1) + * ops.expand_dims(ops.expand_dims(self.inv_freq, axis=0), axis=0) + ) + angles = ops.reshape(angles, (ops.shape(angles)[0], -1)) + angles = ops.tile(angles, (1, 2)) + + cos = ops.cos(angles) + sin = ops.sin(angles) + + return ops.cast(cos, pixel_values.dtype), ops.cast( + sin, pixel_values.dtype + ) + + def get_config(self): + config = super().get_config() + config.update( + { + "hidden_dim": self.hidden_dim, + "num_heads": self.num_heads, + "rope_theta": self.rope_theta, + "patch_size": self.patch_size, + } + ) + return config + + +def _rotate_half(x): + x1 = x[..., : ops.shape(x)[-1] // 2] + x2 = x[..., ops.shape(x)[-1] // 2 :] + return ops.concatenate([-x2, x1], axis=-1) + + +def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens): + q_prefix_tokens = q[:, :, :num_prefix_tokens, :] + q_patches = q[:, :, num_prefix_tokens:, :] + k_prefix_tokens = k[:, :, :num_prefix_tokens, :] + k_patches = k[:, :, num_prefix_tokens:, :] + + cos = ops.expand_dims(ops.expand_dims(cos, axis=0), axis=0) + sin = ops.expand_dims(ops.expand_dims(sin, axis=0), axis=0) + + q_patches = (q_patches * cos) + (_rotate_half(q_patches) * sin) + k_patches = (k_patches * cos) + (_rotate_half(k_patches) * sin) + + q = ops.concatenate([q_prefix_tokens, q_patches], axis=-2) + k = ops.concatenate([k_prefix_tokens, k_patches], axis=-2) + + return q, k + + +class DINOV3Attention(layers.Layer): + def __init__( + self, + hidden_dim, + num_heads, + dropout_rate=0.0, + query_bias=True, + key_bias=True, + value_bias=True, + proj_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_dim = hidden_dim + self.num_heads = num_heads + self.dropout_rate = dropout_rate + self.head_dim = hidden_dim // num_heads + self.scale = self.head_dim**-0.5 + + self.q_proj = layers.Dense( + hidden_dim, use_bias=query_bias, name="q_proj" + ) + self.k_proj = layers.Dense(hidden_dim, use_bias=key_bias, name="k_proj") + self.v_proj = layers.Dense( + hidden_dim, use_bias=value_bias, name="v_proj" + ) + self.o_proj = layers.Dense( + hidden_dim, use_bias=proj_bias, name="o_proj" + ) + self.dropout = layers.Dropout(dropout_rate) + + def call( + self, + hidden_states, + attention_mask=None, + position_embeddings=None, + num_prefix_tokens=0, + ): + batch_size, seq_len, _ = ops.shape(hidden_states) + + q = self.q_proj(hidden_states) + k = self.k_proj(hidden_states) + v = self.v_proj(hidden_states) + + q = ops.reshape(q, (batch_size, seq_len, self.num_heads, self.head_dim)) + k = ops.reshape(k, (batch_size, seq_len, self.num_heads, self.head_dim)) + v = ops.reshape(v, (batch_size, seq_len, self.num_heads, self.head_dim)) + + q = ops.transpose(q, (0, 2, 1, 3)) + k = ops.transpose(k, (0, 2, 1, 3)) + v = ops.transpose(v, (0, 2, 1, 3)) + + if position_embeddings is not None: + cos, sin = position_embeddings + q, k = _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens) + + attn_weights = ( + ops.matmul(q, ops.transpose(k, (0, 1, 3, 2))) * self.scale + ) + + if attention_mask is not None: + attn_weights += attention_mask + + attn_weights = ops.softmax(attn_weights, axis=-1) + attn_weights = self.dropout(attn_weights) + + attn_output = ops.matmul(attn_weights, v) + attn_output = ops.transpose(attn_output, (0, 2, 1, 3)) + attn_output = ops.reshape(attn_output, (batch_size, seq_len, -1)) + attn_output = self.o_proj(attn_output) + + return attn_output + + def get_config(self): + config = super().get_config() + config.update( + { + "hidden_dim": self.hidden_dim, + "num_heads": self.num_heads, + "dropout_rate": self.dropout_rate, + "query_bias": self.q_proj.use_bias, + "key_bias": self.k_proj.use_bias, + "value_bias": self.v_proj.use_bias, + "proj_bias": self.o_proj.use_bias, + } + ) + return config + + +class DINOV3LayerScale(layers.Layer): + def __init__(self, hidden_dim, init_values=1.0, **kwargs): + super().__init__(**kwargs) + self.hidden_dim = int(hidden_dim) + self.init_values = float(init_values) + + def build(self, input_shape): + self.lambda1 = self.add_weight( + shape=(self.hidden_dim,), + initializer=initializers.Constant(self.init_values), + trainable=True, + name="lambda1", + ) + + def call(self, inputs, training=None): + return ops.multiply(inputs, self.lambda1) + + def get_config(self): + config = super().get_config() + config.update( + {"hidden_dim": self.hidden_dim, "init_values": self.init_values} + ) + return config + + +class DINOV3DropPath(layers.Layer): + def __init__(self, rate=0.0, **kwargs): + super().__init__(**kwargs) + self.rate = float(rate) + + def build(self, input_shape): + self.noise_shape = (input_shape[0],) + (1,) * (len(input_shape) - 1) + + def call(self, inputs, training=None): + if not training or self.rate == 0.0: + return inputs + + keep_prob = 1.0 - self.rate + random_tensor = keep_prob + random.uniform( + self.noise_shape, dtype=inputs.dtype + ) + random_tensor = ops.floor(random_tensor) + return ops.multiply(ops.divide(inputs, keep_prob), random_tensor) + + def get_config(self): + config = super().get_config() + config.update({"rate": self.rate}) + return config + + +class DINOV3MLP(layers.Layer): + def __init__( + self, + hidden_dim, + intermediate_dim, + activation="gelu", + use_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_dim = hidden_dim + self.intermediate_dim = intermediate_dim + self.activation = activation + self.use_bias = use_bias + self.up_proj = layers.Dense( + intermediate_dim, use_bias=use_bias, name="up_proj" + ) + self.down_proj = layers.Dense( + hidden_dim, use_bias=use_bias, name="down_proj" + ) + self.act_fn = layers.Activation(activation) + + def call(self, x): + return self.down_proj(self.act_fn(self.up_proj(x))) + + def get_config(self): + config = super().get_config() + config.update( + { + "hidden_dim": self.hidden_dim, + "intermediate_dim": self.intermediate_dim, + "activation": self.activation, + "use_bias": self.use_bias, + } + ) + return config + + +class DINOV3GatedMLP(layers.Layer): + def __init__( + self, + hidden_dim, + intermediate_dim, + activation="gelu", + use_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_dim = hidden_dim + self.intermediate_dim = intermediate_dim + self.activation = activation + self.use_bias = use_bias + self.gate_proj = layers.Dense( + intermediate_dim, use_bias=use_bias, name="gate_proj" + ) + self.up_proj = layers.Dense( + intermediate_dim, use_bias=use_bias, name="up_proj" + ) + self.down_proj = layers.Dense( + hidden_dim, use_bias=use_bias, name="down_proj" + ) + self.act_fn = layers.Activation(activation) + + def call(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + def get_config(self): + config = super().get_config() + config.update( + { + "hidden_dim": self.hidden_dim, + "intermediate_dim": self.intermediate_dim, + "activation": self.activation, + "use_bias": self.use_bias, + } + ) + return config + + +class DINOV3Layer(layers.Layer): + def __init__( + self, + hidden_dim, + num_heads, + intermediate_dim, + layer_scale_init_value=1.0, + use_gated_mlp=False, + attention_dropout=0.0, + drop_path_rate=0.0, + layer_norm_eps=1e-6, + query_bias=True, + key_bias=True, + value_bias=True, + proj_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_dim = hidden_dim + self.num_heads = num_heads + self.intermediate_dim = intermediate_dim + self.layer_scale_init_value = layer_scale_init_value + self.use_gated_mlp = use_gated_mlp + self.attention_dropout = attention_dropout + self.drop_path_rate = drop_path_rate + self.layer_norm_eps = layer_norm_eps + + self.norm1 = layers.LayerNormalization( + epsilon=layer_norm_eps, name="norm1" + ) + self.attention = DINOV3Attention( + hidden_dim=hidden_dim, + num_heads=num_heads, + dropout_rate=attention_dropout, + query_bias=query_bias, + key_bias=key_bias, + value_bias=value_bias, + proj_bias=proj_bias, + name="attention", + ) + self.layer_scale1 = DINOV3LayerScale( + hidden_dim, + init_values=layer_scale_init_value, + name="layer_scale1", + ) + self.drop_path = ( + DINOV3DropPath(drop_path_rate) + if drop_path_rate > 0.0 + else layers.Identity() + ) + self.norm2 = layers.LayerNormalization( + epsilon=layer_norm_eps, name="norm2" + ) + if use_gated_mlp: + self.mlp = DINOV3GatedMLP(hidden_dim, intermediate_dim, name="mlp") + else: + self.mlp = DINOV3MLP(hidden_dim, intermediate_dim, name="mlp") + self.layer_scale2 = DINOV3LayerScale( + hidden_dim, init_values=layer_scale_init_value, name="layer_scale2" + ) + + def call( + self, + hidden_states, + attention_mask=None, + position_embeddings=None, + num_prefix_tokens=0, + ): + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states = self.attention( + hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + num_prefix_tokens=num_prefix_tokens, + ) + hidden_states = self.layer_scale1(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.layer_scale2(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + return hidden_states + + def get_config(self): + config = super().get_config() + config.update( + { + "hidden_dim": self.hidden_dim, + "num_heads": self.num_heads, + "intermediate_dim": self.intermediate_dim, + "layer_scale_init_value": self.layer_scale_init_value, + "use_gated_mlp": self.use_gated_mlp, + "attention_dropout": self.attention_dropout, + "drop_path_rate": self.drop_path_rate, + "layer_norm_eps": self.layer_norm_eps, + } + ) + return config + + +class DINOV3Encoder(layers.Layer): + def __init__( + self, + num_layers, + hidden_dim, + num_heads, + intermediate_dim, + layer_scale_init_value=1.0, + use_gated_mlp=False, + attention_dropout=0.0, + drop_path_rate=0.0, + layer_norm_eps=1e-6, + query_bias=True, + key_bias=True, + value_bias=True, + proj_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + self.num_layers = num_layers + self.hidden_dim = hidden_dim + self.num_heads = num_heads + self.intermediate_dim = intermediate_dim + self.layer_scale_init_value = layer_scale_init_value + self.use_gated_mlp = use_gated_mlp + self.attention_dropout = attention_dropout + self.drop_path_rate = drop_path_rate + self.layer_norm_eps = layer_norm_eps + + dpr = [x for x in ops.linspace(0.0, drop_path_rate, num_layers)] + self.layers = [ + DINOV3Layer( + hidden_dim=hidden_dim, + num_heads=num_heads, + intermediate_dim=intermediate_dim, + layer_scale_init_value=layer_scale_init_value, + use_gated_mlp=use_gated_mlp, + attention_dropout=attention_dropout, + drop_path_rate=dpr[i], + layer_norm_eps=layer_norm_eps, + query_bias=query_bias, + key_bias=key_bias, + value_bias=value_bias, + proj_bias=proj_bias, + name=f"layers.{i}", + ) + for i in range(num_layers) + ] + + def call( + self, + hidden_states, + attention_mask=None, + position_embeddings=None, + num_prefix_tokens=0, + ): + pyramid_outputs = {} + for i, layer_module in enumerate(self.layers): + hidden_states = layer_module( + hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + num_prefix_tokens=num_prefix_tokens, + ) + pyramid_outputs[f"stage{i + 1}"] = hidden_states + + return hidden_states, pyramid_outputs + + def get_config(self): + config = super().get_config() + config.update( + { + "num_layers": self.num_layers, + "hidden_dim": self.hidden_dim, + "num_heads": self.num_heads, + "intermediate_dim": self.intermediate_dim, + "layer_scale_init_value": self.layer_scale_init_value, + "use_gated_mlp": self.use_gated_mlp, + "attention_dropout": self.attention_dropout, + "drop_path_rate": self.drop_path_rate, + "layer_norm_eps": self.layer_norm_eps, + } + ) + return config diff --git a/keras_hub/src/models/dinov3/dinov3_presets.py b/keras_hub/src/models/dinov3/dinov3_presets.py new file mode 100644 index 0000000000..077663f11b --- /dev/null +++ b/keras_hub/src/models/dinov3/dinov3_presets.py @@ -0,0 +1,4 @@ +"""DINOV3 model preset configurations.""" + +# Metadata for loading pretrained model weights. +backbone_presets = {} diff --git a/keras_hub/src/utils/transformers/convert_dinov3.py b/keras_hub/src/utils/transformers/convert_dinov3.py new file mode 100644 index 0000000000..97d9bd0b09 --- /dev/null +++ b/keras_hub/src/utils/transformers/convert_dinov3.py @@ -0,0 +1,100 @@ +import numpy as np + +from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone + +backbone_cls = DINOV3Backbone + + +def convert_backbone_config(transformers_config): + image_size = transformers_config["image_size"] + return { + "patch_size": transformers_config["patch_size"], + "num_layers": transformers_config["num_hidden_layers"], + "hidden_dim": transformers_config["hidden_size"], + "num_heads": transformers_config["num_attention_heads"], + "intermediate_dim": transformers_config["intermediate_size"], + "layer_scale_init_value": transformers_config["layerscale_value"], + "num_register_tokens": transformers_config.get( + "num_register_tokens", 0 + ), + "use_mask_token": True, + "use_gated_mlp": transformers_config["use_gated_mlp"], + "attention_dropout": transformers_config["attention_dropout"], + "drop_path_rate": transformers_config["drop_path_rate"], + "image_shape": (image_size, image_size, 3), + "rope_theta": transformers_config["rope_theta"], + "apply_layernorm": False, + "query_bias": transformers_config["query_bias"], + "key_bias": transformers_config["key_bias"], + "value_bias": transformers_config["value_bias"], + "proj_bias": transformers_config["proj_bias"], + } + + +def convert_weights(backbone, loader, transformers_config): + if not isinstance(backbone, DINOV3Backbone): + raise ValueError( + "The provided backbone must be an instance of DINOV3Backbone. " + f"Received: {type(backbone)}" + ) + + def port_ln(keras_variable, weight_key): + loader.port_weight(keras_variable.gamma, f"{weight_key}.weight") + loader.port_weight(keras_variable.beta, f"{weight_key}.bias") + + def port_dense(keras_variable, weight_key): + loader.port_weight( + keras_variable.kernel, + f"{weight_key}.weight", + hook_fn=lambda x, _: x.T, + ) + if keras_variable.bias is not None: + loader.port_weight(keras_variable.bias, f"{weight_key}.bias") + + # Embedding. + loader.port_weight( + keras_variable=backbone.embeddings.cls_token, + hf_weight_key="embeddings.cls_token", + ) + if backbone.num_register_tokens > 0: + loader.port_weight( + keras_variable=backbone.embeddings.register_tokens, + hf_weight_key="embeddings.register_tokens", + ) + loader.port_weight( + keras_variable=backbone.embeddings.patch_embeddings.projection.kernel, + hf_weight_key="embeddings.patch_embeddings.weight", + hook_fn=lambda x, _: np.transpose(x, (2, 3, 1, 0)), + ) + loader.port_weight( + keras_variable=backbone.embeddings.patch_embeddings.projection.bias, + hf_weight_key="embeddings.patch_embeddings.bias", + ) + + # Encoder. + for i, layer in enumerate(backbone.encoder.layers): + prefix = f"layer.{i}" + port_ln(layer.norm1, f"{prefix}.norm1") + port_dense(layer.attention.q_proj, f"{prefix}.attention.q_proj") + port_dense(layer.attention.k_proj, f"{prefix}.attention.k_proj") + port_dense(layer.attention.v_proj, f"{prefix}.attention.v_proj") + port_dense(layer.attention.o_proj, f"{prefix}.attention.o_proj") + + loader.port_weight( + keras_variable=layer.layer_scale1.lambda1, + hf_weight_key=f"{prefix}.layer_scale1.lambda1", + ) + port_ln(layer.norm2, f"{prefix}.norm2") + if backbone.use_gated_mlp: + port_dense(layer.mlp.gate_proj, f"{prefix}.mlp.gate_proj") + port_dense(layer.mlp.up_proj, f"{prefix}.mlp.up_proj") + port_dense(layer.mlp.down_proj, f"{prefix}.mlp.down_proj") + else: + port_dense(layer.mlp.up_proj, f"{prefix}.mlp.up_proj") + port_dense(layer.mlp.down_proj, f"{prefix}.mlp.down_proj") + loader.port_weight( + keras_variable=layer.layer_scale2.lambda1, + hf_weight_key=f"{prefix}.layer_scale2.lambda1", + ) + + port_ln(backbone.layernorm, "norm") diff --git a/keras_hub/src/utils/transformers/preset_loader.py b/keras_hub/src/utils/transformers/preset_loader.py index 73f6a27717..f98007b438 100644 --- a/keras_hub/src/utils/transformers/preset_loader.py +++ b/keras_hub/src/utils/transformers/preset_loader.py @@ -8,6 +8,7 @@ from keras_hub.src.utils.transformers import convert_bert from keras_hub.src.utils.transformers import convert_deit from keras_hub.src.utils.transformers import convert_dinov2 +from keras_hub.src.utils.transformers import convert_dinov3 from keras_hub.src.utils.transformers import convert_distilbert from keras_hub.src.utils.transformers import convert_esm from keras_hub.src.utils.transformers import convert_gemma @@ -42,6 +43,8 @@ def __init__(self, preset, config): self.converter = convert_distilbert elif model_type in ("dinov2", "dinov2_with_registers"): self.converter = convert_dinov2 + elif model_type == "dinov3_vit": + self.converter = convert_dinov3 elif model_type == "esm": self.converter = convert_esm elif model_type in ("gemma", "gemma2"): diff --git a/tools/checkpoint_conversion/convert_dinov3_checkpoints.py b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py new file mode 100644 index 0000000000..03893b6885 --- /dev/null +++ b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py @@ -0,0 +1,136 @@ +import keras +import numpy as np +import torch +from absl import app +from absl import flags +from PIL import Image +from transformers import AutoImageProcessor +from transformers import AutoModel + +import keras_hub + +PRESET_MAP = { + "dinov3_vit_small_lvd1689m": "facebook/dinov3-vits16-pretrain-lvd1689m", +} + +FLAGS = flags.FLAGS +flags.DEFINE_string( + "preset", + None, + f"Must be one of {','.join(PRESET_MAP.keys())}", + required=True, +) +flags.DEFINE_string( + "upload_uri", + None, + 'Could be "kaggle://keras/{variant}/keras/{preset}"', + required=False, +) + + +def convert_image_converter(image_size, hf_image_processor): + config = hf_image_processor.to_dict() + image_size = (image_size, image_size) + std = config["image_std"] + mean = config["image_mean"] + return keras_hub.layers.DINOV3ImageConverter( + image_size=image_size, + scale=[1.0 / 255.0 / s for s in std], + offset=[-m / s for m, s in zip(mean, std)], + interpolation="bilinear", + antialias=True, + ) + + +def validate_output( + keras_hub_model, + keras_hub_image_converter, + hf_model, + hf_image_processor, +): + file = keras.utils.get_file( + origin=("http://images.cocodataset.org/val2017/000000039769.jpg") + ) + image = Image.open(file) + + # Preprocess with hf. + hf_inputs = hf_image_processor(images=image, return_tensors="pt") + hf_preprocessed = hf_inputs["pixel_values"].detach().cpu().numpy() + print("🔶 HF preprocessed shape:", hf_preprocessed.shape) + + # Preprocess with keras. + images = np.expand_dims(np.array(image).astype("float32"), axis=0) + images = keras_hub_image_converter(images) + keras_preprocessed = keras.ops.convert_to_numpy(images) + + # Call with hf. Use the keras preprocessed image so we can keep modeling + # and preprocessing comparisons independent. + hf_inputs["pixel_values"] = torch.from_numpy( + keras.ops.convert_to_numpy( + keras.ops.transpose(keras_preprocessed, (0, 3, 1, 2)) + ) + ) + hf_outputs = hf_model(**hf_inputs) + hf_outputs = hf_outputs[0].detach().cpu().numpy() + + # Call with keras. + keras_outputs = keras_hub_model.predict({"images": images}, verbose=0) + keras_outputs = keras.ops.convert_to_numpy(keras_outputs) + + print("🔶 Keras output:", keras_outputs[0, 0, :10]) + print("🔶 HF output:", hf_outputs[0, 0, :10]) + modeling_diff = np.mean(np.abs(keras_outputs - hf_outputs)) + print("🔶 Modeling difference:", modeling_diff) + preprocessing_diff = np.mean( + np.abs(keras_preprocessed - np.transpose(hf_preprocessed, (0, 2, 3, 1))) + ) + print("🔶 Preprocessing difference:", preprocessing_diff) + + +def main(_): + # === Get the preset name === + if FLAGS.preset not in PRESET_MAP.keys(): + raise ValueError( + f"Invalid preset {FLAGS.preset}. Must be one " + f"of {','.join(PRESET_MAP.keys())}" + ) + preset = FLAGS.preset + hf_preset = PRESET_MAP[preset] + + # Load the HF model. + hf_model = AutoModel.from_pretrained(hf_preset) + hf_model.eval() + image_size = int(hf_model.config.image_size) + hf_image_processor = AutoImageProcessor.from_pretrained(hf_preset) + + # Load the KerasHub model. + keras_hub_backbone = keras_hub.models.DINOV3Backbone.from_preset( + f"hf://{hf_preset}" + ) + keras_hub_backbone.summary() + keras_hub_image_converter = convert_image_converter( + image_size, hf_image_processor + ) + print("✅ KerasHub model loaded.") + print("✅ Weights converted.") + + validate_output( + keras_hub_backbone, + keras_hub_image_converter, + hf_model, + hf_image_processor, + ) + print("✅ Output validated.") + + keras_hub_backbone.save_to_preset(f"./{preset}") + keras_hub_image_converter.save_to_preset(f"./{preset}") + print(f"🏁 Preset saved to ./{preset}.") + + upload_uri = FLAGS.upload_uri + if upload_uri: + keras_hub.upload_preset(uri=upload_uri, preset=f"./{preset}") + print(f"🏁 Preset uploaded to {upload_uri}") + + +if __name__ == "__main__": + app.run(main) From 809abbddc1a11a04cf6146c3a560e3ff8d31217b Mon Sep 17 00:00:00 2001 From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com> Date: Sun, 19 Oct 2025 21:08:18 +0800 Subject: [PATCH 2/5] Add tests and docstrings. --- .../src/models/dinov3/dinov3_backbone_test.py | 101 +++++++++ keras_hub/src/models/dinov3/dinov3_layers.py | 209 ++++++++++++++++-- 2 files changed, 289 insertions(+), 21 deletions(-) create mode 100644 keras_hub/src/models/dinov3/dinov3_backbone_test.py diff --git a/keras_hub/src/models/dinov3/dinov3_backbone_test.py b/keras_hub/src/models/dinov3/dinov3_backbone_test.py new file mode 100644 index 0000000000..1320752c61 --- /dev/null +++ b/keras_hub/src/models/dinov3/dinov3_backbone_test.py @@ -0,0 +1,101 @@ +import os + +import keras +import pytest +from keras import ops + +from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone +from keras_hub.src.tests.test_case import TestCase + + +class DINOV3BackboneTest(TestCase): + def setUp(self): + self.init_kwargs = { + "patch_size": 14, + "num_layers": 2, + "hidden_dim": 16, + "num_heads": 2, + "intermediate_dim": 16 * 4, + "layer_scale_init_value": 1.0, + "num_register_tokens": 4, + "use_gated_mlp": False, + "image_shape": (70, 70, 3), + "name": "dinov3_backbone", + } + self.input_data = { + "images": ops.ones((2, 70, 70, 3)), + } + + def test_backbone_basics(self): + patch_size = self.init_kwargs["patch_size"] + image_size = self.init_kwargs["image_shape"][0] + hidden_dim = self.init_kwargs["hidden_dim"] + num_register_tokens = self.init_kwargs["num_register_tokens"] + sequence_length = ( + (image_size // patch_size) ** 2 + 1 + num_register_tokens + ) + self.run_vision_backbone_test( + cls=DINOV3Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, sequence_length, hidden_dim), + expected_pyramid_output_keys=["stem", "stage1", "stage2"], + expected_pyramid_image_sizes=[(sequence_length, hidden_dim)] * 3, + run_data_format_check=False, + ) + + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=DINOV3Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + + @pytest.mark.large + def test_position_embedding_interpolation(self): + model = DINOV3Backbone(**self.init_kwargs) + model_output = model(self.input_data) + + # Test not using interpolation in `save` and `load_model`. + path = os.path.join(self.get_temp_dir(), "model.keras") + model.save(path) + restored_model = keras.models.load_model(path) + restored_output = restored_model(self.input_data) + self.assertAllClose(model_output, restored_output, atol=1e-5, rtol=1e-5) + + # Test using interpolation in `save_to_preset` and `from_preset` if + # image_shape is different. + path = os.path.join(self.get_temp_dir(), "model") + model.save_to_preset(path) + restored_model = DINOV3Backbone.from_preset( + path, + image_shape=(128, 128, 3), # From 70 to 128. + ) + input_data = { + "images": ops.ones((2, 128, 128, 3)), + } + restored_output = restored_model(input_data) + self.assertNotEqual(model_output.shape, restored_output.shape) + + @pytest.mark.kaggle_key_required + @pytest.mark.extra_large + def test_smallest_preset(self): + self.skipTest("Presets are not uploaded yet.") + self.run_preset_test( + cls=DINOV3Backbone, + preset="dinov3_vit_small_lvd1689m", + input_data=self.input_data, + expected_output_shape=(2, 1374, 768), + ) + + @pytest.mark.kaggle_key_required + @pytest.mark.extra_large + def test_all_presets(self): + self.skipTest("Presets are not uploaded yet.") + for preset in DINOV3Backbone.presets: + self.run_preset_test( + cls=DINOV3Backbone, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py index 1eb80141ee..fa30616fad 100644 --- a/keras_hub/src/models/dinov3/dinov3_layers.py +++ b/keras_hub/src/models/dinov3/dinov3_layers.py @@ -220,6 +220,7 @@ def compute_output_shape(self, input_shape): def _get_patches_center_coordinates( num_patches_h, num_patches_w, dtype="float32" ): + """A helper function to get the center coordinates of the patches.""" coords_h = ops.arange(0.5, num_patches_h, dtype=dtype) coords_w = ops.arange(0.5, num_patches_w, dtype=dtype) @@ -239,6 +240,17 @@ def _get_patches_center_coordinates( class DINOV3RopePositionEmbedding(layers.Layer): + """A layer that implements Rotary Position Embedding. + + Args: + hidden_dim: int. The number of units in the hidden layers. + num_heads: int. Number of attention heads. + rope_theta: float. The base period of the rotary position embeddings. + patch_size: int. The size of one side of each patch. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs): super().__init__(**kwargs) self.hidden_dim = hidden_dim @@ -247,7 +259,8 @@ def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs): self.patch_size = patch_size self.head_dim = hidden_dim // num_heads inv_freq = 1.0 / ( - rope_theta ** (ops.arange(0, 1, 4 / self.head_dim, dtype="float32")) + rope_theta + ** (ops.arange(0, 1, 4 / self.head_dim, dtype=self.dtype)) ) self.inv_freq = inv_freq @@ -258,7 +271,7 @@ def call(self, pixel_values): num_patches_w = width // self.patch_size patch_coords = _get_patches_center_coordinates( - num_patches_h, num_patches_w, dtype="float32" + num_patches_h, num_patches_w, dtype=self.dtype ) angles = ( @@ -291,12 +304,14 @@ def get_config(self): def _rotate_half(x): + """A helper function to rotate half of the features.""" x1 = x[..., : ops.shape(x)[-1] // 2] x2 = x[..., ops.shape(x)[-1] // 2 :] return ops.concatenate([-x2, x1], axis=-1) def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens): + """A helper function to apply rotary position embedding to query and key.""" q_prefix_tokens = q[:, :, :num_prefix_tokens, :] q_patches = q[:, :, num_prefix_tokens:, :] k_prefix_tokens = k[:, :, :num_prefix_tokens, :] @@ -315,6 +330,20 @@ def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens): class DINOV3Attention(layers.Layer): + """A multi-head attention layer with dropout. + + Args: + hidden_dim: int. The number of units in the hidden layers. + num_heads: int. Number of attention heads. + dropout_rate: float. The dropout rate to use. Defaults to `0.0`. + query_bias: bool. Whether to use a bias for the query projection. + key_bias: bool. Whether to use a bias for the key projection. + value_bias: bool. Whether to use a bias for the value projection. + proj_bias: bool. Whether to use a bias for the output projection. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__( self, hidden_dim, @@ -334,16 +363,32 @@ def __init__( self.scale = self.head_dim**-0.5 self.q_proj = layers.Dense( - hidden_dim, use_bias=query_bias, name="q_proj" + hidden_dim, + use_bias=query_bias, + dtype=self.dtype_policy, + name="q_proj", + ) + self.k_proj = layers.Dense( + hidden_dim, + use_bias=key_bias, + dtype=self.dtype_policy, + name="k_proj", ) - self.k_proj = layers.Dense(hidden_dim, use_bias=key_bias, name="k_proj") self.v_proj = layers.Dense( - hidden_dim, use_bias=value_bias, name="v_proj" + hidden_dim, + use_bias=value_bias, + dtype=self.dtype_policy, + name="v_proj", ) self.o_proj = layers.Dense( - hidden_dim, use_bias=proj_bias, name="o_proj" + hidden_dim, + use_bias=proj_bias, + dtype=self.dtype_policy, + name="o_proj", + ) + self.dropout = layers.Dropout( + dropout_rate, dtype=self.dtype_policy, name="dropout" ) - self.dropout = layers.Dropout(dropout_rate) def call( self, @@ -404,6 +449,15 @@ def get_config(self): class DINOV3LayerScale(layers.Layer): + """A layer scale. + + Args: + hidden_dim: int. The number of units in the hidden layers. + init_values: float. The initial value for the scale. Defaults to `1.0`. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__(self, hidden_dim, init_values=1.0, **kwargs): super().__init__(**kwargs) self.hidden_dim = int(hidden_dim) @@ -429,6 +483,14 @@ def get_config(self): class DINOV3DropPath(layers.Layer): + """A drop path layer. + + Args: + rate: float. The drop path rate to use. Defaults to `0.0`. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__(self, rate=0.0, **kwargs): super().__init__(**kwargs) self.rate = float(rate) @@ -454,6 +516,18 @@ def get_config(self): class DINOV3MLP(layers.Layer): + """A DINOV3 MLP block. + + Args: + hidden_dim: int. The number of units in the output layer. + intermediate_dim: int. The output dimension of the first Dense layer. + activation: str of callable. Activation to use in the intermediate + layer. Defaults to `"gelu"`. + use_bias: bool. Whether to use a bias for the dense layers. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__( self, hidden_dim, @@ -468,12 +542,21 @@ def __init__( self.activation = activation self.use_bias = use_bias self.up_proj = layers.Dense( - intermediate_dim, use_bias=use_bias, name="up_proj" + intermediate_dim, + use_bias=use_bias, + dtype=self.dtype_policy, + name="up_proj", ) self.down_proj = layers.Dense( - hidden_dim, use_bias=use_bias, name="down_proj" + hidden_dim, + use_bias=use_bias, + dtype=self.dtype_policy, + name="down_proj", + ) + self.act_fn = layers.Activation( + activation, + dtype=self.dtype_policy, ) - self.act_fn = layers.Activation(activation) def call(self, x): return self.down_proj(self.act_fn(self.up_proj(x))) @@ -492,6 +575,18 @@ def get_config(self): class DINOV3GatedMLP(layers.Layer): + """A DINOV3 Gated MLP block. + + Args: + hidden_dim: int. The number of units in the output layer. + intermediate_dim: int. The output dimension of the first Dense layer. + activation: str of callable. Activation to use in the intermediate + layer. Defaults to `"gelu"`. + use_bias: bool. Whether to use a bias for the dense layers. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__( self, hidden_dim, @@ -506,15 +601,24 @@ def __init__( self.activation = activation self.use_bias = use_bias self.gate_proj = layers.Dense( - intermediate_dim, use_bias=use_bias, name="gate_proj" + intermediate_dim, + use_bias=use_bias, + dtype=self.dtype_policy, + name="gate_proj", ) self.up_proj = layers.Dense( - intermediate_dim, use_bias=use_bias, name="up_proj" + intermediate_dim, + use_bias=use_bias, + dtype=self.dtype_policy, + name="up_proj", ) self.down_proj = layers.Dense( - hidden_dim, use_bias=use_bias, name="down_proj" + hidden_dim, + use_bias=use_bias, + dtype=self.dtype_policy, + name="down_proj", ) - self.act_fn = layers.Activation(activation) + self.act_fn = layers.Activation(activation, dtype=self.dtype_policy) def call(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) @@ -533,6 +637,29 @@ def get_config(self): class DINOV3Layer(layers.Layer): + """A DINOV3 encoder layer. + + Args: + hidden_dim: int. The number of units in the hidden layers. + num_heads: int. Number of attention heads. + intermediate_dim: int. The output dimension of the first Dense layer in + a two-layer feedforward network for each transformer. + layer_scale_init_value: float. The initial value for the scale. + Defaults to `1.0`. + use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to + `False`. + attention_dropout: float. The dropout rate for the attention + probabilities. Defaults to `0.0`. + drop_path_rate: float. The drop path rate to use. Defaults to `0.0`. + layer_norm_eps: float. The epsilon for layer normalization. + query_bias: bool. Whether to use a bias for the query projection. + key_bias: bool. Whether to use a bias for the key projection. + value_bias: bool. Whether to use a bias for the value projection. + proj_bias: bool. Whether to use a bias for the output projection. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__( self, hidden_dim, @@ -560,7 +687,7 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.norm1 = layers.LayerNormalization( - epsilon=layer_norm_eps, name="norm1" + epsilon=layer_norm_eps, dtype=self.dtype_policy, name="norm1" ) self.attention = DINOV3Attention( hidden_dim=hidden_dim, @@ -570,27 +697,42 @@ def __init__( key_bias=key_bias, value_bias=value_bias, proj_bias=proj_bias, + dtype=self.dtype_policy, name="attention", ) self.layer_scale1 = DINOV3LayerScale( hidden_dim, init_values=layer_scale_init_value, + dtype=self.dtype_policy, name="layer_scale1", ) self.drop_path = ( - DINOV3DropPath(drop_path_rate) + DINOV3DropPath(drop_path_rate, dtype=self.dtype_policy) if drop_path_rate > 0.0 - else layers.Identity() + else layers.Identity(dtype=self.dtype_policy) ) self.norm2 = layers.LayerNormalization( - epsilon=layer_norm_eps, name="norm2" + epsilon=layer_norm_eps, dtype=self.dtype_policy, name="norm2" ) if use_gated_mlp: - self.mlp = DINOV3GatedMLP(hidden_dim, intermediate_dim, name="mlp") + self.mlp = DINOV3GatedMLP( + hidden_dim, + intermediate_dim, + dtype=self.dtype_policy, + name="mlp", + ) else: - self.mlp = DINOV3MLP(hidden_dim, intermediate_dim, name="mlp") + self.mlp = DINOV3MLP( + hidden_dim, + intermediate_dim, + dtype=self.dtype_policy, + name="mlp", + ) self.layer_scale2 = DINOV3LayerScale( - hidden_dim, init_values=layer_scale_init_value, name="layer_scale2" + hidden_dim, + init_values=layer_scale_init_value, + dtype=self.dtype_policy, + name="layer_scale2", ) def call( @@ -637,6 +779,30 @@ def get_config(self): class DINOV3Encoder(layers.Layer): + """A DINOV3 encoder. + + Args: + num_layers: int. The number of transformer layers. + hidden_dim: int. The number of units in the hidden layers. + num_heads: int. Number of attention heads. + intermediate_dim: int. The output dimension of the first Dense layer in + a two-layer feedforward network for each transformer. + layer_scale_init_value: float. The initial value for the scale. + Defaults to `1.0`. + use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to + `False`. + attention_dropout: float. The dropout rate for the attention + probabilities. Defaults to `0.0`. + drop_path_rate: float. The drop path rate to use. Defaults to `0.0`. + layer_norm_eps: float. The epsilon for layer normalization. + query_bias: bool. Whether to use a bias for the query projection. + key_bias: bool. Whether to use a bias for the key projection. + value_bias: bool. Whether to use a bias for the value projection. + proj_bias: bool. Whether to use a bias for the output projection. + **kwargs: other keyword arguments passed to `keras.layers.Layer`, + including `name`, `dtype` etc. + """ + def __init__( self, num_layers, @@ -680,6 +846,7 @@ def __init__( key_bias=key_bias, value_bias=value_bias, proj_bias=proj_bias, + dtype=self.dtype_policy, name=f"layers.{i}", ) for i in range(num_layers) From fadc407550054074dc07a263e38cf7b0bafda1b6 Mon Sep 17 00:00:00 2001 From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com> Date: Sun, 19 Oct 2025 22:11:36 +0800 Subject: [PATCH 3/5] Update DINOV3 impls. --- keras_hub/src/models/dinov2/dinov2_layers.py | 4 +- .../src/models/dinov3/dinov3_backbone.py | 67 ++- .../src/models/dinov3/dinov3_backbone_test.py | 8 +- keras_hub/src/models/dinov3/dinov3_layers.py | 505 +++++++++++------- .../src/utils/transformers/convert_dinov3.py | 28 +- .../convert_dinov3_checkpoints.py | 43 +- 6 files changed, 423 insertions(+), 232 deletions(-) diff --git a/keras_hub/src/models/dinov2/dinov2_layers.py b/keras_hub/src/models/dinov2/dinov2_layers.py index 1124b57a50..ce040ae266 100644 --- a/keras_hub/src/models/dinov2/dinov2_layers.py +++ b/keras_hub/src/models/dinov2/dinov2_layers.py @@ -502,7 +502,9 @@ def call(self, inputs, training=None): def get_config(self): config = super().get_config() - config.update({"hidden_dim": self.hidden_dim}) + config.update( + {"hidden_dim": self.hidden_dim, "init_values": self.init_values} + ) return config def compute_output_shape(self, input_shape): diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py index 385247d719..448051a340 100644 --- a/keras_hub/src/models/dinov3/dinov3_backbone.py +++ b/keras_hub/src/models/dinov3/dinov3_backbone.py @@ -28,20 +28,29 @@ class DINOV3Backbone(FeaturePyramidBackbone): embedding layer. Defaults to `0`. use_mask_token: bool. Whether to use a mask token in the embedding layer. Defaults to `True`. + hidden_activation: str or callable. Activation to use in the MLP. + Defaults to `"gelu"`. use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to `False`. + use_query_bias: bool. Whether to use a bias for the query projection. + Defaults to `True`. + use_key_bias: bool. Whether to use a bias for the key projection. + Defaults to `True`. + use_value_bias: bool. Whether to use a bias for the value projection. + Defaults to `True`. + use_proj_bias: bool. Whether to use a bias for the output projection. + Defaults to `True`. + use_mlp_bias: bool. Whether to use a bias for the dense layers in MLP. + Defaults to `True`. attention_dropout: float. The dropout rate for the attention probabilities. Defaults to `0.0`. drop_path_rate: float. The drop path rate to use. Defaults to `0.0`. image_shape: tuple. The input shape without the batch size. Defaults to `(518, 518, 3)`. rope_theta: float. The base period of the rotary position embeddings. + Defaults to `100.0`. apply_layernorm: bool. Whether to apply layer normalization to the outputs of each stage in the feature pyramid. Defaults to `False`. - query_bias: bool. Whether to use a bias for the query projection. - key_bias: bool. Whether to use a bias for the key projection. - value_bias: bool. Whether to use a bias for the value projection. - proj_bias: bool. Whether to use a bias for the output projection. data_format: `None` or str. If specified, either `"channels_last"` or `"channels_first"`. The ordering of the dimensions in the inputs. `"channels_last"` corresponds to inputs with shape @@ -67,16 +76,19 @@ def __init__( layer_scale_init_value=1.0, num_register_tokens=4, use_mask_token=True, + hidden_activation="gelu", use_gated_mlp=False, + use_query_bias=True, + use_key_bias=True, + use_value_bias=True, + use_proj_bias=True, + use_mlp_bias=True, attention_dropout=0.0, drop_path_rate=0.0, + layer_norm_eps=1e-5, image_shape=(518, 518, 3), - rope_theta=10000.0, + rope_theta=100.0, apply_layernorm=False, - query_bias=True, - key_bias=True, - value_bias=True, - proj_bias=True, data_format=None, dtype=None, name=None, @@ -110,18 +122,21 @@ def __init__( num_heads=num_heads, intermediate_dim=intermediate_dim, layer_scale_init_value=layer_scale_init_value, + hidden_activation=hidden_activation, use_gated_mlp=use_gated_mlp, + use_query_bias=use_query_bias, + use_key_bias=use_key_bias, + use_value_bias=use_value_bias, + use_proj_bias=use_proj_bias, + use_mlp_bias=use_mlp_bias, attention_dropout=attention_dropout, drop_path_rate=drop_path_rate, - query_bias=query_bias, - key_bias=key_bias, - value_bias=value_bias, - proj_bias=proj_bias, + layer_norm_eps=layer_norm_eps, dtype=dtype, name=f"{prefix}encoder", ) self.layernorm = layers.LayerNormalization( - epsilon=1e-6, dtype=dtype, name=f"{prefix}layernorm" + epsilon=layer_norm_eps, dtype=dtype, name=f"{prefix}layernorm" ) # === Functional Model === @@ -161,16 +176,19 @@ def __init__( self.layer_scale_init_value = float(layer_scale_init_value) self.num_register_tokens = int(num_register_tokens) self.use_mask_token = bool(use_mask_token) + self.hidden_activation = hidden_activation self.use_gated_mlp = bool(use_gated_mlp) + self.use_query_bias = bool(use_query_bias) + self.use_key_bias = bool(use_key_bias) + self.use_value_bias = bool(use_value_bias) + self.use_proj_bias = bool(use_proj_bias) + self.use_mlp_bias = bool(use_mlp_bias) self.attention_dropout = float(attention_dropout) self.drop_path_rate = float(drop_path_rate) + self.layer_norm_eps = float(layer_norm_eps) self.image_shape = image_shape self.rope_theta = rope_theta self.apply_layernorm = apply_layernorm - self.query_bias = query_bias - self.key_bias = key_bias - self.value_bias = value_bias - self.proj_bias = proj_bias self.pyramid_outputs = pyramid_outputs def get_config(self): @@ -182,19 +200,22 @@ def get_config(self): "hidden_dim": self.hidden_dim, "num_heads": self.num_heads, "intermediate_dim": self.intermediate_dim, - "layer_scale_init_value": self.layer_scale_init_value, "num_register_tokens": self.num_register_tokens, "use_mask_token": self.use_mask_token, + "layer_scale_init_value": self.layer_scale_init_value, + "hidden_activation": self.hidden_activation, "use_gated_mlp": self.use_gated_mlp, + "use_query_bias": self.use_query_bias, + "use_key_bias": self.use_key_bias, + "use_value_bias": self.use_value_bias, + "use_proj_bias": self.use_proj_bias, + "use_mlp_bias": self.use_mlp_bias, "attention_dropout": self.attention_dropout, "drop_path_rate": self.drop_path_rate, + "layer_norm_eps": self.layer_norm_eps, "image_shape": self.image_shape, "rope_theta": self.rope_theta, "apply_layernorm": self.apply_layernorm, - "query_bias": self.query_bias, - "key_bias": self.key_bias, - "value_bias": self.value_bias, - "proj_bias": self.proj_bias, } ) return config diff --git a/keras_hub/src/models/dinov3/dinov3_backbone_test.py b/keras_hub/src/models/dinov3/dinov3_backbone_test.py index 1320752c61..e032ab1c4e 100644 --- a/keras_hub/src/models/dinov3/dinov3_backbone_test.py +++ b/keras_hub/src/models/dinov3/dinov3_backbone_test.py @@ -11,7 +11,7 @@ class DINOV3BackboneTest(TestCase): def setUp(self): self.init_kwargs = { - "patch_size": 14, + "patch_size": 16, "num_layers": 2, "hidden_dim": 16, "num_heads": 2, @@ -19,11 +19,11 @@ def setUp(self): "layer_scale_init_value": 1.0, "num_register_tokens": 4, "use_gated_mlp": False, - "image_shape": (70, 70, 3), + "image_shape": (64, 64, 3), "name": "dinov3_backbone", } self.input_data = { - "images": ops.ones((2, 70, 70, 3)), + "images": ops.ones((2, 64, 64, 3)), } def test_backbone_basics(self): @@ -70,7 +70,7 @@ def test_position_embedding_interpolation(self): model.save_to_preset(path) restored_model = DINOV3Backbone.from_preset( path, - image_shape=(128, 128, 3), # From 70 to 128. + image_shape=(128, 128, 3), # From 64 to 128. ) input_data = { "images": ops.ones((2, 128, 128, 3)), diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py index fa30616fad..d27c3485dc 100644 --- a/keras_hub/src/models/dinov3/dinov3_layers.py +++ b/keras_hub/src/models/dinov3/dinov3_layers.py @@ -188,7 +188,6 @@ def call(self, inputs, masks=None, training=None): ), axis=1, ) - return embeddings def get_config(self): @@ -217,28 +216,6 @@ def compute_output_shape(self, input_shape): return output_shape -def _get_patches_center_coordinates( - num_patches_h, num_patches_w, dtype="float32" -): - """A helper function to get the center coordinates of the patches.""" - coords_h = ops.arange(0.5, num_patches_h, dtype=dtype) - coords_w = ops.arange(0.5, num_patches_w, dtype=dtype) - - coords_h = coords_h / num_patches_h - coords_w = coords_w / num_patches_w - - coords_h = ops.expand_dims(coords_h, axis=1) - coords_w = ops.expand_dims(coords_w, axis=0) - - coords_h = ops.repeat(coords_h, num_patches_w, axis=1) - coords_w = ops.repeat(coords_w, num_patches_h, axis=0) - - coords = ops.stack([coords_h, coords_w], axis=-1) - coords = ops.reshape(coords, (-1, 2)) - coords = 2.0 * coords - 1.0 - return coords - - class DINOV3RopePositionEmbedding(layers.Layer): """A layer that implements Rotary Position Embedding. @@ -247,33 +224,72 @@ class DINOV3RopePositionEmbedding(layers.Layer): num_heads: int. Number of attention heads. rope_theta: float. The base period of the rotary position embeddings. patch_size: int. The size of one side of each patch. + data_format: `None` or str. If specified, either `"channels_last"` or + `"channels_first"`. The ordering of the dimensions in the + inputs. `"channels_last"` corresponds to inputs with shape + `(batch_size, height, width, channels)` + while `"channels_first"` corresponds to inputs with shape + `(batch_size, channels, height, width)`. It defaults to the + `image_data_format` value found in your Keras config file at + `~/.keras/keras.json`. If you never set it, then it will be + `"channels_last"`. **kwargs: other keyword arguments passed to `keras.layers.Layer`, including `name`, `dtype` etc. """ - def __init__(self, hidden_dim, num_heads, rope_theta, patch_size, **kwargs): + def __init__( + self, + hidden_dim, + num_heads, + rope_theta, + patch_size, + data_format=None, + **kwargs, + ): super().__init__(**kwargs) - self.hidden_dim = hidden_dim - self.num_heads = num_heads - self.rope_theta = rope_theta - self.patch_size = patch_size + self.hidden_dim = int(hidden_dim) + self.num_heads = int(num_heads) + self.rope_theta = float(rope_theta) + self.patch_size = int(patch_size) + self.data_format = standardize_data_format(data_format) self.head_dim = hidden_dim // num_heads - inv_freq = 1.0 / ( - rope_theta - ** (ops.arange(0, 1, 4 / self.head_dim, dtype=self.dtype)) + self.inv_freq = 1.0 / ( + rope_theta ** (ops.arange(0, 1, 4 / self.head_dim, dtype="float32")) ) - self.inv_freq = inv_freq - def call(self, pixel_values): - shape = ops.shape(pixel_values) - height, width = shape[1], shape[2] + def _get_patches_center_coordinates( + self, num_patches_h, num_patches_w, dtype="float32" + ): + """A helper function to get the center coordinates of the patches.""" + coords_h = ops.arange(0.5, num_patches_h, dtype=dtype) + coords_w = ops.arange(0.5, num_patches_w, dtype=dtype) + + coords_h = coords_h / num_patches_h + coords_w = coords_w / num_patches_w + + coords_h = ops.expand_dims(coords_h, axis=1) + coords_w = ops.expand_dims(coords_w, axis=0) + + coords_h = ops.repeat(coords_h, num_patches_w, axis=1) + coords_w = ops.repeat(coords_w, num_patches_h, axis=0) + + coords = ops.stack([coords_h, coords_w], axis=-1) + coords = ops.reshape(coords, (-1, 2)) + coords = 2.0 * coords - 1.0 + return coords + + def call(self, inputs): + shape = ops.shape(inputs) + if self.data_format == "channels_last": + height, width = shape[1], shape[2] + else: + height, width = shape[2], shape[3] num_patches_h = height // self.patch_size num_patches_w = width // self.patch_size - patch_coords = _get_patches_center_coordinates( - num_patches_h, num_patches_w, dtype=self.dtype + patch_coords = self._get_patches_center_coordinates( + num_patches_h, num_patches_w, dtype="float32" ) - angles = ( 2 * math.pi @@ -283,12 +299,9 @@ def call(self, pixel_values): angles = ops.reshape(angles, (ops.shape(angles)[0], -1)) angles = ops.tile(angles, (1, 2)) - cos = ops.cos(angles) - sin = ops.sin(angles) - - return ops.cast(cos, pixel_values.dtype), ops.cast( - sin, pixel_values.dtype - ) + cos = ops.cast(ops.cos(angles), inputs.dtype) + sin = ops.cast(ops.sin(angles), inputs.dtype) + return cos, sin def get_config(self): config = super().get_config() @@ -302,31 +315,17 @@ def get_config(self): ) return config - -def _rotate_half(x): - """A helper function to rotate half of the features.""" - x1 = x[..., : ops.shape(x)[-1] // 2] - x2 = x[..., ops.shape(x)[-1] // 2 :] - return ops.concatenate([-x2, x1], axis=-1) - - -def _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens): - """A helper function to apply rotary position embedding to query and key.""" - q_prefix_tokens = q[:, :, :num_prefix_tokens, :] - q_patches = q[:, :, num_prefix_tokens:, :] - k_prefix_tokens = k[:, :, :num_prefix_tokens, :] - k_patches = k[:, :, num_prefix_tokens:, :] - - cos = ops.expand_dims(ops.expand_dims(cos, axis=0), axis=0) - sin = ops.expand_dims(ops.expand_dims(sin, axis=0), axis=0) - - q_patches = (q_patches * cos) + (_rotate_half(q_patches) * sin) - k_patches = (k_patches * cos) + (_rotate_half(k_patches) * sin) - - q = ops.concatenate([q_prefix_tokens, q_patches], axis=-2) - k = ops.concatenate([k_prefix_tokens, k_patches], axis=-2) - - return q, k + def compute_output_shape(self, input_shape): + output_shape = input_shape + if self.data_format == "channels_last": + height, width = input_shape[1], input_shape[2] + else: + height, width = input_shape[2], input_shape[3] + num_patches_h = height // self.patch_size + num_patches_w = width // self.patch_size + seq_len = num_patches_h * num_patches_w + output_shape = (seq_len, self.head_dim) + return output_shape, output_shape class DINOV3Attention(layers.Layer): @@ -336,10 +335,10 @@ class DINOV3Attention(layers.Layer): hidden_dim: int. The number of units in the hidden layers. num_heads: int. Number of attention heads. dropout_rate: float. The dropout rate to use. Defaults to `0.0`. - query_bias: bool. Whether to use a bias for the query projection. - key_bias: bool. Whether to use a bias for the key projection. - value_bias: bool. Whether to use a bias for the value projection. - proj_bias: bool. Whether to use a bias for the output projection. + use_query_bias: bool. Whether to use a bias for the query projection. + use_key_bias: bool. Whether to use a bias for the key projection. + use_value_bias: bool. Whether to use a bias for the value projection. + use_proj_bias: bool. Whether to use a bias for the output projection. **kwargs: other keyword arguments passed to `keras.layers.Layer`, including `name`, `dtype` etc. """ @@ -349,40 +348,44 @@ def __init__( hidden_dim, num_heads, dropout_rate=0.0, - query_bias=True, - key_bias=True, - value_bias=True, - proj_bias=True, + use_query_bias=True, + use_key_bias=True, + use_value_bias=True, + use_proj_bias=True, **kwargs, ): super().__init__(**kwargs) - self.hidden_dim = hidden_dim - self.num_heads = num_heads - self.dropout_rate = dropout_rate + self.hidden_dim = int(hidden_dim) + self.num_heads = int(num_heads) + self.dropout_rate = float(dropout_rate) + self.use_query_bias = bool(use_query_bias) + self.use_key_bias = bool(use_key_bias) + self.use_value_bias = bool(use_value_bias) + self.use_proj_bias = bool(use_proj_bias) self.head_dim = hidden_dim // num_heads self.scale = self.head_dim**-0.5 - self.q_proj = layers.Dense( + self.query_dense = layers.Dense( hidden_dim, - use_bias=query_bias, + use_bias=use_query_bias, dtype=self.dtype_policy, name="q_proj", ) - self.k_proj = layers.Dense( + self.key_dense = layers.Dense( hidden_dim, - use_bias=key_bias, + use_bias=use_key_bias, dtype=self.dtype_policy, name="k_proj", ) - self.v_proj = layers.Dense( + self.value_dense = layers.Dense( hidden_dim, - use_bias=value_bias, + use_bias=use_value_bias, dtype=self.dtype_policy, name="v_proj", ) - self.o_proj = layers.Dense( + self.output_dense = layers.Dense( hidden_dim, - use_bias=proj_bias, + use_bias=use_proj_bias, dtype=self.dtype_policy, name="o_proj", ) @@ -390,47 +393,63 @@ def __init__( dropout_rate, dtype=self.dtype_policy, name="dropout" ) + def build(self, input_shape): + self.query_dense.build(input_shape) + self.key_dense.build(input_shape) + self.value_dense.build(input_shape) + self.output_dense.build(input_shape) + + def _apply_rotary(self, q, k, cos, sin, num_prefix_tokens): + """Apply rotary position embedding to query and key.""" + + def _rotate_half(x): + """A helper function to rotate half of the features.""" + x1 = x[..., : ops.shape(x)[-1] // 2] + x2 = x[..., ops.shape(x)[-1] // 2 :] + return ops.concatenate([-x2, x1], axis=-1) + + q_prefix_tokens = q[:, :num_prefix_tokens, :, :] + q_patches = q[:, num_prefix_tokens:, :, :] + k_prefix_tokens = k[:, :num_prefix_tokens, :, :] + k_patches = k[:, num_prefix_tokens:, :, :] + cos = ops.expand_dims(ops.expand_dims(cos, axis=0), axis=2) + sin = ops.expand_dims(ops.expand_dims(sin, axis=0), axis=2) + + q_patches = (q_patches * cos) + (_rotate_half(q_patches) * sin) + k_patches = (k_patches * cos) + (_rotate_half(k_patches) * sin) + q = ops.concatenate([q_prefix_tokens, q_patches], axis=-3) + k = ops.concatenate([k_prefix_tokens, k_patches], axis=-3) + return q, k + def call( self, - hidden_states, + inputs, attention_mask=None, position_embeddings=None, num_prefix_tokens=0, + training=None, ): - batch_size, seq_len, _ = ops.shape(hidden_states) - - q = self.q_proj(hidden_states) - k = self.k_proj(hidden_states) - v = self.v_proj(hidden_states) - + batch_size, seq_len, _ = ops.shape(inputs) + q = self.query_dense(inputs, training=training) + k = self.key_dense(inputs, training=training) + v = self.value_dense(inputs, training=training) q = ops.reshape(q, (batch_size, seq_len, self.num_heads, self.head_dim)) k = ops.reshape(k, (batch_size, seq_len, self.num_heads, self.head_dim)) v = ops.reshape(v, (batch_size, seq_len, self.num_heads, self.head_dim)) - - q = ops.transpose(q, (0, 2, 1, 3)) - k = ops.transpose(k, (0, 2, 1, 3)) - v = ops.transpose(v, (0, 2, 1, 3)) - if position_embeddings is not None: cos, sin = position_embeddings - q, k = _apply_rotary_pos_emb(q, k, cos, sin, num_prefix_tokens) + q, k = self._apply_rotary(q, k, cos, sin, num_prefix_tokens) - attn_weights = ( - ops.matmul(q, ops.transpose(k, (0, 1, 3, 2))) * self.scale + attn_output = ops.nn.dot_product_attention( + q, + k, + v, + mask=attention_mask, + scale=self.scale, + is_causal=False, ) - - if attention_mask is not None: - attn_weights += attention_mask - - attn_weights = ops.softmax(attn_weights, axis=-1) - attn_weights = self.dropout(attn_weights) - - attn_output = ops.matmul(attn_weights, v) - attn_output = ops.transpose(attn_output, (0, 2, 1, 3)) attn_output = ops.reshape(attn_output, (batch_size, seq_len, -1)) - attn_output = self.o_proj(attn_output) - - return attn_output + return self.output_dense(attn_output, training=training) def get_config(self): config = super().get_config() @@ -439,14 +458,17 @@ def get_config(self): "hidden_dim": self.hidden_dim, "num_heads": self.num_heads, "dropout_rate": self.dropout_rate, - "query_bias": self.q_proj.use_bias, - "key_bias": self.k_proj.use_bias, - "value_bias": self.v_proj.use_bias, - "proj_bias": self.o_proj.use_bias, + "query_bias": self.use_query_bias, + "key_bias": self.use_key_bias, + "value_bias": self.use_value_bias, + "proj_bias": self.use_proj_bias, } ) return config + def compute_output_shape(self, input_shape): + return input_shape + class DINOV3LayerScale(layers.Layer): """A layer scale. @@ -503,10 +525,8 @@ def call(self, inputs, training=None): return inputs keep_prob = 1.0 - self.rate - random_tensor = keep_prob + random.uniform( - self.noise_shape, dtype=inputs.dtype - ) - random_tensor = ops.floor(random_tensor) + random_tensor = random.uniform(self.noise_shape, dtype=inputs.dtype) + random_tensor = ops.add(random_tensor, keep_prob) return ops.multiply(ops.divide(inputs, keep_prob), random_tensor) def get_config(self): @@ -514,6 +534,9 @@ def get_config(self): config.update({"rate": self.rate}) return config + def compute_output_shape(self, input_shape): + return input_shape + class DINOV3MLP(layers.Layer): """A DINOV3 MLP block. @@ -537,12 +560,14 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self.hidden_dim = hidden_dim - self.intermediate_dim = intermediate_dim + self.hidden_dim = int(hidden_dim) + self.intermediate_dim = int(intermediate_dim) self.activation = activation - self.use_bias = use_bias + self.use_bias = bool(use_bias) + self.up_proj = layers.Dense( intermediate_dim, + activation=activation, use_bias=use_bias, dtype=self.dtype_policy, name="up_proj", @@ -553,13 +578,15 @@ def __init__( dtype=self.dtype_policy, name="down_proj", ) - self.act_fn = layers.Activation( - activation, - dtype=self.dtype_policy, - ) - def call(self, x): - return self.down_proj(self.act_fn(self.up_proj(x))) + def build(self, input_shape): + self.up_proj.build(input_shape) + input_shape = self.up_proj.compute_output_shape(input_shape) + self.down_proj.build(input_shape) + + def call(self, inputs, training=None): + x = self.up_proj(inputs, training=training) + return self.down_proj(x, training=training) def get_config(self): config = super().get_config() @@ -573,6 +600,11 @@ def get_config(self): ) return config + def compute_output_shape(self, input_shape): + output_shape = list(input_shape) + output_shape[-1] = self.hidden_dim + return output_shape + class DINOV3GatedMLP(layers.Layer): """A DINOV3 Gated MLP block. @@ -596,12 +628,14 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self.hidden_dim = hidden_dim - self.intermediate_dim = intermediate_dim + self.hidden_dim = int(hidden_dim) + self.intermediate_dim = int(intermediate_dim) self.activation = activation - self.use_bias = use_bias + self.use_bias = bool(use_bias) + self.gate_proj = layers.Dense( intermediate_dim, + activation=activation, use_bias=use_bias, dtype=self.dtype_policy, name="gate_proj", @@ -618,10 +652,19 @@ def __init__( dtype=self.dtype_policy, name="down_proj", ) - self.act_fn = layers.Activation(activation, dtype=self.dtype_policy) - def call(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + def build(self, input_shape): + self.gate_proj.build(input_shape) + self.up_proj.build(input_shape) + input_shape = self.up_proj.compute_output_shape(input_shape) + self.down_proj.build(input_shape) + + def call(self, inputs, training=None): + x = ops.multiply( + self.gate_proj(inputs, training=training), + self.up_proj(inputs, training=training), + ) + return self.down_proj(x, training=training) def get_config(self): config = super().get_config() @@ -635,6 +678,11 @@ def get_config(self): ) return config + def compute_output_shape(self, input_shape): + output_shape = list(input_shape) + output_shape[-1] = self.hidden_dim + return output_shape + class DINOV3Layer(layers.Layer): """A DINOV3 encoder layer. @@ -646,16 +694,19 @@ class DINOV3Layer(layers.Layer): a two-layer feedforward network for each transformer. layer_scale_init_value: float. The initial value for the scale. Defaults to `1.0`. + hidden_activation: str or callable. Activation to use in the MLP. + Defaults to `"gelu"`. use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to `False`. + use_query_bias: bool. Whether to use a bias for the query projection. + use_key_bias: bool. Whether to use a bias for the key projection. + use_value_bias: bool. Whether to use a bias for the value projection. + use_proj_bias: bool. Whether to use a bias for the output projection. + use_mlp_bias: bool. Whether to use a bias for the MLP layers. attention_dropout: float. The dropout rate for the attention probabilities. Defaults to `0.0`. drop_path_rate: float. The drop path rate to use. Defaults to `0.0`. layer_norm_eps: float. The epsilon for layer normalization. - query_bias: bool. Whether to use a bias for the query projection. - key_bias: bool. Whether to use a bias for the key projection. - value_bias: bool. Whether to use a bias for the value projection. - proj_bias: bool. Whether to use a bias for the output projection. **kwargs: other keyword arguments passed to `keras.layers.Layer`, including `name`, `dtype` etc. """ @@ -666,25 +717,33 @@ def __init__( num_heads, intermediate_dim, layer_scale_init_value=1.0, + hidden_activation="gelu", use_gated_mlp=False, + use_query_bias=True, + use_key_bias=True, + use_value_bias=True, + use_proj_bias=True, + use_mlp_bias=True, attention_dropout=0.0, drop_path_rate=0.0, layer_norm_eps=1e-6, - query_bias=True, - key_bias=True, - value_bias=True, - proj_bias=True, **kwargs, ): super().__init__(**kwargs) - self.hidden_dim = hidden_dim - self.num_heads = num_heads - self.intermediate_dim = intermediate_dim - self.layer_scale_init_value = layer_scale_init_value - self.use_gated_mlp = use_gated_mlp - self.attention_dropout = attention_dropout - self.drop_path_rate = drop_path_rate - self.layer_norm_eps = layer_norm_eps + self.hidden_dim = int(hidden_dim) + self.num_heads = int(num_heads) + self.intermediate_dim = int(intermediate_dim) + self.layer_scale_init_value = float(layer_scale_init_value) + self.hidden_activation = hidden_activation + self.use_gated_mlp = bool(use_gated_mlp) + self.use_query_bias = bool(use_query_bias) + self.use_key_bias = bool(use_key_bias) + self.use_value_bias = bool(use_value_bias) + self.use_proj_bias = bool(use_proj_bias) + self.use_mlp_bias = bool(use_mlp_bias) + self.attention_dropout = float(attention_dropout) + self.drop_path_rate = float(drop_path_rate) + self.layer_norm_eps = float(layer_norm_eps) self.norm1 = layers.LayerNormalization( epsilon=layer_norm_eps, dtype=self.dtype_policy, name="norm1" @@ -693,10 +752,10 @@ def __init__( hidden_dim=hidden_dim, num_heads=num_heads, dropout_rate=attention_dropout, - query_bias=query_bias, - key_bias=key_bias, - value_bias=value_bias, - proj_bias=proj_bias, + use_query_bias=use_query_bias, + use_key_bias=use_key_bias, + use_value_bias=use_value_bias, + use_proj_bias=use_proj_bias, dtype=self.dtype_policy, name="attention", ) @@ -718,6 +777,8 @@ def __init__( self.mlp = DINOV3GatedMLP( hidden_dim, intermediate_dim, + activation=hidden_activation, + use_bias=use_mlp_bias, dtype=self.dtype_policy, name="mlp", ) @@ -725,6 +786,8 @@ def __init__( self.mlp = DINOV3MLP( hidden_dim, intermediate_dim, + activation=hidden_activation, + use_bias=use_mlp_bias, dtype=self.dtype_policy, name="mlp", ) @@ -735,15 +798,26 @@ def __init__( name="layer_scale2", ) + def build(self, input_shape): + self.norm1.build(input_shape) + self.attention.build(input_shape) + input_shape = self.attention.compute_output_shape(input_shape) + self.layer_scale1.build(input_shape) + self.drop_path.build(input_shape) + self.norm2.build(input_shape) + self.mlp.build(input_shape) + input_shape = self.mlp.compute_output_shape(input_shape) + self.layer_scale2.build(input_shape) + def call( self, - hidden_states, + inputs, attention_mask=None, position_embeddings=None, num_prefix_tokens=0, ): - residual = hidden_states - hidden_states = self.norm1(hidden_states) + residual = inputs + hidden_states = self.norm1(inputs) hidden_states = self.attention( hidden_states, attention_mask=attention_mask, @@ -769,7 +843,13 @@ def get_config(self): "num_heads": self.num_heads, "intermediate_dim": self.intermediate_dim, "layer_scale_init_value": self.layer_scale_init_value, + "hidden_activation": self.hidden_activation, "use_gated_mlp": self.use_gated_mlp, + "use_query_bias": self.use_query_bias, + "use_key_bias": self.use_key_bias, + "use_value_bias": self.use_value_bias, + "use_proj_bias": self.use_proj_bias, + "use_mlp_bias": self.use_mlp_bias, "attention_dropout": self.attention_dropout, "drop_path_rate": self.drop_path_rate, "layer_norm_eps": self.layer_norm_eps, @@ -777,6 +857,9 @@ def get_config(self): ) return config + def compute_output_shape(self, input_shape): + return input_shape + class DINOV3Encoder(layers.Layer): """A DINOV3 encoder. @@ -789,16 +872,25 @@ class DINOV3Encoder(layers.Layer): a two-layer feedforward network for each transformer. layer_scale_init_value: float. The initial value for the scale. Defaults to `1.0`. + hidden_activation: str or callable. Activation to use in the MLP. + Defaults to `"gelu"`. use_gated_mlp: bool. Whether to use Gated MLP layers. Defaults to `False`. + use_query_bias: bool. Whether to use a bias for the query projection. + Defaults to `True`. + use_key_bias: bool. Whether to use a bias for the key projection. + Defaults to `True`. + use_value_bias: bool. Whether to use a bias for the value projection. + Defaults to `True`. + use_proj_bias: bool. Whether to use a bias for the output projection. + Defaults to `True`. + use_mlp_bias: bool. Whether to use a bias for the dense layers in MLP. + Defaults to `True`. attention_dropout: float. The dropout rate for the attention probabilities. Defaults to `0.0`. drop_path_rate: float. The drop path rate to use. Defaults to `0.0`. - layer_norm_eps: float. The epsilon for layer normalization. - query_bias: bool. Whether to use a bias for the query projection. - key_bias: bool. Whether to use a bias for the key projection. - value_bias: bool. Whether to use a bias for the value projection. - proj_bias: bool. Whether to use a bias for the output projection. + layer_norm_eps: float. The epsilon for layer normalization. Defaults to + `1e-5`. **kwargs: other keyword arguments passed to `keras.layers.Layer`, including `name`, `dtype` etc. """ @@ -810,26 +902,34 @@ def __init__( num_heads, intermediate_dim, layer_scale_init_value=1.0, + hidden_activation="gelu", use_gated_mlp=False, + use_query_bias=True, + use_key_bias=True, + use_value_bias=True, + use_proj_bias=True, + use_mlp_bias=True, attention_dropout=0.0, drop_path_rate=0.0, - layer_norm_eps=1e-6, - query_bias=True, - key_bias=True, - value_bias=True, - proj_bias=True, + layer_norm_eps=1e-5, **kwargs, ): super().__init__(**kwargs) - self.num_layers = num_layers - self.hidden_dim = hidden_dim - self.num_heads = num_heads - self.intermediate_dim = intermediate_dim - self.layer_scale_init_value = layer_scale_init_value - self.use_gated_mlp = use_gated_mlp - self.attention_dropout = attention_dropout - self.drop_path_rate = drop_path_rate - self.layer_norm_eps = layer_norm_eps + self.num_layers = int(num_layers) + self.hidden_dim = int(hidden_dim) + self.num_heads = int(num_heads) + self.intermediate_dim = int(intermediate_dim) + self.layer_scale_init_value = float(layer_scale_init_value) + self.hidden_activation = hidden_activation + self.use_gated_mlp = bool(use_gated_mlp) + self.use_query_bias = bool(use_query_bias) + self.use_key_bias = bool(use_key_bias) + self.use_value_bias = bool(use_value_bias) + self.use_proj_bias = bool(use_proj_bias) + self.use_mlp_bias = bool(use_mlp_bias) + self.attention_dropout = float(attention_dropout) + self.drop_path_rate = float(drop_path_rate) + self.layer_norm_eps = float(layer_norm_eps) dpr = [x for x in ops.linspace(0.0, drop_path_rate, num_layers)] self.layers = [ @@ -838,38 +938,47 @@ def __init__( num_heads=num_heads, intermediate_dim=intermediate_dim, layer_scale_init_value=layer_scale_init_value, + hidden_activation=hidden_activation, use_gated_mlp=use_gated_mlp, + use_query_bias=use_query_bias, + use_key_bias=use_key_bias, + use_value_bias=use_value_bias, + use_proj_bias=use_proj_bias, + use_mlp_bias=use_mlp_bias, attention_dropout=attention_dropout, drop_path_rate=dpr[i], layer_norm_eps=layer_norm_eps, - query_bias=query_bias, - key_bias=key_bias, - value_bias=value_bias, - proj_bias=proj_bias, dtype=self.dtype_policy, - name=f"layers.{i}", + name=f"layers_{i}", ) for i in range(num_layers) ] + def build(self, input_shape): + for layer in self.layers: + layer.build(input_shape) + input_shape = layer.compute_output_shape(input_shape) + def call( self, - hidden_states, + inputs, attention_mask=None, position_embeddings=None, num_prefix_tokens=0, + training=None, ): pyramid_outputs = {} - for i, layer_module in enumerate(self.layers): - hidden_states = layer_module( - hidden_states, + x = inputs + for layer_index, layer in enumerate(self.layers, start=1): + x = layer( + x, attention_mask=attention_mask, position_embeddings=position_embeddings, num_prefix_tokens=num_prefix_tokens, + training=training, ) - pyramid_outputs[f"stage{i + 1}"] = hidden_states - - return hidden_states, pyramid_outputs + pyramid_outputs[f"stage{str(layer_index)}"] = x + return x, pyramid_outputs def get_config(self): config = super().get_config() @@ -880,10 +989,22 @@ def get_config(self): "num_heads": self.num_heads, "intermediate_dim": self.intermediate_dim, "layer_scale_init_value": self.layer_scale_init_value, + "hidden_activation": self.hidden_activation, "use_gated_mlp": self.use_gated_mlp, + "use_query_bias": self.use_query_bias, + "use_key_bias": self.use_key_bias, + "use_value_bias": self.use_value_bias, + "use_proj_bias": self.use_proj_bias, + "use_mlp_bias": self.use_mlp_bias, "attention_dropout": self.attention_dropout, "drop_path_rate": self.drop_path_rate, "layer_norm_eps": self.layer_norm_eps, } ) return config + + def compute_output_shape(self, input_shape): + pyramid_outputs = {} + for layer_index in range(1, len(self.layers) + 1): + pyramid_outputs[f"stage{str(layer_index)}"] = input_shape + return input_shape, pyramid_outputs diff --git a/keras_hub/src/utils/transformers/convert_dinov3.py b/keras_hub/src/utils/transformers/convert_dinov3.py index 97d9bd0b09..7d51eeb118 100644 --- a/keras_hub/src/utils/transformers/convert_dinov3.py +++ b/keras_hub/src/utils/transformers/convert_dinov3.py @@ -14,20 +14,21 @@ def convert_backbone_config(transformers_config): "num_heads": transformers_config["num_attention_heads"], "intermediate_dim": transformers_config["intermediate_size"], "layer_scale_init_value": transformers_config["layerscale_value"], - "num_register_tokens": transformers_config.get( - "num_register_tokens", 0 - ), + "num_register_tokens": transformers_config["num_register_tokens"], "use_mask_token": True, + "hidden_activation": transformers_config["hidden_act"], "use_gated_mlp": transformers_config["use_gated_mlp"], + "use_query_bias": transformers_config["query_bias"], + "use_key_bias": transformers_config["key_bias"], + "use_value_bias": transformers_config["value_bias"], + "use_proj_bias": transformers_config["proj_bias"], + "use_mlp_bias": transformers_config["mlp_bias"], "attention_dropout": transformers_config["attention_dropout"], "drop_path_rate": transformers_config["drop_path_rate"], + "layer_norm_eps": transformers_config["layer_norm_eps"], "image_shape": (image_size, image_size, 3), "rope_theta": transformers_config["rope_theta"], "apply_layernorm": False, - "query_bias": transformers_config["query_bias"], - "key_bias": transformers_config["key_bias"], - "value_bias": transformers_config["value_bias"], - "proj_bias": transformers_config["proj_bias"], } @@ -56,6 +57,11 @@ def port_dense(keras_variable, weight_key): keras_variable=backbone.embeddings.cls_token, hf_weight_key="embeddings.cls_token", ) + if backbone.use_mask_token: + loader.port_weight( + keras_variable=backbone.embeddings.mask_token, + hf_weight_key="embeddings.mask_token", + ) if backbone.num_register_tokens > 0: loader.port_weight( keras_variable=backbone.embeddings.register_tokens, @@ -75,10 +81,10 @@ def port_dense(keras_variable, weight_key): for i, layer in enumerate(backbone.encoder.layers): prefix = f"layer.{i}" port_ln(layer.norm1, f"{prefix}.norm1") - port_dense(layer.attention.q_proj, f"{prefix}.attention.q_proj") - port_dense(layer.attention.k_proj, f"{prefix}.attention.k_proj") - port_dense(layer.attention.v_proj, f"{prefix}.attention.v_proj") - port_dense(layer.attention.o_proj, f"{prefix}.attention.o_proj") + port_dense(layer.attention.query_dense, f"{prefix}.attention.q_proj") + port_dense(layer.attention.key_dense, f"{prefix}.attention.k_proj") + port_dense(layer.attention.value_dense, f"{prefix}.attention.v_proj") + port_dense(layer.attention.output_dense, f"{prefix}.attention.o_proj") loader.port_weight( keras_variable=layer.layer_scale1.lambda1, diff --git a/tools/checkpoint_conversion/convert_dinov3_checkpoints.py b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py index 03893b6885..36d06e3fb4 100644 --- a/tools/checkpoint_conversion/convert_dinov3_checkpoints.py +++ b/tools/checkpoint_conversion/convert_dinov3_checkpoints.py @@ -1,3 +1,28 @@ +"""Convert DINOV3 checkpoints. + +export KAGGLE_USERNAME=xxx KAGGLE_KEY=xxx + +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_small_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_small_lvd1689m +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_small_plus_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_small_plus_lvd1689m +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_base_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_base_lvd1689m +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_large_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_large_lvd1689m +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_huge_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_huge_lvd1689m +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_huge_plus_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_huge_plus_lvd1689m +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_7b_lvd1689m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_7b_lvd1689m + +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_large_sat493m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_large_sat493m +python tools/checkpoint_conversion/convert_dinov3_checkpoints.py \ + --preset dinov3_vit_7b_sat493m --upload_uri kaggle://kerashub/dinov3/keras/dinov3_vit_7b_sat493m +""" + import keras import numpy as np import torch @@ -10,7 +35,20 @@ import keras_hub PRESET_MAP = { + # ViT lvd1689m variants. "dinov3_vit_small_lvd1689m": "facebook/dinov3-vits16-pretrain-lvd1689m", + "dinov3_vit_small_plus_lvd1689m": ( + "facebook/dinov3-vits16plus-pretrain-lvd1689m" + ), + "dinov3_vit_base_lvd1689m": "facebook/dinov3-vitb16-pretrain-lvd1689m", + "dinov3_vit_large_lvd1689m": "facebook/dinov3-vitl16-pretrain-lvd1689m", + "dinov3_vit_huge_plus_lvd1689m": ( + "facebook/dinov3-vith16plus-pretrain-lvd1689m" + ), + "dinov3_vit_7b_lvd1689m": "facebook/dinov3-vit7b16-pretrain-lvd1689m", + # ViT sat493m variants. + "dinov3_vit_large_sat493m": "facebook/dinov3-vitl16-pretrain-sat493m", + "dinov3_vit_7b_sat493m": "facebook/dinov3-vit7b16-pretrain-sat493m", } FLAGS = flags.FLAGS @@ -37,6 +75,7 @@ def convert_image_converter(image_size, hf_image_processor): image_size=image_size, scale=[1.0 / 255.0 / s for s in std], offset=[-m / s for m, s in zip(mean, std)], + crop_to_aspect_ratio=False, interpolation="bilinear", antialias=True, ) @@ -56,13 +95,15 @@ def validate_output( # Preprocess with hf. hf_inputs = hf_image_processor(images=image, return_tensors="pt") hf_preprocessed = hf_inputs["pixel_values"].detach().cpu().numpy() - print("🔶 HF preprocessed shape:", hf_preprocessed.shape) # Preprocess with keras. images = np.expand_dims(np.array(image).astype("float32"), axis=0) images = keras_hub_image_converter(images) keras_preprocessed = keras.ops.convert_to_numpy(images) + print("🔶 Keras preprocessor output:", keras_preprocessed[0, 0, :10, 0]) + print("🔶 HF preprocessor output:", hf_preprocessed[0, 0, 0, :10]) + # Call with hf. Use the keras preprocessed image so we can keep modeling # and preprocessing comparisons independent. hf_inputs["pixel_values"] = torch.from_numpy( From b3156b1d6cbf895e6c1fa8e98cfa66e8cdabeb73 Mon Sep 17 00:00:00 2001 From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com> Date: Mon, 27 Oct 2025 21:01:41 +0800 Subject: [PATCH 4/5] Resolves Gemini comments. --- .../src/models/dinov3/dinov3_backbone.py | 46 ++++++++++++++++++- .../src/models/dinov3/dinov3_backbone_test.py | 4 +- keras_hub/src/models/dinov3/dinov3_layers.py | 19 ++++---- .../utils/transformers/convert_dinov3_test.py | 34 ++++++++++++++ 4 files changed, 91 insertions(+), 12 deletions(-) create mode 100644 keras_hub/src/utils/transformers/convert_dinov3_test.py diff --git a/keras_hub/src/models/dinov3/dinov3_backbone.py b/keras_hub/src/models/dinov3/dinov3_backbone.py index 448051a340..5f52d9a509 100644 --- a/keras_hub/src/models/dinov3/dinov3_backbone.py +++ b/keras_hub/src/models/dinov3/dinov3_backbone.py @@ -64,6 +64,48 @@ class DINOV3Backbone(FeaturePyramidBackbone): for the models computations and weights. Note that some computations, such as softmax and layer normalization will always be done a float32 precision regardless of dtype. + + Example: + ```python + # Pretrained DINOV3 model. + input_data = { + "images": np.ones(shape=(1, 518, 518, 3), dtype="float32"), + } + model = keras_hub.models.DINOV3Backbone.from_preset( + "dinov3_vit_small_lvd1689m" + ) + model(input_data) + + # Pretrained DINOV3 model with custom image shape. + input_data = { + "images": np.ones(shape=(1, 224, 224, 3), dtype="float32"), + } + model = keras_hub.models.DINOV3Backbone.from_preset( + "dinov3_vit_small_lvd1689m", image_shape=(224, 224, 3) + ) + model(input_data) + + # Randomly initialized DINOV3 model with custom config. + model = keras_hub.models.DINOV3Backbone( + patch_size=14, + num_layers=2, + hidden_dim=32, + num_heads=2, + intermediate_dim=128, + image_shape=(224, 224, 3), + ) + model(input_data) + + # Accessing feature pyramid outputs. + backbone = keras_hub.models.DINOV3Backbone.from_preset( + "dinov3_vit_small_lvd1689m", image_shape=(224, 224, 3) + ) + model = keras.Model( + inputs=backbone.inputs, + outputs=backbone.pyramid_outputs, + ) + features = model(input_data) + ``` """ def __init__( @@ -141,7 +183,7 @@ def __init__( # === Functional Model === pyramid_outputs = {} - image_input = layers.Input(shape=image_shape, name="images") + image_input = layers.Input(shape=image_shape, name="pixel_values") x = self.embeddings(image_input) pyramid_outputs["stem"] = x @@ -160,7 +202,7 @@ def __init__( pyramid_outputs[key] = self.layernorm(pyramid_outputs[key]) outputs = x super().__init__( - inputs={"images": image_input}, + inputs={"pixel_values": image_input}, outputs=outputs, dtype=dtype, name=name, diff --git a/keras_hub/src/models/dinov3/dinov3_backbone_test.py b/keras_hub/src/models/dinov3/dinov3_backbone_test.py index e032ab1c4e..b8fdd9a0c6 100644 --- a/keras_hub/src/models/dinov3/dinov3_backbone_test.py +++ b/keras_hub/src/models/dinov3/dinov3_backbone_test.py @@ -23,7 +23,7 @@ def setUp(self): "name": "dinov3_backbone", } self.input_data = { - "images": ops.ones((2, 64, 64, 3)), + "pixel_values": ops.ones((2, 64, 64, 3)), } def test_backbone_basics(self): @@ -73,7 +73,7 @@ def test_position_embedding_interpolation(self): image_shape=(128, 128, 3), # From 64 to 128. ) input_data = { - "images": ops.ones((2, 128, 128, 3)), + "pixel_values": ops.ones((2, 128, 128, 3)), } restored_output = restored_model(input_data) self.assertNotEqual(model_output.shape, restored_output.shape) diff --git a/keras_hub/src/models/dinov3/dinov3_layers.py b/keras_hub/src/models/dinov3/dinov3_layers.py index d27c3485dc..edc46006a8 100644 --- a/keras_hub/src/models/dinov3/dinov3_layers.py +++ b/keras_hub/src/models/dinov3/dinov3_layers.py @@ -449,6 +449,7 @@ def call( is_causal=False, ) attn_output = ops.reshape(attn_output, (batch_size, seq_len, -1)) + attn_output = self.dropout(attn_output, training=training) return self.output_dense(attn_output, training=training) def get_config(self): @@ -815,6 +816,7 @@ def call( attention_mask=None, position_embeddings=None, num_prefix_tokens=0, + training=None, ): residual = inputs hidden_states = self.norm1(inputs) @@ -823,17 +825,18 @@ def call( attention_mask=attention_mask, position_embeddings=position_embeddings, num_prefix_tokens=num_prefix_tokens, + training=training, + ) + hidden_states = self.layer_scale1(hidden_states, training=training) + hidden_states = ( + self.drop_path(hidden_states, training=training) + residual ) - hidden_states = self.layer_scale1(hidden_states) - hidden_states = self.drop_path(hidden_states) + residual residual = hidden_states - hidden_states = self.norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = self.layer_scale2(hidden_states) - hidden_states = self.drop_path(hidden_states) + residual - - return hidden_states + hidden_states = self.norm2(hidden_states, training=training) + hidden_states = self.mlp(hidden_states, training=training) + hidden_states = self.layer_scale2(hidden_states, training=training) + return self.drop_path(hidden_states, training=training) + residual def get_config(self): config = super().get_config() diff --git a/keras_hub/src/utils/transformers/convert_dinov3_test.py b/keras_hub/src/utils/transformers/convert_dinov3_test.py new file mode 100644 index 0000000000..d3aa16d4db --- /dev/null +++ b/keras_hub/src/utils/transformers/convert_dinov3_test.py @@ -0,0 +1,34 @@ +import numpy as np +import pytest + +from keras_hub.src.models.dinov3.dinov3_backbone import DINOV3Backbone +from keras_hub.src.tests.test_case import TestCase + + +class TestTask(TestCase): + @pytest.mark.large + def test_convert_tiny_preset(self): + model = DINOV3Backbone.from_preset( + "hf://facebook/dinov3-vits16-pretrain-lvd1689m", + image_shape=(224, 224, 3), + ) + dummy_input = { + "pixel_values": np.ones((1, 224, 224, 3), dtype="float32") + } + output = model.predict(dummy_input) + self.assertAllClose( + output[0, 0, :10], + [ + -0.2769, + 0.5487, + 0.2501, + -1.2269, + 0.5886, + 0.0762, + 0.6251, + 0.1874, + -0.4259, + -0.4362, + ], + atol=1e-2, + ) From 3ad0e9fdd9b1765f101d33056e20289acbeb2402 Mon Sep 17 00:00:00 2001 From: Hongyu Chiu <20734616+james77777778@users.noreply.github.com> Date: Tue, 28 Oct 2025 20:49:33 +0800 Subject: [PATCH 5/5] Skip the HF conversion test. --- keras_hub/src/utils/transformers/convert_dinov3_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_hub/src/utils/transformers/convert_dinov3_test.py b/keras_hub/src/utils/transformers/convert_dinov3_test.py index d3aa16d4db..81e3775dc8 100644 --- a/keras_hub/src/utils/transformers/convert_dinov3_test.py +++ b/keras_hub/src/utils/transformers/convert_dinov3_test.py @@ -8,6 +8,7 @@ class TestTask(TestCase): @pytest.mark.large def test_convert_tiny_preset(self): + pytest.skip(reason="TODO: enable after HF token is available in CI") model = DINOV3Backbone.from_preset( "hf://facebook/dinov3-vits16-pretrain-lvd1689m", image_shape=(224, 224, 3),