google-deepmind · danieldjohnson · Jun 20, 2025 · May 23, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/penzai/models/transformer/variants/gemma.py b/penzai/models/transformer/variants/gemma.py
@@ -14,13 +14,14 @@
 
 """The Gemma architecture transformer variant.
 
-Supports both the Gemma 1 and Gemma 2 architectures. Based on the Flax
-reference implementation at https://github.com/google-deepmind/gemma.
+Supports all the Gemma 1, Gemma 2 and Gemma 3 architectures. Based on the
+Flax reference implementation at https://github.com/google-deepmind/gemma.
 
 See the Gemma technical reports for more information:
 
 * Gemma 1: https://arxiv.org/abs/2403.08295
 * Gemma 2: https://arxiv.org/abs/2408.00118
+* Gemma 3: https://arxiv.org/abs/2503.19786
 """
 
 from __future__ import annotations
@@ -105,23 +106,124 @@
         final_logit_softcap=30.0,
         attn_logits_soft_cap=50.0,
     ),
+    "gemma3_1b": dict(
+        num_decoder_blocks=26,
+        vocab_size=262_144,
+        num_kv_heads=1,
+        query_head_multiplier=4,
+        embedding_dim=1152,
+        projection_dim=256,
+        mlp_hidden_dim=6*1152,
+        attention_type=(
+            llamalike_common.AttentionTypeSlidingWindowCausal(512),
+            llamalike_common.AttentionTypeSlidingWindowCausal(512),
+            llamalike_common.AttentionTypeSlidingWindowCausal(512),
+            llamalike_common.AttentionTypeSlidingWindowCausal(512),
+            llamalike_common.AttentionTypeSlidingWindowCausal(512),
+            llamalike_common.AttentionTypeGlobalCausal(),
+        ),
+        use_qk_norm=True,
+        use_post_attn_norm=True,
+        use_post_ffw_norm=True,
+        local_rope_wavelength=10_000,
+        global_rope_wavelength=1_000_000,
+    ),
+    "gemma3_4b": dict(
+        num_decoder_blocks=34,
+        vocab_size=262_144,
+        num_kv_heads=4,
+        query_head_multiplier=2,
+        embedding_dim=2560,
+        projection_dim=256,
+        mlp_hidden_dim=2560 * 8 // 2,
+        attention_type=(
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeGlobalCausal(),
+        ),
+        use_qk_norm=True,
+        use_post_attn_norm=True,
+        use_post_ffw_norm=True,
+        local_scale_factor=1.0,
+        global_scale_factor=8.0,
+        local_rope_wavelength=10_000,
+        global_rope_wavelength=1_000_000,
+    ),
+    "gemma3_12b": dict(
+        num_decoder_blocks=48,
+        vocab_size=262_144,
+        num_kv_heads=8,
+        query_head_multiplier=2,
+        embedding_dim=30 * 128,
+        projection_dim=256,
+        mlp_hidden_dim=8 * 30 * 128 // 2,
+        attention_type=(
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeGlobalCausal(),
+        ),
+        use_qk_norm=True,
+        use_post_attn_norm=True,
+        use_post_ffw_norm=True,
+        local_scale_factor=1.0,
+        global_scale_factor=8.0,
+        local_rope_wavelength=10_000,
+        global_rope_wavelength=1_000_000,
+    ),
+    "gemma3_27b": dict(
+        num_decoder_blocks=62,
+        vocab_size=262_144,
+        num_kv_heads=16,
+        query_head_multiplier=2,
+        embedding_dim=5376,
+        projection_dim=128,
+        mlp_hidden_dim=5376 * 8 // 2,
+        # query scaling factor: 1/sqrt(embedding_dim / num_query_heads)
+        query_scaling_factor=(5376 // 32) ** -0.5,
+        attention_type=(
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeSlidingWindowCausal(1024),
+            llamalike_common.AttentionTypeGlobalCausal(),
+        ),
+        use_qk_norm=True,
+        use_post_attn_norm=True,
+        use_post_ffw_norm=True,
+        local_scale_factor=1.0,
+        global_scale_factor=8.0,
+        local_rope_wavelength=10_000,
+        global_rope_wavelength=1_000_000,
+    ),
 }
 _NEEDS_GATING_TRANSPOSE = {
     "gemma_2b": False,
     "gemma_7b": False,
     "gemma2_2b": False,
     "gemma2_9b": True,
     "gemma2_27b": True,
+    "gemma3_1b": True,
+    "gemma3_4b": True,
+    "gemma3_12b": True,
+    "gemma3_27b": True,
 }
 
 
 def gemma_from_pretrained_checkpoint(
     ckpt_params: dict[str, Any],
+    preset_name: Literal[
+        "gemma_2b", "gemma_7b", "gemma2_2b", "gemma2_9b", "gemma2_27b",
+        "gemma3_1b", "gemma3_4b", "gemma3_12b", "gemma3_27b",
+    ],
     upcast_activations_to_float32: bool = False,
     use_layer_stack: bool = False,
-    preset_name: Literal[
-        "gemma_2b", "gemma_7b", "gemma2_2b", "gemma2_9b", "gemma2_27b", "auto"
-    ] = "auto",
 ) -> model_parts.TransformerLM:
   """Builds a Gemma model from a pretrained checkpoint.
 
@@ -139,32 +241,17 @@ def gemma_from_pretrained_checkpoint(
 
   Args:
     ckpt_params: Nested dictionary of weights from the Gemma checkpoint.
+    preset_name: The name of the Gemma preset to use.
     upcast_activations_to_float32: Whether to cast activations to float32 when
       the model runs. This allows analyzing activations at higher precision
       without consuming additional memory for parameters.
     use_layer_stack: Whether to use a layer stack for the decoder blocks.
-    preset_name: Preset name, used to determine model config. If "auto", uses
-      the number of layers in the checkpoint to determine the configuration.
 
   Returns:
     A Transformer model containing the loaded parameters.
   """
   params = {k.removeprefix("transformer/"): v for k, v in ckpt_params.items()}
 
-  if preset_name == "auto":
-    num_layers = 0
-    while f"layer_{num_layers}/mlp/linear" in params:
-      num_layers += 1
-    preset_by_num_layers = {
-        kwargs["num_decoder_blocks"]: preset_name
-        for preset_name, kwargs in _GEMMA_PRESETS.items()
-    }
-    if num_layers not in preset_by_num_layers:
-      raise ValueError(
-          f"Could not determine preset for model with {num_layers} layers."
-      )
-    preset_name = preset_by_num_layers[num_layers]
-
   preset_kwargs = _GEMMA_PRESETS[preset_name]
   preset_needs_gating_transpose = _NEEDS_GATING_TRANSPOSE[preset_name]
 
@@ -207,6 +294,19 @@ def gemma_from_pretrained_checkpoint(
             1 + params[f"layer_{i}/pre_attention_norm"]["scale"]
         ).tag("embedding")
     )
+    # Add qk norm if needed
+    if config.use_qk_norm:
+      cur_block_params["attention/_query_norm/scale.weights"] = (
+          pz.nx.NamedArray.wrap(
+              1 + params[f"layer_{i}/attn/_query_norm"]["scale"]
+          ).tag("projection")
+      )
+      cur_block_params["attention/_key_norm/scale.weights"] = (
+          pz.nx.NamedArray.wrap(
+              1 + params[f"layer_{i}/attn/_key_norm"]["scale"]
+          ).tag("projection")
+      )
+
     if config.use_post_attn_norm:
       cur_block_params["post_attention_norm/scale.weights"] = (
           pz.nx.NamedArray.wrap(

diff --git a/penzai/models/transformer/variants/llamalike_common.py b/penzai/models/transformer/variants/llamalike_common.py
@@ -34,7 +34,7 @@
 import dataclasses
 import functools
 from typing import Any, Literal
-
+from absl import logging
 import jax
 import jax.numpy as jnp
 from penzai import pz
@@ -102,6 +102,12 @@ class LlamalikeTransformerConfig:
     parameter_dtype: Floating dtype to use for all parameters.
     activation_dtype: Floating dtype to use for activations and KV cache tables.
     use_layer_stack: Whether to stack the blocks together using a LayerStack.
+    # NOTE: Gemma3 specific parameters
+    use_qk_norm: Whether to use QK normalization.
+    local_scale_factor: Scale factor for the localRoPE layers.
+    global_scale_factor: Scale factor for the gloabl RoPE layers.
+    local_rope_wavelength: Wavelength for the local RoPE layers.
+    global_rope_wavelength: Wavelength for the globalRoPE layers.
   """
 
   num_kv_heads: int
@@ -126,6 +132,12 @@ class LlamalikeTransformerConfig:
   parameter_dtype: jax.typing.DTypeLike = jnp.float32
   activation_dtype: jax.typing.DTypeLike = jnp.float32
   use_layer_stack: bool = False
+  # NOTE: Gemma3 specific parameters
+  use_qk_norm: bool = False
+  local_scale_factor: float | None = None
+  global_scale_factor: float | None = None
+  local_rope_wavelength: float | None = None
+  global_rope_wavelength: float | None = None
 
 
 def build_llamalike_feedforward(
@@ -261,10 +273,30 @@ def build_llamalike_attention(
         sliding_window_size=attention_type.window_size,
         masked_out_value=masked_out_value,
     )
+    # Decide which wavelength to use for local RoPE.
+    if config.local_rope_wavelength is not None:
+      wavelength = config.local_rope_wavelength
+    else:
+      wavelength = config.rope_wavelength
+    # Decide which scale factor to use for local RoPE.
+    if config.local_scale_factor is not None:
+      scale_factor = config.local_scale_factor
+    else:
+      scale_factor = 1.0
   elif isinstance(attention_type, AttentionTypeGlobalCausal):
     attn_masker = pz.nn.ApplyCausalAttentionMask(
         masked_out_value=masked_out_value,
     )
+    # Decide which wavelength to use for global RoPE.
+    if config.global_rope_wavelength is not None:
+      wavelength = config.global_rope_wavelength
+    else:
+      wavelength = config.rope_wavelength
+    # Decide which scale factor to use for global RoPE.
+    if config.global_scale_factor is not None:
+      scale_factor = config.global_scale_factor
+    else:
+      scale_factor = 1.0
   else:
     raise ValueError(f"Unsupported attention type {attention_type}")
 
@@ -290,42 +322,74 @@ def build_llamalike_attention(
       pz.nn.Softmax("kv_seq"),
   ])
 
+  # add qk norm if needed in the module of input_to_query sublayers
+  input_to_query_sublayers = [
+      pz.nn.Linear.from_config(
+          name=f"{name}/query",
+          init_base_rng=init_base_rng,
+          input_axes={"embedding": embedding_dim},
+          output_axes={
+              **common_head_axes,
+              **query_only_head_axes,
+              "projection": projection_dim,
+          },
+          dtype=config.parameter_dtype,
+      ),
+  ]
+  if config.use_qk_norm:
+    input_to_query_sublayers.append(
+        pz.nn.RMSLayerNorm.from_config(
+            name=f"{name}/_query_norm",
+            init_base_rng=init_base_rng,
+            across_axes={"projection": config.projection_dim},
+            dtype=config.parameter_dtype,
+            epsilon=config.rms_norm_eps,
+        ),
+    )
+  input_to_query_sublayers.extend([
+      pz.nn.ApplyRoPE(
+          positions_input_name="token_positions",
+          embedding_axis="projection",
+          max_wavelength=wavelength,
+          scale_factor=scale_factor,
+      ),
+      pz.nn.ConstantRescale(
+          by=jnp.array(query_scaling_factor, dtype=config.activation_dtype)
+      ),
+  ])
+
+  # add qk norm if needed in the module of input_to_key sublayers
+  input_to_key_sublayers = [
+      pz.nn.Linear.from_config(
+          name=f"{name}/key",
+          init_base_rng=init_base_rng,
+          input_axes={"embedding": embedding_dim},
+          output_axes={**common_head_axes, "projection": projection_dim},
+          dtype=config.parameter_dtype,
+      ),
+  ]
+  if config.use_qk_norm:
+    input_to_key_sublayers.append(
+        pz.nn.RMSLayerNorm.from_config(
+            name=f"{name}/_key_norm",
+            init_base_rng=init_base_rng,
+            across_axes={"projection": config.projection_dim},
+            dtype=config.parameter_dtype,
+            epsilon=config.rms_norm_eps,
+        ),
+    )
+  input_to_key_sublayers.append(
+      pz.nn.ApplyRoPE(
+          positions_input_name="token_positions",
+          embedding_axis="projection",
+          max_wavelength=wavelength,
+          scale_factor=scale_factor,
+      ),
+  )
+
   return pz.nn.Attention(
-      input_to_query=pz.nn.Sequential([
-          pz.nn.Linear.from_config(
-              name=f"{name}/query",
-              init_base_rng=init_base_rng,
-              input_axes={"embedding": embedding_dim},
-              output_axes={
-                  **common_head_axes,
-                  **query_only_head_axes,
-                  "projection": projection_dim,
-              },
-              dtype=config.parameter_dtype,
-          ),
-          pz.nn.ApplyRoPE(
-              positions_input_name="token_positions",
-              embedding_axis="projection",
-              max_wavelength=config.rope_wavelength,
-          ),
-          pz.nn.ConstantRescale(
-              by=jnp.array(query_scaling_factor, dtype=config.activation_dtype)
-          ),
-      ]),
-      input_to_key=pz.nn.Sequential([
-          pz.nn.Linear.from_config(
-              name=f"{name}/key",
-              init_base_rng=init_base_rng,
-              input_axes={"embedding": embedding_dim},
-              output_axes={**common_head_axes, "projection": projection_dim},
-              dtype=config.parameter_dtype,
-          ),
-          pz.nn.ApplyRoPE(
-              positions_input_name="token_positions",
-              embedding_axis="projection",
-              max_wavelength=config.rope_wavelength,
-          ),
-      ]),
+      input_to_query=pz.nn.Sequential(input_to_query_sublayers),
+      input_to_key=pz.nn.Sequential(input_to_key_sublayers),
       input_to_value=pz.nn.Sequential([
           pz.nn.Linear.from_config(
               name=f"{name}/value",
@@ -483,9 +547,10 @@ def build_llamalike_transformer(
   else:
     if not isinstance(config.attention_type, AttentionType):
       if config.num_decoder_blocks % len(config.attention_type) != 0:
-        raise ValueError(
-            "Per-layer attention types must have a length that divides the"
-            " number of blocks."
+        logging.warning(
+            "Please ensure that you are using Gemma3 models."
+            "For other models, per-layer attention types must have a length "
+            "that divides the number of blocks."
         )
     for block_index in range(config.num_decoder_blocks):
       sublayers.append(