Avarok-Cybersecurity · tbraun96 · May 11, 2026 · May 13, 2026 · May 15, 2026 · May 15, 2026
@@ -25,7 +25,7 @@ jobs:
       actions: write
     steps:
       - name: "CLA Assistant"
-        if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
+        if: (contains(github.event.comment.body, 'recheck') || contains(github.event.comment.body, 'I have read the CLA Document and I hereby sign the CLA')) || github.event_name == 'pull_request_target'
         # Alpha Release
         uses: contributor-assistant/github-action@v2.6.1
         env:
@@ -35,8 +35,8 @@ jobs:
         with:
           path-to-signatures: 'signatures/version1/cla.json'
           path-to-document: 'https://github.com/${{ github.repository }}/blob/main/CLA.md'
-          branch: 'main'
-          allowlist: dependabot[bot],renovate[bot]
+          branch: 'cla-signatures'
+          allowlist: dependabot[bot],renovate[bot],google-labs-jules[bot],claude[bot],google-labs-jules,claude
           lock-pullrequest-aftermerge: false
           create-file-commit-message: 'chore: setup CLA signatures file'
           signed-commit-message: 'chore: $username CLA signature added'

@@ -94,5 +94,4 @@ Each step calls into `spark-runtime::GpuBackend` via the layer's cached `KernelH
 - `kernels/gb10/<model>/<quant>/ssm_preprocess.cu`, `gdr.cu`, `causal_conv1d.cu`
 - `crates/spark-model/src/layers/qwen3_ssm.rs`, `nemotron_mamba2.rs`
 - `crates/spark-runtime/src/prefix_cache.rs` (Marconi SSM snapshot)
-- `docs/history/SSM_CATASTROPHIC_FORGETTING_TODO.md`
 - README "Atlas Spark" section — the SSM/GDN story in narrative form
@@ -74,6 +74,7 @@ struct Target {
     behavior_disable_tool_steering: bool,
     behavior_tool_call_parser: String,
     behavior_enable_loop_watchdog: bool,
+    behavior_skip_template_tools: bool,
     /// Which `(model_type, hidden_size)` pairs this kernel target supports.
     /// Parsed from `[[model_types]]` in MODEL.toml.
     model_type_matches: Vec<ModelTypeMatch>,
@@ -362,6 +363,7 @@ fn resolve_targets(workspace_root: &std::path::Path) -> Vec<Target> {
                 b_disable_tool_steering,
                 b_tool_call_parser,
                 b_enable_loop_watchdog,
+                b_skip_template_tools,
             ) = parse_behavior(&model_dir);
             let model_type_matches = parse_model_types(&model_dir);
             let dflash = parse_dflash(&model_dir);
@@ -392,6 +394,7 @@ fn resolve_targets(workspace_root: &std::path::Path) -> Vec<Target> {
                 behavior_disable_tool_steering: b_disable_tool_steering,
                 behavior_tool_call_parser: b_tool_call_parser,
                 behavior_enable_loop_watchdog: b_enable_loop_watchdog,
+                behavior_skip_template_tools: b_skip_template_tools,
                 model_type_matches,
                 dflash,
             });

@@ -168,6 +168,7 @@ pub(super) fn generate_target_ptx_rs(
              \x20               disable_tool_steering: {},\n\
              \x20               tool_call_parser: \"{}\",\n\
              \x20               enable_loop_watchdog: {},\n\
+             \x20               skip_template_tools: {},\n\
              \x20           }},\n\
              \x20           model_type_matches: vec![{}],\n\
              \x20           dflash: {},\n\
@@ -186,6 +187,7 @@ pub(super) fn generate_target_ptx_rs(
             target.behavior_disable_tool_steering,
             target.behavior_tool_call_parser,
             target.behavior_enable_loop_watchdog,
+            target.behavior_skip_template_tools,
             target.model_type_matches.iter().map(|m| {
                 let hs = match m.hidden_size {
                     Some(v) => format!("Some({v})"),

@@ -120,10 +120,10 @@ pub(super) fn parse_sampling_presets(
 }
 
 /// Parse [behavior] from MODEL.toml. Returns
-/// (thinking_in_tools, max_thinking_budget, thinking_default, fp8_kv_calibration_tokens, default_kv_dtype, default_num_drafts, disable_tool_steering, tool_call_parser, enable_loop_watchdog).
+/// (thinking_in_tools, max_thinking_budget, thinking_default, fp8_kv_calibration_tokens, default_kv_dtype, default_num_drafts, disable_tool_steering, tool_call_parser, enable_loop_watchdog, skip_template_tools).
 pub(super) fn parse_behavior(
     model_dir: &std::path::Path,
-) -> (bool, u32, bool, usize, String, u32, bool, String, bool) {
+) -> (bool, u32, bool, usize, String, u32, bool, String, bool, bool) {
     let model_toml_path = model_dir.join("MODEL.toml");
     if !model_toml_path.exists() {
         return (
@@ -136,6 +136,7 @@ pub(super) fn parse_behavior(
             false,
             String::new(),
             false,
+            false,
         );
     }
     let content = std::fs::read_to_string(&model_toml_path).unwrap_or_default();
@@ -152,6 +153,7 @@ pub(super) fn parse_behavior(
                 false,
                 String::new(),
                 false,
+                false,
             );
         }
     };
@@ -197,6 +199,10 @@ pub(super) fn parse_behavior(
         .and_then(|v| v.get("enable_loop_watchdog"))
         .and_then(|v| v.as_bool())
         .unwrap_or(false);
+    let skip_template_tools = b
+        .and_then(|v| v.get("skip_template_tools"))
+        .and_then(|v| v.as_bool())
+        .unwrap_or(false);
     (
         thinking_in_tools,
         max_thinking_budget,
@@ -207,6 +213,7 @@ pub(super) fn parse_behavior(
         disable_tool_steering,
         tool_call_parser,
         enable_loop_watchdog,
+        skip_template_tools,
     )
 }
 

@@ -163,6 +163,19 @@ pub struct ModelBehavior {
     /// JSON arrays of similar objects, multiplication tables). Enable only
     /// when the model has been observed to need it.
     pub enable_loop_watchdog: bool,
+    /// When true, do not pass tool definitions to the Jinja chat template
+    /// (`jinja_tools` stays `None`). Use this for models where the tool-call
+    /// parser already injects a complete system-prompt with tool schemas and
+    /// format instructions, and the template's own XML tool rendering would
+    /// produce contradictory instructions.
+    ///
+    /// Example: Nemotron-Super-120B uses `bare_json` grammar (parser emits
+    /// JSON-schema + bare-JSON instructions) while the `nemotron_h.jinja`
+    /// template would additionally render XML `<function>` blocks and tell
+    /// the model to output `<tool_call>` XML — the opposite format. Setting
+    /// `skip_template_tools = true` suppresses the template rendering and
+    /// leaves the parser's instructions as the sole tool-format signal.
+    pub skip_template_tools: bool,
 }
 
 impl Default for ModelBehavior {
@@ -177,6 +190,7 @@ impl Default for ModelBehavior {
             disable_tool_steering: false,
             tool_call_parser: "",
             enable_loop_watchdog: false,
+            skip_template_tools: false,
         }
     }
 }

@@ -166,6 +166,40 @@ pub fn mla_batched_gemv(
         .launch(stream)
 }
 
+/// Batched V extraction for N-token MLA prefill.
+/// For each (token, head): output[token, head, :] = W_UV[head] @ input[token, head, 0..k]
+/// where input has input_head_stride elements per head (only first k are used).
+///
+/// Grid: (ceil(n_out/8), num_heads, n_tokens)  Block: (256, 1, 1)
+#[allow(clippy::too_many_arguments)]
+pub fn mla_v_extract_batched(
+    gpu: &dyn GpuBackend,
+    kernel: KernelHandle,
+    input: DevicePtr,
+    weight: DevicePtr,
+    output: DevicePtr,
+    n_out: u32,
+    k: u32,
+    num_heads: u32,
+    input_head_stride: u32,
+    output_head_stride: u32,
+    n_tokens: u32,
+    stream: u64,
+) -> Result<()> {
+    KernelLaunch::new(gpu, kernel)
+        .grid([div_ceil(n_out, 8), num_heads, n_tokens])
+        .block([256, 1, 1])
+        .arg_ptr(input)
+        .arg_ptr(weight)
+        .arg_ptr(output)
+        .arg_u32(n_out)
+        .arg_u32(k)
+        .arg_u32(num_heads)
+        .arg_u32(input_head_stride)
+        .arg_u32(output_head_stride)
+        .launch(stream)
+}
+
 /// MLA Q_rope scatter: copy rope portion from q_full to strided q_absorbed_buf. 1 kernel replaces 32 D2D copies.
 #[allow(clippy::too_many_arguments)]
 pub fn mla_q_rope_scatter(

@@ -284,6 +284,51 @@ pub fn mla_prefill_attention_320(
         .launch(stream)
 }
 
+/// Paged MLA prefill attention — absorbed form, HDIM=320, multi-chunk (seq_len_start > 0).
+///
+/// Q [q_len, nq, 320] attends to KV cache (paged) over kv_len tokens with causal masking.
+/// Q at local position i (global position q_offset + i) attends to KV 0..q_offset+i.
+///
+/// Grid: (num_q_heads, ceil(q_len/16), 1)  Block: (256, 1, 1)
+#[allow(clippy::too_many_arguments)]
+pub fn mla_prefill_paged_320(
+    gpu: &dyn GpuBackend,
+    kernel: KernelHandle,
+    q: DevicePtr,
+    k_cache: DevicePtr,
+    v_cache: DevicePtr,
+    output: DevicePtr,
+    block_table: DevicePtr,
+    q_len: u32,
+    kv_len: u32,
+    q_offset: u32,
+    num_q_heads: u32,
+    num_kv_heads: u32,
+    head_dim: u32,
+    cache_block_size: u32,
+    inv_sqrt_d: f32,
+    stream: u64,
+) -> Result<()> {
+    let br = 16u32; // MLA_BR in the kernel
+    KernelLaunch::new(gpu, kernel)
+        .grid([num_q_heads, div_ceil(q_len, br), 1])
+        .block([256, 1, 1])
+        .arg_ptr(q)
+        .arg_ptr(k_cache)
+        .arg_ptr(v_cache)
+        .arg_ptr(output)
+        .arg_ptr(block_table)
+        .arg_u32(q_len)
+        .arg_u32(kv_len)
+        .arg_u32(q_offset)
+        .arg_u32(num_q_heads)
+        .arg_u32(num_kv_heads)
+        .arg_u32(head_dim)
+        .arg_u32(cache_block_size)
+        .arg_f32(inv_sqrt_d)
+        .launch(stream)
+}
+
 pub fn paged_decode_attn_bf16(
     gpu: &dyn GpuBackend,
     kernel: KernelHandle,

@@ -372,7 +372,9 @@ impl Qwen3AttentionLayer {
 
         // Step 8: Paged decode attention
         let attn_out = ctx.buffers.attn_output();
-        let inv_sqrt_d = self.effective_attn_scale(hd);
+        // Absorbed MLA decode operates in (kv_lora+rope)-dim space; 1/sqrt(hd=128)
+        // would over-sharpen softmax vs the correct 1/sqrt(kv_lora+rope=320).
+        let inv_sqrt_d = 1.0f32 / ((kv_lora + mla_rope) as f32).sqrt();
         prof!("paged_attn", {
             ops::paged_decode_attn_bf16(
                 ctx.gpu,

@@ -307,6 +307,16 @@ impl Qwen3AttentionLayer {
                 "mla_fused_prefill",
                 "mla_fused_prefill",
             ),
+            mla_prefill_paged_k: super::super::try_kernel(
+                gpu,
+                "mla_prefill_paged",
+                "mla_prefill_paged_320",
+            ),
+            mla_v_extract_batched_k: super::super::try_kernel(
+                gpu,
+                "mla_absorbed",
+                "mla_v_extract_batched",
+            ),
             gemm_splitk_partial_k: super::super::try_kernel(
                 gpu,
                 "gemm_splitk",
@@ -391,6 +401,16 @@ impl Qwen3AttentionLayer {
                 "inferspark_prefill_paged_512",
             ),
             prefill_attn_64_k: gpu.kernel("inferspark_prefill", "inferspark_prefill_64")?,
+            prefill_attn_128_k: super::super::try_kernel(
+                gpu,
+                "inferspark_prefill_128",
+                "inferspark_prefill_hd128",
+            ),
+            prefill_attn_64_128_k: super::super::try_kernel(
+                gpu,
+                "inferspark_prefill_128",
+                "inferspark_prefill_64_hd128",
+            ),
             prefill_attn_paged_k: gpu.kernel("prefill_paged", "inferspark_prefill_paged")?,
             prefill_attn_paged_fp8_k: gpu
                 .kernel("prefill_paged_fp8", "inferspark_prefill_paged_fp8")?,

@@ -89,16 +89,13 @@ impl Qwen3AttentionLayer {
         if self.mla.is_some() {
             let args = super::cache_skip_mla::CacheSkipMlaArgs {
                 normed,
-                num_tokens,
                 n,
                 h,
                 nq,
-                nkv,
                 hd,
-                kv_dim,
                 eps,
-                bf16,
                 stream,
+                kv_write_start,
             };
             return self.prefill_attention_cache_skip_mla(kv_cache, ctx, &args);
         }
@@ -143,22 +140,6 @@ impl Qwen3AttentionLayer {
                 q_proj_dim as u32,
                 stream,
             )?;
-        } else if self.mla.is_some() {
-            // DIAGNOSTIC: check V BEFORE Q copy
-            if self.attn_layer_idx == 0 && ctx.config.model_type == "mistral" {
-                ctx.gpu.synchronize(stream)?;
-                let v_chk = k_contiguous.offset(num_tokens * kv_dim * bf16);
-                crate::layers::qwen3_attention::trait_impl::diag_norm(
-                    ctx.gpu,
-                    v_chk,
-                    (nkv * hd) as usize,
-                    stream,
-                    "L0 V BEFORE Q_copy",
-                );
-            }
-            ctx.gpu
-                .copy_d2d_async(qg_out, q_contiguous, num_tokens * q_dim * bf16, stream)
-                .map_err(|e| anyhow::anyhow!("MLA Q copy failed: {e}"))?;
         } else {
             ctx.gpu
                 .copy_d2d_async(qg_out, q_contiguous, num_tokens * q_dim * bf16, stream)