Avarok-Cybersecurity · lesserevil · May 29, 2026
@@ -26,6 +26,7 @@ mod loop_detect;
 mod msg_entry;
 pub(super) mod repair_json;
 mod sampling_setup;
+mod sampling_tool_mode;
 mod template;
 mod thinking;
 
@@ -240,6 +241,7 @@ pub(crate) async fn chat_completions_inner(
         max_tokens,
         stop_tokens,
         tool_choice_required,
+        suppress_tool_call,
         grammar_spec,
         timeout_at,
         top_logprobs,

@@ -16,6 +16,7 @@ use crate::tool_parser;
 use super::super::compact::openai_error_response;
 use super::super::inference_impl::tokenize_stop_sequences;
 use super::super::inference_types::GrammarSpec;
+use super::sampling_tool_mode::resolve_tool_mode;
 
 pub(super) struct SamplingSetup {
     pub(super) temperature: f32,
@@ -34,6 +35,7 @@ pub(super) struct SamplingSetup {
     pub(super) max_tokens: usize,
     pub(super) stop_tokens: Vec<u32>,
     pub(super) tool_choice_required: bool,
+    pub(super) suppress_tool_call: bool,
     pub(super) grammar_spec: Option<GrammarSpec>,
     pub(super) timeout_at: Option<std::time::Instant>,
     pub(super) top_logprobs: Option<u8>,
@@ -69,6 +71,13 @@ pub(super) fn build_sampling(
     let dry_base = preset.dry_base;
     let dry_allowed_length = preset.dry_allowed_length;
     let lz_penalty = preset.lz_penalty;
+    let tool_mode = resolve_tool_mode(
+        tools_active,
+        req.tool_choice.as_ref(),
+        state.tool_call_parser.as_ref().map(|p| p.name()),
+        suppress_tool_call,
+    );
+    let suppress_tool_call = tool_mode.suppressed_for_turn;
 
     // OpenAI-style penalty range validation.
     if !(-2.0..=2.0).contains(&presence_penalty) {
@@ -91,14 +100,15 @@ pub(super) fn build_sampling(
             .collect()
     });
 
-    // Exponential `<tool_call>` bias decay.
+    // Exponential `<tool_call>` bias decay. Positive bias is only valid when a
+    // tool call is required; auto mode must let the model choose free text.
     if tools_active
         && !suppress_tool_call
         && let Some(tc_id) = state.tool_call_start_token_id
     {
         let bias = match tool_call_repeat_count {
-            0 | 1 => 3.0,
-            2 => 0.0,
+            0 | 1 if tool_mode.required => 3.0,
+            0..=2 => 0.0,
             3 => -5.0,
             _ => -10.0,
         };
@@ -133,28 +143,16 @@ pub(super) fn build_sampling(
     }
 
     // Tool-choice + parser-driven required mode.
-    let parser_is_minimax_xml = state
-        .tool_call_parser
-        .as_ref()
-        .is_some_and(|p| p.name() == "minimax_xml");
-    let parser_is_bare_json = state
-        .tool_call_parser
-        .as_ref()
-        .is_some_and(|p| p.name() == "bare_json");
-    let tool_choice_required = tools_active
-        && (req.tool_choice.as_ref().is_some_and(|tc| {
-            matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "required")
-                || matches!(tc, tool_parser::ToolChoice::Specific { .. })
-        }) || parser_is_minimax_xml
-            || parser_is_bare_json);
+    let tool_choice_required = tool_mode.required;
 
     // response_format + tools coexistence.
     //
     // OpenAI's API allows both fields in the same request; agentic pipelines
     // routinely set both (the model emits a tool call on turn N, then a
     // schema-shaped final answer on turn N+1). XGrammar's structural-tag
     // grammar enforces *one* shape per request, so we pick which one wins:
-    //   * `tool_choice="none"` → tools won't be called, enforce response_format
+    //   * `tool_choice="none"` or loop suppression → tools won't be called,
+    //     enforce response_format
     //   * any other tool_choice → enforce tool-call grammar; the schema text
     //     is conventionally embedded in the user/system message by the
     //     caller, and capable models (Qwen3.6, etc.) follow it without
@@ -167,7 +165,8 @@ pub(super) fn build_sampling(
         .tool_choice
         .as_ref()
         .is_some_and(|tc| matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "none"));
-    let response_format_only = has_response_format && (!tools_active || tool_choice_none);
+    let response_format_only =
+        has_response_format && (!tools_active || tool_choice_none || suppress_tool_call);
 
     // Grammar spec (XGrammar structural-tag enforcement).
     let use_triggers = !tool_choice_required;
@@ -187,6 +186,9 @@ pub(super) fn build_sampling(
         // still parsed from the output — just not grammar-enforced.
         tracing::info!("MODEL.toml [behavior].disable_tool_grammar=true — tool-call grammar OFF");
         None
+    } else if tools_active && suppress_tool_call {
+        tracing::info!("Tool loop suppression active — tool-call grammar OFF for one turn");
+        None
     } else if tools_active {
         if has_response_format {
             tracing::info!(
@@ -237,6 +239,7 @@ pub(super) fn build_sampling(
         max_tokens,
         stop_tokens,
         tool_choice_required,
+        suppress_tool_call,
         grammar_spec,
         timeout_at,
         top_logprobs,

@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+
+use crate::tool_parser;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) struct ToolMode {
+    pub(super) required: bool,
+    pub(super) suppressed_for_turn: bool,
+}
+
+pub(super) fn resolve_tool_mode(
+    tools_active: bool,
+    tool_choice: Option<&tool_parser::ToolChoice>,
+    parser_name: Option<&str>,
+    requested_suppression: bool,
+) -> ToolMode {
+    if !tools_active {
+        return ToolMode {
+            required: false,
+            suppressed_for_turn: false,
+        };
+    }
+
+    let explicit_required = tool_choice.is_some_and(|tc| {
+        matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "required")
+            || matches!(tc, tool_parser::ToolChoice::Specific { .. })
+    });
+    // Bare JSON has no envelope token that can serve as a safe auto trigger;
+    // MiniMax XML does, so it can follow normal `tool_choice=auto` semantics.
+    let parser_required = matches!(parser_name, Some("bare_json"));
+    let suppressed_for_turn = requested_suppression && !explicit_required;
+
+    ToolMode {
+        required: !suppressed_for_turn && (explicit_required || parser_required),
+        suppressed_for_turn,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{ToolMode, resolve_tool_mode};
+    use crate::tool_parser::{ToolChoice, ToolChoiceFunction};
+
+    #[test]
+    fn loop_suppression_disables_minimax_auto_for_one_turn() {
+        let mode = resolve_tool_mode(true, None, Some("minimax_xml"), true);
+
+        assert_eq!(
+            mode,
+            ToolMode {
+                required: false,
+                suppressed_for_turn: true,
+            }
+        );
+    }
+
+    #[test]
+    fn minimax_parser_uses_auto_without_loop_suppression() {
+        let mode = resolve_tool_mode(true, None, Some("minimax_xml"), false);
+
+        assert_eq!(
+            mode,
+            ToolMode {
+                required: false,
+                suppressed_for_turn: false,
+            }
+        );
+    }
+
+    #[test]
+    fn bare_json_parser_still_requires_without_loop_suppression() {
+        let mode = resolve_tool_mode(true, None, Some("bare_json"), false);
+
+        assert_eq!(
+            mode,
+            ToolMode {
+                required: true,
+                suppressed_for_turn: false,
+            }
+        );
+    }
+
+    #[test]
+    fn explicit_required_ignores_loop_suppression() {
+        let choice = ToolChoice::Mode("required".to_string());
+        let mode = resolve_tool_mode(true, Some(&choice), Some("minimax_xml"), true);
+
+        assert_eq!(
+            mode,
+            ToolMode {
+                required: true,
+                suppressed_for_turn: false,
+            }
+        );
+    }
+
+    #[test]
+    fn specific_function_ignores_loop_suppression() {
+        let choice = ToolChoice::Specific {
+            function: ToolChoiceFunction {
+                name: "session_search".to_string(),
+            },
+        };
+        let mode = resolve_tool_mode(true, Some(&choice), Some("minimax_xml"), true);
+
+        assert_eq!(
+            mode,
+            ToolMode {
+                required: true,
+                suppressed_for_turn: false,
+            }
+        );
+    }
+
+    #[test]
+    fn inactive_tools_ignore_requested_suppression() {
+        let mode = resolve_tool_mode(false, None, Some("minimax_xml"), true);
+
+        assert_eq!(
+            mode,
+            ToolMode {
+                required: false,
+                suppressed_for_turn: false,
+            }
+        );
+    }
+}
@@ -127,13 +127,10 @@ pub fn process_seq_logits(
     }
 
     // Suppress <tool_call> during thinking (prevents KV cache contamination
-    // from think-leak bug) AND when tool call loop detected (≥4 identical
-    // calls — see api.rs:548). For the loop case, use a STRONG NEGATIVE
-    // BIAS (−12.0) instead of `-inf` so the model can still escape if its
-    // evidence for a tool call is overwhelming (e.g. user explicitly says
-    // "actually run the tests"). For thinking, hard-mask remains: tool
-    // calls inside <think> are unparsable per the (canonical) qwen3_coder
-    // dialect, so they must be physically blocked.
+    // from think-leak bug) AND when tool-call loop recovery disables tools
+    // for a single free-text turn. Explicit `tool_choice="required"` /
+    // specific-function requests are filtered out before this flag reaches
+    // the scheduler, so the loop recovery path can hard-mask the opener.
     if a.inside_thinking {
         if let Some(tc_start) = tool_call_start_token {
             let idx = tc_start as usize;
@@ -146,7 +143,7 @@ pub fn process_seq_logits(
     {
         let idx = tc_start as usize;
         if idx < f32_logits.len() {
-            f32_logits[idx] -= 12.0;
+            f32_logits[idx] = f32::NEG_INFINITY;
         }
     }