diff --git a/crates/spark-server/src/api/chat/mod.rs b/crates/spark-server/src/api/chat/mod.rs index 1e2e38d3..0b10f113 100644 --- a/crates/spark-server/src/api/chat/mod.rs +++ b/crates/spark-server/src/api/chat/mod.rs @@ -26,6 +26,7 @@ mod loop_detect; mod msg_entry; pub(super) mod repair_json; mod sampling_setup; +mod sampling_tool_mode; mod template; mod thinking; @@ -240,6 +241,7 @@ pub(crate) async fn chat_completions_inner( max_tokens, stop_tokens, tool_choice_required, + suppress_tool_call, grammar_spec, timeout_at, top_logprobs, diff --git a/crates/spark-server/src/api/chat/sampling_setup.rs b/crates/spark-server/src/api/chat/sampling_setup.rs index 1082fc1f..e97e9de4 100644 --- a/crates/spark-server/src/api/chat/sampling_setup.rs +++ b/crates/spark-server/src/api/chat/sampling_setup.rs @@ -16,6 +16,7 @@ use crate::tool_parser; use super::super::compact::openai_error_response; use super::super::inference_impl::tokenize_stop_sequences; use super::super::inference_types::GrammarSpec; +use super::sampling_tool_mode::resolve_tool_mode; pub(super) struct SamplingSetup { pub(super) temperature: f32, @@ -34,6 +35,7 @@ pub(super) struct SamplingSetup { pub(super) max_tokens: usize, pub(super) stop_tokens: Vec, pub(super) tool_choice_required: bool, + pub(super) suppress_tool_call: bool, pub(super) grammar_spec: Option, pub(super) timeout_at: Option, pub(super) top_logprobs: Option, @@ -69,6 +71,13 @@ pub(super) fn build_sampling( let dry_base = preset.dry_base; let dry_allowed_length = preset.dry_allowed_length; let lz_penalty = preset.lz_penalty; + let tool_mode = resolve_tool_mode( + tools_active, + req.tool_choice.as_ref(), + state.tool_call_parser.as_ref().map(|p| p.name()), + suppress_tool_call, + ); + let suppress_tool_call = tool_mode.suppressed_for_turn; // OpenAI-style penalty range validation. if !(-2.0..=2.0).contains(&presence_penalty) { @@ -91,14 +100,15 @@ pub(super) fn build_sampling( .collect() }); - // Exponential `` bias decay. + // Exponential `` bias decay. Positive bias is only valid when a + // tool call is required; auto mode must let the model choose free text. if tools_active && !suppress_tool_call && let Some(tc_id) = state.tool_call_start_token_id { let bias = match tool_call_repeat_count { - 0 | 1 => 3.0, - 2 => 0.0, + 0 | 1 if tool_mode.required => 3.0, + 0..=2 => 0.0, 3 => -5.0, _ => -10.0, }; @@ -133,20 +143,7 @@ pub(super) fn build_sampling( } // Tool-choice + parser-driven required mode. - let parser_is_minimax_xml = state - .tool_call_parser - .as_ref() - .is_some_and(|p| p.name() == "minimax_xml"); - let parser_is_bare_json = state - .tool_call_parser - .as_ref() - .is_some_and(|p| p.name() == "bare_json"); - let tool_choice_required = tools_active - && (req.tool_choice.as_ref().is_some_and(|tc| { - matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "required") - || matches!(tc, tool_parser::ToolChoice::Specific { .. }) - }) || parser_is_minimax_xml - || parser_is_bare_json); + let tool_choice_required = tool_mode.required; // response_format + tools coexistence. // @@ -154,7 +151,8 @@ pub(super) fn build_sampling( // routinely set both (the model emits a tool call on turn N, then a // schema-shaped final answer on turn N+1). XGrammar's structural-tag // grammar enforces *one* shape per request, so we pick which one wins: - // * `tool_choice="none"` → tools won't be called, enforce response_format + // * `tool_choice="none"` or loop suppression → tools won't be called, + // enforce response_format // * any other tool_choice → enforce tool-call grammar; the schema text // is conventionally embedded in the user/system message by the // caller, and capable models (Qwen3.6, etc.) follow it without @@ -167,7 +165,8 @@ pub(super) fn build_sampling( .tool_choice .as_ref() .is_some_and(|tc| matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "none")); - let response_format_only = has_response_format && (!tools_active || tool_choice_none); + let response_format_only = + has_response_format && (!tools_active || tool_choice_none || suppress_tool_call); // Grammar spec (XGrammar structural-tag enforcement). let use_triggers = !tool_choice_required; @@ -187,6 +186,9 @@ pub(super) fn build_sampling( // still parsed from the output — just not grammar-enforced. tracing::info!("MODEL.toml [behavior].disable_tool_grammar=true — tool-call grammar OFF"); None + } else if tools_active && suppress_tool_call { + tracing::info!("Tool loop suppression active — tool-call grammar OFF for one turn"); + None } else if tools_active { if has_response_format { tracing::info!( @@ -237,6 +239,7 @@ pub(super) fn build_sampling( max_tokens, stop_tokens, tool_choice_required, + suppress_tool_call, grammar_spec, timeout_at, top_logprobs, diff --git a/crates/spark-server/src/api/chat/sampling_tool_mode.rs b/crates/spark-server/src/api/chat/sampling_tool_mode.rs new file mode 100644 index 00000000..c07f6f09 --- /dev/null +++ b/crates/spark-server/src/api/chat/sampling_tool_mode.rs @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +use crate::tool_parser; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) struct ToolMode { + pub(super) required: bool, + pub(super) suppressed_for_turn: bool, +} + +pub(super) fn resolve_tool_mode( + tools_active: bool, + tool_choice: Option<&tool_parser::ToolChoice>, + parser_name: Option<&str>, + requested_suppression: bool, +) -> ToolMode { + if !tools_active { + return ToolMode { + required: false, + suppressed_for_turn: false, + }; + } + + let explicit_required = tool_choice.is_some_and(|tc| { + matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "required") + || matches!(tc, tool_parser::ToolChoice::Specific { .. }) + }); + // Bare JSON has no envelope token that can serve as a safe auto trigger; + // MiniMax XML does, so it can follow normal `tool_choice=auto` semantics. + let parser_required = matches!(parser_name, Some("bare_json")); + let suppressed_for_turn = requested_suppression && !explicit_required; + + ToolMode { + required: !suppressed_for_turn && (explicit_required || parser_required), + suppressed_for_turn, + } +} + +#[cfg(test)] +mod tests { + use super::{ToolMode, resolve_tool_mode}; + use crate::tool_parser::{ToolChoice, ToolChoiceFunction}; + + #[test] + fn loop_suppression_disables_minimax_auto_for_one_turn() { + let mode = resolve_tool_mode(true, None, Some("minimax_xml"), true); + + assert_eq!( + mode, + ToolMode { + required: false, + suppressed_for_turn: true, + } + ); + } + + #[test] + fn minimax_parser_uses_auto_without_loop_suppression() { + let mode = resolve_tool_mode(true, None, Some("minimax_xml"), false); + + assert_eq!( + mode, + ToolMode { + required: false, + suppressed_for_turn: false, + } + ); + } + + #[test] + fn bare_json_parser_still_requires_without_loop_suppression() { + let mode = resolve_tool_mode(true, None, Some("bare_json"), false); + + assert_eq!( + mode, + ToolMode { + required: true, + suppressed_for_turn: false, + } + ); + } + + #[test] + fn explicit_required_ignores_loop_suppression() { + let choice = ToolChoice::Mode("required".to_string()); + let mode = resolve_tool_mode(true, Some(&choice), Some("minimax_xml"), true); + + assert_eq!( + mode, + ToolMode { + required: true, + suppressed_for_turn: false, + } + ); + } + + #[test] + fn specific_function_ignores_loop_suppression() { + let choice = ToolChoice::Specific { + function: ToolChoiceFunction { + name: "session_search".to_string(), + }, + }; + let mode = resolve_tool_mode(true, Some(&choice), Some("minimax_xml"), true); + + assert_eq!( + mode, + ToolMode { + required: true, + suppressed_for_turn: false, + } + ); + } + + #[test] + fn inactive_tools_ignore_requested_suppression() { + let mode = resolve_tool_mode(false, None, Some("minimax_xml"), true); + + assert_eq!( + mode, + ToolMode { + required: false, + suppressed_for_turn: false, + } + ); + } +} diff --git a/crates/spark-server/src/scheduler/decode_logits_seq.rs b/crates/spark-server/src/scheduler/decode_logits_seq.rs index ef356f02..e9e96849 100644 --- a/crates/spark-server/src/scheduler/decode_logits_seq.rs +++ b/crates/spark-server/src/scheduler/decode_logits_seq.rs @@ -127,13 +127,10 @@ pub fn process_seq_logits( } // Suppress during thinking (prevents KV cache contamination - // from think-leak bug) AND when tool call loop detected (≥4 identical - // calls — see api.rs:548). For the loop case, use a STRONG NEGATIVE - // BIAS (−12.0) instead of `-inf` so the model can still escape if its - // evidence for a tool call is overwhelming (e.g. user explicitly says - // "actually run the tests"). For thinking, hard-mask remains: tool - // calls inside are unparsable per the (canonical) qwen3_coder - // dialect, so they must be physically blocked. + // from think-leak bug) AND when tool-call loop recovery disables tools + // for a single free-text turn. Explicit `tool_choice="required"` / + // specific-function requests are filtered out before this flag reaches + // the scheduler, so the loop recovery path can hard-mask the opener. if a.inside_thinking { if let Some(tc_start) = tool_call_start_token { let idx = tc_start as usize; @@ -146,7 +143,7 @@ pub fn process_seq_logits( { let idx = tc_start as usize; if idx < f32_logits.len() { - f32_logits[idx] -= 12.0; + f32_logits[idx] = f32::NEG_INFINITY; } }