Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions crates/spark-server/src/api/chat/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ mod loop_detect;
mod msg_entry;
pub(super) mod repair_json;
mod sampling_setup;
mod sampling_tool_mode;
mod template;
mod thinking;

Expand Down Expand Up @@ -240,6 +241,7 @@ pub(crate) async fn chat_completions_inner(
max_tokens,
stop_tokens,
tool_choice_required,
suppress_tool_call,
grammar_spec,
timeout_at,
top_logprobs,
Expand Down
41 changes: 22 additions & 19 deletions crates/spark-server/src/api/chat/sampling_setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use crate::tool_parser;
use super::super::compact::openai_error_response;
use super::super::inference_impl::tokenize_stop_sequences;
use super::super::inference_types::GrammarSpec;
use super::sampling_tool_mode::resolve_tool_mode;

pub(super) struct SamplingSetup {
pub(super) temperature: f32,
Expand All @@ -34,6 +35,7 @@ pub(super) struct SamplingSetup {
pub(super) max_tokens: usize,
pub(super) stop_tokens: Vec<u32>,
pub(super) tool_choice_required: bool,
pub(super) suppress_tool_call: bool,
pub(super) grammar_spec: Option<GrammarSpec>,
pub(super) timeout_at: Option<std::time::Instant>,
pub(super) top_logprobs: Option<u8>,
Expand Down Expand Up @@ -69,6 +71,13 @@ pub(super) fn build_sampling(
let dry_base = preset.dry_base;
let dry_allowed_length = preset.dry_allowed_length;
let lz_penalty = preset.lz_penalty;
let tool_mode = resolve_tool_mode(
tools_active,
req.tool_choice.as_ref(),
state.tool_call_parser.as_ref().map(|p| p.name()),
suppress_tool_call,
);
let suppress_tool_call = tool_mode.suppressed_for_turn;

// OpenAI-style penalty range validation.
if !(-2.0..=2.0).contains(&presence_penalty) {
Expand All @@ -91,14 +100,15 @@ pub(super) fn build_sampling(
.collect()
});

// Exponential `<tool_call>` bias decay.
// Exponential `<tool_call>` bias decay. Positive bias is only valid when a
// tool call is required; auto mode must let the model choose free text.
if tools_active
&& !suppress_tool_call
&& let Some(tc_id) = state.tool_call_start_token_id
{
let bias = match tool_call_repeat_count {
0 | 1 => 3.0,
2 => 0.0,
0 | 1 if tool_mode.required => 3.0,
0..=2 => 0.0,
3 => -5.0,
_ => -10.0,
};
Expand Down Expand Up @@ -133,28 +143,16 @@ pub(super) fn build_sampling(
}

// Tool-choice + parser-driven required mode.
let parser_is_minimax_xml = state
.tool_call_parser
.as_ref()
.is_some_and(|p| p.name() == "minimax_xml");
let parser_is_bare_json = state
.tool_call_parser
.as_ref()
.is_some_and(|p| p.name() == "bare_json");
let tool_choice_required = tools_active
&& (req.tool_choice.as_ref().is_some_and(|tc| {
matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "required")
|| matches!(tc, tool_parser::ToolChoice::Specific { .. })
}) || parser_is_minimax_xml
|| parser_is_bare_json);
let tool_choice_required = tool_mode.required;

// response_format + tools coexistence.
//
// OpenAI's API allows both fields in the same request; agentic pipelines
// routinely set both (the model emits a tool call on turn N, then a
// schema-shaped final answer on turn N+1). XGrammar's structural-tag
// grammar enforces *one* shape per request, so we pick which one wins:
// * `tool_choice="none"` → tools won't be called, enforce response_format
// * `tool_choice="none"` or loop suppression → tools won't be called,
// enforce response_format
// * any other tool_choice → enforce tool-call grammar; the schema text
// is conventionally embedded in the user/system message by the
// caller, and capable models (Qwen3.6, etc.) follow it without
Expand All @@ -167,7 +165,8 @@ pub(super) fn build_sampling(
.tool_choice
.as_ref()
.is_some_and(|tc| matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "none"));
let response_format_only = has_response_format && (!tools_active || tool_choice_none);
let response_format_only =
has_response_format && (!tools_active || tool_choice_none || suppress_tool_call);

// Grammar spec (XGrammar structural-tag enforcement).
let use_triggers = !tool_choice_required;
Expand All @@ -187,6 +186,9 @@ pub(super) fn build_sampling(
// still parsed from the output — just not grammar-enforced.
tracing::info!("MODEL.toml [behavior].disable_tool_grammar=true — tool-call grammar OFF");
None
} else if tools_active && suppress_tool_call {
tracing::info!("Tool loop suppression active — tool-call grammar OFF for one turn");
None
} else if tools_active {
if has_response_format {
tracing::info!(
Expand Down Expand Up @@ -237,6 +239,7 @@ pub(super) fn build_sampling(
max_tokens,
stop_tokens,
tool_choice_required,
suppress_tool_call,
grammar_spec,
timeout_at,
top_logprobs,
Expand Down
127 changes: 127 additions & 0 deletions crates/spark-server/src/api/chat/sampling_tool_mode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// SPDX-License-Identifier: AGPL-3.0-only

use crate::tool_parser;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) struct ToolMode {
pub(super) required: bool,
pub(super) suppressed_for_turn: bool,
}

pub(super) fn resolve_tool_mode(
tools_active: bool,
tool_choice: Option<&tool_parser::ToolChoice>,
parser_name: Option<&str>,
requested_suppression: bool,
) -> ToolMode {
if !tools_active {
return ToolMode {
required: false,
suppressed_for_turn: false,
};
}

let explicit_required = tool_choice.is_some_and(|tc| {
matches!(tc, tool_parser::ToolChoice::Mode(m) if m == "required")
|| matches!(tc, tool_parser::ToolChoice::Specific { .. })
});
// Bare JSON has no envelope token that can serve as a safe auto trigger;
// MiniMax XML does, so it can follow normal `tool_choice=auto` semantics.
let parser_required = matches!(parser_name, Some("bare_json"));
let suppressed_for_turn = requested_suppression && !explicit_required;

ToolMode {
required: !suppressed_for_turn && (explicit_required || parser_required),
suppressed_for_turn,
}
}

#[cfg(test)]
mod tests {
use super::{ToolMode, resolve_tool_mode};
use crate::tool_parser::{ToolChoice, ToolChoiceFunction};

#[test]
fn loop_suppression_disables_minimax_auto_for_one_turn() {
let mode = resolve_tool_mode(true, None, Some("minimax_xml"), true);

assert_eq!(
mode,
ToolMode {
required: false,
suppressed_for_turn: true,
}
);
}

#[test]
fn minimax_parser_uses_auto_without_loop_suppression() {
let mode = resolve_tool_mode(true, None, Some("minimax_xml"), false);

assert_eq!(
mode,
ToolMode {
required: false,
suppressed_for_turn: false,
}
);
}

#[test]
fn bare_json_parser_still_requires_without_loop_suppression() {
let mode = resolve_tool_mode(true, None, Some("bare_json"), false);

assert_eq!(
mode,
ToolMode {
required: true,
suppressed_for_turn: false,
}
);
}

#[test]
fn explicit_required_ignores_loop_suppression() {
let choice = ToolChoice::Mode("required".to_string());
let mode = resolve_tool_mode(true, Some(&choice), Some("minimax_xml"), true);

assert_eq!(
mode,
ToolMode {
required: true,
suppressed_for_turn: false,
}
);
}

#[test]
fn specific_function_ignores_loop_suppression() {
let choice = ToolChoice::Specific {
function: ToolChoiceFunction {
name: "session_search".to_string(),
},
};
let mode = resolve_tool_mode(true, Some(&choice), Some("minimax_xml"), true);

assert_eq!(
mode,
ToolMode {
required: true,
suppressed_for_turn: false,
}
);
}

#[test]
fn inactive_tools_ignore_requested_suppression() {
let mode = resolve_tool_mode(false, None, Some("minimax_xml"), true);

assert_eq!(
mode,
ToolMode {
required: false,
suppressed_for_turn: false,
}
);
}
}
13 changes: 5 additions & 8 deletions crates/spark-server/src/scheduler/decode_logits_seq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,10 @@ pub fn process_seq_logits(
}

// Suppress <tool_call> during thinking (prevents KV cache contamination
// from think-leak bug) AND when tool call loop detected (≥4 identical
// calls — see api.rs:548). For the loop case, use a STRONG NEGATIVE
// BIAS (−12.0) instead of `-inf` so the model can still escape if its
// evidence for a tool call is overwhelming (e.g. user explicitly says
// "actually run the tests"). For thinking, hard-mask remains: tool
// calls inside <think> are unparsable per the (canonical) qwen3_coder
// dialect, so they must be physically blocked.
// from think-leak bug) AND when tool-call loop recovery disables tools
// for a single free-text turn. Explicit `tool_choice="required"` /
// specific-function requests are filtered out before this flag reaches
// the scheduler, so the loop recovery path can hard-mask the opener.
if a.inside_thinking {
if let Some(tc_start) = tool_call_start_token {
let idx = tc_start as usize;
Expand All @@ -146,7 +143,7 @@ pub fn process_seq_logits(
{
let idx = tc_start as usize;
if idx < f32_logits.len() {
f32_logits[idx] -= 12.0;
f32_logits[idx] = f32::NEG_INFINITY;
}
}

Expand Down
Loading