diff --git a/crates/zeroclaw-providers/src/multimodal.rs b/crates/zeroclaw-providers/src/multimodal.rs index b75f08e362c..1a83750a0b9 100644 --- a/crates/zeroclaw-providers/src/multimodal.rs +++ b/crates/zeroclaw-providers/src/multimodal.rs @@ -278,6 +278,27 @@ pub fn count_user_image_markers(messages: &[ChatMessage]) -> usize { .sum() } +/// Count image markers in the **most recent** genuine user message (the newest +/// inbound user turn), ignoring tool-result carriers and any earlier user +/// messages still present in history. +/// +/// This is the turn-scoped counterpart to [`count_user_image_markers`]. It lets +/// the vision router distinguish "the user *just* sent an image we cannot see" +/// (surface a capability error so the attachment is not silently ignored) from +/// "an earlier user image is merely carried over in history" (degrade to +/// text-only). Erroring on a carried-over marker would poison every later turn, +/// since the marker lives in the long-lived session history permanently. That +/// was the reported bug: a single image to a non-vision provider made every +/// subsequent text turn fail. +pub fn count_latest_user_image_markers(messages: &[ChatMessage]) -> usize { + messages + .iter() + .rev() + .find(|message| message.role == "user" && !is_prompt_tool_result_message(message)) + .map(|message| parse_image_markers(&message.content).1.len()) + .unwrap_or(0) +} + /// Replace media markers (`[IMAGE:...]`, `[PHOTO:...]`, `[DOCUMENT:...]`, /// `[FILE:...]`, `[VIDEO:...]`, `[VOICE:...]`, `[AUDIO:...]`) with /// `[media attachment]`. Match is case-insensitive to align with the channel @@ -1786,6 +1807,44 @@ mod tests { assert_eq!(count_image_markers(&messages), 1); } + #[test] + fn count_latest_user_image_markers_scopes_to_newest_user_message() { + // No user messages at all -> zero. + assert_eq!(count_latest_user_image_markers(&[]), 0); + + // The newest user message carries the image -> counted (the user just + // sent it; the vision router surfaces a capability error). + let just_sent = vec![ + ChatMessage::user("hi".to_string()), + ChatMessage { + role: "assistant".to_string(), + content: "hello".to_string(), + }, + ChatMessage::user("look at this [IMAGE:/tmp/a.png]".to_string()), + ]; + assert_eq!(count_latest_user_image_markers(&just_sent), 1); + + // An earlier user message carried an image, but the newest user message + // is plain text -> zero. This is the poison-prevention case: the carried + // over marker must NOT keep re-triggering the capability error. + let carried_over = vec![ + ChatMessage::user("look at this [IMAGE:/tmp/a.png]".to_string()), + ChatMessage::user("what is WAL?".to_string()), + ]; + assert_eq!(count_latest_user_image_markers(&carried_over), 0); + // The history-wide count still sees the carried-over marker, which is + // why the router must distinguish the two. + assert_eq!(count_user_image_markers(&carried_over), 1); + + // A trailing tool-result carrier does not mask the real latest user + // message (its markers are not user-sent and must not be counted here). + let trailing_tool_result = vec![ + ChatMessage::user("inspect [IMAGE:/tmp/a.png]".to_string()), + ChatMessage::tool("[IMAGE:/tmp/tool.png]\nGenerated".to_string()), + ]; + assert_eq!(count_latest_user_image_markers(&trailing_tool_result), 1); + } + #[tokio::test] async fn prepare_messages_trims_excess_images_from_older_messages() { // 3 messages, each with 1 image — max is 2. diff --git a/crates/zeroclaw-runtime/src/agent/loop_.rs b/crates/zeroclaw-runtime/src/agent/loop_.rs index 9d286efbc4e..45b43c76677 100644 --- a/crates/zeroclaw-runtime/src/agent/loop_.rs +++ b/crates/zeroclaw-runtime/src/agent/loop_.rs @@ -4876,6 +4876,101 @@ mod tests { assert!(!requests[0][0].content.contains(&oversized_payload)); } + /// Regression: a non-vision provider must not be permanently poisoned by an + /// image marker left in history from an EARLIER turn. The capability error + /// is scoped to the image the user *just* sent (see + /// `run_tool_call_loop_returns_structured_error_for_non_vision_provider`); + /// a carried-over marker degrades to text-only so the next plain-text turn + /// succeeds instead of re-failing forever. Reproduces the reported bug where + /// one image to a non-vision provider made every subsequent text turn fail + /// (the RPC/streaming path persists the user message into the long-lived + /// session history before the loop runs, so a failed image turn leaves its + /// marker behind). + #[tokio::test] + async fn run_tool_call_loop_degrades_carried_over_image_on_non_vision_provider() { + let model_provider = RecordingModelProvider::new(); + let recorded_requests = Arc::clone(&model_provider.requests); + + // An earlier image turn left its marker in history; the latest user + // message is plain text. + let mut history = vec![ + ChatMessage::user( + "please inspect [IMAGE:data:image/png;base64,iVBORw0KGgo=]".to_string(), + ), + ChatMessage::user("what is WAL?".to_string()), + ]; + let tools_registry: Vec> = Vec::new(); + let observer = NoopObserver; + + let result = run_tool_call_loop(ToolLoop { + exec: ResolvedAgentExecution { + model_access: ResolvedModelAccess { + model_provider: &model_provider, + provider_name: "mock-provider", + model: "mock-model", + temperature: Some(0.0), + }, + tools_registry: &tools_registry, + observer: &observer, + silent: true, + approval: None, + multimodal_config: &zeroclaw_config::schema::MultimodalConfig::default(), + max_tool_iterations: 3, + hooks: None, + excluded_tools: &[], + dedup_exempt_tools: &[], + activated_tools: None, + model_switch_callback: None, + pacing: &zeroclaw_config::schema::PacingConfig::default(), + strict_tool_parsing: false, + parallel_tools: false, + max_tool_result_chars: 0, + context_token_budget: 0, + receipt_generator: None, + knobs: &LoopKnobs::default(), + }, + history: &mut history, + channel_name: "cli", + channel_reply_target: None, + cancellation_token: None, + on_delta: None, + shared_budget: None, + channel: None, + collected_receipts: None, + event_tx: None, + steering: None, + new_messages_out: None, + image_cache: None, + // Phase 1: stamp Internal/Trusted. Real per-transport + // stamping is PR C (RFC #6971 §4). + ingress: IngressContext::internal(), + }) + .await + .expect("a carried-over image must not fail a plain-text turn"); + + assert_eq!(result, "done"); + + // The provider was actually called (no hard capability error) and the + // carried-over marker was stripped before reaching the text-only model. + let requests = recorded_requests + .lock() + .expect("recorded requests lock should be valid"); + assert_eq!(requests.len(), 1, "exactly one provider call expected"); + let sent_blob = requests[0] + .iter() + .map(|m| m.content.as_str()) + .collect::>() + .join("\n"); + assert!( + !sent_blob.contains("[IMAGE:"), + "carried-over image marker must be stripped, got: {sent_blob}" + ); + assert!( + sent_blob.contains("[media attachment]"), + "stripped marker should become the text placeholder, got: {sent_blob}" + ); + } + #[tokio::test] async fn run_tool_call_loop_accepts_valid_multimodal_request_flow() { let calls = Arc::new(AtomicUsize::new(0)); diff --git a/crates/zeroclaw-runtime/src/agent/turn/vision_route.rs b/crates/zeroclaw-runtime/src/agent/turn/vision_route.rs index fd7fe74e281..c1d3454f97e 100644 --- a/crates/zeroclaw-runtime/src/agent/turn/vision_route.rs +++ b/crates/zeroclaw-runtime/src/agent/turn/vision_route.rs @@ -16,20 +16,25 @@ pub(crate) fn resolve_vision_provider( provider_name: &str, ) -> Result<(Option>, bool)> { let image_marker_count = multimodal::count_image_markers(history); - // Image markers that came from the user (inbound attachments), as - // opposed to tool results. A missing vision capability is handled - // differently for the two: a user image must surface an error (we - // cannot silently ignore what the user sent), while a tool-result - // image may degrade to text-only. - let user_image_marker_count = multimodal::count_user_image_markers(history); + // Image markers in the most recent user message (the image the user *just* + // sent this turn), as opposed to markers carried over from earlier history + // or arriving via tool results. A missing vision capability is handled + // differently: an image the user just sent must surface an error (we cannot + // silently ignore it), while a carried-over or tool-result image degrades to + // text-only. Scoping to the latest user message (rather than the whole + // history) is what stops a single failed image turn from poisoning every + // subsequent text turn: the marker lives in the long-lived session history + // permanently, so a history-wide check would re-fail forever. + let latest_user_image_marker_count = multimodal::count_latest_user_image_markers(history); // ── Vision model_provider routing ────────────────────────── // When the default model_provider lacks vision support but a dedicated // vision_model_provider is configured, create it on demand and use it // for this iteration. When no vision route exists at all, either - // surface a capability error (the user sent an image) or degrade - // gracefully (the markers came only from tool results) — see the - // no-vision-route branch below and `degrade_strip_images`. + // surface a capability error (the user just sent an image) or degrade + // gracefully (the markers are carried over from earlier history or came + // only from tool results); see the no-vision-route branch below and + // `degrade_strip_images`. let mut degrade_strip_images = false; let vision_model_provider_box: Option> = if image_marker_count > 0 && !model_provider.supports_vision() @@ -65,8 +70,8 @@ pub(crate) fn resolve_vision_provider( .into()); } Some(vp_instance) - } else if user_image_marker_count > 0 { - // The user sent an image we cannot see. Surface a capability + } else if latest_user_image_marker_count > 0 { + // The user *just* sent an image we cannot see. Surface a capability // error so the attachment is not silently ignored — channels // render this back to the user (e.g. "⚠️ Error … does not // support vision"). Configuring a `vision_model_provider` @@ -75,21 +80,24 @@ pub(crate) fn resolve_vision_provider( model_provider: provider_name.to_string(), capability: "vision".to_string(), message: format!( - "received {image_marker_count} image marker(s), but this model_provider does not support vision input" + "received {latest_user_image_marker_count} image marker(s), but this model_provider does not support vision input" ), } .into()); } else { - // Markers came only from tool results (e.g. `image_info`, - // `screenshot`, `image_gen`). Previously this aborted the - // entire turn with a capability error, which turned an - // otherwise successful tool call (e.g. `image_info`, which - // always returns useful metadata text alongside its `[IMAGE:]` - // marker) into a hard failure. Instead, degrade: strip the - // image markers from the messages sent to the text-only + // The only image markers left are carried over from earlier + // history (e.g. a prior failed image turn whose user message + // persisted, or a switch from a vision model to a non-vision one) + // or arrived via tool results (`image_info`, `screenshot`, + // `image_gen`). Erroring here would poison every later turn: the + // marker lives in the long-lived session history permanently, so a + // history-wide capability error would re-fire on plain text turns + // forever. Tool-result markers were already degraded for the same + // "don't fail an otherwise useful turn" reason. Instead, degrade: + // strip the markers from the messages sent to the text-only // provider while preserving the surrounding text, so the // conversation continues and the model still receives any - // accompanying metadata/caption. + // accompanying caption/metadata. ::zeroclaw_log::record!( WARN, ::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note) @@ -99,7 +107,7 @@ pub(crate) fn resolve_vision_provider( "model_provider": provider_name, "image_marker_count": image_marker_count, })), - "no vision route for tool-result image marker(s); degrading to text-only (markers stripped)" + "no vision route for carried-over/tool-result image marker(s); degrading to text-only (markers stripped)" ); degrade_strip_images = true; None