Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions crates/zeroclaw-providers/src/multimodal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,27 @@ pub fn count_user_image_markers(messages: &[ChatMessage]) -> usize {
.sum()
}

/// Count image markers in the **most recent** genuine user message (the newest
/// inbound user turn), ignoring tool-result carriers and any earlier user
/// messages still present in history.
///
/// This is the turn-scoped counterpart to [`count_user_image_markers`]. It lets
/// the vision router distinguish "the user *just* sent an image we cannot see"
/// (surface a capability error so the attachment is not silently ignored) from
/// "an earlier user image is merely carried over in history" (degrade to
/// text-only). Erroring on a carried-over marker would poison every later turn,
/// since the marker lives in the long-lived session history permanently. That
/// was the reported bug: a single image to a non-vision provider made every
/// subsequent text turn fail.
pub fn count_latest_user_image_markers(messages: &[ChatMessage]) -> usize {
messages
.iter()
.rev()
.find(|message| message.role == "user" && !is_prompt_tool_result_message(message))
.map(|message| parse_image_markers(&message.content).1.len())
.unwrap_or(0)
}

/// Replace media markers (`[IMAGE:...]`, `[PHOTO:...]`, `[DOCUMENT:...]`,
/// `[FILE:...]`, `[VIDEO:...]`, `[VOICE:...]`, `[AUDIO:...]`) with
/// `[media attachment]`. Match is case-insensitive to align with the channel
Expand Down Expand Up @@ -1786,6 +1807,44 @@ mod tests {
assert_eq!(count_image_markers(&messages), 1);
}

#[test]
fn count_latest_user_image_markers_scopes_to_newest_user_message() {
// No user messages at all -> zero.
assert_eq!(count_latest_user_image_markers(&[]), 0);

// The newest user message carries the image -> counted (the user just
// sent it; the vision router surfaces a capability error).
let just_sent = vec![
ChatMessage::user("hi".to_string()),
ChatMessage {
role: "assistant".to_string(),
content: "hello".to_string(),
},
ChatMessage::user("look at this [IMAGE:/tmp/a.png]".to_string()),
];
assert_eq!(count_latest_user_image_markers(&just_sent), 1);

// An earlier user message carried an image, but the newest user message
// is plain text -> zero. This is the poison-prevention case: the carried
// over marker must NOT keep re-triggering the capability error.
let carried_over = vec![
ChatMessage::user("look at this [IMAGE:/tmp/a.png]".to_string()),
ChatMessage::user("what is WAL?".to_string()),
];
assert_eq!(count_latest_user_image_markers(&carried_over), 0);
// The history-wide count still sees the carried-over marker, which is
// why the router must distinguish the two.
assert_eq!(count_user_image_markers(&carried_over), 1);

// A trailing tool-result carrier does not mask the real latest user
// message (its markers are not user-sent and must not be counted here).
let trailing_tool_result = vec![
ChatMessage::user("inspect [IMAGE:/tmp/a.png]".to_string()),
ChatMessage::tool("[IMAGE:/tmp/tool.png]\nGenerated".to_string()),
];
assert_eq!(count_latest_user_image_markers(&trailing_tool_result), 1);
}

#[tokio::test]
async fn prepare_messages_trims_excess_images_from_older_messages() {
// 3 messages, each with 1 image — max is 2.
Expand Down
95 changes: 95 additions & 0 deletions crates/zeroclaw-runtime/src/agent/loop_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4876,6 +4876,101 @@ mod tests {
assert!(!requests[0][0].content.contains(&oversized_payload));
}

/// Regression: a non-vision provider must not be permanently poisoned by an
/// image marker left in history from an EARLIER turn. The capability error
/// is scoped to the image the user *just* sent (see
/// `run_tool_call_loop_returns_structured_error_for_non_vision_provider`);
/// a carried-over marker degrades to text-only so the next plain-text turn
/// succeeds instead of re-failing forever. Reproduces the reported bug where
/// one image to a non-vision provider made every subsequent text turn fail
/// (the RPC/streaming path persists the user message into the long-lived
/// session history before the loop runs, so a failed image turn leaves its
/// marker behind).
#[tokio::test]
async fn run_tool_call_loop_degrades_carried_over_image_on_non_vision_provider() {
let model_provider = RecordingModelProvider::new();
let recorded_requests = Arc::clone(&model_provider.requests);

// An earlier image turn left its marker in history; the latest user
// message is plain text.
let mut history = vec![
ChatMessage::user(
"please inspect [IMAGE:data:image/png;base64,iVBORw0KGgo=]".to_string(),
),
ChatMessage::user("what is WAL?".to_string()),
];
let tools_registry: Vec<Box<dyn Tool>> = Vec::new();
let observer = NoopObserver;

let result = run_tool_call_loop(ToolLoop {
exec: ResolvedAgentExecution {
model_access: ResolvedModelAccess {
model_provider: &model_provider,
provider_name: "mock-provider",
model: "mock-model",
temperature: Some(0.0),
},
tools_registry: &tools_registry,
observer: &observer,
silent: true,
approval: None,
multimodal_config: &zeroclaw_config::schema::MultimodalConfig::default(),
max_tool_iterations: 3,
hooks: None,
excluded_tools: &[],
dedup_exempt_tools: &[],
activated_tools: None,
model_switch_callback: None,
pacing: &zeroclaw_config::schema::PacingConfig::default(),
strict_tool_parsing: false,
parallel_tools: false,
max_tool_result_chars: 0,
context_token_budget: 0,
receipt_generator: None,
knobs: &LoopKnobs::default(),
},
history: &mut history,
channel_name: "cli",
channel_reply_target: None,
cancellation_token: None,
on_delta: None,
shared_budget: None,
channel: None,
collected_receipts: None,
event_tx: None,
steering: None,
new_messages_out: None,
image_cache: None,
// Phase 1: stamp Internal/Trusted. Real per-transport
// stamping is PR C (RFC #6971 §4).
ingress: IngressContext::internal(),
})
.await
.expect("a carried-over image must not fail a plain-text turn");

assert_eq!(result, "done");

// The provider was actually called (no hard capability error) and the
// carried-over marker was stripped before reaching the text-only model.
let requests = recorded_requests
.lock()
.expect("recorded requests lock should be valid");
assert_eq!(requests.len(), 1, "exactly one provider call expected");
let sent_blob = requests[0]
.iter()
.map(|m| m.content.as_str())
.collect::<Vec<_>>()
.join("\n");
assert!(
!sent_blob.contains("[IMAGE:"),
"carried-over image marker must be stripped, got: {sent_blob}"
);
assert!(
sent_blob.contains("[media attachment]"),
"stripped marker should become the text placeholder, got: {sent_blob}"
);
}

#[tokio::test]
async fn run_tool_call_loop_accepts_valid_multimodal_request_flow() {
let calls = Arc::new(AtomicUsize::new(0));
Expand Down
50 changes: 29 additions & 21 deletions crates/zeroclaw-runtime/src/agent/turn/vision_route.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,25 @@ pub(crate) fn resolve_vision_provider(
provider_name: &str,
) -> Result<(Option<Box<dyn ModelProvider>>, bool)> {
let image_marker_count = multimodal::count_image_markers(history);
// Image markers that came from the user (inbound attachments), as
// opposed to tool results. A missing vision capability is handled
// differently for the two: a user image must surface an error (we
// cannot silently ignore what the user sent), while a tool-result
// image may degrade to text-only.
let user_image_marker_count = multimodal::count_user_image_markers(history);
// Image markers in the most recent user message (the image the user *just*
// sent this turn), as opposed to markers carried over from earlier history
// or arriving via tool results. A missing vision capability is handled
// differently: an image the user just sent must surface an error (we cannot
// silently ignore it), while a carried-over or tool-result image degrades to
// text-only. Scoping to the latest user message (rather than the whole
// history) is what stops a single failed image turn from poisoning every
// subsequent text turn: the marker lives in the long-lived session history
// permanently, so a history-wide check would re-fail forever.
let latest_user_image_marker_count = multimodal::count_latest_user_image_markers(history);

// ── Vision model_provider routing ──────────────────────────
// When the default model_provider lacks vision support but a dedicated
// vision_model_provider is configured, create it on demand and use it
// for this iteration. When no vision route exists at all, either
// surface a capability error (the user sent an image) or degrade
// gracefully (the markers came only from tool results) — see the
// no-vision-route branch below and `degrade_strip_images`.
// surface a capability error (the user just sent an image) or degrade
// gracefully (the markers are carried over from earlier history or came
// only from tool results); see the no-vision-route branch below and
// `degrade_strip_images`.
let mut degrade_strip_images = false;
let vision_model_provider_box: Option<Box<dyn ModelProvider>> = if image_marker_count > 0
&& !model_provider.supports_vision()
Expand Down Expand Up @@ -65,8 +70,8 @@ pub(crate) fn resolve_vision_provider(
.into());
}
Some(vp_instance)
} else if user_image_marker_count > 0 {
// The user sent an image we cannot see. Surface a capability
} else if latest_user_image_marker_count > 0 {
// The user *just* sent an image we cannot see. Surface a capability
// error so the attachment is not silently ignored — channels
// render this back to the user (e.g. "⚠️ Error … does not
// support vision"). Configuring a `vision_model_provider`
Expand All @@ -75,21 +80,24 @@ pub(crate) fn resolve_vision_provider(
model_provider: provider_name.to_string(),
capability: "vision".to_string(),
message: format!(
"received {image_marker_count} image marker(s), but this model_provider does not support vision input"
"received {latest_user_image_marker_count} image marker(s), but this model_provider does not support vision input"
),
}
.into());
} else {
// Markers came only from tool results (e.g. `image_info`,
// `screenshot`, `image_gen`). Previously this aborted the
// entire turn with a capability error, which turned an
// otherwise successful tool call (e.g. `image_info`, which
// always returns useful metadata text alongside its `[IMAGE:]`
// marker) into a hard failure. Instead, degrade: strip the
// image markers from the messages sent to the text-only
// The only image markers left are carried over from earlier
// history (e.g. a prior failed image turn whose user message
// persisted, or a switch from a vision model to a non-vision one)
// or arrived via tool results (`image_info`, `screenshot`,
// `image_gen`). Erroring here would poison every later turn: the
// marker lives in the long-lived session history permanently, so a
// history-wide capability error would re-fire on plain text turns
// forever. Tool-result markers were already degraded for the same
// "don't fail an otherwise useful turn" reason. Instead, degrade:
// strip the markers from the messages sent to the text-only
// provider while preserving the surrounding text, so the
// conversation continues and the model still receives any
// accompanying metadata/caption.
// accompanying caption/metadata.
::zeroclaw_log::record!(
WARN,
::zeroclaw_log::Event::new(module_path!(), ::zeroclaw_log::Action::Note)
Expand All @@ -99,7 +107,7 @@ pub(crate) fn resolve_vision_provider(
"model_provider": provider_name,
"image_marker_count": image_marker_count,
})),
"no vision route for tool-result image marker(s); degrading to text-only (markers stripped)"
"no vision route for carried-over/tool-result image marker(s); degrading to text-only (markers stripped)"
);
degrade_strip_images = true;
None
Expand Down