diff --git a/crates/openfang-channels/src/bridge.rs b/crates/openfang-channels/src/bridge.rs index a1aecae62..16271d6e7 100644 --- a/crates/openfang-channels/src/bridge.rs +++ b/crates/openfang-channels/src/bridge.rs @@ -858,6 +858,93 @@ async fn dispatch_message( return; } + // Multipart: flatten children into LLM content blocks. If any image + // succeeds, dispatch as multimodal; otherwise fall through to the text + // path (Multipart arm in the match below builds the combined descriptor). + if let ChannelContent::Multipart(parts) = &message.content { + let mut blocks: Vec = Vec::new(); + for part in parts { + debug_assert!( + !matches!(part, ChannelContent::Multipart(_)), + "nested Multipart in ChannelContent — adapters should produce flat lists" + ); + match part { + ChannelContent::Text(t) => blocks.push(ContentBlock::Text { + text: t.clone(), + provider_metadata: None, + }), + ChannelContent::Image { url, caption } => { + let mut img = download_image_to_blocks(url, caption.as_deref()).await; + blocks.append(&mut img); + } + ChannelContent::File { url, filename, .. } => { + blocks.push(ContentBlock::Text { + text: format!("[User sent a file ({filename}): {url}]"), + provider_metadata: None, + }); + } + ChannelContent::Voice { + url, + duration_seconds, + } => { + blocks.push(ContentBlock::Text { + text: format!("[User sent a voice message ({duration_seconds}s): {url}]"), + provider_metadata: None, + }); + } + ChannelContent::Location { lat, lon } => { + blocks.push(ContentBlock::Text { + text: format!("[User shared location: {lat}, {lon}]"), + provider_metadata: None, + }); + } + ChannelContent::FileData { filename, .. } => { + blocks.push(ContentBlock::Text { + text: format!("[User sent a local file: {filename}]"), + provider_metadata: None, + }); + } + // Commands aren't expected inside Multipart, but render as + // text rather than drop the message if one slips through. + ChannelContent::Command { name, args } => { + blocks.push(ContentBlock::Text { + text: format!("/{name} {}", args.join(" ")), + provider_metadata: None, + }); + } + // Defensive: debug_assert above catches this in dev; ignore + // gracefully in release. + ChannelContent::Multipart(_) => {} + } + } + + if blocks + .iter() + .any(|b| matches!(b, ContentBlock::Image { .. })) + { + let prefix_style = overrides + .as_ref() + .map(|o| o.prefix_agent_name) + .unwrap_or(PrefixStyle::Off); + dispatch_with_blocks( + blocks, + message, + handle, + router, + adapter, + adapter_arc, + ct_str, + thread_id, + output_format, + lifecycle_reactions, + prefix_style, + ) + .await; + return; + } + // No image blocks — fall through to text path below. + } + // For images: download, base64 encode, and send as multimodal content blocks if let ChannelContent::Image { ref url, @@ -909,6 +996,7 @@ async fn dispatch_message( ChannelContent::File { ref url, ref filename, + .. } => { format!("[User sent a file ({filename}): {url}]") } @@ -924,6 +1012,37 @@ async fn dispatch_message( ChannelContent::FileData { ref filename, .. } => { format!("[User sent a local file: {filename}]") } + ChannelContent::Multipart(parts) => parts + .iter() + .map(|p| match p { + ChannelContent::Text(t) => t.clone(), + ChannelContent::Image { url, caption } => match caption { + Some(c) => format!("[User sent a photo: {url}]\nCaption: {c}"), + None => format!("[User sent a photo: {url}]"), + }, + ChannelContent::File { url, filename, .. } => { + format!("[User sent a file ({filename}): {url}]") + } + ChannelContent::Voice { + url, + duration_seconds, + } => format!("[User sent a voice message ({duration_seconds}s): {url}]"), + ChannelContent::Location { lat, lon } => { + format!("[User shared location: {lat}, {lon}]") + } + ChannelContent::FileData { filename, .. } => { + format!("[User sent a local file: {filename}]") + } + ChannelContent::Command { name, args } => { + format!("/{name} {}", args.join(" ")) + } + // Nesting is rejected by adapters; emit empty so the join + // doesn't insert spurious separators. + ChannelContent::Multipart(_) => String::new(), + }) + .filter(|s| !s.is_empty()) + .collect::>() + .join("\n"), }; // Check if it's a slash command embedded in text (e.g. "/agents") @@ -1372,6 +1491,10 @@ fn media_type_from_url(url: &str) -> String { /// Download an image from a URL and build content blocks for multimodal LLM input. /// +/// Accepts both `http(s)://` URLs (fetched via reqwest) and `file://` URLs +/// (read from local disk — used by the channel inbox materialization path so +/// agents see a stable local path even after a Discord CDN URL has expired). +/// /// Returns a `Vec` containing an image block (base64-encoded) and /// optionally a text block for the caption. If the download fails, returns a /// text-only block describing the failure. @@ -1381,38 +1504,79 @@ async fn download_image_to_blocks(url: &str, caption: Option<&str>) -> Vec r, - Err(e) => { - warn!("Failed to download image from channel: {e}"); - return vec![ContentBlock::Text { - text: format!("[Image download failed: {e}]"), - provider_metadata: None, - }]; - } - }; + // Branch on URL scheme: file:// reads from local disk, everything else + // goes through HTTP. We unify both paths into (bytes, header_type) before + // the size/magic-byte logic below. + let (bytes, header_type): (Vec, Option) = + if let Some(path) = url.strip_prefix("file://") { + // file:// — local read. No content-type header to honor; magic-byte + // sniffing and URL extension fallback do all the work. We don't + // percent-decode: the inbox writer controls filenames and avoids + // characters that would need encoding. + match tokio::fs::read(path).await { + Ok(b) => (b, None), + Err(e) => { + warn!("Failed to read image from local path {path}: {e}"); + return vec![ContentBlock::Text { + text: format!("[Image read failed: {e}]"), + provider_metadata: None, + }]; + } + } + } else { + // Build the client with transparent decompression DISABLED. Discord's + // CDN edges occasionally advertise `content-encoding: gzip` (or br) + // on PNG/JPEG passthroughs while the body is the raw, uncompressed + // image bytes. With the default reqwest client (gzip/deflate/brotli + // features enabled at the workspace level), this causes the + // decompression layer to choke on the image header and reqwest + // returns "error decoding response body" only on `bytes().await`, + // not on `send()`. Forcing identity encoding sidesteps the whole + // class of CDN content-encoding-flapping bugs. We also set a UA + // (some CDNs 403 clients without one) and a 30s timeout aligned + // with the upstream 5 MB cap. + let client = reqwest::Client::builder() + .no_gzip() + .no_deflate() + .no_brotli() + .user_agent("openfang/0.1 (+https://openfang.ai)") + .timeout(std::time::Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); + let resp = match client.get(url).send().await { + Ok(r) => r, + Err(e) => { + warn!("Failed to download image from channel: {e}"); + return vec![ContentBlock::Text { + text: format!("[Image download failed: {e}]"), + provider_metadata: None, + }]; + } + }; - // Detect media type from Content-Type header — but only trust it if it's - // actually an image/* type. Many APIs (Telegram, S3 pre-signed URLs) return - // `application/octet-stream` for all files, which breaks vision. - let header_type = resp - .headers() - .get("content-type") - .and_then(|v| v.to_str().ok()) - .map(|ct| ct.split(';').next().unwrap_or(ct).trim().to_string()) - .filter(|ct| ct.starts_with("image/")); - - let bytes = match resp.bytes().await { - Ok(b) => b, - Err(e) => { - warn!("Failed to read image bytes: {e}"); - return vec![ContentBlock::Text { - text: format!("[Image read failed: {e}]"), - provider_metadata: None, - }]; - } - }; + // Detect media type from Content-Type header — but only trust it if + // it's actually an image/* type. Many APIs (Telegram, S3 pre-signed + // URLs) return `application/octet-stream` for all files, which + // breaks vision. + let header_type = resp + .headers() + .get("content-type") + .and_then(|v| v.to_str().ok()) + .map(|ct| ct.split(';').next().unwrap_or(ct).trim().to_string()) + .filter(|ct| ct.starts_with("image/")); + + let bytes = match resp.bytes().await { + Ok(b) => b, + Err(e) => { + warn!("Failed to read image bytes: {e}"); + return vec![ContentBlock::Text { + text: format!("[Image read failed: {e}]"), + provider_metadata: None, + }]; + } + }; + (bytes.to_vec(), header_type) + }; // Three-tier media type detection: // 1. Trusted Content-Type header (only if image/*) diff --git a/crates/openfang-channels/src/discord.rs b/crates/openfang-channels/src/discord.rs index 7d43e53f4..472b5fcff 100644 --- a/crates/openfang-channels/src/discord.rs +++ b/crates/openfang-channels/src/discord.rs @@ -538,6 +538,77 @@ impl ChannelAdapter for DiscordAdapter { } } +/// Maximum byte size for an attachment to be classified as a vision-eligible +/// image. Anthropic's image content blocks are capped at 5 MB; oversize images +/// fall through to `File` so the bridge passes the URL as text instead of +/// attempting an inline image block. +const VISION_IMAGE_MAX_BYTES: u64 = 5 * 1024 * 1024; + +/// Best-effort MIME inference from a filename extension. Used as a fallback +/// when Discord's `content_type` field is missing or empty (we've observed +/// this on some bot-relayed attachments). +fn mime_from_extension(filename: &str) -> Option<&'static str> { + let ext = filename.rsplit('.').next()?.to_ascii_lowercase(); + match ext.as_str() { + "jpg" | "jpeg" => Some("image/jpeg"), + "png" => Some("image/png"), + "gif" => Some("image/gif"), + "webp" => Some("image/webp"), + "heic" => Some("image/heic"), + "heif" => Some("image/heif"), + "pdf" => Some("application/pdf"), + "txt" => Some("text/plain"), + "md" => Some("text/markdown"), + "json" => Some("application/json"), + "mp4" => Some("video/mp4"), + "mov" => Some("video/quicktime"), + "mp3" => Some("audio/mpeg"), + "wav" => Some("audio/wav"), + "ogg" => Some("audio/ogg"), + _ => None, + } +} + +/// Classify a single Discord attachment JSON object into a `ChannelContent` +/// block. Vision-eligible image MIME types (jpeg/png/gif/webp) under +/// `VISION_IMAGE_MAX_BYTES` become `Image`; everything else becomes `File` +/// (URL-pass-through; the bridge will surface it as a text descriptor in v1). +/// +/// MIME resolution chain: `attachments[].content_type` (if non-empty) → +/// extension lookup → `application/octet-stream`. +fn classify_discord_attachment(att: &serde_json::Value) -> ChannelContent { + let url = att["url"].as_str().unwrap_or("").to_string(); + let filename = att["filename"].as_str().unwrap_or("file").to_string(); + let size = att["size"].as_u64(); + + let resolved_mime: String = att["content_type"] + .as_str() + .filter(|s| !s.is_empty()) + .map(str::to_string) + .or_else(|| mime_from_extension(&filename).map(str::to_string)) + .unwrap_or_else(|| "application/octet-stream".to_string()); + + let is_vision_mime = matches!( + resolved_mime.as_str(), + "image/jpeg" | "image/png" | "image/gif" | "image/webp" + ); + // If size is unknown, optimistically allow the image — the bridge will + // surface a 4xx if Anthropic rejects it, which is better than silently + // demoting to a text URL. + let within_vision_limit = size.map(|s| s <= VISION_IMAGE_MAX_BYTES).unwrap_or(true); + + if is_vision_mime && within_vision_limit { + ChannelContent::Image { url, caption: None } + } else { + ChannelContent::File { + url, + filename, + mime: Some(resolved_mime), + size, + } + } +} + /// Parse a Discord MESSAGE_CREATE or MESSAGE_UPDATE payload into a `ChannelMessage`. async fn parse_discord_message( d: &serde_json::Value, @@ -546,6 +617,11 @@ async fn parse_discord_message( allowed_users: &[String], ignore_bots: bool, ) -> Option { + // Diagnostic: dump the raw Discord payload so we can ground attachment + // parsing in real JSON. Gated by RUST_LOG; silent at default `info` level. + // Enable with: RUST_LOG=openfang_channels::discord=debug + debug!(target: "openfang_channels::discord", payload = %d, "discord raw message payload"); + let author = d.get("author")?; let author_id = author["id"].as_str()?; @@ -577,10 +653,6 @@ async fn parse_discord_message( } let content_text = d["content"].as_str().unwrap_or(""); - if content_text.is_empty() { - return None; - } - let channel_id = d["channel_id"].as_str()?; let message_id = d["id"].as_str().unwrap_or("0"); let username = author["username"].as_str().unwrap_or("Unknown"); @@ -597,7 +669,8 @@ async fn parse_discord_message( .map(|dt| dt.with_timezone(&chrono::Utc)) .unwrap_or_else(chrono::Utc::now); - // Parse commands (messages starting with /) + // Parse commands (messages starting with /). Commands do not carry + // attachments in v1; attachment processing only runs in the non-command path. let content = if content_text.starts_with('/') { let parts: Vec<&str> = content_text.splitn(2, ' ').collect(); let cmd_name = &parts[0][1..]; @@ -611,7 +684,50 @@ async fn parse_discord_message( args, } } else { - ChannelContent::Text(content_text.to_string()) + let attachment_blocks: Vec = d["attachments"] + .as_array() + .map(|arr| arr.iter().map(classify_discord_attachment).collect()) + .unwrap_or_default(); + + match (content_text.is_empty(), attachment_blocks.len()) { + // No text, no attachments → nothing to ingest. + (true, 0) => return None, + // Text only. + (false, 0) => ChannelContent::Text(content_text.to_string()), + // Single attachment, no caption. + (true, 1) => attachment_blocks.into_iter().next().unwrap(), + // Single attachment + caption: emit Multipart with the caption as + // a sibling Text block. This keeps the caption visible to providers + // that flatten content to text only (e.g. claude-code/*, which + // currently drops Image blocks) — the user gets a coherent + // text-only response instead of a hallucination. Vision-capable + // providers see the same blocks and dispatch multimodally. + (false, 1) => { + let block = attachment_blocks.into_iter().next().unwrap(); + let normalized = match block { + // Drop any caption that classify_discord_attachment may have + // attached; the sibling Text block is now the caption. + ChannelContent::Image { url, caption: _ } => { + ChannelContent::Image { url, caption: None } + } + other => other, + }; + ChannelContent::Multipart(vec![ + ChannelContent::Text(content_text.to_string()), + normalized, + ]) + } + // Multiple attachments, no caption. + (true, _) => ChannelContent::Multipart(attachment_blocks), + // Multiple attachments + caption: text first, then attachments + // (matches Discord's visual ordering: text above attachments). + (false, _) => { + let mut blocks = Vec::with_capacity(attachment_blocks.len() + 1); + blocks.push(ChannelContent::Text(content_text.to_string())); + blocks.extend(attachment_blocks); + ChannelContent::Multipart(blocks) + } + } }; // Determine if this is a group message (guild_id present = server channel) @@ -1032,4 +1148,214 @@ mod tests { assert_eq!(adapter.name(), "discord"); assert_eq!(adapter.channel_type(), ChannelType::Discord); } + + // -- Multipart / attachment parsing tests (commit 4) ---------------------- + + fn att(filename: &str, content_type: Option<&str>, size: u64) -> serde_json::Value { + let mut obj = serde_json::json!({ + "url": format!("https://cdn.discordapp.com/attachments/1/2/{filename}"), + "filename": filename, + "size": size, + }); + if let Some(ct) = content_type { + obj["content_type"] = serde_json::Value::String(ct.to_string()); + } + obj + } + + fn payload_with(content: &str, attachments: Vec) -> serde_json::Value { + serde_json::json!({ + "id": "msg1", + "channel_id": "ch1", + "content": content, + "author": { + "id": "user456", + "username": "alice", + "discriminator": "0", + "bot": false + }, + "timestamp": "2024-01-01T00:00:00+00:00", + "attachments": attachments, + }) + } + + #[tokio::test] + async fn test_parse_image_only_no_caption() { + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with("", vec![att("photo.png", Some("image/png"), 100_000)]); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + match msg.content { + ChannelContent::Image { caption, url } => { + assert!(caption.is_none()); + assert!(url.contains("photo.png")); + } + other => panic!("expected Image, got {other:?}"), + } + } + + #[tokio::test] + async fn test_parse_image_with_caption() { + // Single image + caption is emitted as Multipart([Text, Image]) so the + // caption survives providers that flatten content blocks to text only + // (e.g. claude-code/*). The Image carries no caption of its own; the + // sibling Text block IS the caption. + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with( + "look at this", + vec![att("photo.jpg", Some("image/jpeg"), 50_000)], + ); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + match msg.content { + ChannelContent::Multipart(parts) => { + assert_eq!(parts.len(), 2); + assert!(matches!(&parts[0], ChannelContent::Text(t) if t == "look at this")); + match &parts[1] { + ChannelContent::Image { caption, url } => { + assert!( + caption.is_none(), + "image caption should be None; the sibling Text block is the caption" + ); + assert!(url.contains("photo.jpg")); + } + other => panic!("expected Image as second part, got {other:?}"), + } + } + other => panic!("expected Multipart, got {other:?}"), + } + } + + #[tokio::test] + async fn test_parse_multi_image_no_caption() { + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with( + "", + vec![ + att("a.png", Some("image/png"), 10_000), + att("b.png", Some("image/png"), 20_000), + ], + ); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + match msg.content { + ChannelContent::Multipart(parts) => { + assert_eq!(parts.len(), 2); + assert!(parts + .iter() + .all(|p| matches!(p, ChannelContent::Image { .. }))); + } + other => panic!("expected Multipart, got {other:?}"), + } + } + + #[tokio::test] + async fn test_parse_multi_image_with_caption() { + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with( + "two pics", + vec![ + att("a.png", Some("image/png"), 10_000), + att("b.png", Some("image/png"), 20_000), + ], + ); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + match msg.content { + ChannelContent::Multipart(parts) => { + assert_eq!(parts.len(), 3); + // Text first, then images. + assert!(matches!(&parts[0], ChannelContent::Text(t) if t == "two pics")); + assert!(matches!(&parts[1], ChannelContent::Image { .. })); + assert!(matches!(&parts[2], ChannelContent::Image { .. })); + } + other => panic!("expected Multipart, got {other:?}"), + } + } + + #[tokio::test] + async fn test_parse_heic_falls_to_file() { + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with("", vec![att("photo.heic", Some("image/heic"), 100_000)]); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + match msg.content { + ChannelContent::File { mime, filename, .. } => { + assert_eq!(filename, "photo.heic"); + assert_eq!(mime.as_deref(), Some("image/heic")); + } + other => panic!("expected File, got {other:?}"), + } + } + + #[tokio::test] + async fn test_parse_oversize_image_falls_to_file() { + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + // 6 MB exceeds VISION_IMAGE_MAX_BYTES (5 MB). + let d = payload_with( + "", + vec![att("huge.png", Some("image/png"), 6 * 1024 * 1024)], + ); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + match msg.content { + ChannelContent::File { + filename, + mime, + size, + .. + } => { + assert_eq!(filename, "huge.png"); + assert_eq!(mime.as_deref(), Some("image/png")); + assert_eq!(size, Some(6 * 1024 * 1024)); + } + other => panic!("expected File, got {other:?}"), + } + } + + #[tokio::test] + async fn test_parse_file_with_caption_yields_multipart() { + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with( + "see attached", + vec![att("doc.pdf", Some("application/pdf"), 200_000)], + ); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + match msg.content { + ChannelContent::Multipart(parts) => { + assert_eq!(parts.len(), 2); + assert!(matches!(&parts[0], ChannelContent::Text(t) if t == "see attached")); + assert!(matches!(&parts[1], ChannelContent::File { .. })); + } + other => panic!("expected Multipart, got {other:?}"), + } + } + + #[tokio::test] + async fn test_parse_extension_fallback_when_content_type_missing() { + // Discord occasionally omits content_type on bot-relayed attachments; + // we should fall back to the filename extension. + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with("", vec![att("pic.png", None, 50_000)]); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true) + .await + .unwrap(); + assert!(matches!(msg.content, ChannelContent::Image { .. })); + } + + #[tokio::test] + async fn test_parse_empty_message_with_no_attachments_returns_none() { + let bot_id = Arc::new(RwLock::new(Some("bot123".to_string()))); + let d = payload_with("", vec![]); + let msg = parse_discord_message(&d, &bot_id, &[], &[], true).await; + assert!(msg.is_none()); + } } diff --git a/crates/openfang-channels/src/telegram.rs b/crates/openfang-channels/src/telegram.rs index cb4a5b01b..7e5ddf849 100644 --- a/crates/openfang-channels/src/telegram.rs +++ b/crates/openfang-channels/src/telegram.rs @@ -498,7 +498,7 @@ impl TelegramAdapter { self.api_send_photo(chat_id, &url, caption.as_deref(), thread_id) .await?; } - ChannelContent::File { url, filename } => { + ChannelContent::File { url, filename, .. } => { self.api_send_document(chat_id, &url, &filename, thread_id) .await?; } @@ -521,6 +521,17 @@ impl TelegramAdapter { self.api_send_message(chat_id, text.trim(), thread_id) .await?; } + ChannelContent::Multipart(parts) => { + // Send each child as its own Telegram message. Nested + // Multipart is rejected by adapters; flatten defensively. + for part in parts { + if let ChannelContent::Multipart(_) = part { + debug_assert!(false, "nested Multipart in send_to_user"); + continue; + } + Box::pin(self.send_content(user, part, thread_id)).await?; + } + } } Ok(()) } @@ -934,7 +945,12 @@ async fn parse_telegram_update( .unwrap_or("document") .to_string(); match telegram_get_file_url(token, client, file_id, api_base_url).await { - Some(url) => ChannelContent::File { url, filename }, + Some(url) => ChannelContent::File { + url, + filename, + mime: None, + size: None, + }, None => ChannelContent::Text(format!("[Document received: {filename}]")), } } else if message.get("voice").is_some() { @@ -2051,10 +2067,7 @@ mod tests { body, ) } else { - ( - StatusCode::OK, - r#"{"ok":true,"result":true}"#.to_string(), - ) + (StatusCode::OK, r#"{"ok":true,"result":true}"#.to_string()) } } })); @@ -2131,7 +2144,10 @@ mod tests { // Two-chunk message; first POST fails. Nothing delivered → Err. let big = "a".repeat(5000); // > 4096 → split into two chunks let stub = StubServer::new(vec![ - (500, r#"{"ok":false,"error_code":500,"description":"server"}"#), + ( + 500, + r#"{"ok":false,"error_code":500,"description":"server"}"#, + ), (200, r#"{"ok":true,"result":{}}"#), ]); let base = spawn_stub_server(stub.clone()).await; @@ -2159,7 +2175,10 @@ mod tests { let big = "a".repeat(5000); let stub = StubServer::new(vec![ (200, r#"{"ok":true,"result":{}}"#), - (400, r#"{"ok":false,"error_code":400,"description":"some err"}"#), + ( + 400, + r#"{"ok":false,"error_code":400,"description":"some err"}"#, + ), ]); let base = spawn_stub_server(stub.clone()).await; let adapter = test_adapter(base); @@ -2170,7 +2189,11 @@ mod tests { result.is_ok(), "partial delivery must return Ok (best-effort), got {result:?}" ); - assert_eq!(stub.hit_count(), 2, "both chunks should have been attempted"); + assert_eq!( + stub.hit_count(), + 2, + "both chunks should have been attempted" + ); } // ----------------------------------------------------------------------- diff --git a/crates/openfang-channels/src/types.rs b/crates/openfang-channels/src/types.rs index 84247b5af..8d3904169 100644 --- a/crates/openfang-channels/src/types.rs +++ b/crates/openfang-channels/src/types.rs @@ -50,6 +50,16 @@ pub enum ChannelContent { File { url: String, filename: String, + /// Best-effort MIME type from the source platform (e.g. Discord's + /// `attachments[].content_type`). `None` if the platform did not + /// provide one; downstream consumers may sniff bytes or fall back + /// to extension-based detection. + #[serde(default, skip_serializing_if = "Option::is_none")] + mime: Option, + /// Size in bytes, when known. Useful for capacity gating before + /// the bridge attempts to materialize or transmit the file. + #[serde(default, skip_serializing_if = "Option::is_none")] + size: Option, }, /// Local file data (bytes read from disk). Used by the proactive `channel_send` /// tool when `file_path` is provided instead of `file_url`. @@ -70,6 +80,12 @@ pub enum ChannelContent { name: String, args: Vec, }, + /// A composite message carrying multiple content blocks (e.g. a Discord + /// message with several attachments, or an image with a separate file + /// sibling). Blocks are flat-mapped by the bridge into multiple LLM + /// content blocks. Implementations should not produce nested `Multipart` + /// values; consumers may `debug_assert!` against nesting. + Multipart(Vec), } /// A unified message from any channel. diff --git a/crates/openfang-channels/src/whatsapp.rs b/crates/openfang-channels/src/whatsapp.rs index 16f37b56d..8f656b559 100644 --- a/crates/openfang-channels/src/whatsapp.rs +++ b/crates/openfang-channels/src/whatsapp.rs @@ -271,7 +271,7 @@ impl ChannelAdapter for WhatsAppAdapter { return Err(format!("WhatsApp API error {status}: {body}").into()); } } - ChannelContent::File { url, filename } => { + ChannelContent::File { url, filename, .. } => { let body = serde_json::json!({ "messaging_product": "whatsapp", "to": user.platform_id, diff --git a/crates/openfang-kernel/src/kernel.rs b/crates/openfang-kernel/src/kernel.rs index c91d02a85..28a7d1500 100644 --- a/crates/openfang-kernel/src/kernel.rs +++ b/crates/openfang-kernel/src/kernel.rs @@ -7252,6 +7252,8 @@ impl KernelHandle for OpenFangKernel { "file" => openfang_channels::types::ChannelContent::File { url: media_url.to_string(), filename: filename.unwrap_or("file").to_string(), + mime: None, + size: None, }, _ => { return Err(format!( diff --git a/crates/openfang-runtime/src/drivers/claude_code.rs b/crates/openfang-runtime/src/drivers/claude_code.rs index 21e0fb6d1..65182ec6c 100644 --- a/crates/openfang-runtime/src/drivers/claude_code.rs +++ b/crates/openfang-runtime/src/drivers/claude_code.rs @@ -11,7 +11,7 @@ use crate::llm_driver::{CompletionRequest, CompletionResponse, LlmDriver, LlmError, StreamEvent}; use async_trait::async_trait; use dashmap::DashMap; -use openfang_types::message::{ContentBlock, Role, StopReason, TokenUsage}; +use openfang_types::message::{ContentBlock, MessageContent, Role, StopReason, TokenUsage}; use serde::Deserialize; use std::sync::Arc; use tokio::io::{AsyncBufReadExt, AsyncReadExt}; @@ -130,6 +130,14 @@ impl ClaudeCodeDriver { } /// Build a text prompt from the completion request messages. + /// + /// The Claude Code CLI is text-only (`-p `), so non-text content + /// blocks (images, etc.) cannot be sent natively. Rather than dropping + /// them silently — which causes the model to hallucinate about content + /// it can't see — we render each non-text block as a synthetic + /// `[attachment: ...]` marker. The model still can't *view* the + /// attachment, but it knows the attachment exists and can acknowledge + /// it coherently instead of confabulating. fn build_prompt(request: &CompletionRequest) -> String { let mut parts = Vec::new(); @@ -139,15 +147,52 @@ impl ClaudeCodeDriver { Role::Assistant => "Assistant", Role::System => "System", }; - let text = msg.content.text_content(); - if !text.is_empty() { - parts.push(format!("[{role_label}]\n{text}")); + let rendered = Self::render_content(&msg.content); + if !rendered.is_empty() { + parts.push(format!("[{role_label}]\n{rendered}")); } } parts.join("\n\n") } + /// Render message content for the text-only CLI prompt. + /// + /// Text blocks pass through verbatim. Image blocks are rendered as + /// `[attachment: image, ~N KB — not viewable on this + /// provider]` so the model receives a positive signal that an + /// attachment arrived. ToolUse/ToolResult/Thinking are omitted — + /// the CLI manages its own tool loop. + fn render_content(content: &MessageContent) -> String { + match content { + MessageContent::Text(s) => s.clone(), + MessageContent::Blocks(blocks) => blocks + .iter() + .filter_map(|b| match b { + ContentBlock::Text { text, .. } => { + if text.is_empty() { + None + } else { + Some(text.clone()) + } + } + ContentBlock::Image { media_type, data } => { + // base64 → ~3/4 the length in decoded bytes. + let approx_kb = (data.len().saturating_mul(3) / 4) / 1024; + Some(format!( + "[attachment: {media_type} image, ~{approx_kb} KB — not viewable on this provider]" + )) + } + ContentBlock::ToolUse { .. } + | ContentBlock::ToolResult { .. } + | ContentBlock::Thinking { .. } + | ContentBlock::Unknown => None, + }) + .collect::>() + .join("\n"), + } + } + /// Map a model ID like "claude-code/opus" to CLI --model flag value. fn model_flag(model: &str) -> Option { let stripped = model.strip_prefix("claude-code/").unwrap_or(model); @@ -726,6 +771,77 @@ mod tests { assert!(prompt.contains("Hello")); } + #[test] + fn test_build_prompt_renders_image_attachment_marker() { + use openfang_types::message::{ContentBlock, Message, MessageContent}; + + // ~12 KB of base64 — decoded ~9 KB. + let fake_b64 = "A".repeat(12 * 1024); + let request = CompletionRequest { + model: "claude-code/sonnet".to_string(), + messages: vec![Message { + role: Role::User, + content: MessageContent::Blocks(vec![ + ContentBlock::Text { + text: "what's in this?".to_string(), + provider_metadata: None, + }, + ContentBlock::Image { + media_type: "image/png".to_string(), + data: fake_b64, + }, + ]), + }], + tools: vec![], + max_tokens: 1024, + temperature: 0.7, + system: None, + thinking: None, + }; + + let prompt = ClaudeCodeDriver::build_prompt(&request); + assert!(prompt.contains("what's in this?"), "text preserved"); + assert!( + prompt.contains("[attachment: image/png image"), + "image rendered as synthetic attachment marker, got: {prompt}" + ); + assert!( + prompt.contains("not viewable on this provider"), + "marker explains the limitation, got: {prompt}" + ); + } + + #[test] + fn test_build_prompt_image_only_still_emits_marker() { + use openfang_types::message::{ContentBlock, Message, MessageContent}; + + let request = CompletionRequest { + model: "claude-code/sonnet".to_string(), + messages: vec![Message { + role: Role::User, + content: MessageContent::Blocks(vec![ContentBlock::Image { + media_type: "image/jpeg".to_string(), + data: "Zm9v".to_string(), + }]), + }], + tools: vec![], + max_tokens: 1024, + temperature: 0.7, + system: None, + thinking: None, + }; + + let prompt = ClaudeCodeDriver::build_prompt(&request); + assert!( + prompt.contains("[User]"), + "user role label emitted even with image-only content, got: {prompt}" + ); + assert!( + prompt.contains("[attachment: image/jpeg image"), + "bare image renders marker, got: {prompt}" + ); + } + #[test] fn test_model_flag_mapping() { assert_eq!(