From 5ad6e0257479425446d13356b052ff22ed922da0 Mon Sep 17 00:00:00 2001 From: Clifton King Date: Tue, 9 Jun 2026 15:56:22 -0500 Subject: [PATCH] fix: route PDFs per provider format and stop dropping media in generate/run Two related media-handling fixes: 1. PDF attachments were silently encoded as images on 3 of 4 providers. build_openai_compatible_message_content (OpenAI + Grok) emitted every attachment as an image_url part and build_anthropic_message_content emitted every attachment as an image block, regardless of MIME type. Now the builders branch on mime_type: - OpenAI: inline PDFs become the documented file content part ({"type":"file","file":{"filename","file_data"}}); URL-based PDFs return a clear error (chat completions does not accept file URLs). - Anthropic: PDFs become document blocks with base64 or url sources per the PDF-support docs. - Grok: xAI chat completions only documents text and image parts, so PDFs return a clear "not supported" error instead of a mislabeled image_url part. - Non-image, non-PDF MIME types error with the offending type named. - Gemini already passed MIME types through and is unchanged; image handling is byte-for-byte unchanged on all providers. 2. generate and tool run silently dropped attached media. Request::generate and Request::run ignored Request::media entirely. Added LLMClient::generate_with_media (default: error rather than drop) with implementations for all four providers via message-based generate internals, threaded media through ToolRunner::run_tool_loop and the three tool-loop drivers using the same content builders, and routed Request::generate/run through them. MockClient records the new GenerateWithMedia request kind and tool-loop media for assertions. Verified shapes against provider docs (Anthropic PDF support, OpenAI PDF files guide, xAI image-understanding guide). Unit tests cover the serialized part/block shapes per provider and mockito tests assert the request bodies for generate/run now carry media. Co-Authored-By: Claude Fable 5 --- README.md | 11 +- src/backend/anthropic.rs | 249 +++++++++++++++------------ src/backend/any_client.rs | 4 + src/backend/client.rs | 50 ++++++ src/backend/gemini.rs | 337 +++++++++++++++++++----------------- src/backend/grok.rs | 202 ++++++++++++---------- src/backend/media.rs | 347 +++++++++++++++++++++++++++++++++++--- src/backend/mock.rs | 17 +- src/backend/mod.rs | 4 +- src/backend/openai.rs | 231 ++++++++++++++----------- src/backend/request.rs | 37 +++- src/backend/tools.rs | 36 +++- tests/http_mock_tests.rs | 154 +++++++++++++++++ tests/mock_edge_tests.rs | 61 +++++++ 14 files changed, 1258 insertions(+), 482 deletions(-) diff --git a/README.md b/README.md index 6f08ce9..03dd757 100644 --- a/README.md +++ b/README.md @@ -293,7 +293,7 @@ impl SchemaType for SecurityId { } ``` -## Multimodal (Image Input) +## Multimodal (Image & PDF Input) Analyze images with structured extraction across all major providers by attaching media to a request with `with_media`: @@ -329,7 +329,14 @@ async fn main() -> Result<(), Box> { `MediaFile::new(uri, mime_type)` is also available for URL/URI-based media input. The lower-level `LLMClient::materialize_with_media(prompt, &media)` method does -the same thing in one call when you do not need the builder. +the same thing in one call when you do not need the builder. Attached media is +honored by `materialize`, `generate`, and tool `run` alike. + +PDFs are supported too: pass `"application/pdf"` as the MIME type and the +attachment is routed to each provider's documented document format (OpenAI +`file` part, Anthropic `document` block, Gemini `inlineData`/`fileData`). +Combinations a provider does not support — PDFs on Grok, or URL-based PDFs on +OpenAI chat completions — return a clear error instead of a broken request. Provider examples: - `cargo run --example openai_multimodal_example --features openai` diff --git a/src/backend/anthropic.rs b/src/backend/anthropic.rs index a252545..b6b356a 100644 --- a/src/backend/anthropic.rs +++ b/src/backend/anthropic.rs @@ -418,6 +418,127 @@ impl AnthropicClient { trace!(json = %raw_response, "Parsing structured output response"); parse_validate_and_create_output(raw_response, usage) } + + /// Internal implementation of raw text generation (no structured output). + /// + /// Accepts chat messages so that callers can attach media (images/PDFs) to + /// the user message; the same content-building path as `materialize_internal` + /// is used, so media is encoded per Anthropic's documented block format. + async fn generate_internal(&self, messages: &[ChatMessage]) -> Result { + info!("Generating raw text response with Anthropic"); + + // Build thinking config for Claude 4.x models + let is_thinking_model = self.config.model.as_str().contains("sonnet-4") + || self.config.model.as_str().contains("opus-4"); + let thinking_config = self.config.thinking_level.and_then(|level| { + if is_thinking_model && level.claude_thinking_enabled() { + Some(ClaudeThinkingConfig { + thinking_type: "enabled".to_string(), + budget_tokens: level.claude_budget_tokens(), + }) + } else { + None + } + }); + + // Claude requires temperature=1 when thinking is enabled + let effective_temp = if thinking_config.is_some() { + 1.0 + } else { + self.config.temperature + }; + + // Build API messages, including any attached media blocks + let api_messages: Vec = messages + .iter() + .map(|msg| { + Ok(AnthropicMessage { + role: msg.role.as_str().to_string(), + content: build_anthropic_message_content(msg)?, + }) + }) + .collect::>>()?; + + // Build the request (no output_format for raw text generation) + debug!("Building Anthropic API request for text generation"); + let request = CompletionRequest { + model: self.config.model.as_str().to_string(), + messages: api_messages, + temperature: effective_temp, + max_tokens: effective_max_tokens(self.config.max_tokens, thinking_config.as_ref()), + thinking: thinking_config, + output_format: None, // Raw text generation doesn't use structured outputs + }; + + // Send the request to Anthropic + debug!( + model = %self.config.model.as_str(), + max_tokens = request.max_tokens, + "Sending request to Anthropic API" + ); + let base_url = self + .config + .base_url + .as_deref() + .unwrap_or("https://api.anthropic.com/v1"); + let url = format!("{}/messages", base_url); + debug!(url = %url, "Using Anthropic API endpoint"); + let response = self + .client + .post(&url) + .header("x-api-key", &self.config.api_key) + .header("anthropic-version", "2023-06-01") + .header("Content-Type", "application/json") + .json(&request) + .send() + .await + .map_err(|e| handle_http_error(e, "Anthropic"))?; + + // Parse the response + let response = check_response_status(response, "Anthropic").await?; + + debug!("Successfully received response from Anthropic"); + let completion: CompletionResponse = response.json().await.map_err(|e| { + error!(error = %e, "Failed to parse JSON response from Anthropic"); + e + })?; + + // Extract usage info + let model_name = completion + .model + .clone() + .unwrap_or_else(|| self.config.model.as_str().to_string()); + let usage = completion + .usage + .as_ref() + .map(|u| TokenUsage::new(model_name, u.input_tokens, u.output_tokens)); + + // Extract the content + debug!("Extracting text content from response blocks"); + let content: String = completion + .content + .iter() + .filter(|block| block.block_type == "text") + .map(|block| block.text.clone()) + .collect::>() + .join(""); + + if content.is_empty() { + error!("No text content in Anthropic response"); + return Err(RStructorError::api_error( + "Anthropic", + ApiErrorKind::UnexpectedResponse { + details: "No text content in response".to_string(), + }, + )); + } + + debug!( + content_len = content.len(), + "Successfully extracted text content" + ); + Ok(GenerateResult::new(content, usage)) + } } // Generate builder methods using macro @@ -565,6 +686,7 @@ impl crate::backend::tools::ToolRunner for AnthropicClient { &self, system: Option<&str>, prompt: &str, + media: &[super::MediaFile], toolbox: &crate::backend::tools::Toolbox, max_iterations: usize, ) -> Result { @@ -584,6 +706,7 @@ impl crate::backend::tools::ToolRunner for AnthropicClient { .unwrap_or(DEFAULT_ANTHROPIC_MAX_TOKENS), system, prompt, + media, toolbox, max_iterations, ) @@ -686,6 +809,26 @@ impl LLMClient for AnthropicClient { Ok(result.text) } + #[instrument( + name = "anthropic_generate_with_media", + skip(self, prompt, media), + fields( + model = %self.config.model.as_str(), + prompt_len = prompt.len(), + media_len = media.len() + ) + )] + async fn generate_with_media( + &self, + prompt: &str, + media: &[super::MediaFile], + ) -> Result { + let result = self + .generate_internal(&[ChatMessage::user_with_media(prompt, media.to_vec())]) + .await?; + Ok(result.text) + } + #[instrument( name = "anthropic_generate_with_metadata", skip(self, prompt), @@ -695,111 +838,7 @@ impl LLMClient for AnthropicClient { ) )] async fn generate_with_metadata(&self, prompt: &str) -> Result { - info!("Generating raw text response with Anthropic"); - - // Build thinking config for Claude 4.x models - let is_thinking_model = self.config.model.as_str().contains("sonnet-4") - || self.config.model.as_str().contains("opus-4"); - let thinking_config = self.config.thinking_level.and_then(|level| { - if is_thinking_model && level.claude_thinking_enabled() { - Some(ClaudeThinkingConfig { - thinking_type: "enabled".to_string(), - budget_tokens: level.claude_budget_tokens(), - }) - } else { - None - } - }); - - // Claude requires temperature=1 when thinking is enabled - let effective_temp = if thinking_config.is_some() { - 1.0 - } else { - self.config.temperature - }; - - // Build the request (no output_format for raw text generation) - debug!("Building Anthropic API request for text generation"); - let request = CompletionRequest { - model: self.config.model.as_str().to_string(), - messages: vec![AnthropicMessage { - role: "user".to_string(), - content: AnthropicMessageContent::Text(prompt.to_string()), - }], - temperature: effective_temp, - max_tokens: effective_max_tokens(self.config.max_tokens, thinking_config.as_ref()), - thinking: thinking_config, - output_format: None, // Raw text generation doesn't use structured outputs - }; - - // Send the request to Anthropic - debug!( - model = %self.config.model.as_str(), - max_tokens = request.max_tokens, - "Sending request to Anthropic API" - ); - let base_url = self - .config - .base_url - .as_deref() - .unwrap_or("https://api.anthropic.com/v1"); - let url = format!("{}/messages", base_url); - debug!(url = %url, "Using Anthropic API endpoint"); - let response = self - .client - .post(&url) - .header("x-api-key", &self.config.api_key) - .header("anthropic-version", "2023-06-01") - .header("Content-Type", "application/json") - .json(&request) - .send() - .await - .map_err(|e| handle_http_error(e, "Anthropic"))?; - - // Parse the response - let response = check_response_status(response, "Anthropic").await?; - - debug!("Successfully received response from Anthropic"); - let completion: CompletionResponse = response.json().await.map_err(|e| { - error!(error = %e, "Failed to parse JSON response from Anthropic"); - e - })?; - - // Extract usage info - let model_name = completion - .model - .clone() - .unwrap_or_else(|| self.config.model.as_str().to_string()); - let usage = completion - .usage - .as_ref() - .map(|u| TokenUsage::new(model_name, u.input_tokens, u.output_tokens)); - - // Extract the content - debug!("Extracting text content from response blocks"); - let content: String = completion - .content - .iter() - .filter(|block| block.block_type == "text") - .map(|block| block.text.clone()) - .collect::>() - .join(""); - - if content.is_empty() { - error!("No text content in Anthropic response"); - return Err(RStructorError::api_error( - "Anthropic", - ApiErrorKind::UnexpectedResponse { - details: "No text content in response".to_string(), - }, - )); - } - - debug!( - content_len = content.len(), - "Successfully extracted text content" - ); - Ok(GenerateResult::new(content, usage)) + self.generate_internal(&[ChatMessage::user(prompt)]).await } #[cfg(feature = "streaming")] diff --git a/src/backend/any_client.rs b/src/backend/any_client.rs index 4ecf47a..e886637 100644 --- a/src/backend/any_client.rs +++ b/src/backend/any_client.rs @@ -209,6 +209,10 @@ impl LLMClient for AnyClient { dispatch!(self, c => c.generate(prompt).await) } + async fn generate_with_media(&self, prompt: &str, media: &[MediaFile]) -> Result { + dispatch!(self, c => c.generate_with_media(prompt, media).await) + } + async fn generate_with_metadata(&self, prompt: &str) -> Result { dispatch!(self, c => c.generate_with_metadata(prompt).await) } diff --git a/src/backend/client.rs b/src/backend/client.rs index 055a5c3..83c2b1a 100644 --- a/src/backend/client.rs +++ b/src/backend/client.rs @@ -16,6 +16,13 @@ use crate::model::Instructor; /// Created with [`MediaFile::from_bytes`]. This is useful for public images /// downloaded over HTTPS. /// +/// The `mime_type` decides how each provider encodes the attachment: `image/*` +/// is sent in the provider's image format, and `application/pdf` is routed to +/// the provider's document/file format (OpenAI `file` part for inline data, +/// Anthropic `document` block, Gemini `inlineData`/`fileData`). Combinations a +/// provider does not document — e.g. any PDF on Grok, or a URL-based PDF on +/// OpenAI — produce a clear error instead of a silently broken request. +/// /// # Examples /// /// ```no_run @@ -30,6 +37,10 @@ use crate::model::Instructor; /// // Inline data from bytes /// let image_bytes = std::fs::read("photo.png").unwrap(); /// let media = MediaFile::from_bytes(&image_bytes, "image/png"); +/// +/// // Inline PDF (OpenAI, Anthropic, and Gemini) +/// let pdf_bytes = std::fs::read("report.pdf").unwrap(); +/// let media = MediaFile::from_bytes(&pdf_bytes, "application/pdf"); /// ``` #[derive(Debug, Clone)] pub struct MediaFile { @@ -282,6 +293,45 @@ pub trait LLMClient { /// ``` async fn generate(&self, prompt: &str) -> Result; + /// Raw text completion with media attachments (if supported). + /// + /// Like [`generate`](Self::generate), but the prompt is sent together with + /// `media` (images, or PDFs where the provider supports them), encoded in the + /// provider's documented multimodal format. + /// + /// The default implementation forwards to [`generate`](Self::generate) when + /// no media is provided, and otherwise returns + /// [`RStructorError::Unsupported`](crate::RStructorError::Unsupported) so that + /// media is never silently dropped. Providers with media support override this + /// method. All four built-in clients (OpenAI, Anthropic, Grok, Gemini) support + /// media here; PDF support varies by provider (Grok, for example, accepts only + /// images and returns a clear error for PDFs). + /// + /// # Example + /// + /// ```no_run + /// # use rstructor::{LLMClient, OpenAIClient, MediaFile}; + /// # async fn example() -> Result<(), Box> { + /// let client = OpenAIClient::from_env()?; + /// let pdf_bytes = std::fs::read("report.pdf")?; + /// let media = [MediaFile::from_bytes(&pdf_bytes, "application/pdf")]; + /// let summary = client + /// .generate_with_media("Summarize this report", &media) + /// .await?; + /// println!("{summary}"); + /// # Ok(()) + /// # } + /// ``` + async fn generate_with_media(&self, prompt: &str, media: &[MediaFile]) -> Result { + if media.is_empty() { + self.generate(prompt).await + } else { + Err(crate::error::RStructorError::Unsupported( + "this client does not support media inputs".to_string(), + )) + } + } + /// Raw completion with metadata (token usage). /// /// Like [`generate`](Self::generate), but returns a [`GenerateResult`] diff --git a/src/backend/gemini.rs b/src/backend/gemini.rs index b562ce3..012a166 100644 --- a/src/backend/gemini.rs +++ b/src/backend/gemini.rs @@ -201,6 +201,50 @@ struct CandidatePart { text: Option, } +/// Convert provider-agnostic chat messages into Gemini `contents`, including any +/// attached media as `inlineData` (base64) or `fileData` (URI) parts. Gemini +/// passes the MIME type through, so images and PDFs share the same encoding. +fn chat_messages_to_contents(messages: &[ChatMessage]) -> Vec { + messages + .iter() + .map(|msg| { + // Gemini uses "user" and "model" (not "assistant") + let role = if msg.role.as_str() == "assistant" { + "model" + } else { + msg.role.as_str() + }; + let mut parts = Vec::new(); + if !msg.content.is_empty() { + parts.push(Part::Text { + text: msg.content.clone(), + }); + } + for media in &msg.media { + if let Some(ref base64_data) = media.data { + parts.push(Part::InlineData { + inline_data: InlineData { + mime_type: media.mime_type.clone(), + data: base64_data.clone(), + }, + }); + } else { + parts.push(Part::FileData { + file_data: FileData { + mime_type: media.mime_type.clone(), + file_uri: media.uri.clone(), + }, + }); + } + } + Content { + role: Some(role.to_string()), + parts, + } + }) + .collect() +} + impl GeminiClient { /// Create a new Gemini client with the provided API key. /// @@ -321,44 +365,7 @@ impl GeminiClient { // Build API contents from conversation history // With native response_schema, we don't need to include schema instructions in the prompt - let contents: Vec = messages - .iter() - .map(|msg| { - // Gemini uses "user" and "model" (not "assistant") - let role = if msg.role.as_str() == "assistant" { - "model" - } else { - msg.role.as_str() - }; - let mut parts = Vec::new(); - if !msg.content.is_empty() { - parts.push(Part::Text { - text: msg.content.clone(), - }); - } - for media in &msg.media { - if let Some(ref base64_data) = media.data { - parts.push(Part::InlineData { - inline_data: InlineData { - mime_type: media.mime_type.clone(), - data: base64_data.clone(), - }, - }); - } else { - parts.push(Part::FileData { - file_data: FileData { - mime_type: media.mime_type.clone(), - file_uri: media.uri.clone(), - }, - }); - } - } - Content { - role: Some(role.to_string()), - parts, - } - }) - .collect(); + let contents = chat_messages_to_contents(messages); // Build thinking config only for Gemini 3.x models let is_gemini3 = self.config.model.as_str().starts_with("gemini-3"); @@ -493,6 +500,123 @@ impl GeminiClient { None, )) } + + /// Internal implementation of raw text generation (no structured output). + /// + /// Accepts chat messages so that callers can attach media (images/PDFs) to + /// the user message; the same content-building path as `materialize_internal` + /// is used, so media is encoded as `inlineData`/`fileData` parts. + async fn generate_internal(&self, messages: &[ChatMessage]) -> Result { + info!("Generating raw text response with Gemini"); + + // Build thinking config only for Gemini 3.x models + let is_gemini3 = self.config.model.as_str().starts_with("gemini-3"); + let thinking_config = if is_gemini3 { + self.config.thinking_level.and_then(|level| { + level.gemini_level().map(|l| ThinkingConfig { + thinking_level: l.to_string(), + }) + }) + } else { + None + }; + + // Build the request, including any attached media parts + debug!("Building Gemini API request"); + let request = GenerateContentRequest { + contents: chat_messages_to_contents(messages), + generation_config: GenerationConfig { + temperature: self.config.temperature, + max_output_tokens: self.config.max_tokens, + response_mime_type: None, + response_schema: None, + thinking_config, + }, + }; + + // Send the request to Gemini API + let base_url = self + .config + .base_url + .as_deref() + .unwrap_or("https://generativelanguage.googleapis.com/v1beta"); + let url = format!( + "{}/models/{}:generateContent", + base_url, + self.config.model.as_str() + ); + debug!( + url = %url, + model = %self.config.model.as_str(), + "Sending request to Gemini API" + ); + let response = self + .client + .post(&url) + .query(&[("key", &self.config.api_key)]) + .header("Content-Type", "application/json") + .json(&request) + .send() + .await + .map_err(|e| handle_http_error(e, "Gemini"))?; + + // Parse the response + let response = check_response_status(response, "Gemini").await?; + + debug!("Successfully received response from Gemini API"); + let completion: GenerateContentResponse = response.json().await.map_err(|e| { + error!(error = %e, "Failed to parse JSON response from Gemini API"); + e + })?; + + if completion.candidates.is_empty() { + error!("Gemini API returned empty candidates array"); + return Err(RStructorError::api_error( + "Gemini", + ApiErrorKind::UnexpectedResponse { + details: "No completion candidates returned".to_string(), + }, + )); + } + + // Extract usage info + let model_name = completion + .model_version + .clone() + .unwrap_or_else(|| self.config.model.as_str().to_string()); + let usage = completion + .usage_metadata + .as_ref() + .map(|u| TokenUsage::new(model_name, u.prompt_token_count, u.candidates_token_count)); + + let candidate = &completion.candidates[0]; + trace!(finish_reason = %candidate.finish_reason, "Completion finish reason"); + + // Extract the text content + match candidate + .content + .parts + .first() + .and_then(|p| p.text.as_ref()) + { + Some(text) => { + debug!( + content_len = text.len(), + "Successfully extracted text content from response" + ); + Ok(GenerateResult::new(text.clone(), usage)) + } + None => { + error!("No text content in Gemini response"); + Err(RStructorError::api_error( + "Gemini", + ApiErrorKind::UnexpectedResponse { + details: "No text content in response".to_string(), + }, + )) + } + } + } } // Generate builder methods using macro @@ -632,6 +756,7 @@ impl crate::backend::tools::ToolRunner for GeminiClient { &self, system: Option<&str>, prompt: &str, + media: &[super::MediaFile], toolbox: &crate::backend::tools::Toolbox, max_iterations: usize, ) -> Result { @@ -649,6 +774,7 @@ impl crate::backend::tools::ToolRunner for GeminiClient { self.config.max_tokens, system, prompt, + media, toolbox, max_iterations, ) @@ -751,6 +877,26 @@ impl LLMClient for GeminiClient { Ok(result.text) } + #[instrument( + name = "gemini_generate_with_media", + skip(self, prompt, media), + fields( + model = %self.config.model.as_str(), + prompt_len = prompt.len(), + media_len = media.len() + ) + )] + async fn generate_with_media( + &self, + prompt: &str, + media: &[super::MediaFile], + ) -> Result { + let result = self + .generate_internal(&[ChatMessage::user_with_media(prompt, media.to_vec())]) + .await?; + Ok(result.text) + } + #[instrument( name = "gemini_generate_with_metadata", skip(self, prompt), @@ -760,120 +906,7 @@ impl LLMClient for GeminiClient { ) )] async fn generate_with_metadata(&self, prompt: &str) -> Result { - info!("Generating raw text response with Gemini"); - - // Build thinking config only for Gemini 3.x models - let is_gemini3 = self.config.model.as_str().starts_with("gemini-3"); - let thinking_config = if is_gemini3 { - self.config.thinking_level.and_then(|level| { - level.gemini_level().map(|l| ThinkingConfig { - thinking_level: l.to_string(), - }) - }) - } else { - None - }; - - // Build the request - debug!("Building Gemini API request"); - let request = GenerateContentRequest { - contents: vec![Content { - role: Some("user".to_string()), - parts: vec![Part::Text { - text: prompt.to_string(), - }], - }], - generation_config: GenerationConfig { - temperature: self.config.temperature, - max_output_tokens: self.config.max_tokens, - response_mime_type: None, - response_schema: None, - thinking_config, - }, - }; - - // Send the request to Gemini API - let base_url = self - .config - .base_url - .as_deref() - .unwrap_or("https://generativelanguage.googleapis.com/v1beta"); - let url = format!( - "{}/models/{}:generateContent", - base_url, - self.config.model.as_str() - ); - debug!( - url = %url, - model = %self.config.model.as_str(), - "Sending request to Gemini API" - ); - let response = self - .client - .post(&url) - .query(&[("key", &self.config.api_key)]) - .header("Content-Type", "application/json") - .json(&request) - .send() - .await - .map_err(|e| handle_http_error(e, "Gemini"))?; - - // Parse the response - let response = check_response_status(response, "Gemini").await?; - - debug!("Successfully received response from Gemini API"); - let completion: GenerateContentResponse = response.json().await.map_err(|e| { - error!(error = %e, "Failed to parse JSON response from Gemini API"); - e - })?; - - if completion.candidates.is_empty() { - error!("Gemini API returned empty candidates array"); - return Err(RStructorError::api_error( - "Gemini", - ApiErrorKind::UnexpectedResponse { - details: "No completion candidates returned".to_string(), - }, - )); - } - - // Extract usage info - let model_name = completion - .model_version - .clone() - .unwrap_or_else(|| self.config.model.as_str().to_string()); - let usage = completion - .usage_metadata - .as_ref() - .map(|u| TokenUsage::new(model_name, u.prompt_token_count, u.candidates_token_count)); - - let candidate = &completion.candidates[0]; - trace!(finish_reason = %candidate.finish_reason, "Completion finish reason"); - - // Extract the text content - match candidate - .content - .parts - .first() - .and_then(|p| p.text.as_ref()) - { - Some(text) => { - debug!( - content_len = text.len(), - "Successfully extracted text content from response" - ); - Ok(GenerateResult::new(text.clone(), usage)) - } - None => { - error!("No text content in Gemini response"); - Err(RStructorError::api_error( - "Gemini", - ApiErrorKind::UnexpectedResponse { - details: "No text content in response".to_string(), - }, - )) - } - } + self.generate_internal(&[ChatMessage::user(prompt)]).await } #[cfg(feature = "streaming")] diff --git a/src/backend/grok.rs b/src/backend/grok.rs index 17dd968..f2bc2ae 100644 --- a/src/backend/grok.rs +++ b/src/backend/grok.rs @@ -7,11 +7,12 @@ use crate::backend::model_macro::define_model_enum; use crate::backend::{ ChatMessage, GenerateResult, LLMClient, MaterializeInternalOutput, MaterializeResult, ModelInfo, OpenAICompatibleChatCompletionRequest, OpenAICompatibleChatCompletionResponse, - OpenAICompatibleChatMessage, OpenAICompatibleMessageContent, ResponseFormat, TokenUsage, - ValidationFailureContext, check_response_status, convert_openai_compatible_chat_messages, - generate_with_retry_with_history, handle_http_error, materialize_with_media_with_retry, - parse_validate_and_create_output, prepare_strict_schema, + ResponseFormat, TokenUsage, ValidationFailureContext, check_response_status, + convert_openai_compatible_chat_messages, generate_with_retry_with_history, handle_http_error, + materialize_with_media_with_retry, parse_validate_and_create_output, prepare_strict_schema, }; +#[cfg(feature = "streaming")] +use crate::backend::{OpenAICompatibleChatMessage, OpenAICompatibleMessageContent}; use crate::error::{ApiErrorKind, RStructorError, Result}; use crate::model::Instructor; @@ -294,6 +295,94 @@ impl GrokClient { )) } } + + /// Internal implementation of raw text generation (no structured output). + /// + /// Accepts chat messages so that callers can attach media (images) to the + /// user message; the same content-building path as `materialize_internal` is + /// used, so unsupported attachments (e.g. PDFs) fail with a clear error + /// instead of being silently mislabeled. + async fn generate_internal(&self, messages: &[ChatMessage]) -> Result { + info!("Generating raw text response with Grok"); + + // Build the request without structured outputs + debug!("Building Grok API request for text generation"); + let request = OpenAICompatibleChatCompletionRequest { + model: self.config.model.as_str().to_string(), + messages: convert_openai_compatible_chat_messages(messages, "Grok")?, + response_format: None, + temperature: self.config.temperature, + max_tokens: self.config.max_tokens, + reasoning_effort: None, + }; + + // Send the request to Grok/xAI API + let base_url = self + .config + .base_url + .as_deref() + .unwrap_or("https://api.x.ai/v1"); + let url = format!("{}/chat/completions", base_url); + debug!(url = %url, "Sending request to Grok API"); + let response = self + .client + .post(&url) + .header("Authorization", format!("Bearer {}", self.config.api_key)) + .header("Content-Type", "application/json") + .json(&request) + .send() + .await + .map_err(|e| handle_http_error(e, "Grok"))?; + + // Parse the response + let response = check_response_status(response, "Grok").await?; + + debug!("Successfully received response from Grok API"); + let completion: OpenAICompatibleChatCompletionResponse = + response.json().await.map_err(|e| { + error!(error = %e, "Failed to parse JSON response from Grok API"); + e + })?; + + if completion.choices.is_empty() { + error!("Grok API returned empty choices array"); + return Err(RStructorError::api_error( + "Grok", + ApiErrorKind::UnexpectedResponse { + details: "No completion choices returned".to_string(), + }, + )); + } + + // Extract usage info + let model_name = completion + .model + .clone() + .unwrap_or_else(|| self.config.model.as_str().to_string()); + let usage = completion + .usage + .as_ref() + .map(|u| TokenUsage::new(model_name, u.prompt_tokens, u.completion_tokens)); + + let message = &completion.choices[0].message; + trace!(finish_reason = %completion.choices[0].finish_reason, "Completion finish reason"); + + if let Some(content) = &message.content { + debug!( + content_len = content.len(), + "Successfully extracted content from response" + ); + Ok(GenerateResult::new(content.clone(), usage)) + } else { + error!("No content in Grok API response"); + Err(RStructorError::api_error( + "Grok", + ApiErrorKind::UnexpectedResponse { + details: "No content in response".to_string(), + }, + )) + } + } } // Generate builder methods using macro @@ -382,6 +471,7 @@ impl crate::backend::tools::ToolRunner for GrokClient { &self, system: Option<&str>, prompt: &str, + media: &[super::MediaFile], toolbox: &crate::backend::tools::Toolbox, max_iterations: usize, ) -> Result { @@ -403,6 +493,7 @@ impl crate::backend::tools::ToolRunner for GrokClient { None, system, prompt, + media, toolbox, max_iterations, ) @@ -505,6 +596,26 @@ impl LLMClient for GrokClient { Ok(result.text) } + #[instrument( + name = "grok_generate_with_media", + skip(self, prompt, media), + fields( + model = %self.config.model.as_str(), + prompt_len = prompt.len(), + media_len = media.len() + ) + )] + async fn generate_with_media( + &self, + prompt: &str, + media: &[super::MediaFile], + ) -> Result { + let result = self + .generate_internal(&[ChatMessage::user_with_media(prompt, media.to_vec())]) + .await?; + Ok(result.text) + } + #[instrument( name = "grok_generate_with_metadata", skip(self, prompt), @@ -514,88 +625,7 @@ impl LLMClient for GrokClient { ) )] async fn generate_with_metadata(&self, prompt: &str) -> Result { - info!("Generating raw text response with Grok"); - - // Build the request without structured outputs - debug!("Building Grok API request for text generation"); - let request = OpenAICompatibleChatCompletionRequest { - model: self.config.model.as_str().to_string(), - messages: vec![OpenAICompatibleChatMessage { - role: "user".to_string(), - content: OpenAICompatibleMessageContent::Text(prompt.to_string()), - }], - response_format: None, - temperature: self.config.temperature, - max_tokens: self.config.max_tokens, - reasoning_effort: None, - }; - - // Send the request to Grok/xAI API - let base_url = self - .config - .base_url - .as_deref() - .unwrap_or("https://api.x.ai/v1"); - let url = format!("{}/chat/completions", base_url); - debug!(url = %url, "Sending request to Grok API"); - let response = self - .client - .post(&url) - .header("Authorization", format!("Bearer {}", self.config.api_key)) - .header("Content-Type", "application/json") - .json(&request) - .send() - .await - .map_err(|e| handle_http_error(e, "Grok"))?; - - // Parse the response - let response = check_response_status(response, "Grok").await?; - - debug!("Successfully received response from Grok API"); - let completion: OpenAICompatibleChatCompletionResponse = - response.json().await.map_err(|e| { - error!(error = %e, "Failed to parse JSON response from Grok API"); - e - })?; - - if completion.choices.is_empty() { - error!("Grok API returned empty choices array"); - return Err(RStructorError::api_error( - "Grok", - ApiErrorKind::UnexpectedResponse { - details: "No completion choices returned".to_string(), - }, - )); - } - - // Extract usage info - let model_name = completion - .model - .clone() - .unwrap_or_else(|| self.config.model.as_str().to_string()); - let usage = completion - .usage - .as_ref() - .map(|u| TokenUsage::new(model_name, u.prompt_tokens, u.completion_tokens)); - - let message = &completion.choices[0].message; - trace!(finish_reason = %completion.choices[0].finish_reason, "Completion finish reason"); - - if let Some(content) = &message.content { - debug!( - content_len = content.len(), - "Successfully extracted content from response" - ); - Ok(GenerateResult::new(content.clone(), usage)) - } else { - error!("No content in Grok API response"); - Err(RStructorError::api_error( - "Grok", - ApiErrorKind::UnexpectedResponse { - details: "No content in response".to_string(), - }, - )) - } + self.generate_internal(&[ChatMessage::user(prompt)]).await } #[cfg(feature = "streaming")] diff --git a/src/backend/media.rs b/src/backend/media.rs index dba0ad0..2440826 100644 --- a/src/backend/media.rs +++ b/src/backend/media.rs @@ -15,6 +15,7 @@ pub(crate) enum OpenAICompatibleMessageContent { pub(crate) enum OpenAICompatibleMessagePart { Text { text: String }, ImageUrl { image_url: OpenAICompatibleImageUrl }, + File { file: OpenAICompatibleFile }, } #[derive(Debug, Serialize)] @@ -24,6 +25,16 @@ pub(crate) struct OpenAICompatibleImageUrl { pub(crate) detail: Option, } +/// A file content part for OpenAI chat completions (PDF input). +/// +/// See : the part is +/// `{"type": "file", "file": {"filename": ..., "file_data": "data:application/pdf;base64,..."}}`. +#[derive(Debug, Serialize)] +pub(crate) struct OpenAICompatibleFile { + pub(crate) filename: String, + pub(crate) file_data: String, +} + #[derive(Debug, Serialize)] #[serde(untagged)] pub(crate) enum AnthropicMessageContent { @@ -34,13 +45,25 @@ pub(crate) enum AnthropicMessageContent { #[derive(Debug, Serialize)] #[serde(tag = "type", rename_all = "snake_case")] pub(crate) enum AnthropicContentBlock { - Text { text: String }, - Image { source: AnthropicImageSource }, + Text { + text: String, + }, + Image { + source: AnthropicMediaSource, + }, + /// A PDF document block, see + /// : + /// `{"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": ...}}` + /// or `{"type": "document", "source": {"type": "url", "url": ...}}`. + Document { + source: AnthropicMediaSource, + }, } +/// Source of an Anthropic `image` or `document` block (both share this shape). #[derive(Debug, Serialize)] #[serde(tag = "type", rename_all = "snake_case")] -pub(crate) enum AnthropicImageSource { +pub(crate) enum AnthropicMediaSource { Base64 { media_type: String, data: String }, Url { url: String }, } @@ -61,18 +84,108 @@ pub(crate) fn build_openai_compatible_message_content( } for media in &msg.media { - let url = media_to_url(media, provider_name)?; - parts.push(OpenAICompatibleMessagePart::ImageUrl { - image_url: OpenAICompatibleImageUrl { - url, - detail: Some("auto".to_string()), - }, - }); + if media.mime_type.starts_with("image/") { + let url = media_to_url(media, provider_name)?; + parts.push(OpenAICompatibleMessagePart::ImageUrl { + image_url: OpenAICompatibleImageUrl { + url, + detail: Some("auto".to_string()), + }, + }); + } else if media.mime_type == "application/pdf" { + parts.push(openai_compatible_pdf_part(media, provider_name)?); + } else { + return Err(unsupported_media_type(media, provider_name)); + } } Ok(OpenAICompatibleMessageContent::Parts(parts)) } +/// Build the PDF content part for an OpenAI-compatible chat completions request, +/// or a clear error for providers/sources without a documented PDF pathway. +fn openai_compatible_pdf_part( + media: &crate::backend::client::MediaFile, + provider_name: &str, +) -> Result { + // xAI's chat completions API only documents text and image content parts + // (https://docs.x.ai/docs/guides/image-understanding); there is no file or + // document part, so sending a PDF would be silently mislabeled or rejected. + if provider_name == "Grok" { + return Err(RStructorError::api_error( + provider_name, + ApiErrorKind::BadRequest { + details: "PDF attachments are not supported for Grok: the xAI chat \ + completions API only accepts text and image content parts. \ + Extract the PDF's text or render its pages to images and \ + attach those instead" + .to_string(), + }, + )); + } + + if let Some(data) = media.data.as_ref() { + if data.is_empty() { + return Err(RStructorError::api_error( + provider_name, + ApiErrorKind::BadRequest { + details: "MediaFile inline data cannot be empty".to_string(), + }, + )); + } + // Chat completions accept PDFs as a `file` part with base64 `file_data` + // (https://platform.openai.com/docs/guides/pdf-files). + Ok(OpenAICompatibleMessagePart::File { + file: OpenAICompatibleFile { + filename: "document.pdf".to_string(), + file_data: format!("data:{};base64,{}", media.mime_type, data), + }, + }) + } else if !media.uri.is_empty() { + // Chat completions do not accept remote file URLs (only `file_data` or an + // uploaded `file_id`); see https://platform.openai.com/docs/guides/pdf-files. + Err(RStructorError::api_error( + provider_name, + ApiErrorKind::BadRequest { + details: format!( + "{provider_name} chat completions does not support URL-based PDF \ + attachments; download the file and attach the bytes inline with \ + MediaFile::from_bytes(bytes, \"application/pdf\") instead" + ), + }, + )) + } else { + Err(RStructorError::api_error( + provider_name, + ApiErrorKind::BadRequest { + details: "MediaFile must include either inline data or uri".to_string(), + }, + )) + } +} + +/// Error for MIME types with no documented attachment pathway on this provider. +fn unsupported_media_type( + media: &crate::backend::client::MediaFile, + provider_name: &str, +) -> RStructorError { + let supported = if provider_name == "Grok" { + "image/*" + } else { + "image/* and application/pdf" + }; + RStructorError::api_error( + provider_name, + ApiErrorKind::BadRequest { + details: format!( + "unsupported media type {:?} for {provider_name}: only {supported} \ + attachments are supported on this provider", + media.mime_type, + ), + }, + ) +} + pub(crate) fn build_anthropic_message_content( msg: &ChatMessage, ) -> Result { @@ -88,7 +201,10 @@ pub(crate) fn build_anthropic_message_content( } for media in &msg.media { - if let Some(data) = media.data.as_ref() { + let is_image = media.mime_type.starts_with("image/"); + let is_pdf = media.mime_type == "application/pdf"; + + let source = if let Some(data) = media.data.as_ref() { if data.is_empty() { return Err(RStructorError::api_error( "Anthropic", @@ -105,18 +221,14 @@ pub(crate) fn build_anthropic_message_content( }, )); } - blocks.push(AnthropicContentBlock::Image { - source: AnthropicImageSource::Base64 { - media_type: media.mime_type.clone(), - data: data.clone(), - }, - }); + AnthropicMediaSource::Base64 { + media_type: media.mime_type.clone(), + data: data.clone(), + } } else if !media.uri.is_empty() { - blocks.push(AnthropicContentBlock::Image { - source: AnthropicImageSource::Url { - url: media.uri.clone(), - }, - }); + AnthropicMediaSource::Url { + url: media.uri.clone(), + } } else { return Err(RStructorError::api_error( "Anthropic", @@ -124,6 +236,16 @@ pub(crate) fn build_anthropic_message_content( details: "MediaFile must include either inline data or uri".to_string(), }, )); + }; + + if is_image { + blocks.push(AnthropicContentBlock::Image { source }); + } else if is_pdf { + // PDFs go in a `document` block, never an `image` block; see + // https://docs.anthropic.com/en/docs/build-with-claude/pdf-support. + blocks.push(AnthropicContentBlock::Document { source }); + } else { + return Err(unsupported_media_type(media, "Anthropic")); } } @@ -211,4 +333,185 @@ mod tests { assert_eq!(json[1]["source"]["media_type"], "image/png"); assert_eq!(json[1]["source"]["data"], "YWJj"); } + + #[test] + fn test_anthropic_content_with_url_image() { + let msg = ChatMessage::user_with_media( + "describe image", + vec![MediaFile::new("https://example.com/cat.png", "image/png")], + ); + let content = build_anthropic_message_content(&msg).expect("content should build"); + let json = serde_json::to_value(&content).expect("content should serialize"); + assert_eq!(json[1]["type"], "image"); + assert_eq!( + json[1]["source"], + serde_json::json!({"type": "url", "url": "https://example.com/cat.png"}) + ); + } + + // ---- PDF routing: OpenAI ---- + + #[test] + fn test_openai_inline_pdf_becomes_file_part() { + let msg = ChatMessage::user_with_media( + "summarize", + vec![MediaFile::from_bytes(b"%PDF", "application/pdf")], + ); + let content = + build_openai_compatible_message_content(&msg, "OpenAI").expect("content should build"); + let json = serde_json::to_value(&content).expect("content should serialize"); + assert_eq!(json[0]["type"], "text"); + assert_eq!( + json[1], + serde_json::json!({ + "type": "file", + "file": { + "filename": "document.pdf", + "file_data": "data:application/pdf;base64,JVBERg==", + } + }) + ); + } + + #[test] + fn test_openai_url_pdf_is_a_clear_error() { + let msg = ChatMessage::user_with_media( + "summarize", + vec![MediaFile::new( + "https://example.com/report.pdf", + "application/pdf", + )], + ); + let err = build_openai_compatible_message_content(&msg, "OpenAI") + .expect_err("URL-based PDFs are not supported by chat completions"); + let text = err.to_string(); + assert!( + text.contains("URL-based PDF") && text.contains("MediaFile::from_bytes"), + "error should explain the fix, got: {text}" + ); + } + + #[test] + fn test_openai_non_image_non_pdf_is_a_clear_error() { + let msg = ChatMessage::user_with_media( + "transcribe", + vec![MediaFile::from_bytes(b"abc", "audio/mpeg")], + ); + let err = build_openai_compatible_message_content(&msg, "OpenAI") + .expect_err("audio attachments have no chat-completions pathway"); + let text = err.to_string(); + assert!( + text.contains("unsupported media type") && text.contains("audio/mpeg"), + "error should name the offending MIME type, got: {text}" + ); + } + + // ---- PDF routing: Grok ---- + + #[test] + fn test_grok_inline_pdf_is_a_clear_error_not_an_image_url() { + let msg = ChatMessage::user_with_media( + "summarize", + vec![MediaFile::from_bytes(b"%PDF", "application/pdf")], + ); + let err = build_openai_compatible_message_content(&msg, "Grok") + .expect_err("Grok has no documented PDF pathway"); + let text = err.to_string(); + assert!( + text.contains("PDF attachments are not supported for Grok"), + "error should say PDFs are unsupported on Grok, got: {text}" + ); + } + + #[test] + fn test_grok_url_pdf_is_a_clear_error() { + let msg = ChatMessage::user_with_media( + "summarize", + vec![MediaFile::new( + "https://example.com/report.pdf", + "application/pdf", + )], + ); + let err = build_openai_compatible_message_content(&msg, "Grok") + .expect_err("Grok has no documented PDF pathway"); + assert!( + err.to_string() + .contains("PDF attachments are not supported for Grok") + ); + } + + #[test] + fn test_grok_images_still_use_image_url() { + let msg = ChatMessage::user_with_media( + "describe image", + vec![MediaFile::from_bytes(b"abc", "image/jpeg")], + ); + let content = + build_openai_compatible_message_content(&msg, "Grok").expect("content should build"); + let json = serde_json::to_value(&content).expect("content should serialize"); + assert_eq!(json[1]["type"], "image_url"); + assert_eq!(json[1]["image_url"]["url"], "data:image/jpeg;base64,YWJj"); + } + + // ---- PDF routing: Anthropic ---- + + #[test] + fn test_anthropic_inline_pdf_becomes_document_block() { + let msg = ChatMessage::user_with_media( + "summarize", + vec![MediaFile::from_bytes(b"%PDF", "application/pdf")], + ); + let content = build_anthropic_message_content(&msg).expect("content should build"); + let json = serde_json::to_value(&content).expect("content should serialize"); + assert_eq!(json[0]["type"], "text"); + assert_eq!( + json[1], + serde_json::json!({ + "type": "document", + "source": { + "type": "base64", + "media_type": "application/pdf", + "data": "JVBERg==", + } + }) + ); + } + + #[test] + fn test_anthropic_url_pdf_becomes_url_document_block() { + let msg = ChatMessage::user_with_media( + "summarize", + vec![MediaFile::new( + "https://example.com/report.pdf", + "application/pdf", + )], + ); + let content = build_anthropic_message_content(&msg).expect("content should build"); + let json = serde_json::to_value(&content).expect("content should serialize"); + assert_eq!( + json[1], + serde_json::json!({ + "type": "document", + "source": { + "type": "url", + "url": "https://example.com/report.pdf", + } + }) + ); + } + + #[test] + fn test_anthropic_non_image_non_pdf_is_a_clear_error() { + let msg = ChatMessage::user_with_media( + "transcribe", + vec![MediaFile::from_bytes(b"abc", "audio/mpeg")], + ); + let err = build_anthropic_message_content(&msg) + .expect_err("audio attachments have no Messages API pathway"); + let text = err.to_string(); + assert!( + text.contains("unsupported media type") && text.contains("audio/mpeg"), + "error should name the offending MIME type, got: {text}" + ); + } } diff --git a/src/backend/mock.rs b/src/backend/mock.rs index c67b986..4f9573f 100644 --- a/src/backend/mock.rs +++ b/src/backend/mock.rs @@ -153,6 +153,8 @@ pub enum RequestKind { MaterializeWithMedia, /// [`LLMClient::generate`](crate::LLMClient::generate) Generate, + /// [`LLMClient::generate_with_media`](crate::LLMClient::generate_with_media) + GenerateWithMedia, /// [`LLMClient::generate_with_metadata`](crate::LLMClient::generate_with_metadata) GenerateWithMetadata, /// [`LLMClient::list_models`](crate::LLMClient::list_models) @@ -185,7 +187,8 @@ pub struct RecordedRequest { pub schema: Option, /// The schema name of the target type, when known. pub schema_name: Option, - /// Media attached to the call (for `materialize_with_media`). + /// Media attached to the call (for `materialize_with_media`, + /// `generate_with_media`, and the tool loop). pub media: Vec, /// Tool names offered to the call (for the tool loop; empty otherwise). #[cfg(feature = "tools")] @@ -627,6 +630,16 @@ impl LLMClient for MockClient { } } + async fn generate_with_media(&self, prompt: &str, media: &[MediaFile]) -> Result { + let mut view = MockRequestView::bare(RequestKind::GenerateWithMedia, prompt); + view.media = media; + self.record(&view); + match self.pick_response(&view) { + MockResponse::Text(s) => Ok(s), + MockResponse::Error(e) => Err(e), + } + } + async fn generate_with_metadata(&self, prompt: &str) -> Result { let view = MockRequestView::bare(RequestKind::GenerateWithMetadata, prompt); self.record(&view); @@ -755,11 +768,13 @@ impl crate::backend::tools::ToolRunner for MockClient { &self, _system: Option<&str>, prompt: &str, + media: &[MediaFile], toolbox: &crate::backend::tools::Toolbox, _max_iterations: usize, ) -> Result { let tool_names = toolbox.tool_names(); let mut view = MockRequestView::bare(RequestKind::RunToolLoop, prompt); + view.media = media; view.tool_names = &tool_names; self.record(&view); diff --git a/src/backend/mod.rs b/src/backend/mod.rs index 326aa71..22fd82c 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -78,10 +78,12 @@ pub(crate) use media::{ AnthropicMessageContent, OpenAICompatibleMessageContent, build_anthropic_message_content, build_openai_compatible_message_content, }; +#[cfg(feature = "streaming")] +pub(crate) use openai_compatible::OpenAICompatibleChatMessage; #[cfg(feature = "_client")] pub(crate) use openai_compatible::{ OpenAICompatibleChatCompletionRequest, OpenAICompatibleChatCompletionResponse, - OpenAICompatibleChatMessage, convert_openai_compatible_chat_messages, + convert_openai_compatible_chat_messages, }; #[cfg(feature = "_client")] pub(crate) use utils::{ diff --git a/src/backend/openai.rs b/src/backend/openai.rs index 44658bb..1913841 100644 --- a/src/backend/openai.rs +++ b/src/backend/openai.rs @@ -7,11 +7,12 @@ use crate::backend::model_macro::define_model_enum; use crate::backend::{ ChatMessage, GenerateResult, LLMClient, MaterializeInternalOutput, MaterializeResult, ModelInfo, OpenAICompatibleChatCompletionRequest, OpenAICompatibleChatCompletionResponse, - OpenAICompatibleChatMessage, OpenAICompatibleMessageContent, ResponseFormat, ThinkingLevel, - TokenUsage, ValidationFailureContext, check_response_status, + ResponseFormat, ThinkingLevel, TokenUsage, ValidationFailureContext, check_response_status, convert_openai_compatible_chat_messages, generate_with_retry_with_history, handle_http_error, materialize_with_media_with_retry, parse_validate_and_create_output, prepare_strict_schema, }; +#[cfg(feature = "streaming")] +use crate::backend::{OpenAICompatibleChatMessage, OpenAICompatibleMessageContent}; use crate::error::{ApiErrorKind, RStructorError, Result}; use crate::model::Instructor; @@ -449,6 +450,110 @@ impl OpenAIClient { )) } } + + /// Internal implementation of raw text generation (no structured output). + /// + /// Accepts chat messages so that callers can attach media (images/PDFs) to + /// the user message; the same content-building path as `materialize_internal` + /// is used, so media is encoded per OpenAI's documented multimodal format. + async fn generate_internal(&self, messages: &[ChatMessage]) -> Result { + info!("Generating raw text response with OpenAI"); + + // Build reasoning_effort for GPT-5.x models + let is_gpt5 = self.config.model.as_str().starts_with("gpt-5"); + let reasoning_effort = if is_gpt5 { + self.config + .thinking_level + .and_then(|level| level.openai_reasoning_effort().map(|s| s.to_string())) + } else { + None + }; + + // GPT-5.x with reasoning requires temperature=1.0 + let effective_temp = if reasoning_effort.is_some() { + 1.0 + } else { + self.config.temperature + }; + + // Build the request for text generation (no structured output) + debug!("Building OpenAI API request for text generation"); + let request = OpenAICompatibleChatCompletionRequest { + model: self.config.model.as_str().to_string(), + messages: convert_openai_compatible_chat_messages(messages, "OpenAI")?, + response_format: None, + temperature: effective_temp, + max_tokens: self.config.max_tokens, + reasoning_effort, + }; + + // Send the request to OpenAI + let base_url = self + .config + .base_url + .as_deref() + .unwrap_or("https://api.openai.com/v1"); + let url = format!("{}/chat/completions", base_url); + debug!(url = %url, "Sending request to OpenAI API"); + let response = self + .client + .post(&url) + .header("Authorization", format!("Bearer {}", self.config.api_key)) + .header("Content-Type", "application/json") + .json(&request) + .send() + .await + .map_err(|e| handle_http_error(e, "OpenAI"))?; + + // Parse the response + let response = check_response_status(response, "OpenAI").await?; + + debug!("Successfully received response from OpenAI"); + let completion: OpenAICompatibleChatCompletionResponse = + response.json().await.map_err(|e| { + error!(error = %e, "Failed to parse JSON response from OpenAI"); + e + })?; + + if completion.choices.is_empty() { + error!("OpenAI returned empty choices array"); + return Err(RStructorError::api_error( + "OpenAI", + ApiErrorKind::UnexpectedResponse { + details: "No completion choices returned".to_string(), + }, + )); + } + + // Extract usage info + let model_name = completion + .model + .clone() + .unwrap_or_else(|| self.config.model.as_str().to_string()); + let usage = completion + .usage + .as_ref() + .map(|u| TokenUsage::new(model_name, u.prompt_tokens, u.completion_tokens)); + + let message = &completion.choices[0].message; + trace!(finish_reason = %completion.choices[0].finish_reason, "Completion finish reason"); + + if let Some(content) = &message.content { + debug!( + content_len = content.len(), + "Successfully extracted content from response" + ); + Ok(GenerateResult::new(content.clone(), usage)) + } else { + error!("No content in OpenAI response"); + Err(RStructorError::api_error( + "OpenAI", + ApiErrorKind::UnexpectedResponse { + details: "No content in response".to_string(), + }, + )) + } + } } #[cfg(feature = "streaming")] @@ -523,6 +628,7 @@ impl crate::backend::tools::ToolRunner for OpenAIClient { &self, system: Option<&str>, prompt: &str, + media: &[super::MediaFile], toolbox: &crate::backend::tools::Toolbox, max_iterations: usize, ) -> Result { @@ -554,6 +660,7 @@ impl crate::backend::tools::ToolRunner for OpenAIClient { None, system, prompt, + media, toolbox, max_iterations, ) @@ -656,6 +763,26 @@ impl LLMClient for OpenAIClient { Ok(result.text) } + #[instrument( + name = "openai_generate_with_media", + skip(self, prompt, media), + fields( + model = %self.config.model.as_str(), + prompt_len = prompt.len(), + media_len = media.len() + ) + )] + async fn generate_with_media( + &self, + prompt: &str, + media: &[super::MediaFile], + ) -> Result { + let result = self + .generate_internal(&[ChatMessage::user_with_media(prompt, media.to_vec())]) + .await?; + Ok(result.text) + } + #[instrument( name = "openai_generate_with_metadata", skip(self, prompt), @@ -665,105 +792,7 @@ impl LLMClient for OpenAIClient { ) )] async fn generate_with_metadata(&self, prompt: &str) -> Result { - info!("Generating raw text response with OpenAI"); - - // Build reasoning_effort for GPT-5.x models - let is_gpt5 = self.config.model.as_str().starts_with("gpt-5"); - let reasoning_effort = if is_gpt5 { - self.config - .thinking_level - .and_then(|level| level.openai_reasoning_effort().map(|s| s.to_string())) - } else { - None - }; - - // GPT-5.x with reasoning requires temperature=1.0 - let effective_temp = if reasoning_effort.is_some() { - 1.0 - } else { - self.config.temperature - }; - - // Build the request for text generation (no structured output) - debug!("Building OpenAI API request for text generation"); - let request = OpenAICompatibleChatCompletionRequest { - model: self.config.model.as_str().to_string(), - messages: vec![OpenAICompatibleChatMessage { - role: "user".to_string(), - content: OpenAICompatibleMessageContent::Text(prompt.to_string()), - }], - response_format: None, - temperature: effective_temp, - max_tokens: self.config.max_tokens, - reasoning_effort, - }; - - // Send the request to OpenAI - let base_url = self - .config - .base_url - .as_deref() - .unwrap_or("https://api.openai.com/v1"); - let url = format!("{}/chat/completions", base_url); - debug!(url = %url, "Sending request to OpenAI API"); - let response = self - .client - .post(&url) - .header("Authorization", format!("Bearer {}", self.config.api_key)) - .header("Content-Type", "application/json") - .json(&request) - .send() - .await - .map_err(|e| handle_http_error(e, "OpenAI"))?; - - // Parse the response - let response = check_response_status(response, "OpenAI").await?; - - debug!("Successfully received response from OpenAI"); - let completion: OpenAICompatibleChatCompletionResponse = - response.json().await.map_err(|e| { - error!(error = %e, "Failed to parse JSON response from OpenAI"); - e - })?; - - if completion.choices.is_empty() { - error!("OpenAI returned empty choices array"); - return Err(RStructorError::api_error( - "OpenAI", - ApiErrorKind::UnexpectedResponse { - details: "No completion choices returned".to_string(), - }, - )); - } - - // Extract usage info - let model_name = completion - .model - .clone() - .unwrap_or_else(|| self.config.model.as_str().to_string()); - let usage = completion - .usage - .as_ref() - .map(|u| TokenUsage::new(model_name, u.prompt_tokens, u.completion_tokens)); - - let message = &completion.choices[0].message; - trace!(finish_reason = %completion.choices[0].finish_reason, "Completion finish reason"); - - if let Some(content) = &message.content { - debug!( - content_len = content.len(), - "Successfully extracted content from response" - ); - Ok(GenerateResult::new(content.clone(), usage)) - } else { - error!("No content in OpenAI response"); - Err(RStructorError::api_error( - "OpenAI", - ApiErrorKind::UnexpectedResponse { - details: "No content in response".to_string(), - }, - )) - } + self.generate_internal(&[ChatMessage::user(prompt)]).await } #[cfg(feature = "streaming")] diff --git a/src/backend/request.rs b/src/backend/request.rs index 90cc674..65d6723 100644 --- a/src/backend/request.rs +++ b/src/backend/request.rs @@ -57,7 +57,8 @@ impl<'a, C: ?Sized> Request<'a, C> { self } - /// Attach media (images) to the request (used by `materialize`). + /// Attach media (images, or PDFs where the provider supports them) to the + /// request. Used by `materialize`, `generate`, and `run`. #[must_use] pub fn media(mut self, media: impl Into>) -> Self { self.media = media.into(); @@ -106,9 +107,14 @@ impl Request<'_, C> { } } - /// Generate raw text, applying any attached system context. + /// Generate raw text, applying any attached system context and media. pub async fn generate(self, prompt: &str) -> Result { - self.client.generate(&self.combined(prompt)).await + let prompt = self.combined(prompt); + if self.media.is_empty() { + self.client.generate(&prompt).await + } else { + self.client.generate_with_media(&prompt, &self.media).await + } } } @@ -170,16 +176,30 @@ impl<'a, C: LLMClient + Sync + ?Sized> Request<'a, C> { #[cfg(feature = "tools")] impl Request<'_, C> { /// Get a text answer, letting the model call attached tools (if any) in a loop - /// until it produces a final response. With no tools attached this is - /// equivalent to [`generate`](Self::generate). + /// until it produces a final response. Attached media is included in the + /// initial user turn. With no tools attached this is equivalent to + /// [`generate`](Self::generate). pub async fn run(self, prompt: &str) -> Result { match self.tools { Some(toolbox) => { self.client - .run_tool_loop(self.system.as_deref(), prompt, toolbox, self.max_iterations) + .run_tool_loop( + self.system.as_deref(), + prompt, + &self.media, + toolbox, + self.max_iterations, + ) .await } - None => self.client.generate(&self.combined(prompt)).await, + None => { + let prompt = self.combined(prompt); + if self.media.is_empty() { + self.client.generate(&prompt).await + } else { + self.client.generate_with_media(&prompt, &self.media).await + } + } } } } @@ -199,7 +219,8 @@ pub trait RequestExt: LLMClient { Request::new(self).system(system) } - /// Start a request with attached media (images). + /// Start a request with attached media (images, or PDFs where the provider + /// supports them). fn with_media<'a>(&'a self, media: &'a [MediaFile]) -> Request<'a, Self> { Request::new(self).media(media.to_vec()) } diff --git a/src/backend/tools.rs b/src/backend/tools.rs index afdfe74..e640096 100644 --- a/src/backend/tools.rs +++ b/src/backend/tools.rs @@ -264,6 +264,7 @@ pub(crate) async fn run_openai_compatible_tools( reasoning_effort: Option, system: Option<&str>, prompt: &str, + media: &[crate::backend::MediaFile], toolbox: &Toolbox, max_iterations: usize, ) -> Result { @@ -276,7 +277,12 @@ pub(crate) async fn run_openai_compatible_tools( if let Some(system) = system { messages.push(json!({ "role": "system", "content": system })); } - messages.push(json!({ "role": "user", "content": prompt })); + // Encode any attached media with the same content builder as materialize, so + // images/PDFs are carried (or rejected with a clear error) per provider rules. + let user_msg = crate::backend::ChatMessage::user_with_media(prompt, media.to_vec()); + let user_content = + crate::backend::build_openai_compatible_message_content(&user_msg, provider)?; + messages.push(json!({ "role": "user", "content": user_content })); for iteration in 0..max_iterations { let mut body = json!({ @@ -396,6 +402,7 @@ pub(crate) async fn run_anthropic_tools( max_tokens: u32, system: Option<&str>, prompt: &str, + media: &[crate::backend::MediaFile], toolbox: &Toolbox, max_iterations: usize, ) -> Result { @@ -405,7 +412,11 @@ pub(crate) async fn run_anthropic_tools( let tools_json = toolbox.anthropic_tools_json(); let url = format!("{base_url}/messages"); - let mut messages: Vec = vec![json!({ "role": "user", "content": prompt })]; + // Encode any attached media with the same content builder as materialize, so + // images/PDFs are carried (or rejected with a clear error) per provider rules. + let user_msg = crate::backend::ChatMessage::user_with_media(prompt, media.to_vec()); + let user_content = crate::backend::build_anthropic_message_content(&user_msg)?; + let mut messages: Vec = vec![json!({ "role": "user", "content": user_content })]; for _ in 0..max_iterations { let mut body = json!({ @@ -495,6 +506,7 @@ pub(crate) async fn run_gemini_tools( max_tokens: Option, system: Option<&str>, prompt: &str, + media: &[crate::backend::MediaFile], toolbox: &Toolbox, max_iterations: usize, ) -> Result { @@ -504,7 +516,21 @@ pub(crate) async fn run_gemini_tools( let tools_json = toolbox.gemini_tools_json(); let url = format!("{base_url}/models/{model}:generateContent"); - let mut contents: Vec = vec![json!({ "role": "user", "parts": [{ "text": prompt }] })]; + // Attach any media to the initial user turn, mirroring the materialize path: + // inline base64 data becomes `inlineData`, URI references become `fileData`. + let mut user_parts: Vec = vec![json!({ "text": prompt })]; + for m in media { + if let Some(data) = m.data.as_ref() { + user_parts.push(json!({ + "inlineData": { "mimeType": m.mime_type, "data": data } + })); + } else { + user_parts.push(json!({ + "fileData": { "mimeType": m.mime_type, "fileUri": m.uri } + })); + } + } + let mut contents: Vec = vec![json!({ "role": "user", "parts": user_parts })]; for _ in 0..max_iterations { let mut generation_config = json!({ "temperature": temperature }); @@ -585,7 +611,8 @@ pub(crate) async fn run_gemini_tools( /// /// Implemented for each provider client and driven by the fluent /// [`Request`](crate::Request) builder (`client.with_tools(..).run(..)`); not -/// called directly. +/// called directly. `media` carries any attachments from +/// [`Request::media`](crate::Request::media), included in the initial user turn. #[doc(hidden)] #[async_trait] pub trait ToolRunner { @@ -593,6 +620,7 @@ pub trait ToolRunner { &self, system: Option<&str>, prompt: &str, + media: &[crate::backend::MediaFile], toolbox: &Toolbox, max_iterations: usize, ) -> Result; diff --git a/tests/http_mock_tests.rs b/tests/http_mock_tests.rs index 7050724..4dfc331 100644 --- a/tests/http_mock_tests.rs +++ b/tests/http_mock_tests.rs @@ -275,6 +275,115 @@ async fn generate_null_content_is_unexpected_response() { m.assert_async().await; } +// --------------------------------------------------------------------------- +// generate / run carry attached media in the request body (offline_mockito) +// --------------------------------------------------------------------------- + +/// `with_media(..).generate(..)` must include the attached image as an +/// `image_url` content part in the serialized request body — media used to be +/// silently dropped on the plain-text generation path. +#[tokio::test] +async fn generate_request_body_carries_attached_image() { + use rstructor::{MediaFile, RequestExt}; + + let mut server = mockito::Server::new_async().await; + let m = server + .mock("POST", "/chat/completions") + .match_body(mockito::Matcher::PartialJson(json!({ + "messages": [{ + "role": "user", + "content": [ + { "type": "text", "text": "describe" }, + { + "type": "image_url", + "image_url": { "url": "data:image/png;base64,YWJj", "detail": "auto" }, + }, + ], + }], + }))) + .with_status(200) + .with_body(chat_completion("a red square")) + .expect(1) + .create_async() + .await; + + let media = [MediaFile::from_bytes(b"abc", "image/png")]; + let text = client(&server) + .with_media(&media) + .generate("describe") + .await + .unwrap(); + assert_eq!(text, "a red square"); + m.assert_async().await; +} + +/// `generate_with_media` with an inline PDF must encode it as the documented +/// OpenAI `file` content part (`filename` + base64 `file_data`), not `image_url`. +#[tokio::test] +async fn generate_request_body_carries_attached_pdf_as_file_part() { + use rstructor::MediaFile; + + let mut server = mockito::Server::new_async().await; + let m = server + .mock("POST", "/chat/completions") + .match_body(mockito::Matcher::PartialJson(json!({ + "messages": [{ + "role": "user", + "content": [ + { "type": "text", "text": "summarize" }, + { + "type": "file", + "file": { + "filename": "document.pdf", + "file_data": "data:application/pdf;base64,JVBERg==", + }, + }, + ], + }], + }))) + .with_status(200) + .with_body(chat_completion("a summary")) + .expect(1) + .create_async() + .await; + + let media = [MediaFile::from_bytes(b"%PDF", "application/pdf")]; + let text = client(&server) + .generate_with_media("summarize", &media) + .await + .unwrap(); + assert_eq!(text, "a summary"); + m.assert_async().await; +} + +/// A URL-based PDF has no chat-completions pathway: `generate_with_media` must +/// fail with a clear error *before* any HTTP request is made. +#[tokio::test] +async fn generate_with_url_pdf_errors_without_sending_request() { + use rstructor::MediaFile; + + let mut server = mockito::Server::new_async().await; + let m = server + .mock("POST", "/chat/completions") + .expect(0) // the request must never reach the server + .create_async() + .await; + + let media = [MediaFile::new( + "https://example.com/report.pdf", + "application/pdf", + )]; + let err = client(&server) + .generate_with_media("summarize", &media) + .await + .unwrap_err(); + assert!( + err.to_string().contains("URL-based PDF"), + "expected a clear URL-PDF error, got: {err}" + ); + m.assert_async().await; +} + // --------------------------------------------------------------------------- // reasoning_effort + temperature override per model (offline_mockito) // --------------------------------------------------------------------------- @@ -788,3 +897,48 @@ async fn tool_loop_exhaustion_errors() { ); always_tool.assert_async().await; } + +/// `with_tools(..).media(..).run(..)` must include the attached media in the +/// initial user turn of the tool loop's request body — media used to be +/// silently dropped on the `run` path. +#[cfg(feature = "tools")] +#[tokio::test] +async fn tool_run_request_body_carries_attached_media() { + use rstructor::{MediaFile, RequestExt, Toolbox}; + use std::sync::Arc; + + let mut server = mockito::Server::new_async().await; + let m = server + .mock("POST", "/chat/completions") + .match_body(mockito::Matcher::PartialJson(json!({ + "messages": [{ + "role": "user", + "content": [ + { "type": "text", "text": "what is in the image?" }, + { + "type": "image_url", + "image_url": { "url": "data:image/png;base64,YWJj", "detail": "auto" }, + }, + ], + }], + }))) + .with_status(200) + .with_body(chat_completion("a red square")) + .expect(1) + .create_async() + .await; + + let invoked = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let toolbox = Toolbox::new().with(recording_add_tool(invoked.clone())); + let media = [MediaFile::from_bytes(b"abc", "image/png")]; + + let answer = client(&server) + .with_tools(&toolbox) + .media(media.to_vec()) + .run("what is in the image?") + .await + .unwrap(); + + assert_eq!(answer, "a red square"); + m.assert_async().await; +} diff --git a/tests/mock_edge_tests.rs b/tests/mock_edge_tests.rs index 8dd0501..d27ca44 100644 --- a/tests/mock_edge_tests.rs +++ b/tests/mock_edge_tests.rs @@ -347,6 +347,25 @@ mod builder { assert_eq!(req.kind, RequestKind::Materialize); assert_eq!(req.prompt, "B\n\nhi"); } + + /// `with_media(..).generate(..)` routes through `generate_with_media`, + /// carrying the attached media instead of silently dropping it. + #[tokio::test] + async fn with_media_generate_routes_to_generate_with_media() { + let client = MockClient::new().with_response("a caption"); + let media = [MediaFile::new("u", "image/png")]; + let out = client + .with_media(&media) + .generate("describe") + .await + .unwrap(); + assert_eq!(out, "a caption"); + let req = client.last_request().unwrap(); + assert_eq!(req.kind, RequestKind::GenerateWithMedia); + assert_eq!(req.prompt, "describe"); + assert_eq!(req.media.len(), 1); + assert_eq!(req.media[0].mime_type, "image/png"); + } } // --------------------------------------------------------------------------- @@ -373,4 +392,46 @@ mod tools { "no tools were attached, so no tool loop should have run" ); } + + /// `run` with NO tools but WITH media falls back to `generate_with_media`, + /// carrying the attached media instead of silently dropping it. + #[tokio::test] + async fn run_with_no_tools_and_media_falls_back_to_generate_with_media() { + let client = MockClient::new().with_response("answer"); + let media = [rstructor::MediaFile::new("u", "image/png")]; + let out = client.with_media(&media).run("hi").await.unwrap(); + assert_eq!(out, "answer"); + let req = client.last_request().unwrap(); + assert_eq!(req.kind, RequestKind::GenerateWithMedia); + assert_eq!(req.media.len(), 1); + } + + /// `run` WITH tools forwards attached media into the tool loop's request. + #[tokio::test] + async fn run_with_tools_carries_media_into_tool_loop() { + use rstructor::{FnTool, Instructor, Toolbox}; + use serde::{Deserialize, Serialize}; + + #[derive(Instructor, Serialize, Deserialize)] + struct EchoArgs { + value: String, + } + + let toolbox = Toolbox::new().with(FnTool::new("echo", "Echo", |args: EchoArgs| { + std::future::ready(Ok(serde_json::json!(args.value))) + })); + let client = MockClient::new().with_response("done"); + let media = [rstructor::MediaFile::new("u", "image/png")]; + let out = client + .with_tools(&toolbox) + .media(media.to_vec()) + .run("hi") + .await + .unwrap(); + assert_eq!(out, "done"); + let req = client.last_request().unwrap(); + assert_eq!(req.kind, RequestKind::RunToolLoop); + assert_eq!(req.media.len(), 1, "media must reach the tool loop"); + assert_eq!(req.tool_names, vec!["echo"]); + } }