Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ impl SchemaType for SecurityId {
}
```

## Multimodal (Image Input)
## Multimodal (Image & PDF Input)

Analyze images with structured extraction across all major providers by
attaching media to a request with `with_media`:
Expand Down Expand Up @@ -329,7 +329,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {

`MediaFile::new(uri, mime_type)` is also available for URL/URI-based media input.
The lower-level `LLMClient::materialize_with_media(prompt, &media)` method does
the same thing in one call when you do not need the builder.
the same thing in one call when you do not need the builder. Attached media is
honored by `materialize`, `generate`, and tool `run` alike.

PDFs are supported too: pass `"application/pdf"` as the MIME type and the
attachment is routed to each provider's documented document format (OpenAI
`file` part, Anthropic `document` block, Gemini `inlineData`/`fileData`).
Combinations a provider does not support — PDFs on Grok, or URL-based PDFs on
OpenAI chat completions — return a clear error instead of a broken request.

Provider examples:
- `cargo run --example openai_multimodal_example --features openai`
Expand Down
249 changes: 144 additions & 105 deletions src/backend/anthropic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,127 @@ impl AnthropicClient {
trace!(json = %raw_response, "Parsing structured output response");
parse_validate_and_create_output(raw_response, usage)
}

/// Internal implementation of raw text generation (no structured output).
///
/// Accepts chat messages so that callers can attach media (images/PDFs) to
/// the user message; the same content-building path as `materialize_internal`
/// is used, so media is encoded per Anthropic's documented block format.
async fn generate_internal(&self, messages: &[ChatMessage]) -> Result<GenerateResult> {
info!("Generating raw text response with Anthropic");

// Build thinking config for Claude 4.x models
let is_thinking_model = self.config.model.as_str().contains("sonnet-4")
|| self.config.model.as_str().contains("opus-4");
let thinking_config = self.config.thinking_level.and_then(|level| {
if is_thinking_model && level.claude_thinking_enabled() {
Some(ClaudeThinkingConfig {
thinking_type: "enabled".to_string(),
budget_tokens: level.claude_budget_tokens(),
})
} else {
None
}
});

// Claude requires temperature=1 when thinking is enabled
let effective_temp = if thinking_config.is_some() {
1.0
} else {
self.config.temperature
};

// Build API messages, including any attached media blocks
let api_messages: Vec<AnthropicMessage> = messages
.iter()
.map(|msg| {
Ok(AnthropicMessage {
role: msg.role.as_str().to_string(),
content: build_anthropic_message_content(msg)?,
})
})
.collect::<Result<Vec<_>>>()?;

// Build the request (no output_format for raw text generation)
debug!("Building Anthropic API request for text generation");
let request = CompletionRequest {
model: self.config.model.as_str().to_string(),
messages: api_messages,
temperature: effective_temp,
max_tokens: effective_max_tokens(self.config.max_tokens, thinking_config.as_ref()),
thinking: thinking_config,
output_format: None, // Raw text generation doesn't use structured outputs
};

// Send the request to Anthropic
debug!(
model = %self.config.model.as_str(),
max_tokens = request.max_tokens,
"Sending request to Anthropic API"
);
let base_url = self
.config
.base_url
.as_deref()
.unwrap_or("https://api.anthropic.com/v1");
let url = format!("{}/messages", base_url);
debug!(url = %url, "Using Anthropic API endpoint");
let response = self
.client
.post(&url)
.header("x-api-key", &self.config.api_key)
.header("anthropic-version", "2023-06-01")
.header("Content-Type", "application/json")
.json(&request)
.send()
.await
.map_err(|e| handle_http_error(e, "Anthropic"))?;

// Parse the response
let response = check_response_status(response, "Anthropic").await?;

debug!("Successfully received response from Anthropic");
let completion: CompletionResponse = response.json().await.map_err(|e| {
error!(error = %e, "Failed to parse JSON response from Anthropic");
e
})?;

// Extract usage info
let model_name = completion
.model
.clone()
.unwrap_or_else(|| self.config.model.as_str().to_string());
let usage = completion
.usage
.as_ref()
.map(|u| TokenUsage::new(model_name, u.input_tokens, u.output_tokens));

// Extract the content
debug!("Extracting text content from response blocks");
let content: String = completion
.content
.iter()
.filter(|block| block.block_type == "text")
.map(|block| block.text.clone())
.collect::<Vec<String>>()
.join("");

if content.is_empty() {
error!("No text content in Anthropic response");
return Err(RStructorError::api_error(
"Anthropic",
ApiErrorKind::UnexpectedResponse {
details: "No text content in response".to_string(),
},
));
}

debug!(
content_len = content.len(),
"Successfully extracted text content"
);
Ok(GenerateResult::new(content, usage))
}
}

// Generate builder methods using macro
Expand Down Expand Up @@ -565,6 +686,7 @@ impl crate::backend::tools::ToolRunner for AnthropicClient {
&self,
system: Option<&str>,
prompt: &str,
media: &[super::MediaFile],
toolbox: &crate::backend::tools::Toolbox,
max_iterations: usize,
) -> Result<String> {
Expand All @@ -584,6 +706,7 @@ impl crate::backend::tools::ToolRunner for AnthropicClient {
.unwrap_or(DEFAULT_ANTHROPIC_MAX_TOKENS),
system,
prompt,
media,
toolbox,
max_iterations,
)
Expand Down Expand Up @@ -686,6 +809,26 @@ impl LLMClient for AnthropicClient {
Ok(result.text)
}

#[instrument(
name = "anthropic_generate_with_media",
skip(self, prompt, media),
fields(
model = %self.config.model.as_str(),
prompt_len = prompt.len(),
media_len = media.len()
)
)]
async fn generate_with_media(
&self,
prompt: &str,
media: &[super::MediaFile],
) -> Result<String> {
let result = self
.generate_internal(&[ChatMessage::user_with_media(prompt, media.to_vec())])
.await?;
Ok(result.text)
}

#[instrument(
name = "anthropic_generate_with_metadata",
skip(self, prompt),
Expand All @@ -695,111 +838,7 @@ impl LLMClient for AnthropicClient {
)
)]
async fn generate_with_metadata(&self, prompt: &str) -> Result<GenerateResult> {
info!("Generating raw text response with Anthropic");

// Build thinking config for Claude 4.x models
let is_thinking_model = self.config.model.as_str().contains("sonnet-4")
|| self.config.model.as_str().contains("opus-4");
let thinking_config = self.config.thinking_level.and_then(|level| {
if is_thinking_model && level.claude_thinking_enabled() {
Some(ClaudeThinkingConfig {
thinking_type: "enabled".to_string(),
budget_tokens: level.claude_budget_tokens(),
})
} else {
None
}
});

// Claude requires temperature=1 when thinking is enabled
let effective_temp = if thinking_config.is_some() {
1.0
} else {
self.config.temperature
};

// Build the request (no output_format for raw text generation)
debug!("Building Anthropic API request for text generation");
let request = CompletionRequest {
model: self.config.model.as_str().to_string(),
messages: vec![AnthropicMessage {
role: "user".to_string(),
content: AnthropicMessageContent::Text(prompt.to_string()),
}],
temperature: effective_temp,
max_tokens: effective_max_tokens(self.config.max_tokens, thinking_config.as_ref()),
thinking: thinking_config,
output_format: None, // Raw text generation doesn't use structured outputs
};

// Send the request to Anthropic
debug!(
model = %self.config.model.as_str(),
max_tokens = request.max_tokens,
"Sending request to Anthropic API"
);
let base_url = self
.config
.base_url
.as_deref()
.unwrap_or("https://api.anthropic.com/v1");
let url = format!("{}/messages", base_url);
debug!(url = %url, "Using Anthropic API endpoint");
let response = self
.client
.post(&url)
.header("x-api-key", &self.config.api_key)
.header("anthropic-version", "2023-06-01")
.header("Content-Type", "application/json")
.json(&request)
.send()
.await
.map_err(|e| handle_http_error(e, "Anthropic"))?;

// Parse the response
let response = check_response_status(response, "Anthropic").await?;

debug!("Successfully received response from Anthropic");
let completion: CompletionResponse = response.json().await.map_err(|e| {
error!(error = %e, "Failed to parse JSON response from Anthropic");
e
})?;

// Extract usage info
let model_name = completion
.model
.clone()
.unwrap_or_else(|| self.config.model.as_str().to_string());
let usage = completion
.usage
.as_ref()
.map(|u| TokenUsage::new(model_name, u.input_tokens, u.output_tokens));

// Extract the content
debug!("Extracting text content from response blocks");
let content: String = completion
.content
.iter()
.filter(|block| block.block_type == "text")
.map(|block| block.text.clone())
.collect::<Vec<String>>()
.join("");

if content.is_empty() {
error!("No text content in Anthropic response");
return Err(RStructorError::api_error(
"Anthropic",
ApiErrorKind::UnexpectedResponse {
details: "No text content in response".to_string(),
},
));
}

debug!(
content_len = content.len(),
"Successfully extracted text content"
);
Ok(GenerateResult::new(content, usage))
self.generate_internal(&[ChatMessage::user(prompt)]).await
}

#[cfg(feature = "streaming")]
Expand Down
4 changes: 4 additions & 0 deletions src/backend/any_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ impl LLMClient for AnyClient {
dispatch!(self, c => c.generate(prompt).await)
}

async fn generate_with_media(&self, prompt: &str, media: &[MediaFile]) -> Result<String> {
dispatch!(self, c => c.generate_with_media(prompt, media).await)
}

async fn generate_with_metadata(&self, prompt: &str) -> Result<GenerateResult> {
dispatch!(self, c => c.generate_with_metadata(prompt).await)
}
Expand Down
50 changes: 50 additions & 0 deletions src/backend/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ use crate::model::Instructor;
/// Created with [`MediaFile::from_bytes`]. This is useful for public images
/// downloaded over HTTPS.
///
/// The `mime_type` decides how each provider encodes the attachment: `image/*`
/// is sent in the provider's image format, and `application/pdf` is routed to
/// the provider's document/file format (OpenAI `file` part for inline data,
/// Anthropic `document` block, Gemini `inlineData`/`fileData`). Combinations a
/// provider does not document — e.g. any PDF on Grok, or a URL-based PDF on
/// OpenAI — produce a clear error instead of a silently broken request.
///
/// # Examples
///
/// ```no_run
Expand All @@ -30,6 +37,10 @@ use crate::model::Instructor;
/// // Inline data from bytes
/// let image_bytes = std::fs::read("photo.png").unwrap();
/// let media = MediaFile::from_bytes(&image_bytes, "image/png");
///
/// // Inline PDF (OpenAI, Anthropic, and Gemini)
/// let pdf_bytes = std::fs::read("report.pdf").unwrap();
/// let media = MediaFile::from_bytes(&pdf_bytes, "application/pdf");
/// ```
#[derive(Debug, Clone)]
pub struct MediaFile {
Expand Down Expand Up @@ -282,6 +293,45 @@ pub trait LLMClient {
/// ```
async fn generate(&self, prompt: &str) -> Result<String>;

/// Raw text completion with media attachments (if supported).
///
/// Like [`generate`](Self::generate), but the prompt is sent together with
/// `media` (images, or PDFs where the provider supports them), encoded in the
/// provider's documented multimodal format.
///
/// The default implementation forwards to [`generate`](Self::generate) when
/// no media is provided, and otherwise returns
/// [`RStructorError::Unsupported`](crate::RStructorError::Unsupported) so that
/// media is never silently dropped. Providers with media support override this
/// method. All four built-in clients (OpenAI, Anthropic, Grok, Gemini) support
/// media here; PDF support varies by provider (Grok, for example, accepts only
/// images and returns a clear error for PDFs).
///
/// # Example
///
/// ```no_run
/// # use rstructor::{LLMClient, OpenAIClient, MediaFile};
/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let client = OpenAIClient::from_env()?;
/// let pdf_bytes = std::fs::read("report.pdf")?;
/// let media = [MediaFile::from_bytes(&pdf_bytes, "application/pdf")];
/// let summary = client
/// .generate_with_media("Summarize this report", &media)
/// .await?;
/// println!("{summary}");
/// # Ok(())
/// # }
/// ```
async fn generate_with_media(&self, prompt: &str, media: &[MediaFile]) -> Result<String> {
if media.is_empty() {
self.generate(prompt).await
} else {
Err(crate::error::RStructorError::Unsupported(
"this client does not support media inputs".to_string(),
))
}
}

/// Raw completion with metadata (token usage).
///
/// Like [`generate`](Self::generate), but returns a [`GenerateResult`]
Expand Down
Loading
Loading