Skip to content

Commit 50bedb1

Browse files
Merge pull request #180 from jamesrochabrun/feature/mcp-and-image-support
Add MCP server support and image input to Realtime API
2 parents b56cb26 + 8098d51 commit 50bedb1

File tree

4 files changed

+93
-11
lines changed

4 files changed

+93
-11
lines changed

Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,44 @@ open class OpenAIRealtimeSession {
211211
continuation?.yield(.inputAudioTranscriptionCompleted(transcript))
212212
}
213213

214+
// MCP (Model Context Protocol) message types
215+
case "mcp_list_tools.in_progress":
216+
logger.debug("MCP: Tool discovery in progress")
217+
continuation?.yield(.mcpListToolsInProgress)
218+
219+
case "mcp_list_tools.completed":
220+
logger.debug("MCP: Tool discovery completed")
221+
if let tools = json["tools"] as? [String: Any] {
222+
continuation?.yield(.mcpListToolsCompleted(tools))
223+
} else {
224+
continuation?.yield(.mcpListToolsCompleted([:]))
225+
}
226+
227+
case "mcp_list_tools.failed":
228+
logger.error("MCP: Tool discovery failed")
229+
logger.error("Full JSON payload: \(String(describing: json))")
230+
231+
let errorDetails = json["error"] as? [String: Any]
232+
let errorMessage = errorDetails?["message"] as? String
233+
let errorCode = errorDetails?["code"] as? String
234+
235+
// Also check for top-level error fields
236+
let topLevelMessage = json["message"] as? String
237+
let topLevelCode = json["code"] as? String
238+
let topLevelReason = json["reason"] as? String
239+
240+
let finalMessage = errorMessage ?? topLevelMessage ?? topLevelReason ?? "Unknown MCP error"
241+
let finalCode = errorCode ?? topLevelCode
242+
let fullError = finalCode != nil ? "[\(finalCode!)] \(finalMessage)" : finalMessage
243+
244+
logger.error("MCP Error: \(fullError)")
245+
logger.error("Error details: \(String(describing: errorDetails))")
246+
logger
247+
.error(
248+
"Top-level fields: message=\(String(describing: topLevelMessage)), code=\(String(describing: topLevelCode)), reason=\(String(describing: topLevelReason))")
249+
250+
continuation?.yield(.mcpListToolsFailed(fullError))
251+
214252
default:
215253
// Log unhandled message types for debugging
216254
logger.debug("Unhandled message type: \(messageType)")

Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeConversationItemCreate.swift

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,40 @@ extension OpenAIRealtimeConversationItemCreate {
3030

3131
public init(role: String, text: String) {
3232
self.role = role
33-
content = [.init(text: text)]
33+
content = [.text(text)]
34+
}
35+
36+
public init(role: String, content: [Content]) {
37+
self.role = role
38+
self.content = content
3439
}
3540
}
3641
}
3742

3843
// MARK: - OpenAIRealtimeConversationItemCreate.Item.Content
3944

4045
extension OpenAIRealtimeConversationItemCreate.Item {
41-
public struct Content: Encodable {
42-
public let type = "input_text"
43-
public let text: String
46+
public enum Content: Encodable {
47+
case text(String)
48+
case image(String) // base64 data URL: "data:image/{format};base64,{bytes}"
49+
50+
public func encode(to encoder: Encoder) throws {
51+
var container = encoder.container(keyedBy: CodingKeys.self)
52+
switch self {
53+
case .text(let text):
54+
try container.encode("input_text", forKey: .type)
55+
try container.encode(text, forKey: .text)
56+
57+
case .image(let imageUrl):
58+
try container.encode("input_image", forKey: .type)
59+
try container.encode(imageUrl, forKey: .imageUrl)
60+
}
61+
}
4462

45-
public init(text: String) {
46-
self.text = text
63+
private enum CodingKeys: String, CodingKey {
64+
case type
65+
case text
66+
case imageUrl = "image_url"
4767
}
4868
}
4969
}

Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
2121
outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
2222
speed: Float? = 1.0,
2323
temperature: Double? = nil,
24-
tools: [OpenAIRealtimeSessionConfiguration.Tool]? = nil,
24+
tools: [OpenAIRealtimeSessionConfiguration.RealtimeTool]? = nil,
2525
toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil,
2626
turnDetection: OpenAIRealtimeSessionConfiguration.TurnDetection? = nil,
2727
voice: String? = nil)
@@ -130,8 +130,8 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
130130
/// Sampling temperature for the model.
131131
public let temperature: Double?
132132

133-
/// Tools (functions) available to the model.
134-
public let tools: [Tool]?
133+
/// Tools (functions and MCP servers) available to the model.
134+
public let tools: [RealtimeTool]?
135135

136136
/// How the model chooses tools. Options are "auto", "none", "required", or specify a function.
137137
public let toolChoice: ToolChoice?
@@ -191,10 +191,10 @@ extension OpenAIRealtimeSessionConfiguration {
191191
}
192192
}
193193

194-
// MARK: OpenAIRealtimeSessionConfiguration.Tool
194+
// MARK: OpenAIRealtimeSessionConfiguration.FunctionTool
195195

196196
extension OpenAIRealtimeSessionConfiguration {
197-
public struct Tool: Encodable, Sendable {
197+
public struct FunctionTool: Encodable, Sendable {
198198
/// The description of the function
199199
public let description: String
200200

@@ -215,6 +215,25 @@ extension OpenAIRealtimeSessionConfiguration {
215215
}
216216
}
217217

218+
// MARK: OpenAIRealtimeSessionConfiguration.RealtimeTool
219+
220+
extension OpenAIRealtimeSessionConfiguration {
221+
/// Represents a tool that can be either a function or an MCP server
222+
public enum RealtimeTool: Encodable, Sendable {
223+
case function(FunctionTool)
224+
case mcp(Tool.MCPTool)
225+
226+
public func encode(to encoder: Encoder) throws {
227+
switch self {
228+
case .function(let tool):
229+
try tool.encode(to: encoder)
230+
case .mcp(let mcpTool):
231+
try mcpTool.encode(to: encoder)
232+
}
233+
}
234+
}
235+
}
236+
218237
// MARK: OpenAIRealtimeSessionConfiguration.TurnDetection
219238

220239
extension OpenAIRealtimeSessionConfiguration {

Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,9 @@ public enum OpenAIRealtimeMessage: Sendable {
2121
case inputAudioBufferTranscript(String) // "input_audio_buffer.transcript"
2222
case inputAudioTranscriptionDelta(String) // "conversation.item.input_audio_transcription.delta"
2323
case inputAudioTranscriptionCompleted(String) // "conversation.item.input_audio_transcription.completed"
24+
25+
// MCP (Model Context Protocol) messages
26+
case mcpListToolsInProgress // "mcp_list_tools.in_progress"
27+
case mcpListToolsCompleted([String: Any]) // "mcp_list_tools.completed" with tools data
28+
case mcpListToolsFailed(String?) // "mcp_list_tools.failed" with error details
2429
}

0 commit comments

Comments
 (0)