generated from deepgram/oss-repo-template
-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
753be7e
commit f2f6e98
Showing
5 changed files
with
342 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
export enum AgentEvents { | ||
/** | ||
* Built in socket events. | ||
*/ | ||
Open = "Open", | ||
Close = "Close", | ||
Error = "Error", | ||
/** | ||
* Message { type: string } | ||
*/ | ||
Welcome = "Welcome", | ||
ConversationText = "ConversationText", | ||
UserStartedSpeaking = "UserStartedSpeaking", | ||
AgentThinking = "AgentThinking", | ||
FunctionCalling = "FunctionCalling", | ||
AgentStartedSpeaking = "AgentStartedSpeaking", | ||
AgentAudioDone = "AgentAudioDone", | ||
InjectionRefused = "InjectionRefused", | ||
|
||
/** | ||
* Catch all for any other message event | ||
*/ | ||
Unhandled = "Unhandled", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
// TODO: We could probably use this elsewhere? | ||
type AudioFormat = | ||
| { | ||
encoding: "linear16"; | ||
container: "wav" | "none"; | ||
sampleRate: 8000 | 16000 | 24000 | 32000 | 48000; | ||
} | ||
| { | ||
encoding: "mulaw"; | ||
container: "wav" | "none"; | ||
sampleRate: 8000 | 16000; | ||
} | ||
| { | ||
encoding: "alaw"; | ||
container: "wav" | "none"; | ||
sampleRate: 8000 | 16000; | ||
} | ||
| { | ||
encoding: "mp3"; | ||
bitrate: 32000 | 48000; | ||
} | ||
| { | ||
encoding: "opus"; | ||
container: "ogg"; | ||
/** | ||
* Must be between 4000 and 650000, inclusive. | ||
*/ | ||
bitrate: number; | ||
} | ||
| { | ||
encoding: "flac"; | ||
sampleRate: 8000 | 16000 | 22050 | 32000 | 48000; | ||
} | ||
| { | ||
encoding: "aac"; | ||
/** | ||
* Must be between 4000 and 192000 inclusive. | ||
*/ | ||
bitrate: number; | ||
}; | ||
|
||
type ListenModel = | ||
| "nova-2" | ||
| "nova-2-meeting" | ||
| "nova-2-phonecall" | ||
| "nova-2-voicemail" | ||
| "nova-2-finance" | ||
| "nova-2-conversational" | ||
| "nova-2-video" | ||
| "nova-2-medical" | ||
| "nova-2-drivethru" | ||
| "nova-2-automotive" | ||
| "nova-2-atc" | ||
| "nova" | ||
| "nova-phonecall" | ||
| "enhanced" | ||
| "enhanced-meeting" | ||
| "enhanced-phonecall" | ||
| "enhanced-finance" | ||
| "base" | ||
| "base-meeting" | ||
| "base-phonecall" | ||
| "base-voicemail" | ||
| "base-finance" | ||
| "base-conversational" | ||
| "base-video" | ||
| "whisper-tiny" | ||
| "whisper" | ||
| "whisper-small" | ||
| "whisper-medium" | ||
| "whisper-large"; | ||
|
||
type SpeakModel = | ||
| "aura-asteria-en" | ||
| "aura-luna-en" | ||
| "aura-stella-en" | ||
| "aura-athena-en" | ||
| "aura-hera-en" | ||
| "aura-orion-en" | ||
| "aura-arcas-en" | ||
| "aura-perseus-en" | ||
| "aura-angus-en" | ||
| "aura-orpheus-en" | ||
| "aura-helios-en" | ||
| "aura-zeus-en"; | ||
|
||
interface ThinkModelFunction { | ||
name: string; | ||
description: string; | ||
url: string; | ||
headers: [ | ||
{ | ||
key: "authorization"; | ||
value: string; | ||
} | ||
]; | ||
method: "POST"; | ||
parameters: { | ||
type: string; | ||
properties: Record< | ||
string, | ||
{ | ||
type: string; | ||
description: string; | ||
} | ||
>; | ||
}; | ||
} | ||
|
||
type ThinkModel = | ||
| { | ||
provider: { | ||
type: "open_ai"; | ||
}; | ||
model: "gpt-4o-mini"; | ||
instructions: string; | ||
functions: ThinkModelFunction[]; | ||
} | ||
| { | ||
provider: { | ||
type: "anthropic"; | ||
}; | ||
model: "claude-3-haiku-20240307"; | ||
instructions: string; | ||
functions: ThinkModelFunction[]; | ||
} | ||
| { | ||
provider: { | ||
type: "groq"; | ||
}; | ||
model: ""; | ||
instructions: string; | ||
functions: ThinkModelFunction[]; | ||
} | ||
| { | ||
provider: { | ||
type: "custom"; | ||
url: string; | ||
key: string; | ||
}; | ||
model: string; | ||
instructions: string; | ||
functions: ThinkModelFunction[]; | ||
}; | ||
|
||
/** | ||
* @see https://developers.deepgram.com/reference/voicebot-api-phase-preview#settingsconfiguration | ||
*/ | ||
interface AgentLiveSchema extends Record<string, unknown> { | ||
audio: { | ||
input?: { | ||
encoding: string; | ||
sampleRate: number; | ||
}; | ||
/** | ||
* @see https://developers.deepgram.com/docs/tts-media-output-settings#audio-format-combinations | ||
*/ | ||
output?: AudioFormat; | ||
}; | ||
agent: { | ||
listen: { | ||
/** | ||
* @see https://developers.deepgram.com/docs/model | ||
*/ | ||
model: ListenModel; | ||
}; | ||
speak: { | ||
/** | ||
* @see https://developers.deepgram.com/docs/tts-models | ||
*/ | ||
model: SpeakModel; | ||
}; | ||
/** | ||
* @see https://developers.deepgram.com/reference/voicebot-api-phase-preview#supported-llm-providers-and-models | ||
*/ | ||
think: ThinkModel; | ||
}; | ||
context: { | ||
/** | ||
* LLM message history (e.g. to restore existing conversation if websocket disconnects) | ||
*/ | ||
messages: []; | ||
/** | ||
* Whether to replay the last message, if it is an assistant message. | ||
*/ | ||
replay: boolean; | ||
}; | ||
} | ||
|
||
export type { AgentLiveSchema, SpeakModel }; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import { AgentEvents } from "../lib/enums/AgentEvents.js"; | ||
import type { AgentLiveSchema, SpeakModel } from "../lib/types"; | ||
import type { DeepgramClientOptions } from "../lib/types"; | ||
import { AbstractLiveClient } from "./AbstractLiveClient"; | ||
|
||
export class AgentLiveClient extends AbstractLiveClient { | ||
public namespace: string = "agent"; | ||
|
||
constructor(options: DeepgramClientOptions, endpoint: string = ":version/agent") { | ||
super(options); | ||
/** | ||
* According to the docs, this is the correct base URL for the Agent API. | ||
* TODO: Make configurable for self-hosted customers. | ||
*/ | ||
this.baseUrl = "wss://agent.deepgram.com"; | ||
|
||
/** | ||
* TODO: Not sure we should send the options here. | ||
* Think that needs to happen after Websocket is open. | ||
*/ | ||
this.connect({}, endpoint); | ||
} | ||
|
||
/** | ||
* Sets up the connection event handlers. | ||
* This method is responsible for handling the various events that can occur on the WebSocket connection, such as opening, closing, and receiving messages. | ||
* - When the connection is opened, it emits the `AgentEvents.Open` event. | ||
* - When the connection is closed, it emits the `AgentEvents.Close` event. | ||
* - When an error occurs on the connection, it emits the `AgentEvents.Error` event. | ||
* - When a message is received, it parses the message and emits the appropriate event based on the message type. | ||
*/ | ||
public setupConnection(): void { | ||
if (this.conn) { | ||
this.conn.onopen = () => { | ||
this.emit(AgentEvents.Open, this); | ||
}; | ||
|
||
this.conn.onclose = (event: any) => { | ||
this.emit(AgentEvents.Close, event); | ||
}; | ||
|
||
this.conn.onerror = (event: ErrorEvent) => { | ||
this.emit(AgentEvents.Error, event); | ||
}; | ||
|
||
this.conn.onmessage = (event: MessageEvent) => { | ||
try { | ||
const data: any = JSON.parse(event.data.toString()); | ||
|
||
if (data.type in AgentEvents) { | ||
this.emit(data.type, data); | ||
} else { | ||
this.emit(AgentEvents.Unhandled, data); | ||
} | ||
} catch (error) { | ||
this.emit(AgentEvents.Error, { | ||
event, | ||
message: "Unable to parse `data` as JSON.", | ||
error, | ||
}); | ||
} | ||
}; | ||
} | ||
} | ||
|
||
/** | ||
* To be called with your model configuration BEFORE sending | ||
* any audio data. | ||
* @param options - The SettingsConfiguration object. | ||
* @param options.audio.input.encoding - The encoding for your inbound (user) audio. | ||
* @param options.audio.input.sampleRate - The sample rate for your inbound (user) audio. | ||
* @param options.audio.output.encoding - The encoding for your outbound (agent) audio. | ||
* @param options.audio.output.sampleRate - The sample rate for your outbound (agent) audio. | ||
* @param options.audio.output.bitrate - The bitrate for your outbound (agent) audio. | ||
* @param options.audio.output.container - The container for your outbound (agent) audio. | ||
* @param options.agent.listen.model - The STT model to use for processing user audio. | ||
* @param options.agent.speak.model - The TTS model to use for generating agent audio. | ||
* @param options.agent.think.provider.type - The LLM provider to use. | ||
* @param options.agent.think.model - The LLM model to use. | ||
* @param options.agent.think.instructions - The instructions to provide to the LLM. | ||
* @param options.agent.think.functions - The functions to provide to the LLM. | ||
* @param options.context.messages - The message history to provide to the LLM (useful if a websocket connection is lost.) | ||
* @param options.context.replay - Whether to replay the last message if it was an assistant message. | ||
*/ | ||
public configure(options: AgentLiveSchema): void { | ||
this.send(JSON.stringify({ type: "SettingsConfiguration", options })); | ||
} | ||
|
||
/** | ||
* Provide new instructions to the LLM. | ||
* @param instructions - The instructions to provide. | ||
*/ | ||
public updateInstructions(instructions: string): void { | ||
this.send(JSON.stringify({ type: "UpdateInstructions", instructions })); | ||
} | ||
|
||
/** | ||
* Change the speak model. | ||
* @param model - The new model to use. | ||
*/ | ||
public updateSpeak(model: SpeakModel): void { | ||
this.send(JSON.stringify({ type: "UpdateSpeak", model })); | ||
} | ||
|
||
/** | ||
* Immediately trigger an agent message. If this message | ||
* is sent while the user is speaking, or while the server is in the | ||
* middle of sending audio, then the request will be ignored and an InjectionRefused | ||
* event will be emitted. | ||
* @example "Hold on while I look that up for you." | ||
* @example "Are you still on the line?" | ||
* @param message - The message to speak. | ||
*/ | ||
public injectAgentMessage(message: string): void { | ||
this.send(JSON.stringify({ type: "InjectAgentMessage", message })); | ||
} | ||
|
||
/** | ||
* Send a keepalive to avoid closing the websocket while you | ||
* are not transmitting audio. This should be sent at least | ||
* every 8 seconds. | ||
*/ | ||
public keepAlive(): void { | ||
this.send(JSON.stringify({ type: "KeepAlive" })); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters