feat: spec out the api docs

deepgram · Oct 19, 2024 · f2f6e98 · f2f6e98
1 parent 753be7e
commit f2f6e98
Show file tree

Hide file tree

Showing 5 changed files with 342 additions and 0 deletions.
diff --git a/src/lib/enums/AgentEvents.ts b/src/lib/enums/AgentEvents.ts
@@ -0,0 +1,24 @@
+export enum AgentEvents {
+  /**
+   * Built in socket events.
+   */
+  Open = "Open",
+  Close = "Close",
+  Error = "Error",
+  /**
+   * Message { type: string }
+   */
+  Welcome = "Welcome",
+  ConversationText = "ConversationText",
+  UserStartedSpeaking = "UserStartedSpeaking",
+  AgentThinking = "AgentThinking",
+  FunctionCalling = "FunctionCalling",
+  AgentStartedSpeaking = "AgentStartedSpeaking",
+  AgentAudioDone = "AgentAudioDone",
+  InjectionRefused = "InjectionRefused",
+
+  /**
+   * Catch all for any other message event
+   */
+  Unhandled = "Unhandled",
+}
diff --git a/src/lib/types/AgentLiveSchema.ts b/src/lib/types/AgentLiveSchema.ts
@@ -0,0 +1,190 @@
+// TODO: We could probably use this elsewhere?
+type AudioFormat =
+  | {
+      encoding: "linear16";
+      container: "wav" | "none";
+      sampleRate: 8000 | 16000 | 24000 | 32000 | 48000;
+    }
+  | {
+      encoding: "mulaw";
+      container: "wav" | "none";
+      sampleRate: 8000 | 16000;
+    }
+  | {
+      encoding: "alaw";
+      container: "wav" | "none";
+      sampleRate: 8000 | 16000;
+    }
+  | {
+      encoding: "mp3";
+      bitrate: 32000 | 48000;
+    }
+  | {
+      encoding: "opus";
+      container: "ogg";
+      /**
+       * Must be between 4000 and 650000, inclusive.
+       */
+      bitrate: number;
+    }
+  | {
+      encoding: "flac";
+      sampleRate: 8000 | 16000 | 22050 | 32000 | 48000;
+    }
+  | {
+      encoding: "aac";
+      /**
+       * Must be between 4000 and 192000 inclusive.
+       */
+      bitrate: number;
+    };
+
+type ListenModel =
+  | "nova-2"
+  | "nova-2-meeting"
+  | "nova-2-phonecall"
+  | "nova-2-voicemail"
+  | "nova-2-finance"
+  | "nova-2-conversational"
+  | "nova-2-video"
+  | "nova-2-medical"
+  | "nova-2-drivethru"
+  | "nova-2-automotive"
+  | "nova-2-atc"
+  | "nova"
+  | "nova-phonecall"
+  | "enhanced"
+  | "enhanced-meeting"
+  | "enhanced-phonecall"
+  | "enhanced-finance"
+  | "base"
+  | "base-meeting"
+  | "base-phonecall"
+  | "base-voicemail"
+  | "base-finance"
+  | "base-conversational"
+  | "base-video"
+  | "whisper-tiny"
+  | "whisper"
+  | "whisper-small"
+  | "whisper-medium"
+  | "whisper-large";
+
+type SpeakModel =
+  | "aura-asteria-en"
+  | "aura-luna-en"
+  | "aura-stella-en"
+  | "aura-athena-en"
+  | "aura-hera-en"
+  | "aura-orion-en"
+  | "aura-arcas-en"
+  | "aura-perseus-en"
+  | "aura-angus-en"
+  | "aura-orpheus-en"
+  | "aura-helios-en"
+  | "aura-zeus-en";
+
+interface ThinkModelFunction {
+  name: string;
+  description: string;
+  url: string;
+  headers: [
+    {
+      key: "authorization";
+      value: string;
+    }
+  ];
+  method: "POST";
+  parameters: {
+    type: string;
+    properties: Record<
+      string,
+      {
+        type: string;
+        description: string;
+      }
+    >;
+  };
+}
+
+type ThinkModel =
+  | {
+      provider: {
+        type: "open_ai";
+      };
+      model: "gpt-4o-mini";
+      instructions: string;
+      functions: ThinkModelFunction[];
+    }
+  | {
+      provider: {
+        type: "anthropic";
+      };
+      model: "claude-3-haiku-20240307";
+      instructions: string;
+      functions: ThinkModelFunction[];
+    }
+  | {
+      provider: {
+        type: "groq";
+      };
+      model: "";
+      instructions: string;
+      functions: ThinkModelFunction[];
+    }
+  | {
+      provider: {
+        type: "custom";
+        url: string;
+        key: string;
+      };
+      model: string;
+      instructions: string;
+      functions: ThinkModelFunction[];
+    };
+
+/**
+ * @see https://developers.deepgram.com/reference/voicebot-api-phase-preview#settingsconfiguration
+ */
+interface AgentLiveSchema extends Record<string, unknown> {
+  audio: {
+    input?: {
+      encoding: string;
+      sampleRate: number;
+    };
+    /**
+     * @see https://developers.deepgram.com/docs/tts-media-output-settings#audio-format-combinations
+     */
+    output?: AudioFormat;
+  };
+  agent: {
+    listen: {
+      /**
+       * @see https://developers.deepgram.com/docs/model
+       */
+      model: ListenModel;
+    };
+    speak: {
+      /**
+       * @see https://developers.deepgram.com/docs/tts-models
+       */
+      model: SpeakModel;
+    };
+    /**
+     * @see https://developers.deepgram.com/reference/voicebot-api-phase-preview#supported-llm-providers-and-models
+     */
+    think: ThinkModel;
+  };
+  context: {
+    /**
+     * LLM message history (e.g. to restore existing conversation if websocket disconnects)
+     */
+    messages: [];
+    /**
+     * Whether to replay the last message, if it is an assistant message.
+     */
+    replay: boolean;
+  };
+}
+
+export type { AgentLiveSchema, SpeakModel };
diff --git a/src/lib/types/index.ts b/src/lib/types/index.ts
@@ -1,3 +1,4 @@
+export * from "./AgentLiveSchema";
 export * from "./AnalyzeSchema";
 export * from "./AsyncAnalyzeResponse";
 export * from "./AsyncPrerecordedResponse";

diff --git a/src/packages/AgentLiveClient.ts b/src/packages/AgentLiveClient.ts
@@ -0,0 +1,126 @@
+import { AgentEvents } from "../lib/enums/AgentEvents.js";
+import type { AgentLiveSchema, SpeakModel } from "../lib/types";
+import type { DeepgramClientOptions } from "../lib/types";
+import { AbstractLiveClient } from "./AbstractLiveClient";
+
+export class AgentLiveClient extends AbstractLiveClient {
+  public namespace: string = "agent";
+
+  constructor(options: DeepgramClientOptions, endpoint: string = ":version/agent") {
+    super(options);
+    /**
+     * According to the docs, this is the correct base URL for the Agent API.
+     * TODO: Make configurable for self-hosted customers.
+     */
+    this.baseUrl = "wss://agent.deepgram.com";
+
+    /**
+     * TODO: Not sure we should send the options here.
+     * Think that needs to happen after Websocket is open.
+     */
+    this.connect({}, endpoint);
+  }
+
+  /**
+   * Sets up the connection event handlers.
+   * This method is responsible for handling the various events that can occur on the WebSocket connection, such as opening, closing, and receiving messages.
+   * - When the connection is opened, it emits the `AgentEvents.Open` event.
+   * - When the connection is closed, it emits the `AgentEvents.Close` event.
+   * - When an error occurs on the connection, it emits the `AgentEvents.Error` event.
+   * - When a message is received, it parses the message and emits the appropriate event based on the message type.
+   */
+  public setupConnection(): void {
+    if (this.conn) {
+      this.conn.onopen = () => {
+        this.emit(AgentEvents.Open, this);
+      };
+
+      this.conn.onclose = (event: any) => {
+        this.emit(AgentEvents.Close, event);
+      };
+
+      this.conn.onerror = (event: ErrorEvent) => {
+        this.emit(AgentEvents.Error, event);
+      };
+
+      this.conn.onmessage = (event: MessageEvent) => {
+        try {
+          const data: any = JSON.parse(event.data.toString());
+
+          if (data.type in AgentEvents) {
+            this.emit(data.type, data);
+          } else {
+            this.emit(AgentEvents.Unhandled, data);
+          }
+        } catch (error) {
+          this.emit(AgentEvents.Error, {
+            event,
+            message: "Unable to parse `data` as JSON.",
+            error,
+          });
+        }
+      };
+    }
+  }
+
+  /**
+   * To be called with your model configuration BEFORE sending
+   * any audio data.
+   * @param options - The SettingsConfiguration object.
+   * @param options.audio.input.encoding - The encoding for your inbound (user) audio.
+   * @param options.audio.input.sampleRate - The sample rate for your inbound (user) audio.
+   * @param options.audio.output.encoding - The encoding for your outbound (agent) audio.
+   * @param options.audio.output.sampleRate - The sample rate for your outbound (agent) audio.
+   * @param options.audio.output.bitrate - The bitrate for your outbound (agent) audio.
+   * @param options.audio.output.container - The container for your outbound (agent) audio.
+   * @param options.agent.listen.model - The STT model to use for processing user audio.
+   * @param options.agent.speak.model - The TTS model to use for generating agent audio.
+   * @param options.agent.think.provider.type - The LLM provider to use.
+   * @param options.agent.think.model - The LLM model to use.
+   * @param options.agent.think.instructions - The instructions to provide to the LLM.
+   * @param options.agent.think.functions - The functions to provide to the LLM.
+   * @param options.context.messages - The message history to provide to the LLM (useful if a websocket connection is lost.)
+   * @param options.context.replay - Whether to replay the last message if it was an assistant message.
+   */
+  public configure(options: AgentLiveSchema): void {
+    this.send(JSON.stringify({ type: "SettingsConfiguration", options }));
+  }
+
+  /**
+   * Provide new instructions to the LLM.
+   * @param instructions - The instructions to provide.
+   */
+  public updateInstructions(instructions: string): void {
+    this.send(JSON.stringify({ type: "UpdateInstructions", instructions }));
+  }
+
+  /**
+   * Change the speak model.
+   * @param model - The new model to use.
+   */
+  public updateSpeak(model: SpeakModel): void {
+    this.send(JSON.stringify({ type: "UpdateSpeak", model }));
+  }
+
+  /**
+   * Immediately trigger an agent message. If this message
+   * is sent while the user is speaking, or while the server is in the
+   * middle of sending audio, then the request will be ignored and an InjectionRefused
+   * event will be emitted.
+   * @example "Hold on while I look that up for you."
+   * @example "Are you still on the line?"
+   * @param message - The message to speak.
+   */
+  public injectAgentMessage(message: string): void {
+    this.send(JSON.stringify({ type: "InjectAgentMessage", message }));
+  }
+
+  /**
+   * Send a keepalive to avoid closing the websocket while you
+   * are not transmitting audio. This should be sent at least
+   * every 8 seconds.
+   */
+  public keepAlive(): void {
+    this.send(JSON.stringify({ type: "KeepAlive" }));
+  }
+}
diff --git a/src/packages/index.ts b/src/packages/index.ts
@@ -1,6 +1,7 @@
 export * from "./AbstractClient";
 export * from "./AbstractLiveClient";
 export * from "./AbstractRestClient";
+export * from "./AgentLiveClient";
 export * from "./ListenClient";
 export * from "./ListenLiveClient";
 export * from "./ListenRestClient";