tarasglek · AryanK1511 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/src/components/OptionsButton.tsx b/src/components/OptionsButton.tsx
@@ -218,7 +218,7 @@ function OptionsButton({
             ref={fileInputRef}
             hidden
             onChange={handleFileChange}
-            accept="image/*,text/*,.pdf,application/pdf,*.docx,application/vnd.openxmlformats-officedocument.wordprocessingml.document,.json,application/json,application/markdown"
+            accept="image/*,text/*,.pdf,application/pdf,*.docx,application/vnd.openxmlformats-officedocument.wordprocessingml.document,.json,application/json,application/markdown, audio/*"
           />
           <MenuItem icon={<BsPaperclip />} onClick={handleAttachFiles}>
             Attach Files...

diff --git a/src/hooks/use-file-import.tsx b/src/hooks/use-file-import.tsx
@@ -2,7 +2,12 @@ import { useCallback } from "react";
 import { useAlert } from "./use-alert";
 import { ChatCraftChat } from "../lib/ChatCraftChat";
 import { ChatCraftHumanMessage } from "../lib/ChatCraftMessage";
-import { type JinaAiReaderResponse, pdfToMarkdown } from "../lib/ai";
+import {
+  audioToText,
+  type JinaAiReaderResponse,
+  type OpenAISpeechToTextResponse,
+  pdfToMarkdown,
+} from "../lib/ai";
 import { compressImageToBase64, formatAsCodeBlock } from "../lib/utils";
 import { getSettings } from "../lib/settings";
 
@@ -118,22 +123,31 @@ function formatTextContent(filename: string, type: string, content: string): str
 }
 
 // Makes sure that the contents are non-empty
-function assertContents(contents: string | JinaAiReaderResponse) {
+function assertContents(contents: string | JinaAiReaderResponse | OpenAISpeechToTextResponse) {
+  let content: string | undefined;
+
   if (typeof contents === "string") {
-    if (!contents.trim().length) {
-      throw new Error("Empty contents", { cause: { code: "EmptyFile" } });
-    }
-  } else {
-    if (!contents.data.content.trim().length) {
-      throw new Error("Empty contents", { cause: { code: "EmptyFile" } });
-    }
+    content = contents;
+  } else if ("data" in contents && "content" in contents.data) {
+    content = contents.data.content;
+  } else if ("text" in contents) {
+    content = contents.text;
+  }
+
+  if (!content?.trim().length) {
+    throw new Error("Empty contents", { cause: { code: "EmptyFile" } });
+  }
+
+  if (!content) {
+    throw new Error("Unknown content type", { cause: { code: "InvalidContentType" } });
   }
 }
 
 async function processFile(
   file: File,
   settings: ReturnType<typeof getSettings>
-): Promise<string | JinaAiReaderResponse> {
+): Promise<string | JinaAiReaderResponse | OpenAISpeechToTextResponse> {
+  console.log(file.type);
   if (file.type.startsWith("image/")) {
     return await compressImageToBase64(file, {
       compressionFactor: settings.compressionFactor,
@@ -142,6 +156,12 @@ async function processFile(
     });
   }
 
+  if (file.type.startsWith("audio/")) {
+    const contents = await audioToText(file);
+    assertContents(contents);
+    return contents;
+  }
+
   if (file.type === "application/pdf") {
     const contents = await pdfToMarkdown(file);
     assertContents(contents);
@@ -180,13 +200,16 @@ export function useFileImport({ chat, onImageImport }: UseFileImportOptions) {
   const settings = getSettings();
 
   const importFile = useCallback(
-    (file: File, contents: string | JinaAiReaderResponse) => {
+    async (file: File, contents: string | JinaAiReaderResponse | OpenAISpeechToTextResponse) => {
       if (file.type.startsWith("image/")) {
         const base64 = contents as string;
         onImageImport(base64);
       } else if (file.type === "application/pdf") {
         const document = (contents as JinaAiReaderResponse).data;
         chat.addMessage(new ChatCraftHumanMessage({ text: `${document.content}\n` }));
+      } else if (file.type.startsWith("audio/")) {
+        const document = (contents as OpenAISpeechToTextResponse).text;
+        chat.addMessage(new ChatCraftHumanMessage({ text: `${document}\n` }));
       } else if (
         file.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
       ) {

diff --git a/src/lib/ai.ts b/src/lib/ai.ts
@@ -11,6 +11,8 @@ import {
 import { ChatCraftModel } from "./ChatCraftModel";
 import { getSettings } from "./settings";
 import { usingOfficialOpenAI } from "./providers";
+import { ModelService } from "./model-service";
+import { SpeechRecognition } from "./speech-recognition";
 
 export type ChatOptions = {
   model?: ChatCraftModel;
@@ -464,6 +466,45 @@ export function isChatModel(model: string): boolean {
   );
 }
 
+export type OpenAISpeechToTextResponse = {
+  text: string;
+};
+
+/**
+ * Convert an audio file to text
+ */
+
+export async function audioToText(file: File): Promise<OpenAISpeechToTextResponse> {
+  const settings = getSettings();
+  const currentProvider = settings.currentProvider;
+
+  if (!currentProvider.apiKey) {
+    throw new Error("Missing API Key");
+  }
+
+  const sttClient = await ModelService.getSpeechToTextClient();
+
+  if (!sttClient) {
+    throw new Error("No STT client available");
+  }
+
+  const sttModel = await ModelService.getSpeechToTextModel(currentProvider);
+
+  if (!sttModel) {
+    throw new Error(`No speech-to-text model found for provider ${currentProvider.name}`);
+  }
+
+  const recognition = new SpeechRecognition(sttModel, sttClient);
+
+  try {
+    const text = await recognition.transcribe(file);
+    return { text };
+  } catch (error) {
+    console.error("Error transcribing audio:", error);
+    throw error;
+  }
+}
+
 export type JinaAiReaderResponse = {
   code: number;
   status: number;

diff --git a/src/lib/model-service.ts b/src/lib/model-service.ts
@@ -0,0 +1,33 @@
+import { ChatCraftProvider } from "./ChatCraftProvider";
+import { getSettings } from "./settings";
+import { isSpeechToTextModel } from "./ai";
+
+export class ModelService {
+  static async getSpeechToTextClient() {
+    const settings = getSettings();
+    const provider = settings.currentProvider;
+
+    if (!provider.apiKey) {
+      return null;
+    }
+
+    return provider.createClient(provider.apiKey).openai;
+  }
+
+  static async getSpeechToTextModel(provider: ChatCraftProvider): Promise<string | null> {
+    if (!provider.apiKey) {
+      return null;
+    }
+    const models: string[] = await provider.queryModels(provider.apiKey);
+    const sttModel = models.find((model) => isSpeechToTextModel(model));
+    return sttModel || null;
+  }
+
+  static async isSpeechToTextSupported(provider: ChatCraftProvider): Promise<boolean> {
+    if (!provider.apiKey) {
+      return false;
+    }
+    const models: string[] = await provider.queryModels(provider.apiKey);
+    return models.some((model) => isSpeechToTextModel(model));
+  }
+}
diff --git a/src/lib/speech-recognition.ts b/src/lib/speech-recognition.ts
@@ -132,7 +132,7 @@ export class SpeechRecognition {
     }
   }
 
-  async transcribe(audio: File) {
+  async transcribe(audio: File): Promise<string> {
     const transcriptions = new OpenAI.Audio.Transcriptions(this._openai);
     const transcription = await transcriptions.create({
       file: audio,