Create model service to allow dynamic model fetching for audio to tex…

…t functionality
tarasglek · Dec 5, 2024 · 91921f8 · 91921f8
1 parent ddfdfbe
commit 91921f8
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 21 deletions.
diff --git a/src/hooks/use-file-import.tsx b/src/hooks/use-file-import.tsx
@@ -145,6 +145,7 @@ async function processFile(
   file: File,
   settings: ReturnType<typeof getSettings>
 ): Promise<string | JinaAiReaderResponse | OpenAISpeechToTextResponse> {
+  console.log(file.type);
   if (file.type.startsWith("image/")) {
     return await compressImageToBase64(file, {
       compressionFactor: settings.compressionFactor,

diff --git a/src/lib/ai.ts b/src/lib/ai.ts
@@ -11,6 +11,8 @@ import {
 import { ChatCraftModel } from "./ChatCraftModel";
 import { getSettings } from "./settings";
 import { usingOfficialOpenAI } from "./providers";
+import { ModelService } from "./model-service";
+import { SpeechRecognition } from "./speech-recognition";
 
 export type ChatOptions = {
   model?: ChatCraftModel;
@@ -469,34 +471,37 @@ export type OpenAISpeechToTextResponse = {
 };
 
 /**
- * Convert an audio file to text using https://platform.openai.com/docs/api-reference/audio/createTranscription?lang=node
+ * Convert an audio file to text
  */
+
 export async function audioToText(file: File): Promise<OpenAISpeechToTextResponse> {
-  try {
-    const { currentProvider } = getSettings();
-    if (!currentProvider.apiKey) {
-      throw new Error("Missing OpenAI API Key");
-    }
+  const settings = getSettings();
+  const currentProvider = settings.currentProvider;
 
-    const { openai } = currentProvider.createClient(currentProvider.apiKey);
+  if (!currentProvider.apiKey) {
+    throw new Error("Missing API Key");
+  }
 
-    const response = await openai.audio.transcriptions.create({
-      file,
-      model: "whisper-1",
-    });
+  const sttClient = await ModelService.getSpeechToTextClient();
 
-    if (!response.text) {
-      throw new Error("Error: No transcription text returned by OpenAI.");
-    }
+  if (!sttClient) {
+    throw new Error("No STT client available");
+  }
 
-    const result: OpenAISpeechToTextResponse = {
-      text: response.text,
-    };
+  const sttModel = await ModelService.getSpeechToTextModel(currentProvider);
 
-    return result;
-  } catch (err) {
-    console.error("Error converting audio to text", err);
-    throw err;
+  if (!sttModel) {
+    throw new Error(`No speech-to-text model found for provider ${currentProvider.name}`);
+  }
+
+  const recognition = new SpeechRecognition(sttModel, sttClient);
+
+  try {
+    const text = await recognition.transcribe(file);
+    return { text };
+  } catch (error) {
+    console.error("Error transcribing audio:", error);
+    throw error;
   }
 }
 

diff --git a/src/lib/model-service.ts b/src/lib/model-service.ts
@@ -0,0 +1,33 @@
+import { ChatCraftProvider } from "./ChatCraftProvider";
+import { getSettings } from "./settings";
+import { isSpeechToTextModel } from "./ai";
+
+export class ModelService {
+  static async getSpeechToTextClient() {
+    const settings = getSettings();
+    const provider = settings.currentProvider;
+
+    if (!provider.apiKey) {
+      return null;
+    }
+
+    return provider.createClient(provider.apiKey).openai;
+  }
+
+  static async getSpeechToTextModel(provider: ChatCraftProvider): Promise<string | null> {
+    if (!provider.apiKey) {
+      return null;
+    }
+    const models: string[] = await provider.queryModels(provider.apiKey);
+    const sttModel = models.find((model) => isSpeechToTextModel(model));
+    return sttModel || null;
+  }
+
+  static async isSpeechToTextSupported(provider: ChatCraftProvider): Promise<boolean> {
+    if (!provider.apiKey) {
+      return false;
+    }
+    const models: string[] = await provider.queryModels(provider.apiKey);
+    return models.some((model) => isSpeechToTextModel(model));
+  }
+}