-
Notifications
You must be signed in to change notification settings - Fork 45
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Support for Audio File Transcription #745
base: main
Are you sure you want to change the base?
Changes from 7 commits
81120c0
8b51315
2d8d921
ee2b7ec
ddfdfbe
91921f8
8449a35
f93d190
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,12 @@ import { useCallback } from "react"; | |
import { useAlert } from "./use-alert"; | ||
import { ChatCraftChat } from "../lib/ChatCraftChat"; | ||
import { ChatCraftHumanMessage } from "../lib/ChatCraftMessage"; | ||
import { type JinaAiReaderResponse, pdfToMarkdown } from "../lib/ai"; | ||
import { | ||
audioToText, | ||
type JinaAiReaderResponse, | ||
type OpenAISpeechToTextResponse, | ||
pdfToMarkdown, | ||
} from "../lib/ai"; | ||
import { compressImageToBase64, formatAsCodeBlock } from "../lib/utils"; | ||
import { getSettings } from "../lib/settings"; | ||
|
||
|
@@ -118,22 +123,31 @@ function formatTextContent(filename: string, type: string, content: string): str | |
} | ||
|
||
// Makes sure that the contents are non-empty | ||
function assertContents(contents: string | JinaAiReaderResponse) { | ||
function assertContents(contents: string | JinaAiReaderResponse | OpenAISpeechToTextResponse) { | ||
let content: string | undefined; | ||
|
||
if (typeof contents === "string") { | ||
if (!contents.trim().length) { | ||
throw new Error("Empty contents", { cause: { code: "EmptyFile" } }); | ||
} | ||
} else { | ||
if (!contents.data.content.trim().length) { | ||
throw new Error("Empty contents", { cause: { code: "EmptyFile" } }); | ||
} | ||
content = contents; | ||
} else if ("data" in contents && "content" in contents.data) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. const content = contents?.data?.content then use if (content) will take care of this and length check here and below There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tarasglek I have changed the file now according to what you suggested There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where did you do it, the code here is still showing the old way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe I didn't understand the comment properly @humphd. What do you guys want me to do here? |
||
content = contents.data.content; | ||
} else if ("text" in contents) { | ||
content = contents.text; | ||
} | ||
|
||
if (!content?.trim().length) { | ||
throw new Error("Empty contents", { cause: { code: "EmptyFile" } }); | ||
} | ||
|
||
if (!content) { | ||
throw new Error("Unknown content type", { cause: { code: "InvalidContentType" } }); | ||
} | ||
} | ||
|
||
async function processFile( | ||
file: File, | ||
settings: ReturnType<typeof getSettings> | ||
): Promise<string | JinaAiReaderResponse> { | ||
): Promise<string | JinaAiReaderResponse | OpenAISpeechToTextResponse> { | ||
console.log(file.type); | ||
if (file.type.startsWith("image/")) { | ||
return await compressImageToBase64(file, { | ||
compressionFactor: settings.compressionFactor, | ||
|
@@ -142,6 +156,12 @@ async function processFile( | |
}); | ||
} | ||
|
||
if (file.type.startsWith("audio/")) { | ||
const contents = await audioToText(file); | ||
assertContents(contents); | ||
return contents; | ||
} | ||
|
||
if (file.type === "application/pdf") { | ||
const contents = await pdfToMarkdown(file); | ||
assertContents(contents); | ||
|
@@ -180,13 +200,16 @@ export function useFileImport({ chat, onImageImport }: UseFileImportOptions) { | |
const settings = getSettings(); | ||
|
||
const importFile = useCallback( | ||
(file: File, contents: string | JinaAiReaderResponse) => { | ||
async (file: File, contents: string | JinaAiReaderResponse | OpenAISpeechToTextResponse) => { | ||
if (file.type.startsWith("image/")) { | ||
const base64 = contents as string; | ||
onImageImport(base64); | ||
} else if (file.type === "application/pdf") { | ||
const document = (contents as JinaAiReaderResponse).data; | ||
chat.addMessage(new ChatCraftHumanMessage({ text: `${document.content}\n` })); | ||
} else if (file.type.startsWith("audio/")) { | ||
const document = (contents as OpenAISpeechToTextResponse).text; | ||
chat.addMessage(new ChatCraftHumanMessage({ text: `${document}\n` })); | ||
} else if ( | ||
file.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||
) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import { ChatCraftProvider } from "./ChatCraftProvider"; | ||
import { getSettings } from "./settings"; | ||
import { isSpeechToTextModel } from "./ai"; | ||
|
||
export class ModelService { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an interesting idea. We should add other methods later. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed! |
||
static async getSpeechToTextClient() { | ||
const settings = getSettings(); | ||
const provider = settings.currentProvider; | ||
|
||
if (!provider.apiKey) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not all providers require an API key |
||
return null; | ||
} | ||
|
||
return provider.createClient(provider.apiKey).openai; | ||
} | ||
|
||
static async getSpeechToTextModel(provider: ChatCraftProvider): Promise<string | null> { | ||
if (!provider.apiKey) { | ||
return null; | ||
} | ||
const models: string[] = await provider.queryModels(provider.apiKey); | ||
const sttModel = models.find((model) => isSpeechToTextModel(model)); | ||
return sttModel || null; | ||
} | ||
|
||
static async isSpeechToTextSupported(provider: ChatCraftProvider): Promise<boolean> { | ||
if (!provider.apiKey) { | ||
return false; | ||
} | ||
const models: string[] = await provider.queryModels(provider.apiKey); | ||
return models.some((model) => isSpeechToTextModel(model)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we support all audio types? Let's narrow this so we only accept the ones we can process.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay I have done that now @humphd