Merge pull request #85 from intelligentnode/83-add-whisper-model

83 improve whisper model
intelligentnode · Jan 30, 2025 · 670a129 · 670a129
2 parents ddcd1e0 + 3ddf5c6
commit 670a129
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 25 deletions.
diff --git a/intelli/test/integration/test_keras_whisper.py b/intelli/test/integration/test_keras_whisper.py
@@ -22,12 +22,12 @@ def test_whisper_real_audio():
     )
 
     result = wrapper.transcript(
-                audio_data,
-                sample_rate=sample_rate,
-                language="<|en|>",
-                user_prompt="You are a medical expert responsible for transcribing notes from a doctor’s speech.",
-                condition_on_previous_text=True
-            )
+        audio_data,
+        sample_rate=sample_rate,
+        language="<|en|>",
+        user_prompt="You are a medical expert responsible for transcribing notes from a doctor’s speech.",
+        condition_on_previous_text=True,
+    )
     assert result is not None, "Transcription result is None."
     print("Transcription output:", result)
 

diff --git a/intelli/utils/whisper_helper.py b/intelli/utils/whisper_helper.py
@@ -6,6 +6,7 @@ def __init__(self, model_name="whisper_tiny_en", backbone=None):
         try:
             import numpy as np
             import tensorflow as tf
+            tf.config.optimizer.set_jit(True)
             import librosa
             import keras_hub as hub
         except ImportError as e:
@@ -101,7 +102,7 @@ def transcribe(
         audio_data,
         sample_rate=16000,
         language=None,
-        max_steps=100,
+        max_steps=80,
         min_chunk_sec=20,
         max_chunk_sec=30,
         silence_top_db=40,
@@ -148,7 +149,7 @@ def transcribe(
         running_prompt = user_prompt or ""
         results = []
 
-        for (start, end) in final_chunks:
+        for start, end in final_chunks:
             chunk_data = audio_data[start:end]
 
             text = self._transcribe_single_chunk(
@@ -172,12 +173,13 @@ def transcribe(
 
         return " ".join(results).strip()
 
+
     def _transcribe_single_chunk(
         self,
         chunk_audio_data,
         sample_rate=16000,
         language=None,
-        max_steps=100,
+        max_steps=80,
         user_prompt=None,
     ):
         """
@@ -212,9 +214,7 @@ def _transcribe_single_chunk(
 
         # final check - everything is an integer
         if any(not isinstance(x, int) for x in start_ids):
-            raise ValueError(
-                f"start_ids contains a non-integer. start_ids={start_ids}"
-            )
+            raise ValueError(f"start_ids contains a non-integer. start_ids={start_ids}")
 
         # convert to TF tensor
         decoder_ids = self.tf.constant([start_ids], dtype=self.tf.int32)
@@ -244,6 +244,6 @@ def _transcribe_single_chunk(
                 break
 
         # slice out generated tokens - ignore the "start_ids"
-        final_ids = decoder_ids[0, len(start_ids):]
+        final_ids = decoder_ids[0, len(start_ids) :]
         text = self.tokenizer.detokenize(final_ids)
         return text.replace("<|endoftext|>", "").strip()
diff --git a/intelli/wrappers/keras_wrapper.py b/intelli/wrappers/keras_wrapper.py
@@ -22,7 +22,7 @@ def _load_model(self):
                 "keras_nlp is not installed or model is not supported."
             ) from e
 
-        if "KAGGLE_USERNAME" in self.model_params:
+        if self.model_params and "KAGGLE_USERNAME" in self.model_params:
             os.environ["KAGGLE_USERNAME"] = self.model_params["KAGGLE_USERNAME"]
             os.environ["KAGGLE_KEY"] = self.model_params["KAGGLE_KEY"]
 
@@ -40,7 +40,6 @@ def _load_model(self):
             )
         elif "whisper" in self.model_name:
             try:
-                print("---> whisper")
                 backbone = self.nlp_manager.models.WhisperBackbone.from_preset(
                     self.model_name
                 )
@@ -129,14 +128,14 @@ def fine_tune(
         self.model.fit(dataset, epochs=epochs, batch_size=batch_size)
 
     def transcript(
-    self,
-    audio_data,
-    sample_rate=16000,
-    language=None,
-    user_prompt=None,
-    condition_on_previous_text=False,
-    max_steps=100,
-    max_chunk_sec=30,
+        self,
+        audio_data,
+        sample_rate=16000,
+        language=None,
+        user_prompt=None,
+        condition_on_previous_text=False,
+        max_steps=80,
+        max_chunk_sec=30,
     ):
         """
         Convert speech to text using the Whisper model.
@@ -155,4 +154,3 @@ def transcript(
             user_prompt=user_prompt,
             condition_on_previous_text=condition_on_previous_text,
         )
-
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="intelli",
-    version="0.5.0",
+    version="0.5.1",
     author="Intellinode",
     author_email="[email protected]",
     description="Create your chatbot or AI agent using Intellinode. We make any model smarter.",