条件ごとのcontroller作成

kasi-x · Nov 29, 2023 · fb81c03 · fb81c03
1 parent f49cb17
commit fb81c03
Show file tree

Hide file tree

Showing 11 changed files with 361 additions and 99 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 *.py[cod]
 *$py.class
 *.wav
+llm/models/*
 
 # C extensions
 *.so

diff --git a/README.md b/README.md
@@ -1,116 +1,78 @@
+# 概要
+
+LLMを用いたシンプルな音声対話システムです。
+
 # 環境設定
+
+googleとopenaiのキーをそれぞれ環境変数に登録してください。
+
 ```
 export GOOGLE_APPLICATION_CREDENTIALS=...
 ```
 ```
 export OPENAI_API_KEY=...
 ```
 
-
+pipモジュールをインストールします。
 ```
-wget https://github.com/GoogleCloudPlatform/python-docs-samples/raw/main/speech/microphone/transcribe_streaming_infinite.py -O stt/google_stt.py
+pip install -r requirements.txt
 ```
-modified listen_print_loop
 
-before
-```
-def listen_print_loop(responses: object, stream: object) -> object:
-
-[中略]
+その後、voicevoxをインストールしてください。具体的なインストール方法は公式ページを参照してください。
 
-        if result.is_final:
-            sys.stdout.write(GREEN)
-            sys.stdout.write("\033[K")
-            sys.stdout.write(str(corrected_time) + ": " + transcript + "\n")
+https://voicevox.hiroshiba.jp/
 
-            stream.is_final_end_time = stream.result_end_time
-            stream.last_transcript_was_final = True
 
-            # Exit recognition if any of the transcribed phrases could be
-            # one of our keywords.
-            if re.search(r"\b(exit|quit)\b", transcript, re.I):
-                sys.stdout.write(YELLOW)
-                sys.stdout.write("Exiting...\n")
-                stream.closed = True
-                break
-        else:
-            sys.stdout.write(RED)
-            sys.stdout.write("\033[K")
-            sys.stdout.write(str(corrected_time) + ": " + transcript + "\r")
+# voice_interaction_base
 
-            stream.last_transcript_was_final = False
+一番遅いモデルです。Google STTのfinalの終わりまで待ち、ChatGPTにリクエストを投げます。ただし応答内容の精度は良いです。
 
-        return transcript
 ```
-after
+python voice_interaction_base.py
 ```
-def listen_print_loop(responses: object, stream: object, callback_interim: object, callback_final: object) -> object:
-
-[中略]
-
-        if result.is_final:
-            sys.stdout.write(GREEN)
-            sys.stdout.write("\033[K")
-            sys.stdout.write(str(corrected_time) + ": " + transcript + "\n")
 
-            if callback_final != None:
-                callback_final(transcript)
+# voice_interaction
 
-            stream.is_final_end_time = stream.result_end_time
-            stream.last_transcript_was_final = True
+Google STTのfinalが出力されるまでの時間が短縮されています。
 
-            # Exit recognition if any of the transcribed phrases could be
-            # one of our keywords.
-            if re.search(r"\b(exit|quit)\b", transcript, re.I):
-                sys.stdout.write(YELLOW)
-                sys.stdout.write("Exiting...\n")
-                stream.closed = True
-                break
-        else:
-            sys.stdout.write(RED)
-            sys.stdout.write("\033[K")
-            sys.stdout.write(str(corrected_time) + ": " + transcript + "\r")
+```
+python voice_interaction.py
+```
 
-            if callback_interim != None:
-                callback_interim(transcript)
+# voice_interaction_stream
 
-            stream.last_transcript_was_final = False
+リアルタイムもどきの方法で音声合成が行われます。
 
-    return transcript
 ```
-
-modified main
+python voice_interaction_stream.py
 ```
-def main() -> None:
 
-[中略]
+# voice_interaction_llama2
 
-    config = speech.RecognitionConfig(
-        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
-        sample_rate_hertz=SAMPLE_RATE,
-        language_code="en-US",
-        max_alternatives=1,
-    )
+ChatGPTではなくllama2を用います。事前にモデルデータを準備しておく必要があります。
 
-[中略]
-            listen_print_loop(responses, stream)
 ```
+# bash
+> git clone https://github.com/ggerganov/llama.cpp
+> cd llama.cpp
+> make -j 8 LLAMA_CUBLAS=1
+> python
+    import huggingface_hub
+    huggingface_hub.snapshot_download(repo_id='elyza/ELYZA-japanese-Llama-2-7b-instruct', cache_dir="original_models")
+    exit()
+> python3 convert.py original_models/models--elyza--ELYZA-japanese-Llama-2-7b-instruct/snapshots/48fa08b3098a23d3671e09565499a4cfbaff1923 --outfile gguf-models/elyza.gguf
+> ./quantize gguf-models/elyza.gguf gguf-models/elyza-q8.gguf q8_0
 ```
-def main(callback_interim, callback_final) -> None:
-
-[中略]
 
-    config = speech.RecognitionConfig(
-        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
-        sample_rate_hertz=SAMPLE_RATE,
-        language_code="ja-JP",
-        max_alternatives=1,
-    )
+生成された`gguf-models/elyza-q8.gguf`を`llm/models`に配置してください。
+次に、llama-cpp-pythonをインストールします。
 
-[中略]
-
-            listen_print_loop(responses, stream, callback_interim, callback_final)
+```
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -vv
 ```
 
+以上で準備は完了です。実行してください。
 
-
+```
+python voice_interaction_stream.py
+```
diff --git a/llm/README.md b/llm/README.md
@@ -0,0 +1,29 @@
+```
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+mkdir build
+cd build
+
+make  -j 8 LLAMA_CUBLAS=1
+
+mkdir -p gguf_models
+python3 convert.py original_models/models--cyberagent--open-calm-7b/snapshots/276a5fb67510554e11ef191a2da44c919acccdf5/ \
+ --vocabtype spm \
+ --outtype f16 \
+ --outfile gguf_models/open-calm-7b-f16.gguf
+
+./quantize \
+    gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-f16.gguf \
+    gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf q8_0
+
+
+./main -m gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf \
+    -n 512 -c 0 \
+    --repeat_penalty 1.0 \
+    --color -i -r "User:" \
+    -f prompts/chat-with-sota-kun.txt
+```
+test
+```
+./main -m gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf --temp 0.1 -p "[INST]こんにちは。[/INST]" -ngl 32 -b 512
+```
diff --git a/llm/chatgpt.py b/llm/chatgpt.py
@@ -14,7 +14,7 @@ def get(self, user_utterance):
             messages=self.dialogue_history,
             stream = self.valid_stream
         )
-        return completion.choices[0].message.content
+        return completion
 
     def set_agent_utterance(self, agent_utterance):
         self.dialogue_history.append({"role": "assistant", "content": agent_utterance})
diff --git a/llm/llama2.py b/llm/llama2.py
@@ -0,0 +1,17 @@
+from llama_cpp import Llama
+
+class Llama2():
+
+    def __init__(self, valid_stream=True) -> None:
+        self.llama = Llama(model_path="llm/models/elyza-q8.gguf", n_gpu_layers=50)
+        self.valid_stream = valid_stream
+
+    def get(self, user_utterance):
+        streamer = self.llama.create_chat_completion(
+            [{"role":"user", "content": f"""[INST] <<SYS>>\nあなたはアシスタントです。\n<</SYS>>\n\n{user_utterance}[/INST]"""}], 
+            stream=self.valid_stream
+        )
+        return streamer
+
+    def set_agent_utterance(self, agent_utterance):
+        pass
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
-openai
+openai==1.3.6
 google-api-python-client==2.86.0
 google-cloud-speech==2.19.0
 webrtcvad==2.0.10
 PyAudio==0.2.13
-soundfile
-playsound
+soundfile==0.12.1
+playsound==1.3.0
diff --git a/tts/voicevox.py b/tts/voicevox.py
@@ -28,6 +28,16 @@ def run_synthesis(query_data, speaker = 1):
             print('fail connect...', url)
             time.sleep(0.1)
 
+def extract_wav_length(query_data):
+    length = 0
+    for accent_phrase in query_data["accent_phrases"]:
+        for mora in accent_phrase["moras"]:
+            if mora["consonant_length"] != None:
+                length += mora["consonant_length"]
+            if mora["vowel_length"] != None:
+                length += mora["vowel_length"]
+    return length
+
 def get_audio_file_from_text(text):
     query_data = get_audio_query(text)
-    return run_synthesis(query_data)
+    return run_synthesis(query_data), extract_wav_length(query_data)
diff --git a/voice_interaction.py b/voice_interaction.py
@@ -4,18 +4,23 @@
 from tts import voicevox
 import threading
 from playsound import playsound
+import time
 
 class Main():
 
     def __init__(self) -> None:
+        self.valid_stream = False
         vad = google_vad.GOOGLE_WEBRTC()
         vad_thread = threading.Thread(target=vad.vad_loop, args=(self.callback_vad, ))
         stt_thread = threading.Thread(target=google_stt.main, args=(self.callback_interim, self.callback_final,))
-        self.llm = chatgpt.ChatGPT(valid_stream=False)
+        self.llm = chatgpt.ChatGPT(valid_stream=self.valid_stream)
 
         self.latest_user_utterance = None
         self.finished_user_speeching = False
 
+        # 計測用
+        self.time_user_speeching_end = None
+
         stt_thread.start()
         vad_thread.start()
 
@@ -26,38 +31,35 @@ def wait(self):
             thread.join()
 
     def callback_interim(self, user_utterance):
-        print("interim", user_utterance)
         self.latest_user_utterance = user_utterance
 
     def callback_final(self, user_utterance):
-        print("final", user_utterance)
         self.latest_user_utterance = user_utterance
 
     def callback_vad(self, flag):
-        print("vad", flag)
         if flag == True:
             self.latest_user_utterance = None
         elif self.latest_user_utterance != None:
+            self.time_user_speeching_end = time.time()
             threading.Thread(target=self.main_process, args=(self.latest_user_utterance,)).start()
 
     def main_process(self, user_utterance):
         llm_result = self.llm.get(user_utterance)
-        print("llm end", llm_result)
-        if type(llm_result) == str:
-            print(llm_result)
-            wav_data = voicevox.get_audio_file_from_text(llm_result)
+        if self.valid_stream == False:
+            agent_utterance = llm_result.choices[0].message.content
+            wav_data, _ = voicevox.get_audio_file_from_text(agent_utterance)
             self.audio_play(wav_data)
-        else:
-            print(llm_result)
 
     def audio_play(self, wav_data):
-        with open("tmp.wav", mode='bx') as f:
+        start_time = time.time()
+        with open("tmp.wav", mode='bw') as f:
             f.write(wav_data)
+        if self.time_user_speeching_end != None:
+            print("応答までの時間", time.time() - self.time_user_speeching_end)
+        self.time_user_speeching_end = None
         playsound("tmp.wav")
 
+
 if __name__ == '__main__':
     ins = Main()
     ins.wait()
-
-
-
diff --git a/voice_interaction_base.py b/voice_interaction_base.py
@@ -0,0 +1,63 @@
+from stt import google_stt
+# from vad import google_vad
+from llm import chatgpt
+from tts import voicevox
+import threading
+from playsound import playsound
+import time
+
+class Main():
+
+    def __init__(self) -> None:
+        stt_thread = threading.Thread(target=google_stt.main, args=(self.callback_interim, self.callback_final,))
+        self.llm = chatgpt.ChatGPT(valid_stream=False)
+
+        self.latest_user_utterance = None
+        self.finished_user_speeching = False
+
+        # 計測用
+        # vad = google_vad.GOOGLE_WEBRTC()
+        # vad_thread = threading.Thread(target=vad.vad_loop, args=(self.callback_vad, ))
+        # vad_thread.start()
+        # self.time_user_speeching_end = None
+
+        stt_thread.start()
+
+    def wait(self):
+        thread_list = threading.enumerate()
+        thread_list.remove(threading.main_thread())
+        for thread in thread_list:
+            thread.join()
+
+    def callback_interim(self, user_utterance):
+        self.latest_user_utterance = user_utterance
+
+    def callback_final(self, user_utterance):
+        self.latest_user_utterance = user_utterance
+        threading.Thread(target=self.main_process, args=(self.latest_user_utterance,)).start()
+
+    # def callback_vad(self, flag):
+    #     if flag == False:
+    #         print("vad")
+    #         self.time_user_speeching_end = time.time()
+
+    def main_process(self, user_utterance):
+        llm_result = self.llm.get(user_utterance)
+        wav_data, _ = voicevox.get_audio_file_from_text(llm_result.choices[0].message.content)
+        self.audio_play(wav_data)
+
+    def audio_play(self, wav_data):
+        with open("tmp.wav", mode='bw') as f:
+            f.write(wav_data)
+        # if self.time_user_speeching_end != None:
+        #     print("応答までの時間", time.time() - self.time_user_speeching_end)
+        self.time_user_speeching_end = None
+        playsound("tmp.wav")
+
+
+if __name__ == '__main__':
+    ins = Main()
+    ins.wait()
+
+
+