Skip to content

Commit

Permalink
条件ごとのcontroller作成
Browse files Browse the repository at this point in the history
  • Loading branch information
tennmoku71 committed Nov 29, 2023
1 parent f49cb17 commit fb81c03
Show file tree
Hide file tree
Showing 11 changed files with 361 additions and 99 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*.py[cod]
*$py.class
*.wav
llm/models/*

# C extensions
*.so
Expand Down
122 changes: 42 additions & 80 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,116 +1,78 @@
# 概要

LLMを用いたシンプルな音声対話システムです。

# 環境設定

googleとopenaiのキーをそれぞれ環境変数に登録してください。

```
export GOOGLE_APPLICATION_CREDENTIALS=...
```
```
export OPENAI_API_KEY=...
```


pipモジュールをインストールします。
```
wget https://github.com/GoogleCloudPlatform/python-docs-samples/raw/main/speech/microphone/transcribe_streaming_infinite.py -O stt/google_stt.py
pip install -r requirements.txt
```
modified listen_print_loop

before
```
def listen_print_loop(responses: object, stream: object) -> object:
[中略]
その後、voicevoxをインストールしてください。具体的なインストール方法は公式ページを参照してください。

if result.is_final:
sys.stdout.write(GREEN)
sys.stdout.write("\033[K")
sys.stdout.write(str(corrected_time) + ": " + transcript + "\n")
https://voicevox.hiroshiba.jp/

stream.is_final_end_time = stream.result_end_time
stream.last_transcript_was_final = True

# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
sys.stdout.write(YELLOW)
sys.stdout.write("Exiting...\n")
stream.closed = True
break
else:
sys.stdout.write(RED)
sys.stdout.write("\033[K")
sys.stdout.write(str(corrected_time) + ": " + transcript + "\r")
# voice_interaction_base

stream.last_transcript_was_final = False
一番遅いモデルです。Google STTのfinalの終わりまで待ち、ChatGPTにリクエストを投げます。ただし応答内容の精度は良いです。

return transcript
```
after
python voice_interaction_base.py
```
def listen_print_loop(responses: object, stream: object, callback_interim: object, callback_final: object) -> object:
[中略]
if result.is_final:
sys.stdout.write(GREEN)
sys.stdout.write("\033[K")
sys.stdout.write(str(corrected_time) + ": " + transcript + "\n")

if callback_final != None:
callback_final(transcript)
# voice_interaction

stream.is_final_end_time = stream.result_end_time
stream.last_transcript_was_final = True
Google STTのfinalが出力されるまでの時間が短縮されています。

# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
sys.stdout.write(YELLOW)
sys.stdout.write("Exiting...\n")
stream.closed = True
break
else:
sys.stdout.write(RED)
sys.stdout.write("\033[K")
sys.stdout.write(str(corrected_time) + ": " + transcript + "\r")
```
python voice_interaction.py
```

if callback_interim != None:
callback_interim(transcript)
# voice_interaction_stream

stream.last_transcript_was_final = False
リアルタイムもどきの方法で音声合成が行われます。

return transcript
```

modified main
python voice_interaction_stream.py
```
def main() -> None:

[中略]
# voice_interaction_llama2

config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=SAMPLE_RATE,
language_code="en-US",
max_alternatives=1,
)
ChatGPTではなくllama2を用います。事前にモデルデータを準備しておく必要があります。

[中略]
listen_print_loop(responses, stream)
```
# bash
> git clone https://github.com/ggerganov/llama.cpp
> cd llama.cpp
> make -j 8 LLAMA_CUBLAS=1
> python
import huggingface_hub
huggingface_hub.snapshot_download(repo_id='elyza/ELYZA-japanese-Llama-2-7b-instruct', cache_dir="original_models")
exit()
> python3 convert.py original_models/models--elyza--ELYZA-japanese-Llama-2-7b-instruct/snapshots/48fa08b3098a23d3671e09565499a4cfbaff1923 --outfile gguf-models/elyza.gguf
> ./quantize gguf-models/elyza.gguf gguf-models/elyza-q8.gguf q8_0
```
def main(callback_interim, callback_final) -> None:
[中略]

config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=SAMPLE_RATE,
language_code="ja-JP",
max_alternatives=1,
)
生成された`gguf-models/elyza-q8.gguf``llm/models`に配置してください。
次に、llama-cpp-pythonをインストールします。

[中略]
listen_print_loop(responses, stream, callback_interim, callback_final)
```
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -vv
```

以上で準備は完了です。実行してください。


```
python voice_interaction_stream.py
```
29 changes: 29 additions & 0 deletions llm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
```
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
make -j 8 LLAMA_CUBLAS=1
mkdir -p gguf_models
python3 convert.py original_models/models--cyberagent--open-calm-7b/snapshots/276a5fb67510554e11ef191a2da44c919acccdf5/ \
--vocabtype spm \
--outtype f16 \
--outfile gguf_models/open-calm-7b-f16.gguf
./quantize \
gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-f16.gguf \
gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf q8_0
./main -m gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf \
-n 512 -c 0 \
--repeat_penalty 1.0 \
--color -i -r "User:" \
-f prompts/chat-with-sota-kun.txt
```
test
```
./main -m gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf --temp 0.1 -p "[INST]こんにちは。[/INST]" -ngl 32 -b 512
```
2 changes: 1 addition & 1 deletion llm/chatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get(self, user_utterance):
messages=self.dialogue_history,
stream = self.valid_stream
)
return completion.choices[0].message.content
return completion

def set_agent_utterance(self, agent_utterance):
self.dialogue_history.append({"role": "assistant", "content": agent_utterance})
17 changes: 17 additions & 0 deletions llm/llama2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from llama_cpp import Llama

class Llama2():

def __init__(self, valid_stream=True) -> None:
self.llama = Llama(model_path="llm/models/elyza-q8.gguf", n_gpu_layers=50)
self.valid_stream = valid_stream

def get(self, user_utterance):
streamer = self.llama.create_chat_completion(
[{"role":"user", "content": f"""[INST] <<SYS>>\nあなたはアシスタントです。\n<</SYS>>\n\n{user_utterance}[/INST]"""}],
stream=self.valid_stream
)
return streamer

def set_agent_utterance(self, agent_utterance):
pass
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
openai
openai==1.3.6
google-api-python-client==2.86.0
google-cloud-speech==2.19.0
webrtcvad==2.0.10
PyAudio==0.2.13
soundfile
playsound
soundfile==0.12.1
playsound==1.3.0
12 changes: 11 additions & 1 deletion tts/voicevox.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ def run_synthesis(query_data, speaker = 1):
print('fail connect...', url)
time.sleep(0.1)

def extract_wav_length(query_data):
length = 0
for accent_phrase in query_data["accent_phrases"]:
for mora in accent_phrase["moras"]:
if mora["consonant_length"] != None:
length += mora["consonant_length"]
if mora["vowel_length"] != None:
length += mora["vowel_length"]
return length

def get_audio_file_from_text(text):
query_data = get_audio_query(text)
return run_synthesis(query_data)
return run_synthesis(query_data), extract_wav_length(query_data)
30 changes: 16 additions & 14 deletions voice_interaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,23 @@
from tts import voicevox
import threading
from playsound import playsound
import time

class Main():

def __init__(self) -> None:
self.valid_stream = False
vad = google_vad.GOOGLE_WEBRTC()
vad_thread = threading.Thread(target=vad.vad_loop, args=(self.callback_vad, ))
stt_thread = threading.Thread(target=google_stt.main, args=(self.callback_interim, self.callback_final,))
self.llm = chatgpt.ChatGPT(valid_stream=False)
self.llm = chatgpt.ChatGPT(valid_stream=self.valid_stream)

self.latest_user_utterance = None
self.finished_user_speeching = False

# 計測用
self.time_user_speeching_end = None

stt_thread.start()
vad_thread.start()

Expand All @@ -26,38 +31,35 @@ def wait(self):
thread.join()

def callback_interim(self, user_utterance):
print("interim", user_utterance)
self.latest_user_utterance = user_utterance

def callback_final(self, user_utterance):
print("final", user_utterance)
self.latest_user_utterance = user_utterance

def callback_vad(self, flag):
print("vad", flag)
if flag == True:
self.latest_user_utterance = None
elif self.latest_user_utterance != None:
self.time_user_speeching_end = time.time()
threading.Thread(target=self.main_process, args=(self.latest_user_utterance,)).start()

def main_process(self, user_utterance):
llm_result = self.llm.get(user_utterance)
print("llm end", llm_result)
if type(llm_result) == str:
print(llm_result)
wav_data = voicevox.get_audio_file_from_text(llm_result)
if self.valid_stream == False:
agent_utterance = llm_result.choices[0].message.content
wav_data, _ = voicevox.get_audio_file_from_text(agent_utterance)
self.audio_play(wav_data)
else:
print(llm_result)

def audio_play(self, wav_data):
with open("tmp.wav", mode='bx') as f:
start_time = time.time()
with open("tmp.wav", mode='bw') as f:
f.write(wav_data)
if self.time_user_speeching_end != None:
print("応答までの時間", time.time() - self.time_user_speeching_end)
self.time_user_speeching_end = None
playsound("tmp.wav")


if __name__ == '__main__':
ins = Main()
ins.wait()



63 changes: 63 additions & 0 deletions voice_interaction_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from stt import google_stt
# from vad import google_vad
from llm import chatgpt
from tts import voicevox
import threading
from playsound import playsound
import time

class Main():

def __init__(self) -> None:
stt_thread = threading.Thread(target=google_stt.main, args=(self.callback_interim, self.callback_final,))
self.llm = chatgpt.ChatGPT(valid_stream=False)

self.latest_user_utterance = None
self.finished_user_speeching = False

# 計測用
# vad = google_vad.GOOGLE_WEBRTC()
# vad_thread = threading.Thread(target=vad.vad_loop, args=(self.callback_vad, ))
# vad_thread.start()
# self.time_user_speeching_end = None

stt_thread.start()

def wait(self):
thread_list = threading.enumerate()
thread_list.remove(threading.main_thread())
for thread in thread_list:
thread.join()

def callback_interim(self, user_utterance):
self.latest_user_utterance = user_utterance

def callback_final(self, user_utterance):
self.latest_user_utterance = user_utterance
threading.Thread(target=self.main_process, args=(self.latest_user_utterance,)).start()

# def callback_vad(self, flag):
# if flag == False:
# print("vad")
# self.time_user_speeching_end = time.time()

def main_process(self, user_utterance):
llm_result = self.llm.get(user_utterance)
wav_data, _ = voicevox.get_audio_file_from_text(llm_result.choices[0].message.content)
self.audio_play(wav_data)

def audio_play(self, wav_data):
with open("tmp.wav", mode='bw') as f:
f.write(wav_data)
# if self.time_user_speeching_end != None:
# print("応答までの時間", time.time() - self.time_user_speeching_end)
self.time_user_speeching_end = None
playsound("tmp.wav")


if __name__ == '__main__':
ins = Main()
ins.wait()



Loading

0 comments on commit fb81c03

Please sign in to comment.