-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f49cb17
commit fb81c03
Showing
11 changed files
with
361 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
*.py[cod] | ||
*$py.class | ||
*.wav | ||
llm/models/* | ||
|
||
# C extensions | ||
*.so | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,116 +1,78 @@ | ||
# 概要 | ||
|
||
LLMを用いたシンプルな音声対話システムです。 | ||
|
||
# 環境設定 | ||
|
||
googleとopenaiのキーをそれぞれ環境変数に登録してください。 | ||
|
||
``` | ||
export GOOGLE_APPLICATION_CREDENTIALS=... | ||
``` | ||
``` | ||
export OPENAI_API_KEY=... | ||
``` | ||
|
||
|
||
pipモジュールをインストールします。 | ||
``` | ||
wget https://github.com/GoogleCloudPlatform/python-docs-samples/raw/main/speech/microphone/transcribe_streaming_infinite.py -O stt/google_stt.py | ||
pip install -r requirements.txt | ||
``` | ||
modified listen_print_loop | ||
|
||
before | ||
``` | ||
def listen_print_loop(responses: object, stream: object) -> object: | ||
[中略] | ||
その後、voicevoxをインストールしてください。具体的なインストール方法は公式ページを参照してください。 | ||
|
||
if result.is_final: | ||
sys.stdout.write(GREEN) | ||
sys.stdout.write("\033[K") | ||
sys.stdout.write(str(corrected_time) + ": " + transcript + "\n") | ||
https://voicevox.hiroshiba.jp/ | ||
|
||
stream.is_final_end_time = stream.result_end_time | ||
stream.last_transcript_was_final = True | ||
|
||
# Exit recognition if any of the transcribed phrases could be | ||
# one of our keywords. | ||
if re.search(r"\b(exit|quit)\b", transcript, re.I): | ||
sys.stdout.write(YELLOW) | ||
sys.stdout.write("Exiting...\n") | ||
stream.closed = True | ||
break | ||
else: | ||
sys.stdout.write(RED) | ||
sys.stdout.write("\033[K") | ||
sys.stdout.write(str(corrected_time) + ": " + transcript + "\r") | ||
# voice_interaction_base | ||
|
||
stream.last_transcript_was_final = False | ||
一番遅いモデルです。Google STTのfinalの終わりまで待ち、ChatGPTにリクエストを投げます。ただし応答内容の精度は良いです。 | ||
|
||
return transcript | ||
``` | ||
after | ||
python voice_interaction_base.py | ||
``` | ||
def listen_print_loop(responses: object, stream: object, callback_interim: object, callback_final: object) -> object: | ||
[中略] | ||
if result.is_final: | ||
sys.stdout.write(GREEN) | ||
sys.stdout.write("\033[K") | ||
sys.stdout.write(str(corrected_time) + ": " + transcript + "\n") | ||
|
||
if callback_final != None: | ||
callback_final(transcript) | ||
# voice_interaction | ||
|
||
stream.is_final_end_time = stream.result_end_time | ||
stream.last_transcript_was_final = True | ||
Google STTのfinalが出力されるまでの時間が短縮されています。 | ||
|
||
# Exit recognition if any of the transcribed phrases could be | ||
# one of our keywords. | ||
if re.search(r"\b(exit|quit)\b", transcript, re.I): | ||
sys.stdout.write(YELLOW) | ||
sys.stdout.write("Exiting...\n") | ||
stream.closed = True | ||
break | ||
else: | ||
sys.stdout.write(RED) | ||
sys.stdout.write("\033[K") | ||
sys.stdout.write(str(corrected_time) + ": " + transcript + "\r") | ||
``` | ||
python voice_interaction.py | ||
``` | ||
|
||
if callback_interim != None: | ||
callback_interim(transcript) | ||
# voice_interaction_stream | ||
|
||
stream.last_transcript_was_final = False | ||
リアルタイムもどきの方法で音声合成が行われます。 | ||
|
||
return transcript | ||
``` | ||
|
||
modified main | ||
python voice_interaction_stream.py | ||
``` | ||
def main() -> None: | ||
|
||
[中略] | ||
# voice_interaction_llama2 | ||
|
||
config = speech.RecognitionConfig( | ||
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, | ||
sample_rate_hertz=SAMPLE_RATE, | ||
language_code="en-US", | ||
max_alternatives=1, | ||
) | ||
ChatGPTではなくllama2を用います。事前にモデルデータを準備しておく必要があります。 | ||
|
||
[中略] | ||
listen_print_loop(responses, stream) | ||
``` | ||
# bash | ||
> git clone https://github.com/ggerganov/llama.cpp | ||
> cd llama.cpp | ||
> make -j 8 LLAMA_CUBLAS=1 | ||
> python | ||
import huggingface_hub | ||
huggingface_hub.snapshot_download(repo_id='elyza/ELYZA-japanese-Llama-2-7b-instruct', cache_dir="original_models") | ||
exit() | ||
> python3 convert.py original_models/models--elyza--ELYZA-japanese-Llama-2-7b-instruct/snapshots/48fa08b3098a23d3671e09565499a4cfbaff1923 --outfile gguf-models/elyza.gguf | ||
> ./quantize gguf-models/elyza.gguf gguf-models/elyza-q8.gguf q8_0 | ||
``` | ||
def main(callback_interim, callback_final) -> None: | ||
[中略] | ||
|
||
config = speech.RecognitionConfig( | ||
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, | ||
sample_rate_hertz=SAMPLE_RATE, | ||
language_code="ja-JP", | ||
max_alternatives=1, | ||
) | ||
生成された`gguf-models/elyza-q8.gguf`を`llm/models`に配置してください。 | ||
次に、llama-cpp-pythonをインストールします。 | ||
|
||
[中略] | ||
listen_print_loop(responses, stream, callback_interim, callback_final) | ||
``` | ||
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -vv | ||
``` | ||
|
||
以上で準備は完了です。実行してください。 | ||
|
||
|
||
``` | ||
python voice_interaction_stream.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
``` | ||
git clone https://github.com/ggerganov/llama.cpp | ||
cd llama.cpp | ||
mkdir build | ||
cd build | ||
make -j 8 LLAMA_CUBLAS=1 | ||
mkdir -p gguf_models | ||
python3 convert.py original_models/models--cyberagent--open-calm-7b/snapshots/276a5fb67510554e11ef191a2da44c919acccdf5/ \ | ||
--vocabtype spm \ | ||
--outtype f16 \ | ||
--outfile gguf_models/open-calm-7b-f16.gguf | ||
./quantize \ | ||
gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-f16.gguf \ | ||
gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf q8_0 | ||
./main -m gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf \ | ||
-n 512 -c 0 \ | ||
--repeat_penalty 1.0 \ | ||
--color -i -r "User:" \ | ||
-f prompts/chat-with-sota-kun.txt | ||
``` | ||
test | ||
``` | ||
./main -m gguf-models/cyberagent/llama2-7b-chat-japanese/ggml-model-q8_0.gguf --temp 0.1 -p "[INST]こんにちは。[/INST]" -ngl 32 -b 512 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from llama_cpp import Llama | ||
|
||
class Llama2(): | ||
|
||
def __init__(self, valid_stream=True) -> None: | ||
self.llama = Llama(model_path="llm/models/elyza-q8.gguf", n_gpu_layers=50) | ||
self.valid_stream = valid_stream | ||
|
||
def get(self, user_utterance): | ||
streamer = self.llama.create_chat_completion( | ||
[{"role":"user", "content": f"""[INST] <<SYS>>\nあなたはアシスタントです。\n<</SYS>>\n\n{user_utterance}[/INST]"""}], | ||
stream=self.valid_stream | ||
) | ||
return streamer | ||
|
||
def set_agent_utterance(self, agent_utterance): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
openai | ||
openai==1.3.6 | ||
google-api-python-client==2.86.0 | ||
google-cloud-speech==2.19.0 | ||
webrtcvad==2.0.10 | ||
PyAudio==0.2.13 | ||
soundfile | ||
playsound | ||
soundfile==0.12.1 | ||
playsound==1.3.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from stt import google_stt | ||
# from vad import google_vad | ||
from llm import chatgpt | ||
from tts import voicevox | ||
import threading | ||
from playsound import playsound | ||
import time | ||
|
||
class Main(): | ||
|
||
def __init__(self) -> None: | ||
stt_thread = threading.Thread(target=google_stt.main, args=(self.callback_interim, self.callback_final,)) | ||
self.llm = chatgpt.ChatGPT(valid_stream=False) | ||
|
||
self.latest_user_utterance = None | ||
self.finished_user_speeching = False | ||
|
||
# 計測用 | ||
# vad = google_vad.GOOGLE_WEBRTC() | ||
# vad_thread = threading.Thread(target=vad.vad_loop, args=(self.callback_vad, )) | ||
# vad_thread.start() | ||
# self.time_user_speeching_end = None | ||
|
||
stt_thread.start() | ||
|
||
def wait(self): | ||
thread_list = threading.enumerate() | ||
thread_list.remove(threading.main_thread()) | ||
for thread in thread_list: | ||
thread.join() | ||
|
||
def callback_interim(self, user_utterance): | ||
self.latest_user_utterance = user_utterance | ||
|
||
def callback_final(self, user_utterance): | ||
self.latest_user_utterance = user_utterance | ||
threading.Thread(target=self.main_process, args=(self.latest_user_utterance,)).start() | ||
|
||
# def callback_vad(self, flag): | ||
# if flag == False: | ||
# print("vad") | ||
# self.time_user_speeching_end = time.time() | ||
|
||
def main_process(self, user_utterance): | ||
llm_result = self.llm.get(user_utterance) | ||
wav_data, _ = voicevox.get_audio_file_from_text(llm_result.choices[0].message.content) | ||
self.audio_play(wav_data) | ||
|
||
def audio_play(self, wav_data): | ||
with open("tmp.wav", mode='bw') as f: | ||
f.write(wav_data) | ||
# if self.time_user_speeching_end != None: | ||
# print("応答までの時間", time.time() - self.time_user_speeching_end) | ||
self.time_user_speeching_end = None | ||
playsound("tmp.wav") | ||
|
||
|
||
if __name__ == '__main__': | ||
ins = Main() | ||
ins.wait() | ||
|
||
|
||
|
Oops, something went wrong.