From 1caa7739f8ae79207b01dfa2e905b7f7ec77d0c6 Mon Sep 17 00:00:00 2001 From: kashi-x Date: Mon, 13 May 2024 04:17:52 +0900 Subject: [PATCH] add init --- README.md | 1 - README_jp.md | 12 + src/emo.py | 50 ---- src/emo_trial.py | 44 ---- src/f.py | 105 --------- src/fast.py | 110 --------- src/fast2.py | 120 ---------- src/fast3.py | 137 ----------- src/fast4.py | 133 ----------- src/fast5.py | 126 ---------- src/fast6.py | 113 --------- src/fast7.py | 148 ------------ src/fast8.py | 125 ---------- src/fast9.py | 229 ------------------- dev/b.py => src/fast_voice2word.py | 0 src/foo.py | 38 --- dev/a.py => src/word2emotion_and_plotting.py | 23 +- 17 files changed, 27 insertions(+), 1487 deletions(-) delete mode 100644 README.md create mode 100644 README_jp.md delete mode 100644 src/emo.py delete mode 100644 src/emo_trial.py delete mode 100644 src/f.py delete mode 100644 src/fast.py delete mode 100644 src/fast2.py delete mode 100644 src/fast3.py delete mode 100644 src/fast4.py delete mode 100644 src/fast5.py delete mode 100644 src/fast6.py delete mode 100644 src/fast7.py delete mode 100644 src/fast8.py delete mode 100644 src/fast9.py rename dev/b.py => src/fast_voice2word.py (100%) delete mode 100644 src/foo.py rename dev/a.py => src/word2emotion_and_plotting.py (88%) diff --git a/README.md b/README.md deleted file mode 100644 index 6f7a1df..0000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# fast_word_emotion_analysis \ No newline at end of file diff --git a/README_jp.md b/README_jp.md new file mode 100644 index 0000000..06af717 --- /dev/null +++ b/README_jp.md @@ -0,0 +1,12 @@ +# fast_word_emotion_analysis + +音声の入力を受けとり、音節の切れ目で、それまでの感情を計算します。 +arduinoを経由し、話を聞いてない人や耳が聞こえない人の首に電流を流して、感情に沿った適切なうなづきを実行させます。 +arduinoのコードは……実機の中にしかありません。 + +動作方法 + +srcの内部の二つのスクリプトを動作させる。二つとも、非同期で処理が走る。 +(それぞれ、分析の結果をfileのioを通してパスしている。デモをする際に、支給されたパソコンのスペックがあんまりであったため、処理の負荷を手元の携帯に分散させることができるように、fileのio経由で同期させている。) +初回は、whisperモデルやbertのモデルのダウンロードが走るため、時間がかかる。 +また、cudaや音声入力の管理が必要なため、環境依存で修正しなければならない諸々が非常に多い。 diff --git a/src/emo.py b/src/emo.py deleted file mode 100644 index ef21bfd..0000000 --- a/src/emo.py +++ /dev/null @@ -1,50 +0,0 @@ -from transformers import pipeline, AutoModelForSequenceClassification, BertJapaneseTokenizer -import numpy as np -import matplotlib.pyplot as plt - - -model = AutoModelForSequenceClassification.from_pretrained( - "patrickramos/bert-base-japanese-v2-wrime-fine-tune" -) -tokenizer = BertJapaneseTokenizer.from_pretrained( - "cl-tohoku/bert-base-japanese-whole-word-masking" -) -nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) - - -# pipelineでの感情分析結果 -results = nlp("私はとっても幸せ") - -# readerに関する結果のみをフィルタリング -reader_results = [result for result in results if "reader_" in result["label"]] -values = [result["score"] for result in reader_results] - -# 感情の日本語訳 -emotion_translation = { - "surprise": "驚き", - "sadness": "悲しみ", - "fear": "恐れ", - "disgust": "嫌悪", - "anger": "怒り", - "anticipation": "期待", - "joy": "喜び", - "trust": "信頼", -} - -# ラベルを日本語に変換 -labels = [emotion_translation[result["label"].split("_")[1]] for result in reader_results] - -# N-gramの設定 (ここでは2-gram) -N = 2 -ngram_values = [np.mean(values[i : i + N]) for i in range(len(values) - N + 1)] -ngram_labels = [f"{labels[i]}-{labels[i+1]}" for i in range(len(labels) - N + 1)] - -# プロット -plt.figure(figsize=(10, 7)) -plt.bar(range(len(ngram_values)), ngram_values, color="skyblue", align="center") -plt.xticks(range(len(ngram_values)), ngram_labels, rotation=45) -plt.xlabel("N-gramの感情ペア") -plt.ylabel("平均スコア") -plt.title("Readerの感情分析 (N-gram)") -plt.tight_layout() -plt.show() diff --git a/src/emo_trial.py b/src/emo_trial.py deleted file mode 100644 index 2a1703e..0000000 --- a/src/emo_trial.py +++ /dev/null @@ -1,44 +0,0 @@ -import matplotlib.pyplot as plt -import matplotlib.animation as animation -import numpy as np -from transformers import AutoTokenizer, AutoModelForSequenceClassification -import torch - -plt.rcParams["font.family"] = "Meiryo" - -# モデルとトークナイザの準備 -tokenizer = AutoTokenizer.from_pretrained( - "Mizuiro-sakura/luke-japanese-large-sentiment-analysis-wrime" -) -model = AutoModelForSequenceClassification.from_pretrained( - "Mizuiro-sakura/luke-japanese-large-sentiment-analysis-wrime" -) - -# 感情のリスト -emotions = ["喜び", "悲しみ", "期待", "驚き", "怒り", "恐れ", "嫌悪", "信頼"] - - -def get_emotion_probs(text): - token = tokenizer( - text, return_tensors="pt", truncation=True, max_length=512, padding="max_length" - ) - output = model(**token) - normalized_logits = (output.logits - torch.min(output.logits)) / ( - torch.max(output.logits) - torch.min(output.logits) - ) - probs = normalized_logits.squeeze().tolist() - probs.append(probs[0]) # 最初の確率を最後にも追加 - return probs - - -fig = plt.figure(figsize=(4, 4)) -ax = fig.add_subplot(111, projection="polar") -ax.set_ylim(0, 1) - -theta = np.linspace(0, 2 * np.pi, len(emotions) + 1, endpoint=True) # 最後に最初の値を追加 -(l,) = ax.plot([], []) - -texts = ["すごく楽しかった。"] -data = get_emotion_probs(texts[0]) - - diff --git a/src/f.py b/src/f.py deleted file mode 100644 index c0782e4..0000000 --- a/src/f.py +++ /dev/null @@ -1,105 +0,0 @@ - -import matplotlib.pyplot as plt -import matplotlib.animation as animation -import numpy as np -import sounddevice as sd -import threading -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForSequenceClassification -import queue - -plt.rcParams["font.family"] = "Meiryo" - -# SETTINGS -BLOCKSIZE = 24678 // 5 -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -SILENCE_RATIO = 300 - -# Initialize Whisper model and processor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_name = "vumichien/whisper-small-ja" -processor = WhisperProcessor.from_pretrained(model_name) -model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) -model = model.half() -forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - -# Initialize sentiment analysis model and tokenizer -tokenizer = AutoTokenizer.from_pretrained("Mizuiro-sakura/luke-japanese-large-sentiment-analysis-wrime") -sentiment_model = AutoModelForSequenceClassification.from_pretrained("Mizuiro-sakura/luke-japanese-large-sentiment-analysis-wrime").to(device) - -# Lists -emotions = ["喜び", "悲しみ", "期待", "驚き", "怒り", "恐れ", "嫌悪", "信頼"] - -audio_queue = queue.Queue() -global_ndarray = None - -running = True - -def get_emotion_probs(text): - token = tokenizer( - text, return_tensors="pt", truncation=True, max_length=512, padding="max_length" - ) - output = sentiment_model(**token) - normalized_logits = (output.logits - torch.min(output.logits)) / ( - torch.max(output.logits) - torch.min(output.logits) - ) - probs = normalized_logits.squeeze().tolist() - probs.append(probs[0]) # 最初の確率を最後にも追加 - return probs - -def audio_capture_thread(): - with sd.InputStream(samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE) as stream: - while running: - indata, status = stream.read(BLOCKSIZE) - audio_queue.put((indata, status)) - -def update_plot(i): - global global_ndarray - - indata, _ = audio_queue.get_nowait() - - line.set_ydata(indata) - - indata_flattened = abs(indata.flatten()) - is_significant_audio = np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO - - if is_significant_audio: - if global_ndarray is not None: - global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16") - else: - global_ndarray = indata - elif global_ndarray is not None: - if len(global_ndarray) < MIN_AUDIO_LENGTH: - return - indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0 - global_ndarray = None - input_data = processor(indata_transformed, sampling_rate=16000, return_tensors="pt").input_features - input_data = input_data.half() - predicted_ids = model.generate(input_data.to(device), forced_decoder_ids=forced_decoder_ids) - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - data = get_emotion_probs(transcription[0]) - radar_line.set_ydata(data) - -if __name__ == "__main__": - fig, axs = plt.subplots(2) - - # Audio waveform plot - (line,) = axs[0].plot(np.random.randn(BLOCKSIZE)) - axs[0].set_ylim([-(2**15), 2**15 - 1]) - axs[0].set_xlim(0, BLOCKSIZE) - - # Sentiment radar chart - theta = np.linspace(0, 2 * np.pi, len(emotions) + 1, endpoint=True) - (radar_line,) = axs[1].plot(theta, [0] * (len(emotions) + 1)) - axs[1].set_ylim(0, 1) - - capture_thread = threading.Thread(target=audio_capture_thread) - capture_thread.start() - - ani = animation.FuncAnimation(fig, update_plot, interval=100, blit=False) - - plt.show() - - running = False - capture_thread.join() \ No newline at end of file diff --git a/src/fast.py b/src/fast.py deleted file mode 100644 index 82495cd..0000000 --- a/src/fast.py +++ /dev/null @@ -1,110 +0,0 @@ -import sounddevice as sd -import numpy as np - -import matplotlib.pyplot as plt -import whisper - -import asyncio -import queue -import sys - - -# SETTINGS -MODEL_TYPE = "base.en" -# the model used for transcription. https://github.com/openai/whisper#available-models-and-languages -LANGUAGE = "English" -# pre-set the language to avoid autodetection -BLOCKSIZE = 24678 // 5 -# this is the base chunk size the audio is split into in samples. blocksize / 16000 = chunk length in seconds. -SILENCE_THRESHOLD = 700 -# should be set to the lowest sample amplitude that the speech in the audio material has -SILENCE_RATIO = 2000 -# number of samples in one buffer that are allowed to be higher than threshold - - -global_ndarray = None -model = whisper.load_model(MODEL_TYPE) - - -async def inputstream_generator(): - """Generator that yields blocks of input data as NumPy arrays.""" - q_in = asyncio.Queue() - loop = asyncio.get_event_loop() - - def callback(indata, frame_count, time_info, status): - print("Received audio data.") # Log when audio data is received. - loop.call_soon_threadsafe(q_in.put_nowait, (indata.copy(), status)) - - stream = sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE, callback=callback - ) - with stream: - while True: - indata, status = await q_in.get() - print( - f"Yielding {len(indata)} frames of audio data." - ) # Log the amount of audio data being yielded. - yield indata, status - - -plt.ion() -fig, ax = plt.subplots() -(line,) = ax.plot(np.random.randn(BLOCKSIZE)) -ax.set_ylim([-(2**15), 2**15 - 1]) -ax.set_xlim(0, BLOCKSIZE) - - -async def process_audio_buffer(): - global global_ndarray - async for indata, status in inputstream_generator(): - indata_flattened = abs(indata.flatten()) - - line.set_ydata(indata) - plt.draw() - plt.pause(0.001) - - # Log the size of non-silent data. - non_silent_size = np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size - print(f"Non-silent data size: {non_silent_size}") - - if non_silent_size < SILENCE_RATIO: - print("Discarding buffer due to silence.") - continue - - if global_ndarray is not None: - global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16") - else: - global_ndarray = indata - - avg_end_signal = np.average((indata_flattened[-100:-1])) - if avg_end_signal > SILENCE_THRESHOLD / 15: - print("Appending buffer as the end is not silent.") - continue - else: - local_ndarray = global_ndarray.copy() - global_ndarray = None - indata_transformed = local_ndarray.flatten().astype(np.float32) / 32768.0 - result = model.transcribe(indata_transformed, language=LANGUAGE) - print(f"Transcription Result: {result['text']}") # Log the transcription result. - - del local_ndarray - del indata_flattened - - -async def main(): - print("\nActivating wire ...\n") - audio_task = asyncio.create_task(process_audio_buffer()) - while True: - await asyncio.sleep(1) - audio_task.cancel() - try: - await audio_task - except asyncio.CancelledError: - print("\nwire was cancelled") - - -if __name__ == "__main__": - try: - asyncio.run(main()) - except KeyboardInterrupt: - sys.exit("\nInterrupted by user") diff --git a/src/fast2.py b/src/fast2.py deleted file mode 100644 index fb1deff..0000000 --- a/src/fast2.py +++ /dev/null @@ -1,120 +0,0 @@ -import asyncio -import numpy as np -import sounddevice as sd -import sys -import threading -import curses -import matplotlib.pyplot as plt -import sounddevice as sd -import numpy as np - -import whisper - -import asyncio -import queue -import sys - - -# SETTINGS -MODEL_TYPE = "base.en" -# the model used for transcription. https://github.com/openai/whisper#available-models-and-languages -LANGUAGE = "English" -# pre-set the language to avoid autodetection -BLOCKSIZE = 24678 -# this is the base chunk size the audio is split into in samples. blocksize / 16000 = chunk length in seconds. -SILENCE_THRESHOLD = 700 -# should be set to the lowest sample amplitude that the speech in the audio material has -SILENCE_RATIO = 2000 -# number of samples in one buffer that are allowed to be higher than threshold - - -global_ndarray = None -model = whisper.load_model(MODEL_TYPE) - -data_queue = asyncio.Queue() - - -# Curses UI function -def display_ui(data_queue): - stdscr = curses.initscr() - curses.noecho() - curses.cbreak() - stdscr.keypad(True) - stdscr.nodelay(1) # non-blocking input - try: - while True: - stdscr.clear() - try: - data = data_queue.get_nowait() - except: - data = None - - if data: - stdscr.addstr(0, 0, data) - stdscr.refresh() - curses.napms(100) # Wait for 100ms - except KeyboardInterrupt: - pass - finally: - curses.endwin() - - -# Real-time plotting function -def realtime_plot(): - plt.ion() - fig, ax = plt.subplots() - (line,) = ax.plot(BLOCKSIZE) - ax.set_ylim([-(2**15), 2**15 - 1]) - ax.set_xlim(0, BLOCKSIZE) - - while True: - try: - indata = plot_queue.get() - line.set_ydata(indata) - plt.draw() - plt.pause(0.001) - except KeyboardInterrupt: - break - - -# Your original inputstream generator here... - -# Modified process_audio_buffer function -plot_queue = asyncio.Queue() - - -async def process_audio_buffer(): - global global_ndarray - async for indata, status in inputstream_generator(): - indata_flattened = abs(indata.flatten()) - non_silent_data_size = np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size - - message = f"Non-silent data size: {non_silent_data_size} | " - - # Append indata to the plot_queue for real-time plotting - await plot_queue.put(indata) - - if non_silent_data_size < SILENCE_RATIO: - message += "Determined as silence. Skipping buffer." - else: - message += "Determined as non-silent." - # ... (Rest of your code) - - await data_queue.put(message) - - -def main(): - ui_thread = threading.Thread(target=display_ui, args=(data_queue,)) - plot_thread = threading.Thread(target=realtime_plot) - ui_thread.start() - plot_thread.start() - - try: - asyncio.run(process_audio_buffer()) - except KeyboardInterrupt: - ui_thread.join() - plot_thread.join() - - -if __name__ == "__main__": - main() diff --git a/src/fast3.py b/src/fast3.py deleted file mode 100644 index bb07fc7..0000000 --- a/src/fast3.py +++ /dev/null @@ -1,137 +0,0 @@ -import sounddevice as sd -import numpy as np - -import matplotlib.pyplot as plt -import whisper -import threading -import asyncio -import queue -import sys -import numpy as np -import sounddevice as sd -import asyncio -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -import matplotlib.pyplot as plt - - -# SETTINGS -# MODEL_TYPE = "base.en" -# the model used for transcription. https://github.com/openai/whisper#available-models-and-languages -# LANGUAGE = "English" -# pre-set the language to avoid autodetection -BLOCKSIZE = 24678 // 5 -# this is the base chunk size the audio is split into in samples. blocksize / 16000 = chunk length in seconds. -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -# should be set to the lowest sample amplitude that the speech in the audio material has -SILENCE_RATIO = 300 -# number of samples in one buffer that are allowed to be higher than threshold - -# Initialize model and processor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -processor = WhisperProcessor.from_pretrained("clu-ling/whisper-large-v2-japanese-5k-steps") -model = WhisperForConditionalGeneration.from_pretrained( - "clu-ling/whisper-large-v2-japanese-5k-steps" -).to(device) -forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - -global_ndarray = None -# model = whisper.load_model(MODEL_TYPE) - - -plt.ion() -fig, ax = plt.subplots() -(line,) = ax.plot(np.random.randn(BLOCKSIZE)) -ax.set_ylim([-(2**15), 2**15 - 1]) -ax.set_xlim(0, BLOCKSIZE) - -audio_queue = queue.Queue() # Use a regular Python queue - -def audio_capture_thread(): - """Thread that captures audio and puts blocks of data into the audio queue.""" - with sd.InputStream(samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE) as stream: - while True: - indata, status = stream.read(BLOCKSIZE) - audio_queue.put((indata, status)) - -async def inputstream_generator(): - """Generator that yields blocks of input data as NumPy arrays.""" - while True: - indata, status = audio_queue.get() - yield indata, status - -async def inputstream_generator(): - """Generator that yields blocks of input data as NumPy arrays.""" - q_in = asyncio.Queue() - loop = asyncio.get_event_loop() - - def callback(indata, frame_count, time_info, status): - loop.call_soon_threadsafe(q_in.put_nowait, (indata.copy(), status)) - - stream = sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE, callback=callback - ) - with stream: - while True: - indata, status = await q_in.get() - yield indata, status - - -async def process_audio_buffer(): - global global_ndarray - receiving_audio = False - - async for indata, status in inputstream_generator(): - indata_flattened = abs(indata.flatten()) - line.set_ydata(indata) - plt.draw() - plt.pause(0.001) - - # Check if current chunk has significant audio - is_significant_audio = ( - np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO - ) - - # If it has significant audio - if is_significant_audio: - print("Status: Receiving audio data.") - receiving_audio = True - if global_ndarray is not None: - global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16") - else: - global_ndarray = indata - continue - - # If current chunk is silent and there was audio being received previously - if not is_significant_audio and receiving_audio: - print("Status: Detected silence after receiving audio.") - if len(global_ndarray) < MIN_AUDIO_LENGTH: - print( - f"Status: Audio length {len(global_ndarray)} is insufficient. Awaiting more input." - ) - continue - - print("Status: Processing audio data...") - indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0 - input_features = processor( - indata_transformed, sampling_rate=16000, return_tensors="pt" - ).input_features - predicted_ids = model.generate( - input_features.to(device), forced_decoder_ids=forced_decoder_ids - ) - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - print(f"Transcription: {transcription}") - global_ndarray = None - receiving_audio = False - else: - print("Status: Detected silence.") - - -if __name__ == "__main__": - thread = threading.Thread(target=audio_capture_thread, daemon=True) - thread.start() - try: - asyncio.run(process_audio_buffer()) - except KeyboardInterrupt: - print("\nInterrupted by user") diff --git a/src/fast4.py b/src/fast4.py deleted file mode 100644 index 11ea611..0000000 --- a/src/fast4.py +++ /dev/null @@ -1,133 +0,0 @@ -import sounddevice as sd -import numpy as np -import matplotlib.pyplot as plt -import threading -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -import queue - -# SETTINGS -BLOCKSIZE = 24678 // 5 -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -SILENCE_RATIO = 300 - -# Initialize model and processor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -# print(f"Using device: {device}") -# "clu-ling/whisper-large-v2-japanese-5k-steps" -model_name = "vumichien/whisper-small-ja" -# model_name = "kimbochen/whisper-tiny-ja" -# C:\Users\anosillus\.cache\huggingface\hub\models--kimbochen--whisper-tiny-ja -processor = WhisperProcessor.from_pretrained(model_name) -model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) -model = model.half() -forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - -global_ndarray = None -audio_queue = queue.Queue() - - -def audio_capture_thread(): - """Thread that captures audio and puts blocks of data into the audio queue.""" - with sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE - ) as stream: - while True: - indata, status = stream.read(BLOCKSIZE) - audio_queue.put((indata, status)) - - -def transcription_and_plotting(): - plt.ion() - fig, ax = plt.subplots() - (line,) = ax.plot(np.random.randn(BLOCKSIZE)) - ax.set_ylim([-(2**15), 2**15 - 1]) - ax.set_xlim(0, BLOCKSIZE) - - global global_ndarray - - while True: - indata, status = audio_queue.get() - indata_flattened = abs(indata.flatten()) - - line.set_ydata(indata) - plt.draw() - plt.pause(0.001) - - is_significant_audio = ( - np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO - ) - - if is_significant_audio: - if global_ndarray is not None: - global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16") - else: - global_ndarray = indata - elif global_ndarray is not None: - if len(global_ndarray) < MIN_AUDIO_LENGTH: - continue - indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0 - global_ndarray = None - input_data = processor( - indata_transformed, sampling_rate=16000, return_tensors="pt" - ).input_features - input_data = input_data.half() - predicted_ids = model.generate( - input_data.to(device), forced_decoder_ids=forced_decoder_ids - ) - - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - print(f"Transcription: {transcription}") - - -if __name__ == "__main__": - capture_thread = threading.Thread(target=audio_capture_thread) - capture_thread.start() - - try: - transcription_and_plotting() - except KeyboardInterrupt: - print("\nInterrupted by user") - -""" - -# ... [上記のコードとインポート文はここに続く] - -# 1. モデルとトークナイザの初期化 -emotion_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") -emotion_model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device) - -def predict_emotion(text): - # 2. `predict_emotion`関数を定義 - inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) - for key in inputs: - inputs[key] = inputs[key].to(device) - - with torch.no_grad(): - outputs = emotion_model(**inputs) - - probabilities = softmax(outputs.logits, dim=1) - class_id = torch.argmax(probabilities).item() - - return "Positive" if class_id == 1 else "Negative" - -def transcription_and_plotting(): - # ... [関数の中身の初めの部分] - - while True: - # ... [関数の中のループの初めの部分] - - if is_significant_audio: - # ... [この部分の残り] - - # 3. トランスクリプトが得られた後、そのトランスクリプトを感情分析関数に渡す - if transcription: - emotion_result = predict_emotion(transcription[0]) - - # 4. トランスクリプトと感情の結果を表示 - print(f"Transcription: {transcription}") - print(f"Emotion: {emotion_result}") - -# ... [関数の定義の後の部分] -""" diff --git a/src/fast5.py b/src/fast5.py deleted file mode 100644 index 29fda2c..0000000 --- a/src/fast5.py +++ /dev/null @@ -1,126 +0,0 @@ -import sounddevice as sd -import numpy as np -import matplotlib.pyplot as plt -import multiprocessing -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -from transformers import BertTokenizer, BertForSequenceClassification - -# SETTINGS -BLOCKSIZE = 24678 // 5 -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -SILENCE_RATIO = 300 - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -# Initialize Whisper model and processor -model_name = "vumichien/whisper-small-ja" -processor = WhisperProcessor.from_pretrained(model_name) -model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) -model = model.half() -forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - -# Initialize BERT model -BERT_MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment" -tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME) -bert_model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME).to(device) -bert_model = bert_model.half() - -audio_queue = multiprocessing.Queue() -classification_queue = multiprocessing.Queue() -shared_ndarray_list = None - - -def audio_capture_thread(): - try: - print("Starting audio capture thread...") - with sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE - ) as stream: - while True: - indata, status = stream.read(BLOCKSIZE) - print(f"Captured audio data: {indata[:10]}") - audio_queue.put((indata, status)) - except Exception as e: - print(f"Error in audio_capture_thread: {e}") - - -def bert_classification(): - try: - print("Starting BERT classification thread...") - while True: - transcription = classification_queue.get() - inputs = tokenizer( - transcription, - return_tensors="pt", - truncation=True, - padding=True, - max_length=256, - ).to(device) - outputs = bert_model(**inputs) - predicted_label_idx = torch.argmax(outputs.logits, dim=1).item() - labels = ["very negative", "negative", "neutral", "positive", "very positive"] - print(f"Predicted emotion: {labels[predicted_label_idx]}") - except Exception as e: - print(f"Error in bert_classification: {e}") - - -def transcription_and_plotting(): - plt.ion() - fig, ax = plt.subplots() - (line,) = ax.plot(np.random.randn(BLOCKSIZE)) - ax.set_ylim([-(2**15), 2**15 - 1]) - ax.set_xlim(0, BLOCKSIZE) - - global shared_ndarray_list - while True: - indata, status = audio_queue.get() - indata_flattened = abs(indata.flatten()) - line.set_ydata(indata) - plt.draw() - - is_significant_audio = ( - np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO - ) - if is_significant_audio: - shared_ndarray_list.extend(indata.flatten()) - elif len(shared_ndarray_list) > 0: - global_ndarray = np.array(shared_ndarray_list, dtype="int16") - if len(global_ndarray) < MIN_AUDIO_LENGTH: - continue - indata_transformed = global_ndarray.astype(np.float32) / 32768.0 - shared_ndarray_list.clear() - input_data = processor( - indata_transformed, sampling_rate=16000, return_tensors="pt" - ).input_features - input_data = input_data.half() - predicted_ids = model.generate( - input_data.to(device), forced_decoder_ids=forced_decoder_ids - ) - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - print(f"Transcription: {transcription}") - classification_queue.put(transcription[0]) - - -def main(): - global shared_ndarray_list - manager = multiprocessing.Manager() - shared_ndarray_list = manager.list() - - capture_process = multiprocessing.Process(target=audio_capture_thread) - classification_process = multiprocessing.Process(target=bert_classification) - - capture_process.start() - classification_process.start() - - try: - transcription_and_plotting() - except KeyboardInterrupt: - print("\nInterrupted by user") - capture_process.terminate() - classification_process.terminate() - - -if __name__ == "__main__": - main() diff --git a/src/fast6.py b/src/fast6.py deleted file mode 100644 index 21914b6..0000000 --- a/src/fast6.py +++ /dev/null @@ -1,113 +0,0 @@ -import sounddevice as sd -import numpy as np -import matplotlib.pyplot as plt -import threading -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -from transformers import BertTokenizer, BertForSequenceClassification -import queue - -# SETTINGS -BLOCKSIZE = 24678 // 5 -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -SILENCE_RATIO = 300 - -# Initialize Whisper model and processor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_name = "vumichien/whisper-small-ja" -processor = WhisperProcessor.from_pretrained(model_name) -model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) -model = model.half() -forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - -# Initialize BERT model and tokenizer for sentiment analysis -BERT_MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment" -tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME) -bert_model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME).to(device) -bert_model = bert_model.half() - -global_ndarray = None -audio_queue = queue.Queue() -classification_queue = queue.Queue() - - -def audio_capture_thread(): - with sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE - ) as stream: - while True: - indata, status = stream.read(BLOCKSIZE) - audio_queue.put((indata, status)) - - -def bert_classification(): - while True: - transcription = classification_queue.get() - inputs = tokenizer( - transcription, return_tensors="pt", truncation=True, padding=True, max_length=256 - ).to(device) - outputs = bert_model(**inputs) - predicted_label_idx = torch.argmax(outputs.logits, dim=1).item() - - labels = ["very negative", "negative", "neutral", "positive", "very positive"] - print(f"Predicted emotion: {labels[predicted_label_idx]}") - - -def transcription_and_plotting(): - plt.ion() - fig, ax = plt.subplots() - (line,) = ax.plot(np.random.randn(BLOCKSIZE)) - ax.set_ylim([-(2**15), 2**15 - 1]) - ax.set_xlim(0, BLOCKSIZE) - - global global_ndarray - - while True: - indata, status = audio_queue.get() - indata_flattened = abs(indata.flatten()) - - line.set_ydata(indata) - plt.draw() - plt.pause(0.001) - - is_significant_audio = ( - np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO - ) - - if is_significant_audio: - if global_ndarray is not None: - global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16") - else: - global_ndarray = indata - elif global_ndarray is not None: - if len(global_ndarray) < MIN_AUDIO_LENGTH: - continue - indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0 - global_ndarray = None - input_data = processor( - indata_transformed, sampling_rate=16000, return_tensors="pt" - ).input_features - input_data = input_data.half() - predicted_ids = model.generate( - input_data.to(device), forced_decoder_ids=forced_decoder_ids - ) - - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - print(f"Transcription: {transcription}") - - # BERT分類スレッドに転送 - classification_queue.put(transcription[0]) - - -if __name__ == "__main__": - capture_thread = threading.Thread(target=audio_capture_thread) - classification_thread = threading.Thread(target=bert_classification) - - capture_thread.start() - classification_thread.start() - - try: - transcription_and_plotting() - except KeyboardInterrupt: - print("\nInterrupted by user") diff --git a/src/fast7.py b/src/fast7.py deleted file mode 100644 index 483278e..0000000 --- a/src/fast7.py +++ /dev/null @@ -1,148 +0,0 @@ -import sounddevice as sd -import numpy as np -import matplotlib.pyplot as plt -import threading -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -from transformers import BertTokenizer, BertForSequenceClassification -import queue - -# SETTINGS -BLOCKSIZE = 24678 // 5 -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -SILENCE_RATIO = 300 - -audio_queue = queue.Queue() -classification_queue = queue.Queue() -running = True - - -def initialize_models(): - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - # Whisper model and processor - model_name = "vumichien/whisper-small-ja" - processor = WhisperProcessor.from_pretrained(model_name) - model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) - model = model.half() - forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - - # BERT model and tokenizer - BERT_MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment" - tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME) - bert_model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME).to(device) - bert_model = bert_model.half() - - return model, processor, forced_decoder_ids, bert_model, tokenizer, device - - -def audio_capture_thread(): - global running - with sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE - ) as stream: - while running: - indata, status = stream.read(BLOCKSIZE) - audio_queue.put((indata, status)) - audio_queue.put(None) # Signal termination - - -def bert_classification(tokenizer, bert_model, device): - labels = ["very negative", "negative", "neutral", "positive", "very positive"] - while True: - transcription = classification_queue.get() - if transcription is None: # Check for termination signal - break - inputs = tokenizer( - transcription, return_tensors="pt", truncation=True, padding=True, max_length=256 - ).to(device) - outputs = bert_model(**inputs) - predicted_label_idx = torch.argmax(outputs.logits, dim=1).item() - - print(f"Predicted emotion: {labels[predicted_label_idx]}") - - -def process_audio_data(line, global_ndarray, model, processor, forced_decoder_ids, device): - indata, status = audio_queue.get() - if indata is None: # Check for termination signal - return None, None - - indata_flattened = abs(indata.flatten()) - - line.set_ydata(indata) - plt.draw() - plt.pause(0.001) - - is_significant_audio = ( - np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO - ) - - if is_significant_audio: - if global_ndarray is not None: - global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16") - else: - global_ndarray = indata - elif global_ndarray is not None: - if len(global_ndarray) < MIN_AUDIO_LENGTH: - return global_ndarray, None - indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0 - global_ndarray = None - input_data = processor( - indata_transformed, sampling_rate=16000, return_tensors="pt" - ).input_features - input_data = input_data.half() - predicted_ids = model.generate( - input_data.to(device), forced_decoder_ids=forced_decoder_ids - ) - - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] - print(f"Transcription: {transcription}") - - # Send to BERT classification thread - classification_queue.put(transcription) - return global_ndarray, None - - -def update_plot(line): - global running - while running: - plt.draw() - plt.pause(0.01) - - -if __name__ == "__main__": - model, processor, forced_decoder_ids, bert_model, tokenizer, device = initialize_models() - - plt.ion() - fig, ax = plt.subplots() - (line,) = ax.plot(np.random.randn(BLOCKSIZE)) - ax.set_ylim([-(2**15), 2**15 - 1]) - ax.set_xlim(0, BLOCKSIZE) - - capture_thread = threading.Thread(target=audio_capture_thread) - classification_thread = threading.Thread( - target=bert_classification, args=(tokenizer, bert_model, device) - ) - plot_thread = threading.Thread(target=update_plot, args=(line,)) - - capture_thread.start() - classification_thread.start() - plot_thread.start() - - global_ndarray = None - - try: - while running: - global_ndarray, _ = process_audio_data( - line, global_ndarray, model, processor, forced_decoder_ids, device - ) - if global_ndarray is None: - running = False - except KeyboardInterrupt: - running = False - capture_thread.join() - classification_thread.join() - plot_thread.join() # Make sure to join the plot thread as well - classification_queue.put(None) # Signal termination to the classification thread - print("\nInterrupted by user") diff --git a/src/fast8.py b/src/fast8.py deleted file mode 100644 index 90ad3b4..0000000 --- a/src/fast8.py +++ /dev/null @@ -1,125 +0,0 @@ -import sounddevice as sd -import numpy as np -import matplotlib.pyplot as plt -import threading -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -from transformers import BertTokenizer, BertForSequenceClassification -import queue - -# SETTINGS -BLOCKSIZE = 24678 // 5 -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -SILENCE_RATIO = 300 - -# Initialize Whisper model and processor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_name = "vumichien/whisper-small-ja" -processor = WhisperProcessor.from_pretrained(model_name) -model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) -model = model.half() -forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - -# Initialize BERT model and tokenizer for sentiment analysis -BERT_MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment" -tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME) -bert_model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME).to(device) -bert_model = bert_model.half() - -global_ndarray = None -audio_queue = queue.Queue() -classification_queue = queue.Queue() - -# 追加: スレッドの動作を制御するフラグ -running = True - - -def audio_capture_thread(): - with sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE - ) as stream: - while running: - indata, status = stream.read(BLOCKSIZE) - audio_queue.put((indata, status)) - - audio_queue.put(("STOP", None)) - - -def bert_classification(): - while True: - transcription = classification_queue.get() - if transcription == "STOP": - break - inputs = tokenizer( - transcription, return_tensors="pt", truncation=True, padding=True, max_length=256 - ).to(device) - outputs = bert_model(**inputs) - predicted_label_idx = torch.argmax(outputs.logits, dim=1).item() - - labels = ["very negative", "negative", "neutral", "positive", "very positive"] - print(f"Predicted emotion: {labels[predicted_label_idx]}") - - -def transcription_and_plotting(): - plt.ion() - fig, ax = plt.subplots() - (line,) = ax.plot(np.random.randn(BLOCKSIZE)) - ax.set_ylim([-(2**15), 2**15 - 1]) - ax.set_xlim(0, BLOCKSIZE) - - global global_ndarray - - while running: - indata, status = audio_queue.get() - indata_flattened = abs(indata.flatten()) - - line.set_ydata(indata) - plt.draw() - plt.pause(0.001) - - is_significant_audio = ( - np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO - ) - - if is_significant_audio: - if global_ndarray is not None: - global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16") - else: - global_ndarray = indata - elif global_ndarray is not None: - if len(global_ndarray) < MIN_AUDIO_LENGTH: - continue - indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0 - global_ndarray = None - input_data = processor( - indata_transformed, sampling_rate=16000, return_tensors="pt" - ).input_features - input_data = input_data.half() - predicted_ids = model.generate( - input_data.to(device), forced_decoder_ids=forced_decoder_ids - ) - - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - print(f"Transcription: {transcription}") - - # BERT分類スレッドに転送 - classification_queue.put(transcription[0]) - - -if __name__ == "__main__": - capture_thread = threading.Thread(target=audio_capture_thread) - classification_thread = threading.Thread(target=bert_classification) - - capture_thread.start() - classification_thread.start() - - try: - transcription_and_plotting() - except KeyboardInterrupt: - print("\nInterrupted by user") - running = False - plt.close() - - capture_thread.join() - classification_thread.join() diff --git a/src/fast9.py b/src/fast9.py deleted file mode 100644 index 1cce2b8..0000000 --- a/src/fast9.py +++ /dev/null @@ -1,229 +0,0 @@ -import sounddevice as sd -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -import threading -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -from transformers import BertTokenizer, BertForSequenceClassification -import queue -from transformers import AutoModelForSequenceClassification, BertJapaneseTokenizer -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.animation import FuncAnimation -from transformers import AutoTokenizer, AutoModelForSequenceClassification -import torch - -plt.rcParams["font.family"] = "Meiryo" -# SETTINGS -BLOCKSIZE = 24678 // 5 -SILENCE_THRESHOLD = 700 -MIN_AUDIO_LENGTH = 8000 -SILENCE_RATIO = 300 - -# Initialize Whisper model and processor -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_name = "vumichien/whisper-small-ja" -processor = WhisperProcessor.from_pretrained(model_name) -model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) -model = model.half() - - -forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe") - -BERT_MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment" -tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME) -bert_model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME).to(device) -bert_model = bert_model.half() - -global_ndarray = None -audio_queue = queue.Queue() -classification_queue = queue.Queue() - -# 追加: スレッドの動作を制御するフラグ -running = True - - -sentiment_model = AutoModelForSequenceClassification.from_pretrained( - "patrickramos/bert-base-japanese-v2-wrime-fine-tune" -).to(device) -sentiment_model = sentiment_model.half() -sentiment_tokenizer = BertJapaneseTokenizer.from_pretrained( - "cl-tohoku/bert-base-japanese-whole-word-masking" -) - -emotion_translation = { - "surprise": "驚き", - "sadness": "悲しみ", - "fear": "恐れ", - "disgust": "嫌悪", - "anger": "怒り", - "anticipation": "期待", - "joy": "喜び", - "trust": "信頼", -} - - -def audio_capture_thread(): - with sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE - ) as stream: - while running: - indata, status = stream.read(BLOCKSIZE) - audio_queue.put((indata, status)) - - audio_queue.put(("STOP", None)) - - -plotting_queue = queue.Queue() - - -def bert_classification(): - while True: - transcription = classification_queue.get() - if transcription == "STOP": - break - - # Sentiment analysis with the provided model - results = sentiment_tokenizer( - transcription, return_tensors="pt", truncation=True, padding=True, max_length=256 - ).to(device) - outputs = sentiment_model(**results) - sentiment_results = torch.softmax(outputs.logits, dim=1).cpu().detach().numpy() - - reader_results = [ - {"label": label.item(), "score": score} - for label, score in zip( - outputs.logits.argmax(dim=1).cpu().numpy(), sentiment_results[0] - ) - if "reader_" in str(label.item()) - ] - values = [result["score"] for result in reader_results] - - # ラベルを日本語に変換 - labels = [emotion_translation[result["label"]] for result in reader_results] - - # N-gramの設定 (ここでは2-gram) - N = 2 - ngram_values = [np.mean(values[i : i + N]) for i in range(len(values) - N + 1)] - ngram_labels = [f"{labels[i]}-{labels[i+1]}" for i in range(len(labels) - N + 1)] - - # データをキューに追加 - plotting_queue.put((ngram_values, ngram_labels)) - - - -emotions = ["喜び", "悲しみ", "期待", "驚き", "怒り", "恐れ", "嫌悪", "信頼"] - -def get_emotion_probs(text): - token = tokenizer( - text, return_tensors="pt", truncation=True, max_length=512, padding="max_length" - ) - output = model(**token) - normalized_logits = (output.logits - torch.min(output.logits)) / ( - torch.max(output.logits) - torch.min(output.logits) - ) - probs = normalized_logits.squeeze().tolist() - probs.append(probs[0]) # 最初の確率を最後にも追加 - return probs - -def main_plotting(): - plt.ion() - fig = plt.figure(figsize=(10, 7)) - ax1 = fig.add_subplot(2, 1, 1) - ax1.set_ylim([-(2**15), 2**15 - 1]) - ax1.set_xlim(0, BLOCKSIZE) - (line,) = ax1.plot(np.zeros(BLOCKSIZE), 'g-') - - ax2 = fig.add_subplot(2, 1, 2, polar=True) - ax2.set_ylim(0, 1) - theta = np.linspace(0, 2 * np.pi, len(emotions) + 1, endpoint=True) - (l,) = ax2.plot([], []) - - index = 0 - - def update(i): - global index - # 音声データの取得と更新 - indata, _ = audio_queue.get() - if isinstance(indata, str) and indata == "STOP": - return - - line.set_ydata(indata) - - # 以下の部分は、感情分析のためのテキストデータを取得するものと仮定しています。 - # もし実際にテキストデータがキューに入れられる場合は、以下の行を有効にしてください。 - text = classification_queue.get() - - # この例では、固定のテキストリストからデータを取得します。 - text = texts[index % len(texts)] - index += 1 - - data = get_emotion_probs(text) - - ax2.clear() - ax2.set_xticks(theta) - ax2.set_xticklabels(emotions + [emotions[0]]) # ラベルも最初のものを最後に追加 - ax2.set_ylim(0, 1) - (l,) = ax2.plot(theta, data, "r-", lw=2) - - ani = FuncAnimation(fig, update, interval=1000, blit=False) - plt.show() - - -# def main_plotting(): -# plt.ion() -# fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 7)) -# -# ax1.set_ylim([-(2**15), 2**15 - 1]) -# ax1.set_xlim(0, BLOCKSIZE) -# (line,) = ax1.plot(np.zeros(BLOCKSIZE), 'g-') -# -# # レーダーチャートの初期設定 -# emotions = list(emotion_translation.values()) -# num_vars = len(emotions) -# angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() -# ax2.set_theta_offset(np.pi / 2) -# ax2.set_theta_direction(-1) -# ax2.set_rlabel_position(115) -# ax2.set_xticks(angles) -# ax2.set_xticklabels(emotions) -# ax2.set_ylim(0, 1) -# -# while running: -# # 音声データの取得と更新 -# indata, _ = audio_queue.get() -# if isinstance(indata, str) and indata == "STOP": -# break -# line.set_ydata(indata) -# -# # 感情分析データの取得とレーダーチャートの更新 -# ngram_values, ngram_labels = plotting_queue.get() -# ax2.clear() -# ax2.set_xticks(angles) -# ax2.set_xticklabels(emotions) -# ax2.set_ylim(0, 1) -# ax2.plot(angles, ngram_values, color='b', linewidth=2, linestyle='solid') -# ax2.fill(angles, ngram_values, color='skyblue', alpha=0.4) -# -# plt.pause(0.001) -# -# plt.close() - - -if __name__ == "__main__": - capture_thread = threading.Thread(target=audio_capture_thread) - classification_thread = threading.Thread(target=bert_classification) - - capture_thread.start() - classification_thread.start() - - try: - print('start') - main_plotting() - except KeyboardInterrupt: - print("\nInterrupted by user") - running = False - - capture_thread.join() - classification_thread.join() diff --git a/dev/b.py b/src/fast_voice2word.py similarity index 100% rename from dev/b.py rename to src/fast_voice2word.py diff --git a/src/foo.py b/src/foo.py deleted file mode 100644 index 5c392da..0000000 --- a/src/foo.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.animation import FuncAnimation - -# 感情の日本語訳 -emotion_translation = { - "surprise": "驚き", - "sadness": "悲しみ", - "fear": "恐れ", - "disgust": "嫌悪", - "anger": "怒り", - "anticipation": "期待", - "joy": "喜び", - "trust": "信頼" -} - -# readerの感情のデータ (仮のデータを設定) -labels = list(emotion_translation.values()) -values = [0.073, 0.075, 0.076, 0.041, 0.023, 0.022, 0.022, 0.020] -# データの最初の値を末尾に追加して閉じる -values.append(values[0]) - -# アニメーションの設定 -fig = plt.figure(figsize=(7, 7)) -ax = plt.subplot(111, polar=True) -ax.set_ylim(0, 0.1) - -theta = np.linspace(0, 2 * np.pi, len(values), endpoint=True) -line, = ax.plot(theta, values, "o-", lw=3) -ax.set_thetagrids(np.arange(0, 360, 360/len(labels)), labels) - -def animate(i): - values_shifted = np.roll(values, shift=i) - line.set_ydata(values_shifted) - return line, - -ani = FuncAnimation(fig, animate, frames=len(values)-1, repeat=True, blit=True) -plt.show() \ No newline at end of file diff --git a/dev/a.py b/src/word2emotion_and_plotting.py similarity index 88% rename from dev/a.py rename to src/word2emotion_and_plotting.py index 569302a..e98d5d7 100644 --- a/dev/a.py +++ b/src/word2emotion_and_plotting.py @@ -1,10 +1,11 @@ -import sounddevice as sd -import numpy as np +import queue +import threading + import matplotlib.pyplot as plt +import numpy as np +import sounddevice as sd import torch -import threading -from transformers import WhisperProcessor, WhisperForConditionalGeneration -import queue +from transformers import WhisperForConditionalGeneration, WhisperProcessor # SETTINGS BLOCKSIZE = 24678 // 5 @@ -29,7 +30,10 @@ def audio_capture_thread(): with sd.InputStream( - samplerate=16000, channels=1, dtype="int16", blocksize=BLOCKSIZE + samplerate=16000, + channels=1, + dtype="int16", + blocksize=BLOCKSIZE, ) as stream: while running: indata, status = stream.read(BLOCKSIZE) @@ -73,11 +77,14 @@ def transcription_and_plotting(): indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0 global_ndarray = None input_data = processor( - indata_transformed, sampling_rate=16000, return_tensors="pt" + indata_transformed, + sampling_rate=16000, + return_tensors="pt", ).input_features input_data = input_data.half() predicted_ids = model.generate( - input_data.to(device), forced_decoder_ids=forced_decoder_ids + input_data.to(device), + forced_decoder_ids=forced_decoder_ids, ) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]