-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2emotion_and_plotting.py
110 lines (88 loc) · 3.23 KB
/
word2emotion_and_plotting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import queue
import threading
import matplotlib.pyplot as plt
import numpy as np
import sounddevice as sd
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
# SETTINGS
BLOCKSIZE = 24678 // 5
SILENCE_THRESHOLD = 700
MIN_AUDIO_LENGTH = 8000
SILENCE_RATIO = 300
SAVE_PATH = "transcriptions.txt"
# Initialize Whisper model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "vumichien/whisper-small-ja"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
model = model.half()
forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe")
global_ndarray = None
audio_queue = queue.Queue()
running = True
def audio_capture_thread():
with sd.InputStream(
samplerate=16000,
channels=1,
dtype="int16",
blocksize=BLOCKSIZE,
) as stream:
while running:
indata, status = stream.read(BLOCKSIZE)
audio_queue.put(indata)
audio_queue.put(None) # Sentinel value to indicate end of stream
def transcription_and_plotting():
plt.ion()
fig, ax = plt.subplots()
(line,) = ax.plot(np.random.randn(BLOCKSIZE))
ax.set_ylim([-(2**15), 2**15 - 1])
ax.set_xlim(0, BLOCKSIZE)
global global_ndarray
while running:
indata = audio_queue.get()
if indata is None: # If end of stream sentinel is found, break the loop
break
indata_flattened = abs(indata.flatten())
line.set_ydata(indata)
plt.draw()
plt.pause(0.001)
is_significant_audio = (
np.asarray(np.where(indata_flattened > SILENCE_THRESHOLD)).size >= SILENCE_RATIO
)
if is_significant_audio:
if global_ndarray is not None:
global_ndarray = np.concatenate((global_ndarray, indata), dtype="int16")
else:
global_ndarray = indata
elif global_ndarray is not None:
if len(global_ndarray) < MIN_AUDIO_LENGTH:
continue
indata_transformed = global_ndarray.flatten().astype(np.float32) / 32768.0
global_ndarray = None
input_data = processor(
indata_transformed,
sampling_rate=16000,
return_tensors="pt",
).input_features
input_data = input_data.half()
predicted_ids = model.generate(
input_data.to(device),
forced_decoder_ids=forced_decoder_ids,
)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcription: {transcription}")
# with open(SAVE_PATH, "a", encoding="utf-8", buffering=0) as file:
with open(SAVE_PATH, "a", encoding="utf-8") as file:
file.write(transcription + "\n")
file.flush()
if __name__ == "__main__":
capture_thread = threading.Thread(target=audio_capture_thread)
capture_thread.start()
try:
transcription_and_plotting()
except KeyboardInterrupt:
print("\nInterrupted by user")
running = False
plt.close()
capture_thread.join()