-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeech_to_text.py
131 lines (103 loc) · 4.4 KB
/
speech_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import asyncio
import websockets
import json
import requests
import pyaudio
class SpeechToText:
def __init__(self):
# Variables to use for recording audio
self.CHUNK = 1024
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 2
self.RATE = 16000
# Initialize auth_token
self.auth_token = None
self.transcript = "No audio was received"
self.p = pyaudio.PyAudio()
# Opens the stream to start recording from the default microphone
self.stream = self.p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
output=True,
frames_per_buffer=self.CHUNK)
self.record_voice()
# This is the language model to use to transcribe the audio
self.model = "en-US_BroadbandModel"
# These are the urls we will be using to communicate with Watson
self.default_url = "https://stream.watsonplatform.net/speech-to-text/api"
self.token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
"url=https://stream.watsonplatform.net/speech-to-text/api"
self.url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"
# Params to use for Watson API
self.params = {
"word_confidence": True,
"content_type": "audio/l16;rate=16000;channels=2",
"action": "start",
"interim_results": False
}
def record_voice(self):
# Starts recording of microphone
print("* READY *")
frames = []
for i in range(0, int(self.RATE / self.CHUNK * 5)):
data = self.stream.read(self.CHUNK)
frames.append(data)
print("* FINISH *")
# Stop the stream and terminate the recording
self.stream.stop_stream()
self.stream.close()
self.p.terminate()
self.audio_feed = b''.join(frames)
def get_auth(self):
# BlueMix app credentials
username = "<username>"
password = "<password>"
# Send a request to get an authorization key
r = requests.get(self.token_url, auth=(username, password))
self.auth_token = r.text
self.token_header = {"X-Watson-Authorization-Token": self.auth_token}
print("Authorization was requested")
async def send_audio(self, audio, ws):
if not audio:
await ws.send(json.dumps({'action': 'stop'}))
await ws.send(audio)
await ws.send(json.dumps({'action': 'stop'}))
async def speech_to_text(self, audio, auth_token):
self.auth_token = auth_token
self.token_header = {"X-Watson-Authorization-Token": self.auth_token}
if self.auth_token is None:
self.get_auth()
async with websockets.connect(self.url, extra_headers=self.token_header) as conn:
# Send request to watson and waits for the listening response
send = await conn.send(json.dumps(self.params))
rec = await conn.recv()
print(rec)
asyncio.ensure_future(self.send_audio(audio, conn))
# Keeps receiving transcript until we have the final transcript
while True:
try:
rec = await conn.recv()
parsed = json.loads(rec)
self.transcript = parsed["results"][0]["alternatives"][0]["transcript"]
print(self.transcript)
# print(parsed)
if "results" in parsed:
if len(parsed["results"]) > 0:
if "final" in parsed["results"][0]:
if parsed["results"][0]["final"]:
conn.close()
return False
pass
except KeyError:
conn.close()
return False
# Starts the application loop
def run(self, auth_token):
loop = asyncio.get_event_loop()
loop.run_until_complete(self.speech_to_text(self.audio_feed, auth_token))
return self.transcript, self.auth_token
if __name__ == "__main__":
app = SpeechToText()
result = app.run()
print(result)