Skip to content

Commit

Permalink
Merge remote-tracking branch 'iory/respeaker' into respeaker-multi-pr…
Browse files Browse the repository at this point in the history
…1040
  • Loading branch information
708yamaguchi committed Jun 14, 2022
2 parents c3a1b34 + 383670f commit 58c4f64
Show file tree
Hide file tree
Showing 9 changed files with 274 additions and 16 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<launch>
<arg name="run_app_manager" default="false" />
<arg name="applist" default="" />
<arg name="credential" default="$(optenv GOOGLE_APPLICATION_CREDENTIALS)"/>
<arg name="credential" default="$(optenv GOOGLE_APPLICATION_CREDENTIALS)" doc="Read credentials JSON from this value when use_yaml is false." />
<arg name="project_id" default="$(optenv DIALOGFLOW_PROJECT_ID)"/>
<arg name="enable_hotword" default="true" />

Expand Down
6 changes: 4 additions & 2 deletions dialogflow_task_executive/samples/demo.launch
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
<launch>
<arg name="google_cloud_credentials_json" default="" />
<arg name="google_cloud_credentials_json" default="$(optenv GOOGLE_APPLICATION_CREDENTIALS)" doc="Read credentials JSON from this value when use_yaml is false." />
<arg name="project_id" default="eternal-byte-236613" />
<arg name="run_app_manager" default="true" doc="Run app_manager or not"/>

<include file="$(find dialogflow_task_executive)/launch/dialogflow_task_executive.launch" >
<arg name="credential" value="$(arg google_cloud_credentials_json)" />
<arg name="project_id" value="$(arg project_id)" />
<arg name="run_app_manager" value="true" />
<arg name="run_app_manager" value="$(arg run_app_manager)" />
<arg name="enable_hotword" default="false" />
</include>
<node pkg="dialogflow_task_executive" type="client.l"
Expand Down
2 changes: 2 additions & 0 deletions respeaker_ros/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,7 @@ catkin_install_python(PROGRAMS ${PYTHON_SCRIPTS}

if(CATKIN_ENABLE_TESTING)
find_package(rostest REQUIRED)
find_package(roslaunch REQUIRED)
add_rostest(test/sample_respeaker.test)
roslaunch_add_file_check(launch/sample_respeaker.launch)
endif()
145 changes: 145 additions & 0 deletions respeaker_ros/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,151 @@ A ROS Package for Respeaker Mic Array
a: 0.3"
```

## Parameters for respeaker_node.py

- ### Publishing topics

- `audio` (`audio_common_msgs/AudioData`)

Processed audio for ASR. 1 channel.

- `audio_info` (`audio_common_msgs/AudioInfo`)

Audio info with respect to `~audio`.

- `audio_raw` (`audio_common_msgs/AudioData`)

Micarray audio data has 4-channels. Maybe you need to update respeaker firmware.

If the firmware isn't supported, this will not be output.
- `audio_info_raw` (`audio_common_msgs/AudioInfo`)
Audio info with respect to `~audio_raw`.
If the firmware isn't supported, this will not be output.

- `speech_audio` (`audio_common_msgs/AudioData`)

Audio data while a person is speaking using the VAD function.

- `speech_audio_raw` (`audio_common_msgs/AudioData`)

Audio data has 4-channels while a person is speaking using the VAD function.

If the firmware isn't supported, this will not be output.
- `audio_merged_playback` (`audio_common_msgs/AudioData`)
Data that combines the sound of mic and speaker.
If the firmware isn't supported, this will not be output.

For more detail, please see https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/

- `~is_speeching` (`std_msgs/Bool`)

Using VAD function, publish whether someone is speaking.

- `~sound_direction` (`std_msgs/Int32`)

Direction of sound.

- `~sound_localization` (`geometry_msgs/PoseStamped`)

Localized Sound Direction. The value of the position in the estimated direction with `~doa_offset` as the radius is obtained.

- ### Parameters

- `~update_rate` (`Double`, default: `10.0`)

Publishing info data such as `~is_speeching`, `~sound_direction`, `~sound_localization`, `~speech_audio` and `~speech_audio_raw`.

- `~sensor_frame_id` (`String`, default: `respeaker_base`)

Frame id.

- `~doa_xy_offset` (`Double`, default: `0.0`)

`~doa_offset` is a estimated sound direction's radius.
- `~doa_yaw_offset` (`Double`, default: `90.0`)
Estimated DoA angle offset.
- `~speech_prefetch` (`Double`, default: `0.5`)
Time to represent how long speech is pre-stored in buffer.
- `~speech_continuation` (`Double`, default: `0.5`)
If the time between the current time and the time when the speech is stopped is shorter than this time,
it is assumed that someone is speaking.
- `~speech_max_duration` (`Double`, default: `7.0`)
- `~speech_min_duration` (`Double`, default: `0.1`)
If the speaking interval is within these times, `~speech_audio` and `~speech_audio_raw` will be published.
- `~suppress_pyaudio_error` (`Bool`, default: `True`)
If this value is `True`, suppress error from pyaudio.
## Parameters for speech_to_text.py
- ### Publishing topics
- `~speech_to_text` (`speech_recognition_msgs/SpeechRecognitionCandidates`)
Recognized text.
- ### Subscribing topics
- `audio` (`audio_common_msgs/AudioData`)
Input audio.
- ### Parameters
- `~audio_info` (`String`, default: ``)
audio_info (`audio_common_msgs/AudioInfo`) topic. If this value is specified, `~sample_rate`, `~sample_width` and `~channels` parameters are obtained from the topic.
- `~sample_rate` (`Int`, default: `16000`)
Sampling rate.
- `~sample_width` (`Int`, default: `2`)
Sample with.
- `~channels` (`Int`, default: `1`)
Number of channels.
- `~target_channel` (`Int`, default: `0`)
Target number of channel.
- `~language` (`String`, default: `ja-JP`)
language of speech to text service. For English users, you can specify `en-US`.
- `~self_cancellation` (`Bool`, default: `True`)
ignore voice input while the robot is speaking.
- `~tts_tolerance` (`String`, default: `1.0`)
time to assume as SPEAKING after tts service is finished.
- `~tts_action_names` (`List[String]`, default: `['sound_play']`)
If `~self_chancellation` is `True`, this value will be used.
When the actions are active, do nothing with the callback that subscribes to `audio`.
## Use cases
### Voice Recognition
Expand Down
6 changes: 5 additions & 1 deletion respeaker_ros/launch/sample_respeaker.launch
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,17 @@
<arg name="language" default="en-US"/>
<!-- self cancellation -->
<arg name="self_cancellation" default="true"/>
<!-- audio info topic name -->
<arg name="audio_info" default="audio_info"/>

<node if="$(arg publish_tf)"
name="static_transformer" pkg="tf" type="static_transform_publisher"
args="0 0 0 0 0 0 1 map respeaker_base 100"/>

<node if="$(arg launch_respeaker)"
name="respeaker_node" pkg="respeaker_ros" type="respeaker_node.py"
respawn="true" respawn_delay="10" />
respawn="true" respawn_delay="10" >
</node>

<node if="$(arg launch_soundplay)"
name="sound_play" pkg="sound_play" type="soundplay_node.py"/>
Expand All @@ -30,6 +33,7 @@
<remap from="audio" to="$(arg audio)"/>
<remap from="speech_to_text" to="$(arg speech_to_text)"/>
<rosparam subst_value="true">
audio_info: $(arg audio_info)
language: $(arg language)
self_cancellation: $(arg self_cancellation)
tts_tolerance: 0.5
Expand Down
1 change: 1 addition & 0 deletions respeaker_ros/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
<exec_depend>flac</exec_depend>
<exec_depend>geometry_msgs</exec_depend>
<exec_depend>std_msgs</exec_depend>
<exec_depend>sound_play</exec_depend>
<exec_depend>speech_recognition_msgs</exec_depend>
<exec_depend>tf</exec_depend>
<exec_depend condition="$ROS_PYTHON_VERSION == 2">python-numpy</exec_depend>
Expand Down
94 changes: 85 additions & 9 deletions respeaker_ros/scripts/respeaker_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import sys
import time
from audio_common_msgs.msg import AudioData
from audio_common_msgs.msg import AudioInfo
from geometry_msgs.msg import PoseStamped
from std_msgs.msg import Bool, Int32, ColorRGBA
from dynamic_reconfigure.server import Server
Expand Down Expand Up @@ -254,7 +255,6 @@ def __init__(self, on_audio, channel=0, suppress_error=True):
if self.channels != 6:
rospy.logwarn("%d channel is found for respeaker" % self.channels)
rospy.logwarn("You may have to update firmware.")
self.channel = min(self.channels - 1, max(0, self.channel))

self.stream = self.pyaudio.open(
input=True, start=False,
Expand Down Expand Up @@ -284,9 +284,8 @@ def stream_callback(self, in_data, frame_count, time_info, status):
data = np.frombuffer(in_data, dtype=np.int16)
chunk_per_channel = int(len(data) / self.channels)
data = np.reshape(data, (chunk_per_channel, self.channels))
chan_data = data[:, self.channel]
# invoke callback
self.on_audio(chan_data.tobytes())
self.on_audio(data)
return None, pyaudio.paContinue

def start(self):
Expand Down Expand Up @@ -322,21 +321,80 @@ def __init__(self):
self.pub_doa_raw = rospy.Publisher("sound_direction", Int32, queue_size=1, latch=True)
self.pub_doa = rospy.Publisher("sound_localization", PoseStamped, queue_size=1, latch=True)
self.pub_audio = rospy.Publisher("audio", AudioData, queue_size=10)
self.pub_audio_info = rospy.Publisher("audio_info", AudioInfo,
queue_size=1, latch=True)
self.pub_audio_raw_info = rospy.Publisher("audio_info_raw", AudioInfo,
queue_size=1, latch=True)
self.pub_speech_audio = rospy.Publisher("speech_audio", AudioData, queue_size=10)
# init config
self.config = None
self.dyn_srv = Server(RespeakerConfig, self.on_config)
# start
self.respeaker_audio = RespeakerAudio(self.on_audio, suppress_error=suppress_pyaudio_error)
self.n_channel = self.respeaker_audio.channels

self.speech_prefetch_bytes = int(
self.speech_prefetch * self.respeaker_audio.rate * self.respeaker_audio.bitdepth / 8.0)
1
* self.speech_prefetch
* self.respeaker_audio.rate
* self.respeaker_audio.bitdepth / 8.0)
self.speech_prefetch_buffer = b""
self.respeaker_audio.start()
self.info_timer = rospy.Timer(rospy.Duration(1.0 / self.update_rate),
self.on_timer)
self.timer_led = None
self.sub_led = rospy.Subscriber("status_led", ColorRGBA, self.on_status_led)

# processed audio for ASR
info_msg = AudioInfo(
channels=1,
sample_rate=self.respeaker_audio.rate,
sample_format='S16LE',
bitrate=self.respeaker_audio.rate * self.respeaker_audio.bitdepth,
coding_format='WAVE')
self.pub_audio_info.publish(info_msg)

if self.n_channel > 1:
# The respeaker has 4 microphones.
# Multiple microphones can be used for
# beam forming (strengthening the sound in a specific direction)
# and sound localization (the respeaker outputs the azimuth
# direction, but the multichannel can estimate
# the elevation direction). etc.

# Channel 0: processed audio for ASR
# Channel 1: mic1 raw data
# Channel 2: mic2 raw data
# Channel 3: mic3 raw data
# Channel 4: mic4 raw data
# Channel 5: merged playback
# For more detail, please see
# https://wiki.seeedstudio.com/ReSpeaker_Mic_Array_v2.0/
# (self.n_channel - 2) = 4 channels are multiple microphones.
self.pub_audio_raw = rospy.Publisher("audio_raw", AudioData,
queue_size=10)
self.pub_audio_merged_playback = rospy.Publisher(
"audio_merged_playback", AudioData,
queue_size=10)
info_raw_msg = AudioInfo(
channels=self.n_channel - 2,
sample_rate=self.respeaker_audio.rate,
sample_format='S16LE',
bitrate=(self.respeaker_audio.rate *
self.respeaker_audio.bitdepth),
coding_format='WAVE')
self.pub_audio_raw_info.publish(info_raw_msg)

self.speech_audio_raw_buffer = b""
self.speech_raw_prefetch_buffer = b""
self.pub_speech_audio_raw = rospy.Publisher(
"speech_audio_raw", AudioData, queue_size=10)
self.speech_raw_prefetch_bytes = int(
self.n_channel - 2
* self.speech_prefetch
* self.respeaker_audio.rate
* self.respeaker_audio.bitdepth / 8.0)

def on_shutdown(self):
try:
self.respeaker.close()
Expand Down Expand Up @@ -374,14 +432,30 @@ def on_status_led(self, msg):
oneshot=True)

def on_audio(self, data):
self.pub_audio.publish(AudioData(data=data))
# take processed audio for ASR.
processed_data = data[:, 0].tobytes()
self.pub_audio.publish(AudioData(data=processed_data))
if self.n_channel > 1:
raw_audio_data = data[:, 1:5].reshape(-1).tobytes()
self.pub_audio_raw.publish(
AudioData(data=raw_audio_data))
self.pub_audio_merged_playback.publish(
AudioData(data=data[:, 5].tobytes()))
if self.is_speeching:
if len(self.speech_audio_buffer) == 0:
self.speech_audio_buffer = self.speech_prefetch_buffer
self.speech_audio_buffer += data
if self.n_channel > 1:
self.speech_audio_raw_buffer = self.speech_raw_prefetch_buffer
self.speech_audio_buffer += processed_data
if self.n_channel > 1:
self.speech_audio_raw_buffer += raw_audio_data
else:
self.speech_prefetch_buffer += data
self.speech_prefetch_buffer += processed_data
self.speech_prefetch_buffer = self.speech_prefetch_buffer[-self.speech_prefetch_bytes:]
if self.n_channel > 1:
self.speech_raw_prefetch_buffer += raw_audio_data
self.speech_raw_prefetch_buffer = self.speech_raw_prefetch_buffer[
-self.speech_raw_prefetch_bytes:]

def on_timer(self, event):
stamp = event.current_real or rospy.Time.now()
Expand Down Expand Up @@ -421,13 +495,15 @@ def on_timer(self, event):
elif self.is_speeching:
buf = self.speech_audio_buffer
self.speech_audio_buffer = b""
buf_raw = self.speech_audio_raw_buffer
self.speech_audio_raw_buffer = b""
self.is_speeching = False
duration = 8.0 * len(buf) * self.respeaker_audio.bitwidth
duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth
duration = duration / self.respeaker_audio.rate / self.respeaker_audio.bitdepth / self.n_channel
rospy.loginfo("Speech detected for %.3f seconds" % duration)
if self.speech_min_duration <= duration < self.speech_max_duration:

self.pub_speech_audio.publish(AudioData(data=buf))
self.pub_speech_audio_raw.publish(AudioData(data=buf_raw))


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 58c4f64

Please sign in to comment.