Skip to content

Commit

Permalink
[WIP] Various Improvements (#12)
Browse files Browse the repository at this point in the history
* first pass of framework for resampler

* add resample header

* convert mono to stereo in ResamplerStreamer

* reduce timeout for announcing state

* first pass at sending stream info/stereo output

* working stereo mp3s

* working resampler!

* use mp3s for TTS

* split most streamers into own files

* revert back to using pcm for tts

* functioning flac decoder

* implement dac volume control

* publish state after muting/unmuting

* flac decoder pulls directly from ring buffer

* update todos

* add support for volume up and down commands

* tweak some memory allocations

* add volume support

* define i2c registers as static const

* mixing algorithm - consistent announcement loudness

* clear specific ring bufers in mixer when stopping

* stop announcement on wake word

* add some TODOs

* avoid high freq loopers to prevent stuttering

* apply biquad filters when resampling

* clean up code/variable names

* integrate synesthesiam's wav header parser

* stop active pipeline before starting new stream

* uniform I2C function return behavior

* simplify announcement flag handling

* increase and align buffer sizes

* fix edge case of trying to stop a stopping pipeline

* initial work for playing local media files

* update for release

* fix typo
  • Loading branch information
kahrendt authored Jul 22, 2024
1 parent f93e7d8 commit 4c796e2
Show file tree
Hide file tree
Showing 27 changed files with 3,430 additions and 799 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <driver/i2s.h>

#include "esphome/core/hal.h"
#include "esphome/core/helpers.h"
#include "esphome/core/log.h"
#include "esphome/core/ring_buffer.h"

Expand Down Expand Up @@ -254,12 +255,12 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len) {
}

void I2SAudioMicrophone::read_() {
std::vector<int16_t> samples;
std::vector<int16_t, ExternalRAMAllocator<int16_t>> samples;
samples.resize(BUFFER_SIZE);
// TODO this probably isn't correct
size_t bytes_read = this->read(samples.data(), BUFFER_SIZE / sizeof(int16_t));
samples.resize(bytes_read / sizeof(int16_t));
this->data_callbacks_.call(samples);
// this->data_callbacks_.call(samples);
}

void I2SAudioMicrophone::loop() {
Expand Down
32 changes: 32 additions & 0 deletions esphome/components/media_player/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,28 @@
media_player_ns = cg.esphome_ns.namespace("media_player")

MediaPlayer = media_player_ns.class_("MediaPlayer")
MediaFile = media_player_ns.struct("MediaFile")
MediaFileType = media_player_ns.enum("MediaFileType", is_class=True)
MEDIA_FILE_TYPE_ENUM = {
"NONE": MediaFileType.NONE,
"WAV": MediaFileType.WAV,
"MP3": MediaFileType.MP3,
"FLAC": MediaFileType.FLAC,
}

CONF_MEDIA_FILE = "media_file"



PlayAction = media_player_ns.class_(
"PlayAction", automation.Action, cg.Parented.template(MediaPlayer)
)
PlayMediaAction = media_player_ns.class_(
"PlayMediaAction", automation.Action, cg.Parented.template(MediaPlayer)
)
PlayLocalMediaAction = media_player_ns.class_(
"PlayLocalMediaAction", automation.Action, cg.Parented.template(MediaPlayer)
)
ToggleAction = media_player_ns.class_(
"ToggleAction", automation.Action, cg.Parented.template(MediaPlayer)
)
Expand Down Expand Up @@ -143,6 +158,23 @@ async def media_player_play_media_action(config, action_id, template_arg, args):
cg.add(var.set_media_url(media_url))
return var

# @automation.register_action(
# "media_player.play_local_media_file",
# PlayLocalMediaAction,
# cv.maybe_simple_value(
# {
# cv.GenerateID(): cv.use_id(MediaPlayer),
# cv.Required(CONF_MEDIA_FILE): cv.templatable(cv.string),
# },
# key=CONF_MEDIA_FILE,
# ),
# )
# async def media_player_play_media_action(config, action_id, template_arg, args):
# var = cg.new_Pvariable(action_id, template_arg)
# await cg.register_parented(var, config[CONF_ID])
# media_url = await cg.templatable(config[CONF_MEDIA_URL], args, cg.std_string)
# cg.add(var.set_media_url(media_url))
# return var

@automation.register_action("media_player.play", PlayAction, MEDIA_PLAYER_ACTION_SCHEMA)
@automation.register_action(
Expand Down
5 changes: 5 additions & 0 deletions esphome/components/media_player/automation.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ template<typename... Ts> class PlayMediaAction : public Action<Ts...>, public Pa
void play(Ts... x) override { this->parent_->make_call().set_media_url(this->media_url_.value(x...)).perform(); }
};

// template<typename... Ts> class PlayLocalMediaAction : public Action<Ts...>, public Parented<MediaPlayer> {
// TEMPLATABLE_VALUE(MediaFile, media_file)
// void play(Ts... x) override { this->parent_->make_call().set_local_media_file(this->media_file_.value(x...)).perform(); }
// };

template<typename... Ts> class VolumeSetAction : public Action<Ts...>, public Parented<MediaPlayer> {
TEMPLATABLE_VALUE(float, volume)
void play(Ts... x) override { this->parent_->make_call().set_volume(this->volume_.value(x...)).perform(); }
Expand Down
5 changes: 5 additions & 0 deletions esphome/components/media_player/media_player.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ MediaPlayerCall &MediaPlayerCall::set_media_url(const std::string &media_url) {
return *this;
}

MediaPlayerCall &MediaPlayerCall::set_local_media_file(MediaFile *media_file) {
this->media_file_ = media_file;
return *this;
}

MediaPlayerCall &MediaPlayerCall::set_volume(float volume) {
this->volume_ = volume;
return *this;
Expand Down
17 changes: 17 additions & 0 deletions esphome/components/media_player/media_player.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ enum MediaPlayerCommand : uint8_t {
};
const char *media_player_command_to_string(MediaPlayerCommand command);

enum class MediaFileType : uint8_t {
NONE = 0,
WAV,
MP3,
FLAC,
};

struct MediaFile {
const uint8_t *data;
size_t length;
MediaFileType file_type;
};


class MediaPlayer;

class MediaPlayerTraits {
Expand All @@ -50,6 +64,7 @@ class MediaPlayerCall {
MediaPlayerCall &set_command(const std::string &command);

MediaPlayerCall &set_media_url(const std::string &url);
MediaPlayerCall &set_local_media_file(MediaFile *media_file);

MediaPlayerCall &set_volume(float volume);
MediaPlayerCall &set_announcement(bool announce);
Expand All @@ -60,6 +75,7 @@ class MediaPlayerCall {
const optional<std::string> &get_media_url() const { return media_url_; }
const optional<float> &get_volume() const { return volume_; }
const optional<bool> &get_announcement() const { return announcement_; }
const optional<MediaFile*> &get_local_media_file() const { return media_file_; }

protected:
void validate_();
Expand All @@ -68,6 +84,7 @@ class MediaPlayerCall {
optional<std::string> media_url_;
optional<float> volume_;
optional<bool> announcement_;
optional<MediaFile*> media_file_;
};

class MediaPlayer : public EntityBase {
Expand Down
48 changes: 25 additions & 23 deletions esphome/components/micro_wake_word/micro_wake_word.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ static const char *const TAG = "micro_wake_word";
static const size_t SAMPLE_RATE_HZ = 16000; // 16 kHz
static const size_t BUFFER_LENGTH = 64; // 0.064 seconds
static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
static const size_t INPUT_BUFFER_SIZE = 16 * SAMPLE_RATE_HZ / 1000; // 16ms * 16kHz / 1000ms
static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 16ms * 16kHz / 1000ms

float MicroWakeWord::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }

Expand Down Expand Up @@ -113,7 +113,7 @@ void MicroWakeWord::loop() {
ESP_LOGD(TAG, "Starting Microphone");
this->microphone_->start();
this->set_state_(State::STARTING_MICROPHONE);
this->high_freq_.start();
// this->high_freq_.start();
break;
case State::STARTING_MICROPHONE:
if (this->microphone_->is_running()) {
Expand All @@ -135,7 +135,7 @@ void MicroWakeWord::loop() {
ESP_LOGD(TAG, "Stopping Microphone");
// this->microphone_->stop();
this->set_state_(State::STOPPING_MICROPHONE);
this->high_freq_.stop();
// this->high_freq_.stop();
this->unload_models_();
this->deallocate_buffers_();
break;
Expand Down Expand Up @@ -299,31 +299,33 @@ void MicroWakeWord::unload_models_() {
void MicroWakeWord::update_model_probabilities_() {
int8_t audio_features[PREPROCESSOR_FEATURE_SIZE];

if (!this->generate_features_for_window_(audio_features)) {
return;
}
// if (!this->generate_features_for_window_(audio_features)) {
// return;
// }

// Increase the counter since the last positive detection
this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
while (this->generate_features_for_window_(audio_features)) {
// Increase the counter since the last positive detection
this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);

// static size_t total_inference_time = 0;
// static size_t inference_count = 0;
// static size_t total_inference_time = 0;
// static size_t inference_count = 0;

// size_t start_time = millis();
for (auto &model : this->wake_word_models_) {
// Perform inference
model.perform_streaming_inference(audio_features);
}
// size_t start_time = millis();
for (auto &model : this->wake_word_models_) {
// Perform inference
model.perform_streaming_inference(audio_features);
}
#ifdef USE_MICRO_WAKE_WORD_VAD
this->vad_model_->perform_streaming_inference(audio_features);
this->vad_model_->perform_streaming_inference(audio_features);
#endif
// total_inference_time += (millis() - start_time);
// ++inference_count;
// if (inference_count > 500) {
// ESP_LOGD(TAG, "average inference time=%.3f ms", static_cast<float>(total_inference_time) / inference_count);
// total_inference_time = 0;
// inference_count = 0;
// }
// total_inference_time += (millis() - start_time);
// ++inference_count;
// if (inference_count > 500) {
// ESP_LOGD(TAG, "average inference time=%.3f ms", static_cast<float>(total_inference_time) / inference_count);
// total_inference_time = 0;
// inference_count = 0;
// }
}
}

bool MicroWakeWord::detect_wake_words_() {
Expand Down
93 changes: 93 additions & 0 deletions esphome/components/nabu/biquad.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
////////////////////////////////////////////////////////////////////////////
// **** BIQUAD **** //
// Simple Biquad Filter Library //
// Copyright (c) 2021 - 2022 David Bryant. //
// All Rights Reserved. //
// Distributed under the BSD Software License (see license.txt) //
////////////////////////////////////////////////////////////////////////////

// biquad.c

#include "biquad.h"

// Second-order Lowpass

void biquad_lowpass (BiquadCoefficients *filter, double frequency)
{
double Q = sqrt (0.5), K = tan (M_PI * frequency);
double norm = 1.0 / (1.0 + K / Q + K * K);

filter->a0 = K * K * norm;
filter->a1 = 2 * filter->a0;
filter->a2 = filter->a0;
filter->b1 = 2.0 * (K * K - 1.0) * norm;
filter->b2 = (1.0 - K / Q + K * K) * norm;
}

// Second-order Highpass

void biquad_highpass (BiquadCoefficients *filter, double frequency)
{
double Q = sqrt (0.5), K = tan (M_PI * frequency);
double norm = 1.0 / (1.0 + K / Q + K * K);

filter->a0 = norm;
filter->a1 = -2.0 * norm;
filter->a2 = filter->a0;
filter->b1 = 2.0 * (K * K - 1.0) * norm;
filter->b2 = (1.0 - K / Q + K * K) * norm;
}

// Initialize the specified biquad filter with the given parameters. Note that the "gain" parameter is supplied here
// to save a multiply every time the filter in applied.

void biquad_init (Biquad *f, const BiquadCoefficients *coeffs, float gain)
{
f->coeffs = *coeffs;
f->coeffs.a0 *= gain;
f->coeffs.a1 *= gain;
f->coeffs.a2 *= gain;
f->in_d1 = f->in_d2 = 0.0F;
f->out_d1 = f->out_d2 = 0.0F;
f->first_order = (coeffs->a2 == 0.0F && coeffs->b2 == 0.0F);
}

// Apply the supplied sample to the specified biquad filter, which must have been initialized with biquad_init().

float biquad_apply_sample (Biquad *f, float input)
{
float sum;

if (f->first_order)
sum = (input * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) - (f->coeffs.b1 * f->out_d1);
else
sum = (input * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) + (f->in_d2 * f->coeffs.a2) - (f->coeffs.b1 * f->out_d1) - (f->coeffs.b2 * f->out_d2);

f->out_d2 = f->out_d1;
f->out_d1 = sum;
f->in_d2 = f->in_d1;
f->in_d1 = input;
return sum;
}

// Apply the supplied buffer to the specified biquad filter, which must have been initialized with biquad_init().

void biquad_apply_buffer (Biquad *f, float *buffer, int num_samples, int stride)
{
if (f->first_order) while (num_samples--) {
float sum = (*buffer * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) - (f->coeffs.b1 * f->out_d1);
f->out_d2 = f->out_d1;
f->in_d2 = f->in_d1;
f->in_d1 = *buffer;
*buffer = f->out_d1 = sum;
buffer += stride;
}
else while (num_samples--) {
float sum = (*buffer * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) + (f->in_d2 * f->coeffs.a2) - (f->coeffs.b1 * f->out_d1) - (f->coeffs.b2 * f->out_d2);
f->out_d2 = f->out_d1;
f->in_d2 = f->in_d1;
f->in_d1 = *buffer;
*buffer = f->out_d1 = sum;
buffer += stride;
}
}
41 changes: 41 additions & 0 deletions esphome/components/nabu/biquad.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
////////////////////////////////////////////////////////////////////////////
// **** BIQUAD **** //
// Simple Biquad Filter Library //
// Copyright (c) 2021 - 2022 David Bryant. //
// All Rights Reserved. //
// Distributed under the BSD Software License (see license.txt) //
////////////////////////////////////////////////////////////////////////////

// biquad.h

#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <math.h>

typedef struct {
float a0, a1, a2, b1, b2;
} BiquadCoefficients;

typedef struct {
BiquadCoefficients coeffs; // coefficients
float in_d1, in_d2; // delayed input
float out_d1, out_d2; // delayed output
int first_order; // optimization
} Biquad;

#ifdef __cplusplus
extern "C" {
#endif

void biquad_init (Biquad *f, const BiquadCoefficients *coeffs, float gain);

void biquad_lowpass (BiquadCoefficients *filter, double frequency);
void biquad_highpass (BiquadCoefficients *filter, double frequency);

void biquad_apply_buffer (Biquad *f, float *buffer, int num_samples, int stride);
float biquad_apply_sample (Biquad *f, float input);

#ifdef __cplusplus
}
#endif
Loading

0 comments on commit 4c796e2

Please sign in to comment.