[WIP] Various Improvements (#12)

* first pass of framework for resampler * add resample header * convert mono to stereo in ResamplerStreamer * reduce timeout for announcing state * first pass at sending stream info/stereo output * working stereo mp3s * working resampler! * use mp3s for TTS * split most streamers into own files * revert back to using pcm for tts * functioning flac decoder * implement dac volume control * publish state after muting/unmuting * flac decoder pulls directly from ring buffer * update todos * add support for volume up and down commands * tweak some memory allocations * add volume support * define i2c registers as static const * mixing algorithm - consistent announcement loudness * clear specific ring bufers in mixer when stopping * stop announcement on wake word * add some TODOs * avoid high freq loopers to prevent stuttering * apply biquad filters when resampling * clean up code/variable names * integrate synesthesiam's wav header parser * stop active pipeline before starting new stream * uniform I2C function return behavior * simplify announcement flag handling * increase and align buffer sizes * fix edge case of trying to stop a stopping pipeline * initial work for playing local media files * update for release * fix typo
esphome · Jul 22, 2024 · 4c796e2 · 4c796e2
1 parent f93e7d8
commit 4c796e2
Show file tree

Hide file tree

Showing 27 changed files with 3,430 additions and 799 deletions.
diff --git a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp
@@ -5,6 +5,7 @@
 #include <driver/i2s.h>
 
 #include "esphome/core/hal.h"
+#include "esphome/core/helpers.h"
 #include "esphome/core/log.h"
 #include "esphome/core/ring_buffer.h"
 
@@ -254,12 +255,12 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len) {
 }
 
 void I2SAudioMicrophone::read_() {
-  std::vector<int16_t> samples;
+  std::vector<int16_t, ExternalRAMAllocator<int16_t>> samples;
   samples.resize(BUFFER_SIZE);
   // TODO this probably isn't correct
   size_t bytes_read = this->read(samples.data(), BUFFER_SIZE / sizeof(int16_t));
   samples.resize(bytes_read / sizeof(int16_t));
-  this->data_callbacks_.call(samples);
+  // this->data_callbacks_.call(samples);
 }
 
 void I2SAudioMicrophone::loop() {

diff --git a/esphome/components/media_player/__init__.py b/esphome/components/media_player/__init__.py
@@ -22,13 +22,28 @@
 media_player_ns = cg.esphome_ns.namespace("media_player")
 
 MediaPlayer = media_player_ns.class_("MediaPlayer")
+MediaFile = media_player_ns.struct("MediaFile")
+MediaFileType = media_player_ns.enum("MediaFileType", is_class=True)
+MEDIA_FILE_TYPE_ENUM = {
+    "NONE": MediaFileType.NONE,
+    "WAV": MediaFileType.WAV,
+    "MP3": MediaFileType.MP3,
+    "FLAC": MediaFileType.FLAC,
+}
+
+CONF_MEDIA_FILE = "media_file"
+
+
 
 PlayAction = media_player_ns.class_(
     "PlayAction", automation.Action, cg.Parented.template(MediaPlayer)
 )
 PlayMediaAction = media_player_ns.class_(
     "PlayMediaAction", automation.Action, cg.Parented.template(MediaPlayer)
 )
+PlayLocalMediaAction = media_player_ns.class_(
+    "PlayLocalMediaAction", automation.Action, cg.Parented.template(MediaPlayer)
+)
 ToggleAction = media_player_ns.class_(
     "ToggleAction", automation.Action, cg.Parented.template(MediaPlayer)
 )
@@ -143,6 +158,23 @@ async def media_player_play_media_action(config, action_id, template_arg, args):
     cg.add(var.set_media_url(media_url))
     return var
 
+# @automation.register_action(
+#     "media_player.play_local_media_file",
+#     PlayLocalMediaAction,
+#     cv.maybe_simple_value(
+#         {
+#             cv.GenerateID(): cv.use_id(MediaPlayer),
+#             cv.Required(CONF_MEDIA_FILE): cv.templatable(cv.string),
+#         },
+#         key=CONF_MEDIA_FILE,
+#     ),
+# )
+# async def media_player_play_media_action(config, action_id, template_arg, args):
+#     var = cg.new_Pvariable(action_id, template_arg)
+#     await cg.register_parented(var, config[CONF_ID])
+#     media_url = await cg.templatable(config[CONF_MEDIA_URL], args, cg.std_string)
+#     cg.add(var.set_media_url(media_url))
+#     return var
 
 @automation.register_action("media_player.play", PlayAction, MEDIA_PLAYER_ACTION_SCHEMA)
 @automation.register_action(

diff --git a/esphome/components/media_player/automation.h b/esphome/components/media_player/automation.h
@@ -37,6 +37,11 @@ template<typename... Ts> class PlayMediaAction : public Action<Ts...>, public Pa
   void play(Ts... x) override { this->parent_->make_call().set_media_url(this->media_url_.value(x...)).perform(); }
 };
 
+// template<typename... Ts> class PlayLocalMediaAction : public Action<Ts...>, public Parented<MediaPlayer> {
+//   TEMPLATABLE_VALUE(MediaFile, media_file)
+//   void play(Ts... x) override { this->parent_->make_call().set_local_media_file(this->media_file_.value(x...)).perform(); }
+// };
+
 template<typename... Ts> class VolumeSetAction : public Action<Ts...>, public Parented<MediaPlayer> {
   TEMPLATABLE_VALUE(float, volume)
   void play(Ts... x) override { this->parent_->make_call().set_volume(this->volume_.value(x...)).perform(); }

diff --git a/esphome/components/media_player/media_player.cpp b/esphome/components/media_player/media_player.cpp
@@ -108,6 +108,11 @@ MediaPlayerCall &MediaPlayerCall::set_media_url(const std::string &media_url) {
   return *this;
 }
 
+MediaPlayerCall &MediaPlayerCall::set_local_media_file(MediaFile *media_file) {
+  this->media_file_ = media_file;
+  return *this;
+}
+
 MediaPlayerCall &MediaPlayerCall::set_volume(float volume) {
   this->volume_ = volume;
   return *this;

diff --git a/esphome/components/media_player/media_player.h b/esphome/components/media_player/media_player.h
@@ -27,6 +27,20 @@ enum MediaPlayerCommand : uint8_t {
 };
 const char *media_player_command_to_string(MediaPlayerCommand command);
 
+enum class MediaFileType : uint8_t {
+  NONE = 0,
+  WAV,
+  MP3,
+  FLAC,
+};
+
+struct MediaFile {
+  const uint8_t *data;
+  size_t length;
+  MediaFileType file_type;
+};
+
+
 class MediaPlayer;
 
 class MediaPlayerTraits {
@@ -50,6 +64,7 @@ class MediaPlayerCall {
   MediaPlayerCall &set_command(const std::string &command);
 
   MediaPlayerCall &set_media_url(const std::string &url);
+  MediaPlayerCall &set_local_media_file(MediaFile *media_file);
 
   MediaPlayerCall &set_volume(float volume);
   MediaPlayerCall &set_announcement(bool announce);
@@ -60,6 +75,7 @@ class MediaPlayerCall {
   const optional<std::string> &get_media_url() const { return media_url_; }
   const optional<float> &get_volume() const { return volume_; }
   const optional<bool> &get_announcement() const { return announcement_; }
+  const optional<MediaFile*> &get_local_media_file() const { return media_file_; }
 
  protected:
   void validate_();
@@ -68,6 +84,7 @@ class MediaPlayerCall {
   optional<std::string> media_url_;
   optional<float> volume_;
   optional<bool> announcement_;
+  optional<MediaFile*> media_file_;
 };
 
 class MediaPlayer : public EntityBase {

diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp
@@ -30,7 +30,7 @@ static const char *const TAG = "micro_wake_word";
 static const size_t SAMPLE_RATE_HZ = 16000;  // 16 kHz
 static const size_t BUFFER_LENGTH = 64;      // 0.064 seconds
 static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
-static const size_t INPUT_BUFFER_SIZE = 16 * SAMPLE_RATE_HZ / 1000;  // 16ms * 16kHz / 1000ms
+static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000;  // 16ms * 16kHz / 1000ms
 
 float MicroWakeWord::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
 
@@ -113,7 +113,7 @@ void MicroWakeWord::loop() {
       ESP_LOGD(TAG, "Starting Microphone");
       this->microphone_->start();
       this->set_state_(State::STARTING_MICROPHONE);
-      this->high_freq_.start();
+      // this->high_freq_.start();
       break;
     case State::STARTING_MICROPHONE:
       if (this->microphone_->is_running()) {
@@ -135,7 +135,7 @@ void MicroWakeWord::loop() {
       ESP_LOGD(TAG, "Stopping Microphone");
       // this->microphone_->stop();
       this->set_state_(State::STOPPING_MICROPHONE);
-      this->high_freq_.stop();
+      // this->high_freq_.stop();
       this->unload_models_();
       this->deallocate_buffers_();
       break;
@@ -299,31 +299,33 @@ void MicroWakeWord::unload_models_() {
 void MicroWakeWord::update_model_probabilities_() {
   int8_t audio_features[PREPROCESSOR_FEATURE_SIZE];
 
-  if (!this->generate_features_for_window_(audio_features)) {
-    return;
-  }
+  // if (!this->generate_features_for_window_(audio_features)) {
+  //   return;
+  // }
 
-  // Increase the counter since the last positive detection
-  this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
+  while (this->generate_features_for_window_(audio_features)) {
+    // Increase the counter since the last positive detection
+    this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
 
-  // static size_t total_inference_time = 0;
-  // static size_t inference_count = 0;
+    // static size_t total_inference_time = 0;
+    // static size_t inference_count = 0;
 
-  // size_t start_time = millis();
-  for (auto &model : this->wake_word_models_) {
-    // Perform inference
-    model.perform_streaming_inference(audio_features);
-  }
+    // size_t start_time = millis();
+    for (auto &model : this->wake_word_models_) {
+      // Perform inference
+      model.perform_streaming_inference(audio_features);
+    }
 #ifdef USE_MICRO_WAKE_WORD_VAD
-  this->vad_model_->perform_streaming_inference(audio_features);
+    this->vad_model_->perform_streaming_inference(audio_features);
 #endif
-  // total_inference_time += (millis() - start_time);
-  // ++inference_count;
-  // if (inference_count > 500) {
-  //   ESP_LOGD(TAG, "average inference time=%.3f ms", static_cast<float>(total_inference_time) / inference_count);
-  //   total_inference_time = 0;
-  //   inference_count = 0;
-  // }
+    // total_inference_time += (millis() - start_time);
+    // ++inference_count;
+    // if (inference_count > 500) {
+    //   ESP_LOGD(TAG, "average inference time=%.3f ms", static_cast<float>(total_inference_time) / inference_count);
+    //   total_inference_time = 0;
+    //   inference_count = 0;
+    // }
+  }
 }
 
 bool MicroWakeWord::detect_wake_words_() {

diff --git a/esphome/components/nabu/biquad.c b/esphome/components/nabu/biquad.c
@@ -0,0 +1,93 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** BIQUAD ****                             //
+//                     Simple Biquad Filter Library                       //
+//                Copyright (c) 2021 - 2022 David Bryant.                 //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// biquad.c
+
+#include "biquad.h"
+
+// Second-order Lowpass
+
+void biquad_lowpass (BiquadCoefficients *filter, double frequency)
+{
+    double Q = sqrt (0.5), K = tan (M_PI * frequency);
+    double norm = 1.0 / (1.0 + K / Q + K * K);
+
+    filter->a0 = K * K * norm;
+    filter->a1 = 2 * filter->a0;
+    filter->a2 = filter->a0;
+    filter->b1 = 2.0 * (K * K - 1.0) * norm;
+    filter->b2 = (1.0 - K / Q + K * K) * norm;
+}
+
+// Second-order Highpass
+
+void biquad_highpass (BiquadCoefficients *filter, double frequency)
+{
+    double Q = sqrt (0.5), K = tan (M_PI * frequency);
+    double norm = 1.0 / (1.0 + K / Q + K * K);
+
+    filter->a0 = norm;
+    filter->a1 = -2.0 * norm;
+    filter->a2 = filter->a0;
+    filter->b1 = 2.0 * (K * K - 1.0) * norm;
+    filter->b2 = (1.0 - K / Q + K * K) * norm;
+}
+
+// Initialize the specified biquad filter with the given parameters. Note that the "gain" parameter is supplied here
+// to save a multiply every time the filter in applied.
+
+void biquad_init (Biquad *f, const BiquadCoefficients *coeffs, float gain)
+{
+    f->coeffs = *coeffs;
+    f->coeffs.a0 *= gain;
+    f->coeffs.a1 *= gain;
+    f->coeffs.a2 *= gain;
+    f->in_d1 = f->in_d2 = 0.0F;
+    f->out_d1 = f->out_d2 = 0.0F;
+    f->first_order = (coeffs->a2 == 0.0F && coeffs->b2 == 0.0F);
+}
+
+// Apply the supplied sample to the specified biquad filter, which must have been initialized with biquad_init().
+
+float biquad_apply_sample (Biquad *f, float input)
+{
+    float sum;
+
+    if (f->first_order)
+        sum = (input * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) - (f->coeffs.b1 * f->out_d1);
+    else
+        sum = (input * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) + (f->in_d2 * f->coeffs.a2) - (f->coeffs.b1 * f->out_d1) - (f->coeffs.b2 * f->out_d2);
+
+    f->out_d2 = f->out_d1;
+    f->out_d1 = sum;
+    f->in_d2 = f->in_d1;
+    f->in_d1 = input;
+    return sum;
+}
+
+// Apply the supplied buffer to the specified biquad filter, which must have been initialized with biquad_init().
+
+void biquad_apply_buffer (Biquad *f, float *buffer, int num_samples, int stride)
+{
+    if (f->first_order) while (num_samples--) {
+        float sum = (*buffer * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) - (f->coeffs.b1 * f->out_d1);
+        f->out_d2 = f->out_d1;
+        f->in_d2 = f->in_d1;
+        f->in_d1 = *buffer;
+        *buffer = f->out_d1 = sum;
+        buffer += stride;
+    }
+    else while (num_samples--) {
+        float sum = (*buffer * f->coeffs.a0) + (f->in_d1 * f->coeffs.a1) + (f->in_d2 * f->coeffs.a2) - (f->coeffs.b1 * f->out_d1) - (f->coeffs.b2 * f->out_d2);
+        f->out_d2 = f->out_d1;
+        f->in_d2 = f->in_d1;
+        f->in_d1 = *buffer;
+        *buffer = f->out_d1 = sum;
+        buffer += stride;
+    }
+}
diff --git a/esphome/components/nabu/biquad.h b/esphome/components/nabu/biquad.h
@@ -0,0 +1,41 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** BIQUAD ****                             //
+//                     Simple Biquad Filter Library                       //
+//                Copyright (c) 2021 - 2022 David Bryant.                 //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// biquad.h
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+
+typedef struct {
+    float a0, a1, a2, b1, b2;
+} BiquadCoefficients;
+
+typedef struct {
+    BiquadCoefficients coeffs;  // coefficients
+    float in_d1, in_d2;	        // delayed input
+    float out_d1, out_d2;	// delayed output
+    int first_order;            // optimization
+} Biquad;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void biquad_init (Biquad *f, const BiquadCoefficients *coeffs, float gain);
+
+void biquad_lowpass (BiquadCoefficients *filter, double frequency);
+void biquad_highpass (BiquadCoefficients *filter, double frequency);
+
+void biquad_apply_buffer (Biquad *f, float *buffer, int num_samples, int stride);
+float biquad_apply_sample (Biquad *f, float input);
+
+#ifdef __cplusplus
+}
+#endif