| // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "cobalt/speech/endpointer/energy_endpointer.h" |
| |
| #include <math.h> |
| #include <stddef.h> |
| |
| #include "base/logging.h" |
| |
| namespace { |
| |
| // Returns the RMS (quadratic mean) of the input signal. |
| float RMS(const int16_t* samples, int num_samples) { |
| int64_t ssq_int64 = 0; |
| int64_t sum_int64 = 0; |
| for (int i = 0; i < num_samples; ++i) { |
| sum_int64 += samples[i]; |
| ssq_int64 += samples[i] * samples[i]; |
| } |
| // now convert to floats. |
| double sum = static_cast<double>(sum_int64); |
| sum /= num_samples; |
| double ssq = static_cast<double>(ssq_int64); |
| return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); |
| } |
| |
| int64_t Secs2Usecs(float seconds) { |
| return static_cast<int64_t>(0.5 + (1.0e6 * seconds)); |
| } |
| |
| float GetDecibel(float value) { |
| if (value > 1.0e-100) |
| return static_cast<float>(20 * log10(value)); |
| return -2000.0; |
| } |
| |
| } // namespace |
| |
| namespace cobalt { |
| namespace speech { |
| |
| // Stores threshold-crossing histories for making decisions about the speech |
| // state. |
| class EnergyEndpointer::HistoryRing { |
| public: |
| HistoryRing() : insertion_index_(0) {} |
| |
| // Resets the ring to |size| elements each with state |initial_state| |
| void SetRing(int size, bool initial_state); |
| |
| // Inserts a new entry into the ring and drops the oldest entry. |
| void Insert(int64_t time_us, bool decision); |
| |
| // Returns the time in microseconds of the most recently added entry. |
| int64_t EndTime() const; |
| |
| // Returns the sum of all intervals during which 'decision' is true within |
| // the time in seconds specified by 'duration'. The returned interval is |
| // in seconds. |
| float RingSum(float duration_sec); |
| |
| private: |
| struct DecisionPoint { |
| int64_t time_us; |
| bool decision; |
| }; |
| |
| std::vector<DecisionPoint> decision_points_; |
| int insertion_index_; // Index at which the next item gets added/inserted. |
| |
| DISALLOW_COPY_AND_ASSIGN(HistoryRing); |
| }; |
| |
| void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { |
| insertion_index_ = 0; |
| decision_points_.clear(); |
| DecisionPoint init = { -1, initial_state }; |
| decision_points_.resize(static_cast<size_t>(size), init); |
| } |
| |
| void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { |
| decision_points_[static_cast<size_t>(insertion_index_)].time_us = time_us; |
| decision_points_[static_cast<size_t>(insertion_index_)].decision = decision; |
| insertion_index_ = |
| static_cast<int>((insertion_index_ + 1) % decision_points_.size()); |
| } |
| |
| int64_t EnergyEndpointer::HistoryRing::EndTime() const { |
| int ind = insertion_index_ - 1; |
| if (ind < 0) |
| ind = static_cast<int>(decision_points_.size() - 1); |
| return decision_points_[static_cast<size_t>(ind)].time_us; |
| } |
| |
| float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { |
| if (decision_points_.empty()) |
| return 0.0; |
| |
| int64_t sum_us = 0; |
| int ind = insertion_index_ - 1; |
| if (ind < 0) |
| ind = static_cast<int>(decision_points_.size() - 1); |
| int64_t end_us = decision_points_[static_cast<size_t>(ind)].time_us; |
| bool is_on = decision_points_[static_cast<size_t>(ind)].decision; |
| int64_t start_us = |
| end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec)); |
| if (start_us < 0) |
| start_us = 0; |
| size_t n_summed = 1; // n points ==> (n-1) intervals |
| while ((decision_points_[static_cast<size_t>(ind)].time_us > start_us) && |
| (n_summed < decision_points_.size())) { |
| --ind; |
| if (ind < 0) |
| ind = static_cast<int>(decision_points_.size() - 1); |
| if (is_on) |
| sum_us += end_us - decision_points_[static_cast<size_t>(ind)].time_us; |
| is_on = decision_points_[static_cast<size_t>(ind)].decision; |
| end_us = decision_points_[static_cast<size_t>(ind)].time_us; |
| n_summed++; |
| } |
| |
| return 1.0e-6f * sum_us; // Returns total time that was super threshold. |
| } |
| |
| EnergyEndpointer::EnergyEndpointer() |
| : status_(EP_PRE_SPEECH), |
| offset_confirm_dur_sec_(0), |
| endpointer_time_us_(0), |
| fast_update_frames_(0), |
| frame_counter_(0), |
| max_window_dur_(4.0), |
| sample_rate_(0), |
| history_(new HistoryRing()), |
| decision_threshold_(0), |
| estimating_environment_(false), |
| noise_level_(0), |
| rms_adapt_(0), |
| start_lag_(0), |
| end_lag_(0), |
| user_input_start_time_us_(0) { |
| } |
| |
| EnergyEndpointer::~EnergyEndpointer() { |
| } |
| |
| int EnergyEndpointer::TimeToFrame(float time) const { |
| return static_cast<int32_t>(0.5 + (time / params_.frame_period())); |
| } |
| |
| void EnergyEndpointer::Restart(bool reset_threshold) { |
| status_ = EP_PRE_SPEECH; |
| user_input_start_time_us_ = 0; |
| |
| if (reset_threshold) { |
| decision_threshold_ = params_.decision_threshold(); |
| rms_adapt_ = decision_threshold_; |
| noise_level_ = params_.decision_threshold() / 2.0f; |
| frame_counter_ = 0; // Used for rapid initial update of levels. |
| } |
| |
| // Set up the memories to hold the history windows. |
| history_->SetRing(TimeToFrame(max_window_dur_), false); |
| |
| // Flag that indicates that current input should be used for |
| // estimating the environment. The user has not yet started input |
| // by e.g. pressed the push-to-talk button. By default, this is |
| // false for backward compatibility. |
| estimating_environment_ = false; |
| } |
| |
| void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { |
| params_ = params; |
| |
| // Find the longest history interval to be used, and make the ring |
| // large enough to accommodate that number of frames. NOTE: This |
| // depends upon ep_frame_period being set correctly in the factory |
| // that did this instantiation. |
| max_window_dur_ = params_.onset_window(); |
| if (params_.speech_on_window() > max_window_dur_) |
| max_window_dur_ = params_.speech_on_window(); |
| if (params_.offset_window() > max_window_dur_) |
| max_window_dur_ = params_.offset_window(); |
| Restart(true); |
| |
| offset_confirm_dur_sec_ = params_.offset_window() - |
| params_.offset_confirm_dur(); |
| if (offset_confirm_dur_sec_ < 0.0) |
| offset_confirm_dur_sec_ = 0.0; |
| |
| user_input_start_time_us_ = 0; |
| |
| // Flag that indicates that current input should be used for |
| // estimating the environment. The user has not yet started input |
| // by e.g. pressed the push-to-talk button. By default, this is |
| // false for backward compatibility. |
| estimating_environment_ = false; |
| // The initial value of the noise and speech levels is inconsequential. |
| // The level of the first frame will overwrite these values. |
| noise_level_ = params_.decision_threshold() / 2.0f; |
| fast_update_frames_ = |
| static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period()); |
| |
| frame_counter_ = 0; // Used for rapid initial update of levels. |
| |
| sample_rate_ = params_.sample_rate(); |
| start_lag_ = static_cast<int>(sample_rate_ / |
| params_.max_fundamental_frequency()); |
| end_lag_ = static_cast<int>(sample_rate_ / |
| params_.min_fundamental_frequency()); |
| } |
| |
| void EnergyEndpointer::StartSession() { |
| Restart(true); |
| } |
| |
| void EnergyEndpointer::EndSession() { |
| status_ = EP_POST_SPEECH; |
| } |
| |
| void EnergyEndpointer::SetEnvironmentEstimationMode() { |
| Restart(true); |
| estimating_environment_ = true; |
| } |
| |
| void EnergyEndpointer::SetUserInputMode() { |
| estimating_environment_ = false; |
| user_input_start_time_us_ = endpointer_time_us_; |
| } |
| |
| void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, |
| const int16_t* samples, |
| int num_samples, |
| float* rms_out) { |
| endpointer_time_us_ = time_us; |
| float rms = RMS(samples, num_samples); |
| |
| // Check that this is user input audio vs. pre-input adaptation audio. |
| // Input audio starts when the user indicates start of input, by e.g. |
| // pressing push-to-talk. Audio received prior to that is used to update |
| // noise and speech level estimates. |
| if (!estimating_environment_) { |
| bool decision = false; |
| if ((endpointer_time_us_ - user_input_start_time_us_) < |
| Secs2Usecs(params_.contamination_rejection_period())) { |
| decision = false; |
| DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_; |
| } else { |
| decision = (rms > decision_threshold_); |
| } |
| |
| history_->Insert(endpointer_time_us_, decision); |
| |
| switch (status_) { |
| case EP_PRE_SPEECH: |
| if (history_->RingSum(params_.onset_window()) > |
| params_.onset_detect_dur()) { |
| status_ = EP_POSSIBLE_ONSET; |
| } |
| break; |
| |
| case EP_POSSIBLE_ONSET: { |
| float tsum = history_->RingSum(params_.onset_window()); |
| if (tsum > params_.onset_confirm_dur()) { |
| status_ = EP_SPEECH_PRESENT; |
| } else { // If signal is not maintained, drop back to pre-speech. |
| if (tsum <= params_.onset_detect_dur()) |
| status_ = EP_PRE_SPEECH; |
| } |
| break; |
| } |
| |
| case EP_SPEECH_PRESENT: { |
| // To induce hysteresis in the state residency, we allow a |
| // smaller residency time in the on_ring, than was required to |
| // enter the SPEECH_PERSENT state. |
| float on_time = history_->RingSum(params_.speech_on_window()); |
| if (on_time < params_.on_maintain_dur()) |
| status_ = EP_POSSIBLE_OFFSET; |
| break; |
| } |
| |
| case EP_POSSIBLE_OFFSET: |
| if (history_->RingSum(params_.offset_window()) <= |
| offset_confirm_dur_sec_) { |
| // Note that this offset time may be beyond the end |
| // of the input buffer in a real-time system. It will be up |
| // to the RecognizerSession to decide what to do. |
| status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. |
| } else { // If speech picks up again we allow return to SPEECH_PRESENT. |
| if (history_->RingSum(params_.speech_on_window()) >= |
| params_.on_maintain_dur()) |
| status_ = EP_SPEECH_PRESENT; |
| } |
| break; |
| |
| case EP_POST_SPEECH: |
| // fall-through |
| default: |
| LOG(WARNING) << "Invalid case in switch: " << status_; |
| break; |
| } |
| |
| // If this is a quiet, non-speech region, slowly adapt the detection |
| // threshold to be about 6dB above the average RMS. |
| if ((!decision) && (status_ == EP_PRE_SPEECH)) { |
| decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); |
| rms_adapt_ = decision_threshold_; |
| } else { |
| // If this is in a speech region, adapt the decision threshold to |
| // be about 10dB below the average RMS. If the noise level is high, |
| // the threshold is pushed up. |
| // Adaptation up to a higher level is 5 times faster than decay to |
| // a lower level. |
| if ((status_ == EP_SPEECH_PRESENT) && decision) { |
| if (rms_adapt_ > rms) { |
| rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); |
| } else { |
| rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); |
| } |
| float target_threshold = 0.3f * rms_adapt_ + noise_level_; |
| decision_threshold_ = (.90f * decision_threshold_) + |
| (0.10f * target_threshold); |
| } |
| } |
| |
| // Set a floor |
| if (decision_threshold_ < params_.min_decision_threshold()) |
| decision_threshold_ = params_.min_decision_threshold(); |
| } |
| |
| // Update speech and noise levels. |
| UpdateLevels(rms); |
| ++frame_counter_; |
| |
| if (rms_out) |
| *rms_out = GetDecibel(rms); |
| } |
| |
| float EnergyEndpointer::GetNoiseLevelDb() const { |
| return GetDecibel(noise_level_); |
| } |
| |
| void EnergyEndpointer::UpdateLevels(float rms) { |
| // Update quickly initially. We assume this is noise and that |
| // speech is 6dB above the noise. |
| if (frame_counter_ < fast_update_frames_) { |
| // Alpha increases from 0 to (k-1)/k where k is the number of time |
| // steps in the initial adaptation period. |
| float alpha = static_cast<float>(frame_counter_) / |
| static_cast<float>(fast_update_frames_); |
| noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); |
| DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_ |
| << ", fast_update_frames_ " << fast_update_frames_; |
| } else { |
| // Update Noise level. The noise level adapts quickly downward, but |
| // slowly upward. The noise_level_ parameter is not currently used |
| // for threshold adaptation. It is used for UI feedback. |
| if (noise_level_ < rms) |
| noise_level_ = (0.999f * noise_level_) + (0.001f * rms); |
| else |
| noise_level_ = (0.95f * noise_level_) + (0.05f * rms); |
| } |
| if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { |
| decision_threshold_ = noise_level_ * 2; // 6dB above noise level. |
| // Set a floor |
| if (decision_threshold_ < params_.min_decision_threshold()) |
| decision_threshold_ = params_.min_decision_threshold(); |
| } |
| } |
| |
| EpStatus EnergyEndpointer::Status(int64_t* status_time) const { |
| *status_time = history_->EndTime(); |
| return status_; |
| } |
| |
| } // namespace speech |
| } // namespace cobalt |