src/cobalt/speech/endpointer/energy_endpointer.h - cobalt - Git at Google

 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // The EnergyEndpointer class finds likely speech onset and offset points.
 //
 // The implementation described here is about the simplest possible.
 // It is based on timings of threshold crossings for overall signal
 // RMS. It is suitable for light weight applications.
 //
 // As written, the basic idea is that one specifies intervals that
 // must be occupied by super- and sub-threshold energy levels, and
 // defers decisions re onset and offset times until these
 // specifications have been met.  Three basic intervals are tested: an
 // onset window, a speech-on window, and an offset window.  We require
 // super-threshold to exceed some mimimum total durations in the onset
 // and speech-on windows before declaring the speech onset time, and
 // we specify a required sub-threshold residency in the offset window
 // before declaring speech offset. As the various residency requirements are
 // met, the EnergyEndpointer instance assumes various states, and can return the
 // ID of these states to the client (see EpStatus below).
 //
 // The levels of the speech and background noise are continuously updated. It is
 // important that the background noise level be estimated initially for
 // robustness in noisy conditions. The first frames are assumed to be background
 // noise and a fast update rate is used for the noise level. The duration for
 // fast update is controlled by the fast_update_dur_ paramter.
 //
 // If used in noisy conditions, the endpointer should be started and run in the
 // EnvironmentEstimation mode, for at least 200ms, before switching to
 // UserInputMode.
 // Audio feedback contamination can appear in the input audio, if not cut
 // out or handled by echo cancellation. Audio feedback can trigger a false
 // accept. The false accepts can be ignored by setting
 // ep_contamination_rejection_period.

 #ifndef COBALT_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
 #define COBALT_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

 #include <stdint.h>

 #include <memory>
 #include <vector>

 #include "base/memory/scoped_ptr.h"
 #include "cobalt/speech/endpointer/energy_endpointer_params.h"

 namespace cobalt {
 namespace speech {

 // Endpointer status codes
 enum EpStatus {
   EP_PRE_SPEECH = 10,
   EP_POSSIBLE_ONSET,
   EP_SPEECH_PRESENT,
   EP_POSSIBLE_OFFSET,
   EP_POST_SPEECH,
 };

 class EnergyEndpointer {
  public:
   // The default construction MUST be followed by Init(), before any
   // other use can be made of the instance.
   EnergyEndpointer();
   virtual ~EnergyEndpointer();

   void Init(const EnergyEndpointerParams& params);

   // Start the endpointer. This should be called at the beginning of a session.
   void StartSession();

   // Stop the endpointer.
   void EndSession();

   // Start environment estimation. Audio will be used for environment estimation
   // i.e. noise level estimation.
   void SetEnvironmentEstimationMode();

   // Start user input. This should be called when the user indicates start of
   // input, e.g. by pressing a button.
   void SetUserInputMode();

   // Computes the next input frame and modifies EnergyEndpointer status as
   // appropriate based on the computation.
   void ProcessAudioFrame(int64_t time_us,
                          const int16_t* samples,
                          int num_samples,
                          float* rms_out);

   // Returns the current state of the EnergyEndpointer and the time
   // corresponding to the most recently computed frame.
   EpStatus Status(int64_t* status_time_us) const;

   bool estimating_environment() const {
     return estimating_environment_;
   }

   // Returns estimated noise level in dB.
   float GetNoiseLevelDb() const;

  private:
   class HistoryRing;

   // Resets the endpointer internal state.  If reset_threshold is true, the
   // state will be reset completely, including adaptive thresholds and the
   // removal of all history information.
   void Restart(bool reset_threshold);

   // Update internal speech and noise levels.
   void UpdateLevels(float rms);

   // Returns the number of frames (or frame number) corresponding to
   // the 'time' (in seconds).
   int TimeToFrame(float time) const;

   EpStatus status_;  // The current state of this instance.
   float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
   int64_t
       endpointer_time_us_;  // Time of the most recently received audio frame.
   int64_t
       fast_update_frames_;  // Number of frames for initial level adaptation.
   int64_t
       frame_counter_;     // Number of frames seen. Used for initial adaptation.
   float max_window_dur_;  // Largest search window size (seconds)
   float sample_rate_;  // Sampling rate.

   // Ring buffers to hold the speech activity history.
   scoped_ptr<HistoryRing> history_;

   // Configuration parameters.
   EnergyEndpointerParams params_;

   // RMS which must be exceeded to conclude frame is speech.
   float decision_threshold_;

   // Flag to indicate that audio should be used to estimate environment, prior
   // to receiving user input.
   bool estimating_environment_;

   // Estimate of the background noise level. Used externally for UI feedback.
   float noise_level_;

   // An adaptive threshold used to update decision_threshold_ when appropriate.
   float rms_adapt_;

   // Start lag corresponds to the highest fundamental frequency.
   int start_lag_;

   // End lag corresponds to the lowest fundamental frequency.
   int end_lag_;

   // Time when mode switched from environment estimation to user input. This
   // is used to time forced rejection of audio feedback contamination.
   int64_t user_input_start_time_us_;

   DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);
 };

 }  // namespace speech
 }  // namespace cobalt

 #endif  // COBALT_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
	// Copyright (c) 2012 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// The EnergyEndpointer class finds likely speech onset and offset points.
	//
	// The implementation described here is about the simplest possible.
	// It is based on timings of threshold crossings for overall signal
	// RMS. It is suitable for light weight applications.
	//
	// As written, the basic idea is that one specifies intervals that
	// must be occupied by super- and sub-threshold energy levels, and
	// defers decisions re onset and offset times until these
	// specifications have been met. Three basic intervals are tested: an
	// onset window, a speech-on window, and an offset window. We require
	// super-threshold to exceed some mimimum total durations in the onset
	// and speech-on windows before declaring the speech onset time, and
	// we specify a required sub-threshold residency in the offset window
	// before declaring speech offset. As the various residency requirements are
	// met, the EnergyEndpointer instance assumes various states, and can return the
	// ID of these states to the client (see EpStatus below).
	//
	// The levels of the speech and background noise are continuously updated. It is
	// important that the background noise level be estimated initially for
	// robustness in noisy conditions. The first frames are assumed to be background
	// noise and a fast update rate is used for the noise level. The duration for
	// fast update is controlled by the fast_update_dur_ paramter.
	//
	// If used in noisy conditions, the endpointer should be started and run in the
	// EnvironmentEstimation mode, for at least 200ms, before switching to
	// UserInputMode.
	// Audio feedback contamination can appear in the input audio, if not cut
	// out or handled by echo cancellation. Audio feedback can trigger a false
	// accept. The false accepts can be ignored by setting
	// ep_contamination_rejection_period.

	#ifndef COBALT_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
	#define COBALT_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_

	#include <stdint.h>

	#include <memory>
	#include <vector>

	#include "base/memory/scoped_ptr.h"
	#include "cobalt/speech/endpointer/energy_endpointer_params.h"

	namespace cobalt {
	namespace speech {

	// Endpointer status codes
	enum EpStatus {
	EP_PRE_SPEECH = 10,
	EP_POSSIBLE_ONSET,
	EP_SPEECH_PRESENT,
	EP_POSSIBLE_OFFSET,
	EP_POST_SPEECH,
	};

	class EnergyEndpointer {
	public:
	// The default construction MUST be followed by Init(), before any
	// other use can be made of the instance.
	EnergyEndpointer();
	virtual ~EnergyEndpointer();

	void Init(const EnergyEndpointerParams& params);

	// Start the endpointer. This should be called at the beginning of a session.
	void StartSession();

	// Stop the endpointer.
	void EndSession();

	// Start environment estimation. Audio will be used for environment estimation
	// i.e. noise level estimation.
	void SetEnvironmentEstimationMode();

	// Start user input. This should be called when the user indicates start of
	// input, e.g. by pressing a button.
	void SetUserInputMode();

	// Computes the next input frame and modifies EnergyEndpointer status as
	// appropriate based on the computation.
	void ProcessAudioFrame(int64_t time_us,
	const int16_t* samples,
	int num_samples,
	float* rms_out);

	// Returns the current state of the EnergyEndpointer and the time
	// corresponding to the most recently computed frame.
	EpStatus Status(int64_t* status_time_us) const;

	bool estimating_environment() const {
	return estimating_environment_;
	}

	// Returns estimated noise level in dB.
	float GetNoiseLevelDb() const;

	private:
	class HistoryRing;

	// Resets the endpointer internal state. If reset_threshold is true, the
	// state will be reset completely, including adaptive thresholds and the
	// removal of all history information.
	void Restart(bool reset_threshold);

	// Update internal speech and noise levels.
	void UpdateLevels(float rms);

	// Returns the number of frames (or frame number) corresponding to
	// the 'time' (in seconds).
	int TimeToFrame(float time) const;

	EpStatus status_; // The current state of this instance.
	float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
	int64_t
	endpointer_time_us_; // Time of the most recently received audio frame.
	int64_t
	fast_update_frames_; // Number of frames for initial level adaptation.
	int64_t
	frame_counter_; // Number of frames seen. Used for initial adaptation.
	float max_window_dur_; // Largest search window size (seconds)
	float sample_rate_; // Sampling rate.

	// Ring buffers to hold the speech activity history.
	scoped_ptr<HistoryRing> history_;

	// Configuration parameters.
	EnergyEndpointerParams params_;

	// RMS which must be exceeded to conclude frame is speech.
	float decision_threshold_;

	// Flag to indicate that audio should be used to estimate environment, prior
	// to receiving user input.
	bool estimating_environment_;

	// Estimate of the background noise level. Used externally for UI feedback.
	float noise_level_;

	// An adaptive threshold used to update decision_threshold_ when appropriate.
	float rms_adapt_;

	// Start lag corresponds to the highest fundamental frequency.
	int start_lag_;

	// End lag corresponds to the lowest fundamental frequency.
	int end_lag_;

	// Time when mode switched from environment estimation to user input. This
	// is used to time forced rejection of audio feedback contamination.
	int64_t user_input_start_time_us_;

	DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);
	};

	} // namespace speech
	} // namespace cobalt

	#endif // COBALT_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_