blob: 22648420b8fd0b9994e264323c2f3d2868150f56 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
#include <stdint.h>
#include "cobalt/media/base/audio_bus.h"
#include "content/browser/speech/endpointer/energy_endpointer.h"
#include "content/common/content_export.h"
class EpStatus;
namespace content {
class AudioChunk;
// A simple interface to the underlying energy-endpointer implementation, this
// class lets callers provide audio as being recorded and let them poll to find
// when the user has stopped speaking.
//
// There are two events that may trigger the end of speech:
//
// speechInputPossiblyComplete event:
//
// Signals that silence/noise has been detected for a *short* amount of
// time after some speech has been detected. It can be used for low latency
// UI feedback. To disable it, set it to a large amount.
//
// speechInputComplete event:
//
// This event is intended to signal end of input and to stop recording.
// The amount of time to wait after speech is set by
// speech_input_complete_silence_length_ and optionally two other
// parameters (see below).
// This time can be held constant, or can change as more speech is detected.
// In the latter case, the time changes after a set amount of time from the
// *beginning* of speech. This is motivated by the expectation that there
// will be two distinct types of inputs: short search queries and longer
// dictation style input.
//
// Three parameters are used to define the piecewise constant timeout function.
// The timeout length is speech_input_complete_silence_length until
// long_speech_length, when it changes to
// long_speech_input_complete_silence_length.
class CONTENT_EXPORT Endpointer {
public:
typedef cobalt::media::AudioBus AudioBus;
explicit Endpointer(int sample_rate);
// Start the endpointer. This should be called at the beginning of a session.
void StartSession();
// Stop the endpointer.
void EndSession();
// Start environment estimation. Audio will be used for environment estimation
// i.e. noise level estimation.
void SetEnvironmentEstimationMode();
// Start user input. This should be called when the user indicates start of
// input, e.g. by pressing a button.
void SetUserInputMode();
// Process a segment of audio, which may be more than one frame.
// The status of the last frame will be returned.
#if defined(STARBOARD)
EpStatus ProcessAudio(const AudioBus& audio_bus, float* rms_out);
#else
EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
#endif
// Get the status of the endpointer.
EpStatus Status(int64_t* time_us);
// Returns true if the endpointer detected reasonable audio levels above
// background noise which could be user speech, false if not.
bool DidStartReceivingSpeech() const {
return speech_previously_detected_;
}
bool IsEstimatingEnvironment() const {
return energy_endpointer_.estimating_environment();
}
void set_speech_input_complete_silence_length(int64_t time_us) {
speech_input_complete_silence_length_us_ = time_us;
}
void set_long_speech_input_complete_silence_length(int64_t time_us) {
long_speech_input_complete_silence_length_us_ = time_us;
}
void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
speech_input_possibly_complete_silence_length_us_ = time_us;
}
void set_long_speech_length(int64_t time_us) {
long_speech_length_us_ = time_us;
}
bool speech_input_complete() const {
return speech_input_complete_;
}
#if defined(STARBOARD)
int sample_rate() const { return sample_rate_; }
#endif
// RMS background noise level in dB.
float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
private:
// Reset internal states. Helper method common to initial input utterance
// and following input utternaces.
void Reset();
// Minimum allowable length of speech input.
int64_t speech_input_minimum_length_us_;
// The speechInputPossiblyComplete event signals that silence/noise has been
// detected for a *short* amount of time after some speech has been detected.
// This proporty specifies the time period.
int64_t speech_input_possibly_complete_silence_length_us_;
// The speechInputComplete event signals that silence/noise has been
// detected for a *long* amount of time after some speech has been detected.
// This property specifies the time period.
int64_t speech_input_complete_silence_length_us_;
// Same as above, this specifies the required silence period after speech
// detection. This period is used instead of
// speech_input_complete_silence_length_ when the utterance is longer than
// long_speech_length_. This parameter is optional.
int64_t long_speech_input_complete_silence_length_us_;
// The period of time after which the endpointer should consider
// long_speech_input_complete_silence_length_ as a valid silence period
// instead of speech_input_complete_silence_length_. This parameter is
// optional.
int64_t long_speech_length_us_;
// First speech onset time, used in determination of speech complete timeout.
int64_t speech_start_time_us_;
// Most recent end time, used in determination of speech complete timeout.
int64_t speech_end_time_us_;
int64_t audio_frame_time_us_;
EpStatus old_ep_status_;
bool waiting_for_speech_possibly_complete_timeout_;
bool waiting_for_speech_complete_timeout_;
bool speech_previously_detected_;
bool speech_input_complete_;
EnergyEndpointer energy_endpointer_;
int sample_rate_;
int32_t frame_size_;
};
} // namespace content
#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_