blob: e510b9765fca41513a65bff8d3a9bec72b043c84 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
module media.mojom;
import "media/mojo/mojom/audio_parameters.mojom";
import "media/mojo/mojom/audio_stream_factory.mojom";
import "media/mojo/mojom/media_types.mojom";
import "mojo/public/mojom/base/file_path.mojom";
import "mojo/public/mojom/base/time.mojom";
import "sandbox/policy/mojom/sandbox.mojom";
import "services/network/public/mojom/url_loader_factory.mojom";
// Corresponds to the LangIdEvent.ConfidenceInterval defined in
// http://google3/speech/soda/public/soda_event.proto.
enum ConfidenceLevel {
// The main interface a client uses to interact with a speech recognition
// service process. In Live Caption, every renderer can own one or more
// Remote<SpeechRecognitionContext>, with the receiver bound through the
// BrowserInterfaceBroker. In Chrome OS features like Dictation and Projector,
// every OnDeviceSpeechRecognizer can own a Remote<SpeechRecognitionContext>.
interface SpeechRecognitionContext {
// Bind the recognizers to the speech recognition service. Returns a flag
// indicating whether multichannel audio is supported by the speech
// recognition service.
BindRecognizer(pending_receiver<SpeechRecognitionRecognizer> receiver,
pending_remote<SpeechRecognitionRecognizerClient> client,
SpeechRecognitionOptions options)
=> (bool is_multichannel_supported);
// Prepares microphone audio to be captured from within the
// SpeechRecognitionService process, with results passed back to the
// SpeechRecognitionRecognizerClient.
pending_receiver<AudioSourceFetcher> fetcher_receiver,
pending_remote<SpeechRecognitionRecognizerClient> client,
SpeechRecognitionOptions options)
=> (bool is_multichannel_supported);
// The main interface to a speech secognition service process.
// Used by the browser to issue top-level control requests to the service,
// acquired during process launch.
interface SpeechRecognitionService {
// Bind the context to a new instance of the speech recognition.
BindContext(pending_receiver<SpeechRecognitionContext> context);
// Sets the URL loader factory used to create network requests.
pending_remote<network.mojom.URLLoaderFactory> url_loader_factory);
// Sets the file path to the Speech On-Device API (SODA) binary and
// the config file for the language pack.
SetSodaPath(mojo_base.mojom.FilePath binary_path,
mojo_base.mojom.FilePath config_path);
// Binds the speech recognition service client used by the speech
// recognition service to send messages back to the client.
pending_remote<SpeechRecognitionServiceClient> client);
// The interface used to start and stop fetching audio from the microphone
// for speech recognition.
interface AudioSourceFetcher {
// Begin fetching audio. Results will be returned using the
// Remote<SpeechRecognitionRecognizerClient> which was passed in
// BindAudioSourceFetcher.
Start(pending_remote<AudioStreamFactory> factory, string device_id,
media.mojom.AudioParameters audio_parameters);
// Stops audio fetching.
// The interface used to send messages from the speech recognition service
// back to the consumer of the service.
interface SpeechRecognitionServiceClient {
// Executed when the network service crashes, prompting the client to
// reset the URL loader factory.
// The interface used to pass raw audio from the renderer to the speech
// recognition service. The remote lives in the renderer process and the
// receiver lives in the speech recognition process.
interface SpeechRecognitionRecognizer {
// Initialize the speech recognition instance. The speech recognition client
// will return the recognition events containing the transcribed audio back
// to the originating media.
SendAudioToSpeechRecognitionService(AudioDataS16 buffer);
// Notify the speech recognition recognizer that the language changed. Takes
// in the locale string (e.g. "en-US").
OnLanguageChanged(string language);
// The interface used to return speech recognition events from the speech
// recognition service to the client that will display the results to the user.
// The remote lives in the speech recognition process and the receiver lives in
// the browser process.
interface SpeechRecognitionRecognizerClient {
// Triggered by speech recognition process on a speech recognition event.
// Returns whether the result was received successfully. Speech recognition
// will halt if this returns false.
OnSpeechRecognitionRecognitionEvent(SpeechRecognitionResult result)
=> (bool success);
// Triggered by an error within the speech recognition service.
// Triggered by speech recognition process on a language identification event.
OnLanguageIdentificationEvent(LanguageIdentificationEvent event);
// The hypothesis parts that provides timing information for each word in
// recognized speech.
struct HypothesisParts {
// A section of the final transcription text. Either an entire word or single
// character (depending on the language) with adjacent punctuation. There will
// usually only be one value here. If formatting is enabled in the speech
// recognition, then the raw text will be included as the second element.
array<string> text;
// Time offset from this event's |audio_start_time| defined below. We enforce
// the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
// audio_start_time|.
mojo_base.mojom.TimeDelta hypothesis_part_offset;
// The timing information for the transcript.
struct TimingInformation {
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
mojo_base.mojom.TimeDelta audio_start_time;
// Elapsed processed audio from first frame after preamble.
mojo_base.mojom.TimeDelta audio_end_time;
// The timing information for each word/letter in the transription.
// HypothesisPartsInResult was introduced in min version 1 in
// chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
// must be optional. Hypothesis parts maybe non-empty optional containing a
// zero length vector if no words were spoken during the event's time span.
array<HypothesisParts> ? hypothesis_parts;
// A speech recognition result created by the speech service and passed to the
// browser.
struct SpeechRecognitionResult {
string transcription;
// A flag indicating whether the result is final. If true, the result is
// locked in and the next result returned will not overlap with the previous
// final result.
bool is_final;
// Timing information for the current transcription. |timing_information| is
// expected to be valid if:
// 1. speech recognition is provided by |CrosSodaClient| and
// 2. |is_final| is true.
TimingInformation? timing_information;
// A language identification event created by the speech recognition service
// and passed to the browser and renderer.
struct LanguageIdentificationEvent {
// The locale of the language with the highest confidence.
string language;
// The confidence interval.
ConfidenceLevel confidence_level;
// The interface used to notify the speech recognition client of events
// triggered by the browser. The remote lives in the browser process and the
// receiver lives in the renderer process.
interface SpeechRecognitionBrowserObserver {
// Notify the speech recognition client when speech recognition availability
// changes.
SpeechRecognitionAvailabilityChanged(bool is_speech_recognition_available);
// Notify the speech recognition client when the speech recognition language
// changes.
SpeechRecognitionLanguageChanged(string language);
// This interface between the speech recognition client and the browser.
// The remote lives in the renderer process and the receiver lives in the
// browser process.
interface SpeechRecognitionClientBrowserInterface {
// Bind the speech recognition availability observer.
pending_remote<SpeechRecognitionBrowserObserver> observer);
// Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
// chrome/services/speech/soda/proto/soda_api.proto and
// SodaRecognitionMode in
// chromeos/services/machine_learning/public/mojom/soda.mojom.
enum SpeechRecognitionMode {
// Intended for voice input for keyboard usage.
// Intended to caption a stream of audio.
// Options for speech recognition.
// TODO( Add option to include timing metrics in the result.
struct SpeechRecognitionOptions {
// What kind of recognition to use.
// In the case of web fallback (not for launch, used for development only),
// this option will be ignored.
SpeechRecognitionMode recognition_mode;
// Whether to enable formatting and punctuation in the recognition results.
bool enable_formatting;
// The BCP-47 localized language code to use (e.g. "en-US").
// TODO( Language needs to be required when multiple
// languages are supported by SODA, so that each SpeechRecognitionRecognizer
// can use its own language. Right now Language is only used by Projector
// and Dictation via OnDeviceSpeechRecognizer in Chrome OS.
string? language;