media/mojo/mojom/speech_recognition_service.mojom - cobalt - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 module media.mojom;

 import "media/mojo/mojom/audio_parameters.mojom";
 import "media/mojo/mojom/audio_stream_factory.mojom";
 import "media/mojo/mojom/media_types.mojom";
 import "mojo/public/mojom/base/file_path.mojom";
 import "mojo/public/mojom/base/time.mojom";
 import "sandbox/policy/mojom/sandbox.mojom";
 import "services/network/public/mojom/url_loader_factory.mojom";

 // Corresponds to the LangIdEvent.ConfidenceInterval defined in
 // http://google3/speech/soda/public/soda_event.proto.
 enum ConfidenceLevel {
   kUnknown,
   kNotConfident,
   kConfident,
   kHighlyConfident,
 };

 // The main interface a client uses to interact with a speech recognition
 // service process. In Live Caption, every renderer can own one or more
 // Remote<SpeechRecognitionContext>, with the receiver bound through the
 // BrowserInterfaceBroker. In Chrome OS features like Dictation and Projector,
 // every OnDeviceSpeechRecognizer can own a Remote<SpeechRecognitionContext>.
 interface SpeechRecognitionContext {
   // Bind the recognizers to the speech recognition service. Returns a flag
   // indicating whether multichannel audio is supported by the speech
   // recognition service.
   BindRecognizer(pending_receiver<SpeechRecognitionRecognizer> receiver,
                  pending_remote<SpeechRecognitionRecognizerClient> client,
                  SpeechRecognitionOptions options)
                  => (bool is_multichannel_supported);

   // Prepares microphone audio to be captured from within the
   // SpeechRecognitionService process, with results passed back to the
   // SpeechRecognitionRecognizerClient.
   BindAudioSourceFetcher(
                  pending_receiver<AudioSourceFetcher> fetcher_receiver,
                  pending_remote<SpeechRecognitionRecognizerClient> client,
                  SpeechRecognitionOptions options)
                  => (bool is_multichannel_supported);
 };

 // The main interface to a speech secognition service process.
 // Used by the browser to issue top-level control requests to the service,
 // acquired during process launch.
 [ServiceSandbox=sandbox.mojom.Sandbox.kSpeechRecognition]
 interface SpeechRecognitionService {
   // Bind the context to a new instance of the speech recognition.
   BindContext(pending_receiver<SpeechRecognitionContext> context);

   // Sets the URL loader factory used to create network requests.
   SetUrlLoaderFactory(
       pending_remote<network.mojom.URLLoaderFactory> url_loader_factory);

   // Sets the file path to the Speech On-Device API (SODA) binary and
   // the config file for the language pack.
   SetSodaPath(mojo_base.mojom.FilePath binary_path,
       mojo_base.mojom.FilePath config_path);

   // Binds the speech recognition service client used by the speech
   // recognition service to send messages back to the client.
   BindSpeechRecognitionServiceClient(
       pending_remote<SpeechRecognitionServiceClient> client);
 };

 // The interface used to start and stop fetching audio from the microphone
 // for speech recognition.
 interface AudioSourceFetcher {
   // Begin fetching audio. Results will be returned using the
   // Remote<SpeechRecognitionRecognizerClient> which was passed in
   // BindAudioSourceFetcher.
   Start(pending_remote<AudioStreamFactory> factory, string device_id,
         media.mojom.AudioParameters audio_parameters);

   // Stops audio fetching.
   Stop();
 };

 // The interface used to send messages from the speech recognition service
 // back to the consumer of the service.
 interface SpeechRecognitionServiceClient {
   // Executed when the network service crashes, prompting the client to
   // reset the URL loader factory.
   OnNetworkServiceDisconnect();
 };

 // The interface used to pass raw audio from the renderer to the speech
 // recognition service. The remote lives in the renderer process and the
 // receiver lives in the speech recognition process.
 interface SpeechRecognitionRecognizer {
   // Initialize the speech recognition instance. The speech recognition client
   // will return the recognition events containing the transcribed audio back
   // to the originating media.
   SendAudioToSpeechRecognitionService(AudioDataS16 buffer);

   // Notify the speech recognition recognizer that the language changed. Takes
   // in the locale string (e.g. "en-US").
   OnLanguageChanged(string language);
 };

 // The interface used to return speech recognition events from the speech
 // recognition service to the client that will display the results to the user.
 // The remote lives in the speech recognition process and the receiver lives in
 // the browser process.
 interface SpeechRecognitionRecognizerClient {
   // Triggered by speech recognition process on a speech recognition event.
   // Returns whether the result was received successfully. Speech recognition
   // will halt if this returns false.
   OnSpeechRecognitionRecognitionEvent(SpeechRecognitionResult result)
       => (bool success);

   // Triggered by an error within the speech recognition service.
   OnSpeechRecognitionError();

   // Triggered by speech recognition process on a language identification event.
   OnLanguageIdentificationEvent(LanguageIdentificationEvent event);
 };

 // The hypothesis parts that provides timing information for each word in
 // recognized speech.
 struct HypothesisParts {
   // A section of the final transcription text. Either an entire word or single
   // character (depending on the language) with adjacent punctuation. There will
   // usually only be one value here. If formatting is enabled in the speech
   // recognition, then the raw text will be included as the second element.
   array<string> text;

   // Time offset from this event's |audio_start_time| defined below. We enforce
   // the following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
   // audio_start_time|.
   mojo_base.mojom.TimeDelta hypothesis_part_offset;
 };

 // The timing information for the transcript.
 struct TimingInformation {
   // Start time in audio time from the start of the SODA session.
   // This time measures the amount of audio input into SODA.
   mojo_base.mojom.TimeDelta audio_start_time;

   // Elapsed processed audio from first frame after preamble.
   mojo_base.mojom.TimeDelta audio_end_time;

   // The timing information for each word/letter in the transription.
   // HypothesisPartsInResult was introduced in min version 1 in
   // chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
   // must be optional. Hypothesis parts maybe non-empty optional containing a
   // zero length vector if no words were spoken during the event's time span.
   array<HypothesisParts> ? hypothesis_parts;
 };

 // A speech recognition result created by the speech service and passed to the
 // browser.
 struct SpeechRecognitionResult {
   string transcription;

   // A flag indicating whether the result is final. If true, the result is
   // locked in and the next result returned will not overlap with the previous
   // final result.
   bool is_final;

   // Timing information for the current transcription. |timing_information| is
   // expected to be valid if:
   //   1. speech recognition is provided by |CrosSodaClient| and
   //   2. |is_final| is true.
   TimingInformation? timing_information;
 };

 // A language identification event created by the speech recognition service
 // and passed to the browser and renderer.
 struct LanguageIdentificationEvent {
   // The locale of the language with the highest confidence.
   string language;

   // The confidence interval.
   ConfidenceLevel confidence_level;
 };

 // The interface used to notify the speech recognition client of events
 // triggered by the browser. The remote lives in the browser process and the
 // receiver lives in the renderer process.
 interface SpeechRecognitionBrowserObserver {
   // Notify the speech recognition client when speech recognition availability
   // changes.
   SpeechRecognitionAvailabilityChanged(bool is_speech_recognition_available);

   // Notify the speech recognition client when the speech recognition language
   // changes.
   SpeechRecognitionLanguageChanged(string language);
 };

 // This interface between the speech recognition client and the browser.
 // The remote lives in the renderer process and the receiver lives in the
 // browser process.
 interface SpeechRecognitionClientBrowserInterface {
   // Bind the speech recognition availability observer.
   BindSpeechRecognitionBrowserObserver(
     pending_remote<SpeechRecognitionBrowserObserver> observer);
 };

 // Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
 // chrome/services/speech/soda/proto/soda_api.proto and
 // SodaRecognitionMode in
 // chromeos/services/machine_learning/public/mojom/soda.mojom.
 enum SpeechRecognitionMode {
   kUnknown,
   // Intended for voice input for keyboard usage.
   kIme,
   // Intended to caption a stream of audio.
   kCaption,
 };

 // Options for speech recognition.
 // TODO(crbug.com/1165437): Add option to include timing metrics in the result.
 struct SpeechRecognitionOptions {
   // What kind of recognition to use.
   // In the case of web fallback (not for launch, used for development only),
   // this option will be ignored.
   SpeechRecognitionMode recognition_mode;

   // Whether to enable formatting and punctuation in the recognition results.
   bool enable_formatting;

   // The BCP-47 localized language code to use (e.g. "en-US").
   // TODO(crbug.com/1161569): Language needs to be required when multiple
   // languages are supported by SODA, so that each SpeechRecognitionRecognizer
   // can use its own language. Right now Language is only used by Projector
   // and Dictation via OnDeviceSpeechRecognizer in Chrome OS.
   string? language;
 };
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	module media.mojom;

	import "media/mojo/mojom/audio_parameters.mojom";
	import "media/mojo/mojom/audio_stream_factory.mojom";
	import "media/mojo/mojom/media_types.mojom";
	import "mojo/public/mojom/base/file_path.mojom";
	import "mojo/public/mojom/base/time.mojom";
	import "sandbox/policy/mojom/sandbox.mojom";
	import "services/network/public/mojom/url_loader_factory.mojom";

	// Corresponds to the LangIdEvent.ConfidenceInterval defined in
	// http://google3/speech/soda/public/soda_event.proto.
	enum ConfidenceLevel {
	kUnknown,
	kNotConfident,
	kConfident,
	kHighlyConfident,
	};

	// The main interface a client uses to interact with a speech recognition
	// service process. In Live Caption, every renderer can own one or more
	// Remote<SpeechRecognitionContext>, with the receiver bound through the
	// BrowserInterfaceBroker. In Chrome OS features like Dictation and Projector,
	// every OnDeviceSpeechRecognizer can own a Remote<SpeechRecognitionContext>.
	interface SpeechRecognitionContext {
	// Bind the recognizers to the speech recognition service. Returns a flag
	// indicating whether multichannel audio is supported by the speech
	// recognition service.
	BindRecognizer(pending_receiver<SpeechRecognitionRecognizer> receiver,
	pending_remote<SpeechRecognitionRecognizerClient> client,
	SpeechRecognitionOptions options)
	=> (bool is_multichannel_supported);

	// Prepares microphone audio to be captured from within the
	// SpeechRecognitionService process, with results passed back to the
	// SpeechRecognitionRecognizerClient.
	BindAudioSourceFetcher(
	pending_receiver<AudioSourceFetcher> fetcher_receiver,
	pending_remote<SpeechRecognitionRecognizerClient> client,
	SpeechRecognitionOptions options)
	=> (bool is_multichannel_supported);
	};

	// The main interface to a speech secognition service process.
	// Used by the browser to issue top-level control requests to the service,
	// acquired during process launch.
	[ServiceSandbox=sandbox.mojom.Sandbox.kSpeechRecognition]
	interface SpeechRecognitionService {
	// Bind the context to a new instance of the speech recognition.
	BindContext(pending_receiver<SpeechRecognitionContext> context);

	// Sets the URL loader factory used to create network requests.
	SetUrlLoaderFactory(
	pending_remote<network.mojom.URLLoaderFactory> url_loader_factory);

	// Sets the file path to the Speech On-Device API (SODA) binary and
	// the config file for the language pack.
	SetSodaPath(mojo_base.mojom.FilePath binary_path,
	mojo_base.mojom.FilePath config_path);

	// Binds the speech recognition service client used by the speech
	// recognition service to send messages back to the client.
	BindSpeechRecognitionServiceClient(
	pending_remote<SpeechRecognitionServiceClient> client);
	};

	// The interface used to start and stop fetching audio from the microphone
	// for speech recognition.
	interface AudioSourceFetcher {
	// Begin fetching audio. Results will be returned using the
	// Remote<SpeechRecognitionRecognizerClient> which was passed in
	// BindAudioSourceFetcher.
	Start(pending_remote<AudioStreamFactory> factory, string device_id,
	media.mojom.AudioParameters audio_parameters);

	// Stops audio fetching.
	Stop();
	};

	// The interface used to send messages from the speech recognition service
	// back to the consumer of the service.
	interface SpeechRecognitionServiceClient {
	// Executed when the network service crashes, prompting the client to
	// reset the URL loader factory.
	OnNetworkServiceDisconnect();
	};

	// The interface used to pass raw audio from the renderer to the speech
	// recognition service. The remote lives in the renderer process and the
	// receiver lives in the speech recognition process.
	interface SpeechRecognitionRecognizer {
	// Initialize the speech recognition instance. The speech recognition client
	// will return the recognition events containing the transcribed audio back
	// to the originating media.
	SendAudioToSpeechRecognitionService(AudioDataS16 buffer);

	// Notify the speech recognition recognizer that the language changed. Takes
	// in the locale string (e.g. "en-US").
	OnLanguageChanged(string language);
	};

	// The interface used to return speech recognition events from the speech
	// recognition service to the client that will display the results to the user.
	// The remote lives in the speech recognition process and the receiver lives in
	// the browser process.
	interface SpeechRecognitionRecognizerClient {
	// Triggered by speech recognition process on a speech recognition event.
	// Returns whether the result was received successfully. Speech recognition
	// will halt if this returns false.
	OnSpeechRecognitionRecognitionEvent(SpeechRecognitionResult result)
	=> (bool success);

	// Triggered by an error within the speech recognition service.
	OnSpeechRecognitionError();

	// Triggered by speech recognition process on a language identification event.
	OnLanguageIdentificationEvent(LanguageIdentificationEvent event);
	};

	// The hypothesis parts that provides timing information for each word in
	// recognized speech.
	struct HypothesisParts {
	// A section of the final transcription text. Either an entire word or single
	// character (depending on the language) with adjacent punctuation. There will
	// usually only be one value here. If formatting is enabled in the speech
	// recognition, then the raw text will be included as the second element.
	array<string> text;

	// Time offset from this event's \|audio_start_time\| defined below. We enforce
	// the following invariant: 0 <= hypothesis_part_offset < \|audio_end_time -
	// audio_start_time\|.
	mojo_base.mojom.TimeDelta hypothesis_part_offset;
	};

	// The timing information for the transcript.
	struct TimingInformation {
	// Start time in audio time from the start of the SODA session.
	// This time measures the amount of audio input into SODA.
	mojo_base.mojom.TimeDelta audio_start_time;

	// Elapsed processed audio from first frame after preamble.
	mojo_base.mojom.TimeDelta audio_end_time;

	// The timing information for each word/letter in the transription.
	// HypothesisPartsInResult was introduced in min version 1 in
	// chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
	// must be optional. Hypothesis parts maybe non-empty optional containing a
	// zero length vector if no words were spoken during the event's time span.
	array<HypothesisParts> ? hypothesis_parts;
	};

	// A speech recognition result created by the speech service and passed to the
	// browser.
	struct SpeechRecognitionResult {
	string transcription;

	// A flag indicating whether the result is final. If true, the result is
	// locked in and the next result returned will not overlap with the previous
	// final result.
	bool is_final;

	// Timing information for the current transcription. \|timing_information\| is
	// expected to be valid if:
	// 1. speech recognition is provided by \|CrosSodaClient\| and
	// 2. \|is_final\| is true.
	TimingInformation? timing_information;
	};

	// A language identification event created by the speech recognition service
	// and passed to the browser and renderer.
	struct LanguageIdentificationEvent {
	// The locale of the language with the highest confidence.
	string language;

	// The confidence interval.
	ConfidenceLevel confidence_level;
	};

	// The interface used to notify the speech recognition client of events
	// triggered by the browser. The remote lives in the browser process and the
	// receiver lives in the renderer process.
	interface SpeechRecognitionBrowserObserver {
	// Notify the speech recognition client when speech recognition availability
	// changes.
	SpeechRecognitionAvailabilityChanged(bool is_speech_recognition_available);

	// Notify the speech recognition client when the speech recognition language
	// changes.
	SpeechRecognitionLanguageChanged(string language);
	};

	// This interface between the speech recognition client and the browser.
	// The remote lives in the renderer process and the receiver lives in the
	// browser process.
	interface SpeechRecognitionClientBrowserInterface {
	// Bind the speech recognition availability observer.
	BindSpeechRecognitionBrowserObserver(
	pending_remote<SpeechRecognitionBrowserObserver> observer);
	};

	// Corresponds to ExtendedSodaConfigMsg.RecognitionMode in
	// chrome/services/speech/soda/proto/soda_api.proto and
	// SodaRecognitionMode in
	// chromeos/services/machine_learning/public/mojom/soda.mojom.
	enum SpeechRecognitionMode {
	kUnknown,
	// Intended for voice input for keyboard usage.
	kIme,
	// Intended to caption a stream of audio.
	kCaption,
	};

	// Options for speech recognition.
	// TODO(crbug.com/1165437): Add option to include timing metrics in the result.
	struct SpeechRecognitionOptions {
	// What kind of recognition to use.
	// In the case of web fallback (not for launch, used for development only),
	// this option will be ignored.
	SpeechRecognitionMode recognition_mode;

	// Whether to enable formatting and punctuation in the recognition results.
	bool enable_formatting;

	// The BCP-47 localized language code to use (e.g. "en-US").
	// TODO(crbug.com/1161569): Language needs to be required when multiple
	// languages are supported by SODA, so that each SpeechRecognitionRecognizer
	// can use its own language. Right now Language is only used by Projector
	// and Dictation via OnDeviceSpeechRecognizer in Chrome OS.
	string? language;
	};