blob: 269388b5f40c21c374cac25efd3e3fa25652dc4b [file] [log] [blame]
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_
#define MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_
#include <string>
#include <vector>
#include "base/time/time.h"
#include "third_party/abseil-cpp/absl/types/optional.h"
namespace media {
struct HypothesisParts {
HypothesisParts();
HypothesisParts(const std::vector<std::string> part, base::TimeDelta offset);
HypothesisParts(const HypothesisParts&);
HypothesisParts(HypothesisParts&&);
HypothesisParts& operator=(const HypothesisParts&);
HypothesisParts& operator=(HypothesisParts&&);
~HypothesisParts();
bool operator==(const HypothesisParts& rhs) const;
// A section of the final transcription text. Either an entire word or single
// character (depending on the language) with adjacent punctuation. There will
// usually only be one value here. If formatting is enabled in the speech
// recognition, then the raw text will be included as the second element.
std::vector<std::string> text;
// Time offset from this event's |audio_start_time| defined below. Time
// offset from this event's |audio_start_time| defined below. We enforce the
// following invariant: 0 <= hypothesis_part_offset < |audio_end_time -
// audio_start_time|.
base::TimeDelta hypothesis_part_offset;
};
struct TimingInformation {
TimingInformation();
TimingInformation(const TimingInformation&);
TimingInformation(TimingInformation&&);
TimingInformation& operator=(const TimingInformation&);
TimingInformation& operator=(TimingInformation&&);
~TimingInformation();
bool operator==(const TimingInformation& rhs) const;
// Start time in audio time from the start of the SODA session.
// This time measures the amount of audio input into SODA.
base::TimeDelta audio_start_time;
// Elapsed processed audio from first frame after preamble.
base::TimeDelta audio_end_time;
// The timing information for each word/letter in the transription.
// HypothesisPartsInResult was introduced in min version 1 in
// chromeos/services/machine_learning/public/mojom/soda.mojom. Therefore, it
// must be optional. Hypothesis parts maybe non-empty optional containing a
// zero length vector if no words were spoken during the event's time span.
absl::optional<std::vector<HypothesisParts>> hypothesis_parts;
};
// A speech recognition result created by the speech service and passed to the
// SpeechRecognitionRecognizerClient.
struct SpeechRecognitionResult {
SpeechRecognitionResult();
SpeechRecognitionResult(const std::string transcript, bool is_final);
SpeechRecognitionResult(const SpeechRecognitionResult&);
SpeechRecognitionResult(SpeechRecognitionResult&&);
SpeechRecognitionResult& operator=(const SpeechRecognitionResult&);
SpeechRecognitionResult& operator=(SpeechRecognitionResult&&);
~SpeechRecognitionResult();
bool operator==(const SpeechRecognitionResult& rhs) const;
std::string transcription;
// A flag indicating whether the result is final. If true, the result is
// locked in and the next result returned will not overlap with the previous
// final result.
bool is_final = false;
// Timing information for the current transcription.
absl::optional<TimingInformation> timing_information;
};
} // namespace media
#endif // MEDIA_MOJO_MOJOM_SPEECH_RECOGNITION_RESULT_H_