Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
feat: Support for spoken punctuation and spoken emojis (#737)
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 367239272
  • Loading branch information
bcoe authored Apr 29, 2021
1 parent 9b255e9 commit 1a30de0
Show file tree
Hide file tree
Showing 4 changed files with 2,715 additions and 7 deletions.
33 changes: 28 additions & 5 deletions protos/google/cloud/speech/v1p1beta1/cloud_speech.proto
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";
import "google/protobuf/wrappers.proto";
import "google/rpc/status.proto";

option cc_enable_arenas = true;
Expand Down Expand Up @@ -242,6 +243,12 @@ message RecognitionConfig {
// kbps). When using this encoding, `sample_rate_hertz` has to match the
// sample rate of the file being used.
MP3 = 8;

// Opus encoded audio frames in WebM container
// ([OggOpus](https://wiki.xiph.org/OggOpus)). This is a Beta features and
// only available in v1p1beta1. `sample_rate_hertz` must be one of 8000,
// 12000, 16000, 24000, or 48000.
WEBM_OPUS = 9;
}

// Encoding of audio data sent in all `RecognitionAudio` messages.
Expand Down Expand Up @@ -316,15 +323,15 @@ message RecognitionConfig {
// Speech adaptation configuration improves the accuracy of speech
// recognition. When speech adaptation is set it supersedes the
// `speech_contexts` field. For more information, see the [speech
// adaptation](https://cloud.google.com/speech-to-text/docs/context-strength)
// adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
// documentation.
SpeechAdaptation adaptation = 20;

// Array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see
// [speech
// adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
// adaptation](https://cloud.google.com/speech-to-text/docs/adaptation).
repeated SpeechContext speech_contexts = 6;

// If `true`, the top result includes a list of words and
Expand All @@ -344,6 +351,22 @@ message RecognitionConfig {
// The default 'false' value does not add punctuation to result hypotheses.
bool enable_automatic_punctuation = 11;

// The spoken punctuation behavior for the call
// If not set, uses default behavior based on model of choice
// e.g. command_and_search will enable spoken punctuation by default
// If 'true', replaces spoken punctuation with the corresponding symbols in
// the request. For example, "how are you question mark" becomes "how are
// you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
// for support. If 'false', spoken punctuation is not replaced.
google.protobuf.BoolValue enable_spoken_punctuation = 22;

// The spoken emoji behavior for the call
// If not set, uses default behavior based on model of choice
// If 'true', adds spoken emoji formatting for the request. This will replace
// spoken emojis with the corresponding Unicode symbols in the final
// transcript. If 'false', spoken emojis are not replaced.
google.protobuf.BoolValue enable_spoken_emojis = 23;

// If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
Expand Down Expand Up @@ -388,7 +411,7 @@ message RecognitionConfig {
// </tr>
// <tr>
// <td><code>video</code></td>
// <td>Best for audio that originated from from video or includes multiple
// <td>Best for audio that originated from video or includes multiple
// speakers. Ideally the audio is recorded at a 16khz or greater
// sampling rate. This is a premium model that costs more than the
// standard rate.</td>
Expand Down Expand Up @@ -674,8 +697,8 @@ message LongRunningRecognizeMetadata {
// audio, and `single_utterance` is set to false, then no messages are streamed
// back to the client.
//
// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
// be returned while processing audio:
// Here's an example of a series of `StreamingRecognizeResponse`s that might be
// returned while processing audio:
//
// 1. results { alternatives { transcript: "tube" } stability: 0.01 }
//
Expand Down
Loading

0 comments on commit 1a30de0

Please sign in to comment.