feat: Support for spoken punctuation and spoken emojis (#737)

PiperOrigin-RevId: 367239272
googleapis · Apr 29, 2021 · 1a30de0 · 1a30de0
1 parent 9b255e9
commit 1a30de0
Show file tree

Hide file tree

Showing 4 changed files with 2,715 additions and 7 deletions.
diff --git a/protos/google/cloud/speech/v1p1beta1/cloud_speech.proto b/protos/google/cloud/speech/v1p1beta1/cloud_speech.proto
@@ -24,6 +24,7 @@ import "google/longrunning/operations.proto";
 import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
 import "google/protobuf/timestamp.proto";
+import "google/protobuf/wrappers.proto";
 import "google/rpc/status.proto";
 
 option cc_enable_arenas = true;
@@ -242,6 +243,12 @@ message RecognitionConfig {
     // kbps). When using this encoding, `sample_rate_hertz` has to match the
     // sample rate of the file being used.
     MP3 = 8;
+
+    // Opus encoded audio frames in WebM container
+    // ([OggOpus](https://wiki.xiph.org/OggOpus)). This is a Beta features and
+    // only available in v1p1beta1. `sample_rate_hertz` must be one of 8000,
+    // 12000, 16000, 24000, or 48000.
+    WEBM_OPUS = 9;
   }
 
   // Encoding of audio data sent in all `RecognitionAudio` messages.
@@ -316,15 +323,15 @@ message RecognitionConfig {
   // Speech adaptation configuration improves the accuracy of speech
   // recognition. When speech adaptation is set it supersedes the
   // `speech_contexts` field. For more information, see the [speech
-  // adaptation](https://cloud.google.com/speech-to-text/docs/context-strength)
+  // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
   // documentation.
   SpeechAdaptation adaptation = 20;
 
   // Array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
   // A means to provide context to assist the speech recognition. For more
   // information, see
   // [speech
-  // adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
+  // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation).
   repeated SpeechContext speech_contexts = 6;
 
   // If `true`, the top result includes a list of words and
@@ -344,6 +351,22 @@ message RecognitionConfig {
   // The default 'false' value does not add punctuation to result hypotheses.
   bool enable_automatic_punctuation = 11;
 
+  // The spoken punctuation behavior for the call
+  // If not set, uses default behavior based on model of choice
+  // e.g. command_and_search will enable spoken punctuation by default
+  // If 'true', replaces spoken punctuation with the corresponding symbols in
+  // the request. For example, "how are you question mark" becomes "how are
+  // you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
+  // for support. If 'false', spoken punctuation is not replaced.
+  google.protobuf.BoolValue enable_spoken_punctuation = 22;
+
+  // The spoken emoji behavior for the call
+  // If not set, uses default behavior based on model of choice
+  // If 'true', adds spoken emoji formatting for the request. This will replace
+  // spoken emojis with the corresponding Unicode symbols in the final
+  // transcript. If 'false', spoken emojis are not replaced.
+  google.protobuf.BoolValue enable_spoken_emojis = 23;
+
   // If 'true', enables speaker detection for each recognized word in
   // the top alternative of the recognition result using a speaker_tag provided
   // in the WordInfo.
@@ -388,7 +411,7 @@ message RecognitionConfig {
   //   </tr>
   //   <tr>
   //     <td><code>video</code></td>
-  //     <td>Best for audio that originated from from video or includes multiple
+  //     <td>Best for audio that originated from video or includes multiple
   //         speakers. Ideally the audio is recorded at a 16khz or greater
   //         sampling rate. This is a premium model that costs more than the
   //         standard rate.</td>
@@ -674,8 +697,8 @@ message LongRunningRecognizeMetadata {
 // audio, and `single_utterance` is set to false, then no messages are streamed
 // back to the client.
 //
-// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
-// be returned while processing audio:
+// Here's an example of a series of `StreamingRecognizeResponse`s that might be
+// returned while processing audio:
 //
 // 1. results { alternatives { transcript: "tube" } stability: 0.01 }
 //