From 5be471c357397f9655b6f5e7fd19c383a7be9fdf Mon Sep 17 00:00:00 2001 From: Yiren Wang <108832030+yrw-google@users.noreply.github.com> Date: Wed, 19 Feb 2025 17:13:54 -0800 Subject: [PATCH] Add speech recognition context to the Web Speech API Explainer for speech recognition context is added in https://github.com/WebAudio/web-speech-api/pull/140 --- .gitignore | 2 + index.bs | 135 ++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 126 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index dcaf716..65092a3 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ index.html +.DS_Store +.idea/ diff --git a/index.bs b/index.bs index 25b5c08..83b4511 100644 --- a/index.bs +++ b/index.bs @@ -161,12 +161,14 @@ interface SpeechRecognition : EventTarget { attribute boolean interimResults; attribute unsigned long maxAlternatives; attribute SpeechRecognitionMode mode; + attribute SpeechRecognitionContext context; // methods to drive the speech interaction undefined start(); undefined start(MediaStreamTrack audioTrack); undefined stop(); undefined abort(); + undefined updateContext(SpeechRecognitionContext context); static Promise availableOnDevice(DOMString lang); static Promise installOnDevice(DOMString lang); @@ -191,7 +193,8 @@ enum SpeechRecognitionErrorCode { "network", "not-allowed", "service-not-allowed", - "language-not-supported" + "language-not-supported", + "context-not-supported" }; enum SpeechRecognitionMode { @@ -246,6 +249,30 @@ dictionary SpeechRecognitionEventInit : EventInit { unsigned long resultIndex = 0; required SpeechRecognitionResultList results; }; + +// The object representing a phrase for contextual biasing. +[Exposed=Window] +interface SpeechRecognitionPhrase { + constructor(DOMString phrase, optional float boost = 1.0); + readonly attribute DOMString phrase; + readonly attribute float boost; +}; + +// The object representing a list of biasing phrases. +[Exposed=Window] +interface SpeechRecognitionPhraseList { + constructor(); + readonly attribute unsigned long length; + SpeechRecognitionPhrase item(unsigned long index); + undefined addItem(SpeechRecognitionPhrase item); +}; + +// The object representing a recognition context collection. +[Exposed=Window] +interface SpeechRecognitionContext { + constructor(SpeechRecognitionPhraseList phrases); + readonly attribute SpeechRecognitionPhraseList phrases; +};

SpeechRecognition Attributes

@@ -276,6 +303,9 @@ dictionary SpeechRecognitionEventInit : EventInit {
mode attribute
An enum to determine where speech recognition takes place. The default value is "ondevice-preferred".
+ +
context attribute
+
This attribute will set the speech recognition context for the recognition session to start with.

The group has discussed whether WebRTC might be used to specify selection of audio sources and remote recognizers. @@ -313,23 +343,43 @@ See microphone" is [=permission/"denied"=], abort these steps. -1. Once the system is successfully listening to the recognition, [=fire an event=] named start at [=this=]. - -

+When the start session algorithm with +|requestMicrophonePermission| is invoked, the user agent MUST run the +following steps: + +1. If the [=current settings object=]'s [=relevant global object=]'s + [=associated Document=] is NOT [=fully active=], throw an {{InvalidStateError}} + and abort these steps. +1. If {{[[started]]}} is `true` and no error or end event + have fired, throw an {{InvalidStateError}} and abort these steps. +1. Set {{[[started]]}} to `true`. +1. If |requestMicrophonePermission| is `true` and [=request + permission to use=] "`microphone`" is [=permission/"denied"=], abort + these steps. +1. If {{SpeechRecognition/context}} is not null and the system does not support + speech recognition context, throw a {{SpeechRecognitionErrorEvent}} with the + {{context-not-supported}} error code and abort these steps. +1. Once the system is successfully listening to the recognition, queue a task to + [=fire an event=] named start at [=this=].

SpeechRecognition Events

@@ -421,6 +471,9 @@ For example, some implementations may fire audioe
"language-not-supported"
The language was not supported.
+ +
"context-not-supported"
+
The speech recognition model does not support speech recognition context.
@@ -499,6 +552,66 @@ For a non-continuous recognition it will hold only a single value.

Note that when resultIndex equals results.length, no new results are returned, this may occur when the array length decreases to remove one or more interim results. +

SpeechRecognitionPhrase

+ +

The SpeechRecognitionPhrase object represents a phrase for contextual biasing.

+ +
+
SpeechRecognitionPhrase(|phrase|, |boost|) constructor
+
+ When invoked, run the following steps: + 1. If the |phrase| is an empty string, throw a "{{SyntaxError}}" {{DOMException}}. + 1. If the |boost| is smaller than 0.0 or greater than 10.0, throw a "{{SyntaxError}}" {{DOMException}}. + 1. Construct a new SpeechRecognitionPhrase object with |phrase| and |boost|. + 1. Return the object. +
+ +
phrase attribute
+
This attribute is the text string to be boosted.
+ +
boost attribute
+
This attribute is approximately the natural log of the number of times more likely the website thinks this phrase is than what the speech recognition model knows. + A valid boost must be a float value inside the range [0.0, 10.0], with a default value of 1.0 if not specified. + A boost of 0.0 means the phrase is not boosted at all, and a higher boost means the phrase is more likely to appear. + A boost of 10.0 means the phrase is extremely likely to appear and should be rarely set. +
+
+ +

SpeechRecognitionPhraseList

+ +

The SpeechRecognitionPhraseList object holds a sequence of phrases for contextual biasing.

+ +
+
SpeechRecognitionPhraseList() constructor
+
This constructor returns an empty list.
+ +
length attribute
+
This attribute indicates how many phrases are in the list. The user agent must ensure it is set to the number of phrases in the list.
+ +
SpeechRecognitionPhrase(|index|) method
+
+ This method gets the SpeechRecognitionPhrase object at the |index| of the list. + When invoked, run the following steps: + 1. If the |index| is smaller than 0, or greater than or equal to {{SpeechRecognitionPhraseList/length}}, return null. + 1. Return the SpeechRecognitionPhrase from the |index| of the list. +
+ +
addItem(|item|) method
+
This method adds the SpeechRecognitionPhrase object |item| to the end of the list.
+
+ +

SpeechRecognitionContext

+ +

The SpeechRecognitionContext object holds contextual information to provide to the speech recognition models.

+ +
+
SpeechRecognitionContext(|phrases|) constructor
+
This constructor returns a new SpeechRecognitionContext object with the SpeechRecognitionPhraseList object |phrases| in it.
+ +
phrases attribute
+
This attribute represents the phrases to be boosted.
+
+

The SpeechSynthesis Interface

The SpeechSynthesis interface is the scripted web API for controlling a text-to-speech output.