From e287ced7b7447e75b044e21721ec42983e5a1e60 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Wed, 9 Mar 2022 23:51:15 -0800
Subject: [PATCH 01/13] Context-based graph execution methods for different
 threading models.

---
 explainer.md |   7 +-
 index.bs     | 523 +++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 390 insertions(+), 140 deletions(-)
diff --git a/explainer.md b/explainer.md
index 4a456de2..50f29393 100644
--- a/explainer.md
+++ b/explainer.md
@@ -47,7 +47,7 @@ const bufferB = new Float32Array(4).fill(0.8);
 const bufferC = new Float32Array(4);
 const inputs = {'A': bufferA, 'B': bufferB};
 const outputs = {'C': bufferC};
-graph.compute(inputs, outputs);
+context.compute(inputs, outputs);
 // The computed result of [[1, 1], [1, 1]] is in the buffer associated with
 // the output operand.
 console.log('Output value: ' + bufferC);
@@ -99,12 +99,13 @@ There are many important [application use cases](https://webmachinelearning.gith
 export class NSNet2 {
   constructor() {
     this.graph = null;
+    this.context = null;
     this.frameSize = 161;
     this.hiddenSize = 400;
   }
 
   async build(baseUrl, batchSize, frames) {
-    const context = navigator.ml.createContext();
+    this.context = navigator.ml.createContext();
     const builder = new MLGraphBuilder(context);
     // Create constants by loading pre-trained data from .npy files.
     const weight172 = await buildConstantByNpy(builder, baseUrl + '172.npy');
@@ -153,7 +154,7 @@ export class NSNet2 {
       'gru94': gru94Buffer,
       'gru157': gru157Buffer
     };
-    return this.graph.compute(inputs, outputs);
+    return this.context.compute(inputs, outputs);
   }
 }
 ```
diff --git a/index.bs b/index.bs
index 7f59d9fb..27327e20 100644
--- a/index.bs
+++ b/index.bs
@@ -30,6 +30,9 @@ urlPrefix: https://gpuweb.github.io/gpuweb/; spec: WEBGPU
         text: GPUDevice; url: gpu-device
         text: GPUBuffer; url: buffer-interface
         text: GPUTexture; url: texture-interface
+        text: GPUQueue; url: queues
+        text: GPUCommandBuffer; url: command-buffers
+        text: GPUCommandBufferDescriptor; url: dictdef-gpucommandbufferdescriptor
 </pre>
 <pre class="biblio">
 {
@@ -444,8 +447,7 @@ computer vision, natural language processing, and robotics.
 The WebNN API is a specification for constructing, compiling, and executing computational
 graphs of neural networks.
 
-The {{MLGraph}} interface represents a compiled computational graph (that is, a model) and exposes
-a compute method to perform inference.
+The {{MLGraph}} interface represents a compiled computational graph that is immutable (that is, a model).
 
 The {{MLGraphBuilder}} interface serves as a builder (factory) to create a {{MLGraph}}.
 An {{MLOperand}} is a representation of data that flows within the computational graph,
@@ -466,10 +468,29 @@ the computation graph used to compute one or more specified outputs. The key
 purpose of the compilation step is to enable optimizations that span two or
 more operations, such as operation or loop fusion.
 
-The {{MLGraph/compute()}} method of the {{MLGraph}} interface is used to execute the
-compiled computation graph (to perform inference). The caller supplies the input
-values using {{MLNamedInputs}}, binding the input {{MLOperand}}s to their values.
-The caller supplies pre-allocated buffers for output {{MLOperand}}s using {{MLNamedOutputs}}.
+Once the {{MLGraph}} is constructed, there are multiple ways by which the graph may be executed. The
+{{MLContext/compute()}} method represents a way the execution of the graph is carried out immediately 
+on the calling thread, which must also be a worker thread. The execution produces the results of the computation 
+from all the inputs bound to the graph. This type of execution is limited only to when the computational device 
+bound to the context is a CPU device.
+
+The {{MLContext/computeAsync()}} method represents a way the execution of the graph is performed asynchronously
+on a separate worker thread. This method returns immediately without blocking the calling thread. This execution 
+method is appropriate when the responsiveness of the calling thread is critical to good user experience. The 
+computation results will be placed at the bound outputs at the time the operation is completed on a worker thread 
+at which time the calling thread is signaled. This type of execution supports both the CPU and GPU device, 
+including when the context is created from the {{WebGLRenderingContext}}.
+
+The {{MLCommandEncoder}} interface created by the {{MLContext/createCommandEncoder()}} method supports 
+a graph execution method that provides the maximum flexibility to callers that also utilize WebGPU in their 
+application. It does this by placing the workload required to initialize and compute the results of the 
+operations in the graph onto a {{GPUCommandBuffer}}. The callers are responsible for the eventual submission 
+of this workload on the {{GPUQueue}}. The submitted workload once completely executed on the GPU would 
+signal the queue with the results filled in the bound output buffers.
+
+In each of these various execution methods, the caller supplies the input values using {{MLNamedInputs}}
+or equivalent type, binding the input {{MLOperand}}s to their values. The caller supplies pre-allocated
+buffers for output {{MLOperand}}s using {{MLNamedOutputs}} or equivalent type.
 
 The runtime values (of {{MLOperand}}s) are tensors, which are essentially multidimensional
 arrays. The representation of the tensors is implementation dependent, but it typically
@@ -616,8 +637,32 @@ The <dfn>power preference</dfn> indicates preference as related to power consump
 </dl>
 
 <script type=idl>
+typedef (MLBufferView or WebGLTexture or GPUTexture) MLResource;
+
+dictionary MLInput {
+  required MLResource resource;
+  required sequence<long> dimensions;
+};
+
+dictionary MLArrayInput {
+  required ArrayBufferView resource;
+  required sequence<long> dimensions;
+};
+
+typedef record<DOMString, (MLResource or MLInput)> MLNamedInputs;
+typedef record<DOMString, MLResource> MLNamedOutputs;
+typedef record<DOMString, (ArrayBufferView or MLArrayInput)> MLNamedArrayInputs;
+typedef record<DOMString, ArrayBufferView> MLNamedArrayOutputs;
+
 [SecureContext, Exposed=(Window, DedicatedWorker)]
-interface MLContext {};
+interface MLContext {
+  [Exposed=(DedicatedWorker)]
+  undefined compute(MLGraph graph, MLNamedArrayInputs inputs, MLNamedArrayOutputs outputs);
+
+  Promise<undefined> computeAsync(MLGraph graph, MLNamedInputs inputs, MLNamedOutputs outputs);
+
+  MLCommandEncoder createCommandEncoder();
+};
 </script>
 
 {{MLContext}} has the following internal slots:
@@ -634,9 +679,238 @@ interface MLContext {};
         The {{MLContext}}'s [=power preference=].
 </dl>
 
+### compute ### {#api-mlcontext-compute}
+Synchronously carries out the computational workload of a compiled graph {{MLGraph}} on the calling thread, which must be a worker thread, to produce results as defined by the operations in the graph. This method of execution requires an {{MLContext}} created with {{MLContextOptions}} with the {{MLDevicePreference}} option set to either "cpu" or "default" resolved to a CPU context. Otherwise, it throws an {{OperationError}} exception.
+
+<div algorithm=mlcontext.compute>
+
+    **Arguments:**
+    <pre class=argumentdef for="MLContext/compute(graph, inputs, outputs)">
+        |graph|: an {{MLGraph}}. The compiled graph to be executed.
+        |inputs|: an {{MLNamedArrayInputs}}. The resources and optional dimensions of inputs.
+        |outputs|: an {{MLNamedArrayOutputs}}. The pre-allocated resources of required outputs.
+    </pre>
+
+    **Returns:** {{undefined}}.
+    1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
+        <div class=validusage>
+            1. For each |key| -> |value| of |inputs|:
+                1. |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|] must exist.
+                1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+                1. Let |inputSize| be 1.
+                1. If |value| is an {{MLArrayInput}}, then:
+                    1. The length of |value|.{{MLArrayInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+                    1. Let |i| be 0.
+                    1. While true:
+                        1. Let |dimension| be |value|.{{MLArrayInput/dimensions}}[|i|].
+                        1. |dimension| must be greater than 0.
+                        1. If |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|] is greater than 0, then |dimension| must be equal to |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|].
+                        1. Set |inputSize| to the product of |inputSize| and |dimension|.
+                        1. Increment |i| by 1.
+                        1. If |i| if equal to the length of |value|.{{MLArrayInput/dimensions}}, then break.
+                1. Else:
+                    1. For each |dimension| of |inputDesc|.{{MLOperandDescriptor/dimensions}}:
+                        1. The value of |dimension| must be greater than 0.
+                        1. Set |inputSize| to the product of |inputSize| and |dimension|.
+                1. If |value| is an {{MLArrayInput}}, then let |resource| be |value|.{{MLArrayInput/resource}}.
+                1. If |value| is an {{ArrayBufferView}}, then let |resource| be |value|.
+                1. If |resource| is an {{ArrayBufferView}}, then:
+                    1. The kind of |resource| must be compatible with |inputDesc|.{{MLOperandDescriptor/type}} according to [this table](#appendices-mloperandtype-arraybufferview-compatibility).
+                    1. The length of |resource| must be the same as |inputSize|.
+            1. For each |key| -> |value| of |outputs|:
+                1. |graph|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
+        </div>
+    <!-- Compute -->
+    1. For each |key| -> |value| of |inputs|:
+        1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+        1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
+        1. If |value| is an {{MLArrayInput}}, then:
+            1. Set the dimensions of |inputTensor| to |value|.{{MLArrayInput/dimensions}}.
+        1. Else:
+            1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+        1. If |value| is an {{MLArrayInput}}, then:
+            1. Set the values of |inputTensor| to the values of |value|.{{MLArrayInput/resource}}.
+        1. If |value| is an {{ArrayBufferView}}, then:
+            1. Set the values of |inputTensor| to the values of |value|.
+        1. Set the input of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
+    1. For each |key| -> |value| of |outputs|:
+        1. Issue a compute request for output of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key|.
+        1. Wait for the compute request to be completed.
+        1. If there is an error returned by |graph|.{{MLGraph/[[implementation]]}}, then:
+            1. Throw an {{OperationError}} {{DOMException}} and stop.
+        1. Else:
+            1. Let |outputTensor| be the output tensor returned by |graph|.{{MLGraph/[[implementation]]}}.
+            1. If the kind of |value| is not compatible with the value type of |outputTensor|, then throw a {{DataError}} {{DOMException}} and stop.
+            1. Let |outputSize| be 1.
+            1. For each |dimension| of dimensions of |outputTensor|:
+                1. Set |outputSize| to the product of |outputSize| and |dimension|.
+            1. If |outputSize| is greater than the length of |value|, then:
+                1. Throw a {{DataError}} {{DOMException}} and stop.
+            1. Else:
+                1. Set the values of |value| to the values of |outputTensor|.
+    1. Return {{undefined}}.
+</div>
+
+#### Examples #### {#compilation-examples}
+
+<div class="example">
+The following code showcases the computation with dynamic input dimensions.
+<pre highlight="js">
+function sizeOfShape(array) {
+  return array.reduce(
+      (accumulator, currentValue) => accumulator * currentValue);
+}
+
+const context = navigator.ml.createContext();
+
+// Create a graph with dynamic shaped inputs.
+const builder = new MLGraphBuilder(context);
+const descA = {type: 'float32', dimensions: [-1, 4]};
+const a = builder.input('a', descA);
+const descB = {type: 'float32', dimensions: [4, -1]};
+const b = builder.input('b', descB);
+const c = builder.matmul(a, b);
+const graph = builder.build({'c': c});
+
+function allocateAndCompute(shapeA, shapeB, shapeC) {
+  const bufferA = new Float32Array(sizeOfShape(shapeA)).fill(0.5);
+  const bufferB = new Float32Array(sizeOfShape(shapeB)).fill(0.5);
+  const bufferC = new Float32Array(sizeOfShape(shapeC));
+
+  // Specify the shape of inputs when computing.
+  const inputs = {
+    'a': {resource: bufferA, dimensions: shapeA},
+    'b': {resource: bufferB, dimensions: shapeB},
+  };
+  const outputs = {'c': bufferC};
+  context.compute(graph, inputs, outputs);
+  console.log(&#96;values: ${bufferC}&#96;);
+}
+
+allocateAndCompute([3, 4], [4, 3], [3, 3]);
+allocateAndCompute([4, 4], [4, 4], [4, 4]);
+allocateAndCompute([5, 4], [4, 5], [5, 5]);
+</pre>
+</div>
+
+<div class="example">
+The following code showcases the computation with optional outputs.
+<pre highlight="js">
+const context = navigator.ml.createContext();
+
+// Build a graph with two outputs.
+const builder = new MLGraphBuilder(context);
+const descA = {type: 'float32', dimensions: [3, 4]};
+const a = builder.input('a', descA);
+const descB = {type: 'float32', dimensions: [4, 3]};
+const bufferB = new Float32Array(sizeOfShape(descB.dimensions)).fill(0.5);
+const b = builder.constant(descB, bufferB);
+const descC = {type: 'float32', dimensions: [3, 3]};
+const bufferC = new Float32Array(sizeOfShape(descC.dimensions)).fill(1);
+const c = builder.constant(descC, bufferC);
+const d = builder.matmul(a, b);
+const e = builder.add(d, c);
+const graph = builder.build({'d': d, 'e': e});
+
+const bufferA = new Float32Array(sizeOfShape(descA.dimensions)).fill(0.5);
+const inputs = {'a': bufferA};
+
+// Compute d.
+const bufferD = new Float32Array(sizeOfShape([3, 3]));
+context.compute(graph, inputs, {'d': bufferD});
+console.log(&#96;values: ${bufferD}&#96;);
+
+// Compute e.
+const bufferE = new Float32Array(sizeOfShape([3, 3]));
+context.compute(graph, inputs, {'e': bufferE});
+console.log(&#96;values: ${bufferE}&#96;);
+</pre>
+</div>
+
+### computeAsync ### {#api-mlcontext-computeasync}
+Asynchronously carries out the computational workload of a compiled graph {{MLGraph}} on a worker thread to avoid blocking the calling thread while producing results as defined by the operations in the graph. This method of execution requires an {{MLContext}} created with {{MLContextOptions}} or {{WebGLRenderingContext}}. Otherwise, it throws an {{OperationError}} exception.
+
+<div algorithm=mlcontext.computeasync>
+
+    **Arguments:**
+    <pre class=argumentdef for="MLContext/computeAsync(graph, inputs, outputs)">
+        |graph|: an {{MLGraph}}. The compiled graph to be executed.
+        |inputs|: an {{MLNamedInputs}}. The resources and optional dimensions of inputs.
+        |outputs|: an {{MLNamedOutputs}}. The pre-allocated resources of required outputs.
+    </pre>
+
+    **Returns:** {{Promise}}<{{undefined}}>.
+    1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
+        <div class=validusage>
+            1. For each |key| -> |value| of |inputs|:
+                1. |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|] must exist.
+                1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+                1. Let |inputSize| be 1.
+                1. If |value| is an {{MLInput}}, then:
+                    1. The length of |value|.{{MLInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+                    1. Let |i| be 0.
+                    1. While true:
+                        1. Let |dimension| be |value|.{{MLInput/dimensions}}[|i|].
+                        1. |dimension| must be greater than 0.
+                        1. If |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|] is greater than 0, then |dimension| must be equal to |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|].
+                        1. Set |inputSize| to the product of |inputSize| and |dimension|.
+                        1. Increment |i| by 1.
+                        1. If |i| if equal to the length of |value|.{{MLInput/dimensions}}, then break.
+                1. Else:
+                    1. For each |dimension| of |inputDesc|.{{MLOperandDescriptor/dimensions}}:
+                        1. The value of |dimension| must be greater than 0.
+                        1. Set |inputSize| to the product of |inputSize| and |dimension|.
+                1. If |value| is an {{MLInput}}, then let |resource| be |value|.{{MLInput/resource}}.
+                1. If |value| is an {{MLResource}}, then let |resource| be |value|.
+                1. If |resource| is an {{ArrayBufferView}}, then:
+                    1. The kind of |resource| must be compatible with |inputDesc|.{{MLOperandDescriptor/type}} according to [this table](#appendices-mloperandtype-arraybufferview-compatibility).
+                    1. The length of |resource| must be the same as |inputSize|.
+            1. For each |key| -> |value| of |outputs|:
+                1. |graph|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
+        </div>
+    <!-- Compute -->
+    1. For each |key| -> |value| of |inputs|:
+        1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+        1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
+        1. If |value| is an {{MLInput}}, then:
+            1. Set the dimensions of |inputTensor| to |value|.{{MLInput/dimensions}}.
+        1. Else:
+            1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+        1. If |value| is an {{MLInput}}, then:
+            1. Set the values of |inputTensor| to the values of |value|.{{MLInput/resource}}.
+        1. If |value| is an {{MLResource}}, then:
+            1. Set the values of |inputTensor| to the values of |value|.
+        1. Set the input of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
+    1. For each |key| -> |value| of |outputs|:
+        1. Issue a compute request for output of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key|.
+        1. Wait for the compute request to be completed.
+        1. If there is an error returned by |graph|.{{MLGraph/[[implementation]]}}, then:
+            1. Throw an {{OperationError}} {{DOMException}} and stop.
+        1. Else:
+            1. Let |outputTensor| be the output tensor returned by |graph|.{{MLGraph/[[implementation]]}}.
+            1. If the kind of |value| is not compatible with the value type of |outputTensor|, then throw a {{DataError}} {{DOMException}} and stop.
+            1. Let |outputSize| be 1.
+            1. For each |dimension| of dimensions of |outputTensor|:
+                1. Set |outputSize| to the product of |outputSize| and |dimension|.
+            1. If |outputSize| is greater than the length of |value|, then:
+                1. Throw a {{DataError}} {{DOMException}} and stop.
+            1. Else:
+                1. Set the values of |value| to the values of |outputTensor|.
+    1. Return {{Promise}}<{{undefined}}>.
+</div>
+
+### createCommandEncoder ### {#api-mlcontext-createcommandencoder}
+Create {{MLCommandEncoder}} interface used to record the ML workload onto a {{GPUCommandBuffer}} to allow mixing of ML workload with other GPU workloads in an application that leverages WebGPU. This method only succeeds on an {{MLContext}} created with {{GPUDevice}}. Otherwise, it throws an {{OperationError}} exception.
+
+<div algorithm=mlcontext.createcommandencoder>
+
+    **Returns:** 
+        - an {{MLCommandEncoder}}. The command encoder used to record ML workload on the GPU.
+</div>
+
 ## MLOperandDescriptor ## {#api-mloperanddescriptor}
 <script type=idl>
-enum MLInputOperandLayout {
+enum MLGPUInputOperandLayout {
   "nchw",
   "nhwc"
 };
@@ -867,7 +1141,7 @@ dictionary MLConv2dOptions {
   sequence<long> dilations;
   MLAutoPad autoPad = "explicit";
   long groups = 1;
-  MLInputOperandLayout inputLayout = "nchw";
+  MLGPUInputOperandLayout inputLayout = "nchw";
   MLConv2dFilterOperandLayout filterLayout = "oihw";
   MLOperand bias;
   MLOperator activation;
@@ -889,7 +1163,7 @@ partial interface MLGraphBuilder {
             - *dilations*: a sequence of {{long}} of length 2. The dilation factor for each spatial dimension of *input*, [dilation_height, dilation_width]. If not present, the values are assumed to be [1,1].
             - *autoPad*: an {{MLAutoPad}}. The automatic input padding options. By default, this argument is set to *"explicit"*, which means that the values in the *options.padding* array should be used for input padding. When the option is set other than *"explicit"*, the values in the *options.padding* array are ignored. With the *"same-upper"* option, the padding values are automatically computed such that the additional ending padding of the spatial input dimensions would allow all of the input values in the corresponding dimension to be filtered. The *"same-lower"* option is similar but padding is applied to the beginning padding of the spatial input dimensions instead of the ending one.
             - *groups*: a {{long}} scalar. The number of groups that input channels and output channels are divided into, default to 1.
-            - *inputLayout*: an {{MLInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
+            - *inputLayout*: an {{MLGPUInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
 
                 "nchw":
                     - input tensor: [batches, input_channels, height, width]
@@ -944,7 +1218,7 @@ dictionary MLConvTranspose2dOptions {
   sequence<long> outputSizes;
   MLAutoPad autoPad = "explicit";
   long groups = 1;
-  MLInputOperandLayout inputLayout = "nchw";
+  MLGPUInputOperandLayout inputLayout = "nchw";
   MLConvTranspose2dFilterOperandLayout filterLayout = "iohw";
   MLOperand bias;
   MLOperator activation;
@@ -969,7 +1243,7 @@ partial interface MLGraphBuilder {
             - *outputSizes*: a sequence of {{long}} of length 2. The sizes of the last two dimensions of the output tensor. When the output sizes are explicitly specified, the output padding values in *options.outputPadding* are ignored. If not specified, the output sizes are automatically computed.
             - *autoPad*: an {{MLAutoPad}}. The automatic input padding options. By default, this argument is set to *"explicit"*, which means that the values in the *options.padding* array should be used for input padding. When the option is set other than *"explicit"*, the values in the *options.padding* array are ignored. With the *"same-upper"* option, the padding values are automatically computed such that the additional ending padding of the spatial input dimensions would allow all of the input values in the corresponding dimension to be filtered. The *"same-lower"* option is similar but padding is applied to the beginning padding of the spatial input dimensions instead of the ending one.
             - *groups*: a {{long}} scalar. The number of groups that input channels and output channels are divided into, default to 1.
-            - *inputLayout*: an {{MLInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
+            - *inputLayout*: an {{MLGPUInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
 
                 "nchw":
                     - input tensor: [batches, input_channels, height, width]
@@ -1476,7 +1750,7 @@ dictionary MLInstanceNormalizationOptions {
   MLOperand scale;
   MLOperand bias;
   float epsilon = 1e-5;
-  MLInputOperandLayout layout = "nchw";
+  MLGPUInputOperandLayout layout = "nchw";
 };
 
 partial interface MLGraphBuilder {
@@ -1491,7 +1765,7 @@ partial interface MLGraphBuilder {
               - *scale*: an {{MLOperand}}. The 1-D tensor of the scaling values whose length is equal to the size of the feature dimension of the input e.g. for the input tensor with *nchw* layout, the feature dimension is 1.
               - *bias*: an {{MLOperand}}. The 1-D tensor of the bias values whose length is equal to the size of the feature dimension of the input e.g. for the input tensor with *nchw* layout, the feature dimension is 1.
               - *epsilon*: a {{float}} scalar. A small value to prevent computational error due to divide-by-zero. The default value is 0.00001 when not specified.
-              - *layout*: an {{MLInputOperandLayout}}. This option specifies the layout format of the input. The default value is *"nchw"*.
+              - *layout*: an {{MLGPUInputOperandLayout}}. This option specifies the layout format of the input. The default value is *"nchw"*.
         
     **Returns:** an {{MLOperand}}. The instance-normalized 4-D tensor of the same shape as the input tensor.
 
@@ -1717,7 +1991,7 @@ dictionary MLPool2dOptions {
   sequence<long> strides;
   sequence<long> dilations;
   MLAutoPad autoPad = "explicit";
-  MLInputOperandLayout layout = "nchw";
+  MLGPUInputOperandLayout layout = "nchw";
   MLRoundingType roundingType = "floor";
   sequence<long> outputSizes;
 };
@@ -1744,7 +2018,7 @@ partial interface MLGraphBuilder {
                 for each spatial dimension of *input*, [dilation_height, dilation_width].
                 If not present, the values are assumed to be [1,1].
             - *autoPad*: an {{MLAutoPad}}. The automatic input padding options. By default, this argument is set to *"explicit"*, which means that the values in the *options.padding* array should be used for input padding. When the option is set other than *"explicit"*, the values in the *options.padding* array are ignored. With the *"same-upper"* option, the padding values are automatically computed such that the additional ending padding of the spatial input dimensions would allow all of the input values in the corresponding dimension to be filtered. The *"same-lower"* option is similar but padding is applied to the beginning padding of the spatial input dimensions instead of the ending one.
-            - *layout*: an {{MLInputOperandLayout}}. The default value is *"nchw"*. This option specifies the
+            - *layout*: an {{MLGPUInputOperandLayout}}. The default value is *"nchw"*. This option specifies the
                 layout format of the input and output tensor as follow:
 
                 "nchw":
@@ -2166,20 +2440,8 @@ partial interface MLGraphBuilder {
 The {{MLGraph}} interface represents a compiled computational graph. A compiled graph once constructed is immutable and cannot be subsequently changed.
 
 <script type=idl>
-typedef (MLBufferView or WebGLTexture or GPUTexture) MLResource;
-
-dictionary MLInput {
-  required MLResource resource;
-  required sequence<long> dimensions;
-};
-
-typedef record<DOMString, (MLResource or MLInput)> MLNamedInputs;
-typedef record<DOMString, MLResource> MLNamedOutputs;
-
 [SecureContext, Exposed=(Window, DedicatedWorker)]
-interface MLGraph {
-  undefined compute(MLNamedInputs inputs, MLNamedOutputs outputs);
-};
+interface MLGraph {};
 </script>
 
 {{MLGraph}} has the following internal slots:
@@ -2202,18 +2464,71 @@ interface MLGraph {
         The underlying implementation provided by the User Agent.
 </dl>
 
-<dl dfn-type=method dfn-for=MLGraph>
-    : <dfn>compute(inputs, outputs)</dfn>
+## MLCommandEncoder ## {#api-mlcommandencoder}
+The {{MLCommandEncoder}} interface represents a method of execution that synchronously records the computational workload of a compiled graph {{MLGraph}} to a GPU command buffer {{GPUCommandBuffer}} on the calling thread. Since the workload is not immediately executed, just recorded, this method allows more flexibility for the caller to determine how and when the recorded commands will be submitted for execution on the GPU relative to other GPU workload on the same queue.
+
+<script type=idl>
+typedef (GPUBuffer or GPUTexture) MLGPUResource;
+
+dictionary MLGPUInput {
+  required MLGPUResource resource;
+  required sequence<long> dimensions;
+};
+
+typedef record<DOMString, (MLGPUResource or MLGPUInput)> MLNamedGPUInputs;
+typedef record<DOMString, MLGPUResource> MLNamedGPUOutputs;
+
+[SecureContext, Exposed=(Window, DedicatedWorker)]
+interface MLCommandEncoder {
+  undefined initializeGraph(MLGraph graph, MLNamedGPUInputs inputs);
+
+  undefined dispatch(MLGraph graph, MLNamedGPUInputs inputs, MLNamedGPUOutputs outputs);
+
+  GPUCommandBuffer finish(optional GPUCommandBufferDescriptor descriptor = {});
+};
+</script>
+
+{{MLCommandEncoder}} has the following internal slots:
+
+<dl dfn-type=attribute dfn-for="MLCommandEncoder">
+    : <dfn>\[[context]]</dfn> of type {{MLContext}}
     ::
-        Compute the {{MLGraph}} given {{MLNamedInputs}} and {{MLNamedOutputs}}. Return once the compute has completed and the results in {{MLNamedOutputs}} are ready to be consumed.
+        The context of type {{MLContext}} associated with this {{MLCommandEncoder}}.
 
-        <div algorithm=MLGraph.compute>
-            **Called on:** {{MLGraph}} |this|.
+    : <dfn>\[[implementation]]</dfn>
+    ::
+        The underlying implementation provided by the User Agent.
+</dl>
+
+<dl dfn-type=method dfn-for=MLCommandEncoder>
+    : <dfn>initializeGraph(graph, inputs)</dfn>
+    ::
+        Record the initialization of the graph {{MLGraph}} on the GPU command buffer {{GPUCommandBuffer}} with constant inputs {{MLNamedGPUInputs}} such as weight inputs. This is a necessary step for optimal performance as it allows the underlying platform an opportunity to prepare and optimize constant input data for the following execution of computational wordloads on the queue. It should only be done once per graph.
+
+        <div algorithm=MLCommandEncoder.initializeGraph>
+            **Called on:** {{MLCommandEncoder}} |this|.
 
             **Arguments:**
-            <pre class=argumentdef for="MLGraph/compute(inputs, outputs)">
-                |inputs|: an {{MLNamedInputs}}. The resources and optional dimensions of inputs for the compute.
-                |outputs|: an {{MLNamedOutputs}}. The pre-allocated resources of required outputs for the compute.
+            <pre class=argumentdef for="MLCommandEncoder/initializeGraph(graph, inputs)">
+                |graph|: an {{MLGraph}}. The compiled graph to be executed.
+                |inputs|: an {{MLNamedGPUInputs}}. The resources and optional dimensions of constant inputs.
+            </pre>
+
+            **Returns:** {{undefined}}.
+        </div>
+
+    : <dfn>dispatch(graph, inputs, outputs)</dfn>
+    ::
+        Record the computational workload of the {{MLGraph}} on the GPU command buffer {{GPUCommandBuffer}} with {{MLNamedGPUInputs}} and {{MLNamedGPUOutputs}}. Return once the recording is completed.
+
+        <div algorithm=MLCommandEncoder.dispatch>
+            **Called on:** {{MLCommandEncoder}} |this|.
+
+            **Arguments:**
+            <pre class=argumentdef for="MLCommandEncoder/dispatch(graph, inputs, outputs)">
+                |graph|: an {{MLGraph}}. The compiled graph to be executed.
+                |inputs|: an {{MLNamedGPUInputs}}. The resources and optional dimensions of inputs.
+                |outputs|: an {{MLNamedGPUOutputs}}. The pre-allocated resources of required outputs.
             </pre>
 
             **Returns:** {{undefined}}.
@@ -2221,52 +2536,49 @@ interface MLGraph {
             1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
                 <div class=validusage>
                     1. For each |key| -> |value| of |inputs|:
-                        1. |this|.{{MLGraph/[[inputDescriptors]]}}[|key|] must exist.
-                        1. Let |inputDesc| be |this|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+                        1. |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|] must exist.
+                        1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
                         1. Let |inputSize| be 1.
-                        1. If |value| is an {{MLInput}}, then:
-                            1. The length of |value|.{{MLInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+                        1. If |value| is an {{MLGPUInput}}, then:
+                            1. The length of |value|.{{MLGPUInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
                             1. Let |i| be 0.
                             1. While true:
-                                1. Let |dimension| be |value|.{{MLInput/dimensions}}[|i|].
+                                1. Let |dimension| be |value|.{{MLGPUInput/dimensions}}[|i|].
                                 1. |dimension| must be greater than 0.
                                 1. If |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|] is greater than 0, then |dimension| must be equal to |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|].
                                 1. Set |inputSize| to the product of |inputSize| and |dimension|.
                                 1. Increment |i| by 1.
-                                1. If |i| if equal to the length of |value|.{{MLInput/dimensions}}, then break.
+                                1. If |i| if equal to the length of |value|.{{MLGPUInput/dimensions}}, then break.
                         1. Else:
                             1. For each |dimension| of |inputDesc|.{{MLOperandDescriptor/dimensions}}:
                                 1. The value of |dimension| must be greater than 0.
                                 1. Set |inputSize| to the product of |inputSize| and |dimension|.
-                        1. If |value| is an {{MLInput}}, then let |resource| be |value|.{{MLInput/resource}}.
-                        1. If |value| is an {{MLResource}}, then let |resource| be |value|.
-                        1. If |resource| is an {{ArrayBufferView}}, then:
-                            1. The kind of |resource| must be compatible with |inputDesc|.{{MLOperandDescriptor/type}} according to [this table](#appendices-mloperandtype-arraybufferview-compatibility).
-                            1. The length of |resource| must be the same as |inputSize|.
+                        1. If |value| is an {{MLGPUInput}}, then let |resource| be |value|.{{MLGPUInput/resource}}.
+                        1. If |value| is an {{MLGPUResource}}, then let |resource| be |value|.
 
                     1. For each |key| -> |value| of |outputs|:
-                        1. |this|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
+                        1. |graph|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
                 </div>
-            <!-- Compute -->
+            <!-- Dispatch -->
             1. For each |key| -> |value| of |inputs|:
-                1. Let |inputDesc| be |this|.{{MLGraph/[[inputDescriptors]]}}[|key|].
-                1. Let |inputTensor| be a new tensor for |this|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
-                1. If |value| is an {{MLInput}}, then:
-                    1. Set the dimensions of |inputTensor| to |value|.{{MLInput/dimensions}}.
+                1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+                1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
+                1. If |value| is an {{MLGPUInput}}, then:
+                    1. Set the dimensions of |inputTensor| to |value|.{{MLGPUInput/dimensions}}.
                 1. Else:
                     1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.
-                1. If |value| is an {{MLInput}}, then:
-                    1. Set the values of |inputTensor| to the values of |value|.{{MLInput/resource}}.
-                1. If |value| is an {{MLResource}}, then:
+                1. If |value| is an {{MLGPUInput}}, then:
+                    1. Set the values of |inputTensor| to the values of |value|.{{MLGPUInput/resource}}.
+                1. If |value| is an {{MLGPUResource}}, then:
                     1. Set the values of |inputTensor| to the values of |value|.
-                1. Set the input of |this|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
+                1. Set the input of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
             1. For each |key| -> |value| of |outputs|:
-                1. Issue a compute request for output of |this|.{{MLGraph/[[implementation]]}} that is associated with |key|.
+                1. Issue a compute request for output of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key|.
                 1. Wait for the compute request to be completed.
-                1. If there is an error returned by |this|.{{MLGraph/[[implementation]]}}, then:
+                1. If there is an error returned by |graph|.{{MLGraph/[[implementation]]}}, then:
                     1. Throw an {{OperationError}} {{DOMException}} and stop.
                 1. Else:
-                    1. Let |outputTensor| be the output tensor returned by |this|.{{MLGraph/[[implementation]]}}.
+                    1. Let |outputTensor| be the output tensor returned by |graph|.{{MLGraph/[[implementation]]}}.
                     1. If the kind of |value| is not compatible with the value type of |outputTensor|, then throw a {{DataError}} {{DOMException}} and stop.
                     1. Let |outputSize| be 1.
                     1. For each |dimension| of dimensions of |outputTensor|:
@@ -2276,86 +2588,23 @@ interface MLGraph {
                     1. Else:
                         1. Set the values of |value| to the values of |outputTensor|.
             1. Return {{undefined}}.
-
-            Issue: Describe the algorithm steps for |this|.{{MLGraph/[[context]]}} created from {{WebGLRenderingContext}} and {{GPUDevice}}.
         </div>
-</dl>
-
-### Examples ### {#compilation-examples}
 
-<div class="example">
-The following code showcases the computation with dynamic input dimensions.
-<pre highlight="js">
-function sizeOfShape(array) {
-  return array.reduce(
-      (accumulator, currentValue) => accumulator * currentValue);
-}
-
-const context = navigator.ml.createContext();
-
-// Create a graph with dynamic shaped inputs.
-const builder = new MLGraphBuilder(context);
-const descA = {type: 'float32', dimensions: [-1, 4]};
-const a = builder.input('a', descA);
-const descB = {type: 'float32', dimensions: [4, -1]};
-const b = builder.input('b', descB);
-const c = builder.matmul(a, b);
-const graph = builder.build({'c': c});
-
-function allocateAndCompute(shapeA, shapeB, shapeC) {
-  const bufferA = new Float32Array(sizeOfShape(shapeA)).fill(0.5);
-  const bufferB = new Float32Array(sizeOfShape(shapeB)).fill(0.5);
-  const bufferC = new Float32Array(sizeOfShape(shapeC));
-
-  // Specify the shape of inputs when computing.
-  const inputs = {
-    'a': {resource: bufferA, dimensions: shapeA},
-    'b': {resource: bufferB, dimensions: shapeB},
-  };
-  const outputs = {'c': bufferC};
-  graph.compute(inputs, outputs);
-  console.log(&#96;values: ${bufferC}&#96;);
-}
-
-allocateAndCompute([3, 4], [4, 3], [3, 3]);
-allocateAndCompute([4, 4], [4, 4], [4, 4]);
-allocateAndCompute([5, 4], [4, 5], [5, 5]);
-</pre>
-</div>
-
-<div class="example">
-The following code showcases the computation with optional outputs.
-<pre highlight="js">
-const context = navigator.ml.createContext();
+    : <dfn>finish(descriptor)</dfn>
+    ::
+        Complete the recording of the command sequence and return a corresponding {{GPUCommandBuffer}}.
 
-// Build a graph with two outputs.
-const builder = new MLGraphBuilder(context);
-const descA = {type: 'float32', dimensions: [3, 4]};
-const a = builder.input('a', descA);
-const descB = {type: 'float32', dimensions: [4, 3]};
-const bufferB = new Float32Array(sizeOfShape(descB.dimensions)).fill(0.5);
-const b = builder.constant(descB, bufferB);
-const descC = {type: 'float32', dimensions: [3, 3]};
-const bufferC = new Float32Array(sizeOfShape(descC.dimensions)).fill(1);
-const c = builder.constant(descC, bufferC);
-const d = builder.matmul(a, b);
-const e = builder.add(d, c);
-const graph = builder.build({'d': d, 'e': e});
+        <div algorithm=MLCommandEncoder.finish>
+            **Called on:** {{MLCommandEncoder}} |this|.
 
-const bufferA = new Float32Array(sizeOfShape(descA.dimensions)).fill(0.5);
-const inputs = {'a': bufferA};
-
-// Compute d.
-const bufferD = new Float32Array(sizeOfShape([3, 3]));
-graph.compute(inputs, {'d': bufferD});
-console.log(&#96;values: ${bufferD}&#96;);
+            **Arguments:**
+            <pre class=argumentdef for="MLCommandEncoder/finish(descriptor)">
+                |descriptor|: an {{GPUCommandBufferDescriptor}}. Descriptor of the command buffer.
+            </pre>
 
-// Compute e.
-const bufferE = new Float32Array(sizeOfShape([3, 3]));
-graph.compute(inputs, {'e': bufferE});
-console.log(&#96;values: ${bufferE}&#96;);
-</pre>
-</div>
+            **Returns:** {{GPUCommandBuffer}}.
+        </div>
+</dl>
 
 Examples {#examples}
 =====================
@@ -2435,7 +2684,7 @@ const inputs = {
   'input2': inputBuffer2,
 };
 const outputs = {'output': outputBuffer};
-graph.compute(inputs, outputs);
+context.compute(graph, inputs, outputs);
 
 console.log('Output value: ' + outputBuffer);
 // Output value: 2.25,2.25,2.25,2.25,2.25,2.25,2.25,2.25

From 4bad865e37796d4dffa691a2c0a449e8f7cc7772 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Thu, 10 Mar 2022 00:03:13 -0800
Subject: [PATCH 02/13] Fixing multiple refs on Promise type.

---
 index.bs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/index.bs b/index.bs
index 27327e20..79bc8bc2 100644
--- a/index.bs
+++ b/index.bs
@@ -839,7 +839,7 @@ Asynchronously carries out the computational workload of a compiled graph {{MLGr
         |outputs|: an {{MLNamedOutputs}}. The pre-allocated resources of required outputs.
     </pre>
 
-    **Returns:** {{Promise}}<{{undefined}}>.
+    **Returns:** Promise<{{undefined}}>.
     1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
         <div class=validusage>
             1. For each |key| -> |value| of |inputs|:
@@ -896,7 +896,7 @@ Asynchronously carries out the computational workload of a compiled graph {{MLGr
                 1. Throw a {{DataError}} {{DOMException}} and stop.
             1. Else:
                 1. Set the values of |value| to the values of |outputTensor|.
-    1. Return {{Promise}}<{{undefined}}>.
+    1. Return Promise<{{undefined}}>.
 </div>
 
 ### createCommandEncoder ### {#api-mlcontext-createcommandencoder}

From 1aeabcab97b8144f43060b8390dfff67faabe557 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Thu, 10 Mar 2022 07:15:15 -0800
Subject: [PATCH 03/13] Fix up over-aggressive search/replace.

---
 index.bs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/index.bs b/index.bs
index 79bc8bc2..70243b46 100644
--- a/index.bs
+++ b/index.bs
@@ -910,7 +910,7 @@ Create {{MLCommandEncoder}} interface used to record the ML workload onto a {{GP
 
 ## MLOperandDescriptor ## {#api-mloperanddescriptor}
 <script type=idl>
-enum MLGPUInputOperandLayout {
+enum MLInputOperandLayout {
   "nchw",
   "nhwc"
 };
@@ -1141,7 +1141,7 @@ dictionary MLConv2dOptions {
   sequence<long> dilations;
   MLAutoPad autoPad = "explicit";
   long groups = 1;
-  MLGPUInputOperandLayout inputLayout = "nchw";
+  MLInputOperandLayout inputLayout = "nchw";
   MLConv2dFilterOperandLayout filterLayout = "oihw";
   MLOperand bias;
   MLOperator activation;
@@ -1163,7 +1163,7 @@ partial interface MLGraphBuilder {
             - *dilations*: a sequence of {{long}} of length 2. The dilation factor for each spatial dimension of *input*, [dilation_height, dilation_width]. If not present, the values are assumed to be [1,1].
             - *autoPad*: an {{MLAutoPad}}. The automatic input padding options. By default, this argument is set to *"explicit"*, which means that the values in the *options.padding* array should be used for input padding. When the option is set other than *"explicit"*, the values in the *options.padding* array are ignored. With the *"same-upper"* option, the padding values are automatically computed such that the additional ending padding of the spatial input dimensions would allow all of the input values in the corresponding dimension to be filtered. The *"same-lower"* option is similar but padding is applied to the beginning padding of the spatial input dimensions instead of the ending one.
             - *groups*: a {{long}} scalar. The number of groups that input channels and output channels are divided into, default to 1.
-            - *inputLayout*: an {{MLGPUInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
+            - *inputLayout*: an {{MLInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
 
                 "nchw":
                     - input tensor: [batches, input_channels, height, width]
@@ -1218,7 +1218,7 @@ dictionary MLConvTranspose2dOptions {
   sequence<long> outputSizes;
   MLAutoPad autoPad = "explicit";
   long groups = 1;
-  MLGPUInputOperandLayout inputLayout = "nchw";
+  MLInputOperandLayout inputLayout = "nchw";
   MLConvTranspose2dFilterOperandLayout filterLayout = "iohw";
   MLOperand bias;
   MLOperator activation;
@@ -1243,7 +1243,7 @@ partial interface MLGraphBuilder {
             - *outputSizes*: a sequence of {{long}} of length 2. The sizes of the last two dimensions of the output tensor. When the output sizes are explicitly specified, the output padding values in *options.outputPadding* are ignored. If not specified, the output sizes are automatically computed.
             - *autoPad*: an {{MLAutoPad}}. The automatic input padding options. By default, this argument is set to *"explicit"*, which means that the values in the *options.padding* array should be used for input padding. When the option is set other than *"explicit"*, the values in the *options.padding* array are ignored. With the *"same-upper"* option, the padding values are automatically computed such that the additional ending padding of the spatial input dimensions would allow all of the input values in the corresponding dimension to be filtered. The *"same-lower"* option is similar but padding is applied to the beginning padding of the spatial input dimensions instead of the ending one.
             - *groups*: a {{long}} scalar. The number of groups that input channels and output channels are divided into, default to 1.
-            - *inputLayout*: an {{MLGPUInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
+            - *inputLayout*: an {{MLInputOperandLayout}}. The default value is *"nchw"*. This option specifies the layout format of the input and output tensor as follow:
 
                 "nchw":
                     - input tensor: [batches, input_channels, height, width]
@@ -1750,7 +1750,7 @@ dictionary MLInstanceNormalizationOptions {
   MLOperand scale;
   MLOperand bias;
   float epsilon = 1e-5;
-  MLGPUInputOperandLayout layout = "nchw";
+  MLInputOperandLayout layout = "nchw";
 };
 
 partial interface MLGraphBuilder {
@@ -1765,7 +1765,7 @@ partial interface MLGraphBuilder {
               - *scale*: an {{MLOperand}}. The 1-D tensor of the scaling values whose length is equal to the size of the feature dimension of the input e.g. for the input tensor with *nchw* layout, the feature dimension is 1.
               - *bias*: an {{MLOperand}}. The 1-D tensor of the bias values whose length is equal to the size of the feature dimension of the input e.g. for the input tensor with *nchw* layout, the feature dimension is 1.
               - *epsilon*: a {{float}} scalar. A small value to prevent computational error due to divide-by-zero. The default value is 0.00001 when not specified.
-              - *layout*: an {{MLGPUInputOperandLayout}}. This option specifies the layout format of the input. The default value is *"nchw"*.
+              - *layout*: an {{MLInputOperandLayout}}. This option specifies the layout format of the input. The default value is *"nchw"*.
         
     **Returns:** an {{MLOperand}}. The instance-normalized 4-D tensor of the same shape as the input tensor.
 
@@ -1991,7 +1991,7 @@ dictionary MLPool2dOptions {
   sequence<long> strides;
   sequence<long> dilations;
   MLAutoPad autoPad = "explicit";
-  MLGPUInputOperandLayout layout = "nchw";
+  MLInputOperandLayout layout = "nchw";
   MLRoundingType roundingType = "floor";
   sequence<long> outputSizes;
 };
@@ -2018,7 +2018,7 @@ partial interface MLGraphBuilder {
                 for each spatial dimension of *input*, [dilation_height, dilation_width].
                 If not present, the values are assumed to be [1,1].
             - *autoPad*: an {{MLAutoPad}}. The automatic input padding options. By default, this argument is set to *"explicit"*, which means that the values in the *options.padding* array should be used for input padding. When the option is set other than *"explicit"*, the values in the *options.padding* array are ignored. With the *"same-upper"* option, the padding values are automatically computed such that the additional ending padding of the spatial input dimensions would allow all of the input values in the corresponding dimension to be filtered. The *"same-lower"* option is similar but padding is applied to the beginning padding of the spatial input dimensions instead of the ending one.
-            - *layout*: an {{MLGPUInputOperandLayout}}. The default value is *"nchw"*. This option specifies the
+            - *layout*: an {{MLInputOperandLayout}}. The default value is *"nchw"*. This option specifies the
                 layout format of the input and output tensor as follow:
 
                 "nchw":

From 8dff2167995083eed1f5c70b8bf8f02a6ff3bcac Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Sun, 27 Mar 2022 17:01:30 -0700
Subject: [PATCH 04/13] Remove `MLCommandEncoder` and simplify `MLContext`
 creation with params.

---
 index.bs | 256 +++++++++++--------------------------------------------
 1 file changed, 48 insertions(+), 208 deletions(-)

diff --git a/index.bs b/index.bs
index 70243b46..285c2dcb 100644
--- a/index.bs
+++ b/index.bs
@@ -475,18 +475,13 @@ from all the inputs bound to the graph. This type of execution is limited only t
 bound to the context is a CPU device.
 
 The {{MLContext/computeAsync()}} method represents a way the execution of the graph is performed asynchronously
-on a separate worker thread. This method returns immediately without blocking the calling thread. This execution 
-method is appropriate when the responsiveness of the calling thread is critical to good user experience. The 
-computation results will be placed at the bound outputs at the time the operation is completed on a worker thread 
-at which time the calling thread is signaled. This type of execution supports both the CPU and GPU device, 
-including when the context is created from the {{WebGLRenderingContext}}.
-
-The {{MLCommandEncoder}} interface created by the {{MLContext/createCommandEncoder()}} method supports 
-a graph execution method that provides the maximum flexibility to callers that also utilize WebGPU in their 
-application. It does this by placing the workload required to initialize and compute the results of the 
-operations in the graph onto a {{GPUCommandBuffer}}. The callers are responsible for the eventual submission 
-of this workload on the {{GPUQueue}}. The submitted workload once completely executed on the GPU would 
-signal the queue with the results filled in the bound output buffers.
+either on a parallel timeline in a separate worker thread for the CPU execution or on a GPU timeline in a GPU 
+command queue. This method returns immediately without blocking the calling thread while the actual execution is 
+offloaded to a different timeline. This type of execution is appropriate when the responsiveness of the calling 
+thread is critical to good user experience. The computation results will be placed at the bound outputs at the 
+time the operation is successfully completed on the offloaded timeline at which time the calling thread is 
+signaled. This type of execution supports both the CPU and GPU device, including when the context is created 
+from the {{WebGLRenderingContext}}.
 
 In each of these various execution methods, the caller supplies the input values using {{MLNamedInputs}}
 or equivalent type, binding the input {{MLOperand}}s to their values. The caller supplies pre-allocated
@@ -510,18 +505,17 @@ An {{MLContext}} interface represents a global state of neural network execution
 
 In a situation when a GPU context executes a graph with a constant or an input in the system memory as an {{ArrayBufferView}}, the input content is automatically uploaded from the system memory to the GPU memory, and downloaded back to the system memory of an {{ArrayBufferView}} output buffer at the end of the graph execution. This data upload and download cycles will only occur whenever the execution device requires the data to be copied out of and back into the system memory, such as in the case of the GPU. It doesn't occur when the device is a CPU device. Additionally, the result of the graph execution is in a known layout format. While the execution may be optimized for a native memory access pattern in an intermediate result within the graph, the output of the last operation of the graph must convert the content back to a known layout format at the end of the graph in order to maintain the expected behavior from the caller's perspective.
 
-When an {{MLContext}} is created with {{MLContextOptions}}, the user agent selects and creates the underlying execution device by taking into account the application's [=power preference=] and [=device preference=] specified in the {{MLPowerPreference}} and {{MLDevicePreference}} options.
+When an {{MLContext}} is created with {{MLContextOptions}}, the user agent selects and creates the underlying execution device by taking into account the application's [=power preference=] and [=device type=] specified in the {{MLPowerPreference}} and {{MLDeviceType}} options.
 
-The following table summarizes the types of resource supported by the device selected.
+The following table summarizes the types of resource supported by the device option.
 
 <div class="note">
 <table>
-  <tr><th>Device Type<th>ArrayBufferView<th>GPUBuffer<th>GPUTexture<th>WebGLBuffer<th>WebGLTexture
-  <tr><td>GPUDevice<td>Yes<td>Yes<td>Yes<td>No<td>No
-  <tr><td>WebGLRenderingContext<td>Yes<td>No<td>No<td>Yes<td>Yes
-  <tr><td>default<td>Yes<td>No<td>No<td>No<td>No
-  <tr><td>gpu<td>Yes<td>No<td>No<td>No<td>No
+  <tr><th>Device Option<th>ArrayBufferView<th>GPUBuffer<th>GPUTexture<th>WebGLBuffer<th>WebGLTexture
   <tr><td>cpu<td>Yes<td>No<td>No<td>No<td>No
+  <tr><td>gpu (GPUDevice == null)<td>Yes<td>No<td>No<td>No<td>No
+  <tr><td>gpu (GPUDevice != null)<td>Yes<td>Yes<td>Yes<td>No<td>No
+  <tr><td>WebGLRenderingContext<td>Yes<td>No<td>No<td>Yes<td>Yes
 </table>
 </div>
 
@@ -543,10 +537,9 @@ WorkerNavigator includes NavigatorML;
 
 ## ML ## {#api-ml}
 <script type=idl>
-enum MLDevicePreference {
-  "default",
-  "gpu",
-  "cpu"
+enum MLDeviceType {
+  "cpu",
+  "gpu"
 };
 
 enum MLPowerPreference {
@@ -556,47 +549,39 @@ enum MLPowerPreference {
 };
 
 dictionary MLContextOptions {
-  MLDevicePreference devicePreference = "default";
+  MLDeviceType deviceType = "cpu";
   MLPowerPreference powerPreference = "default";
+  GPUDevice gpuDevice = null;
+  WebGLRenderingContext glContext = null;
 };
 
 [SecureContext, Exposed=(Window, DedicatedWorker)]
 interface ML {
   MLContext createContext(optional MLContextOptions options = {});
-  MLContext createContext(WebGLRenderingContext glContext);
-  MLContext createContext(GPUDevice gpuDevice);
 };
 </script>
 
 The {{ML/createContext()}} method steps are:
 1. If [=this=]'s [=relevant global object=]'s [=associated Document=] is not [=allowed to use=] the [=webnn-feature|webnn=] feature, then throw a "{{SecurityError!!exception}}" {{DOMException}} and abort these steps.
 1. Let |context| be a new {{MLContext}} object.
-1. Switch on the method's first argument:
+1. Switch on the options specified in the {{MLContextOptions}}
     <dl class=switch>
-    <dt>{{MLContextOptions}}
-    <dd>Set |context|.{{[[contextType]]}} to [=default-context|default=].
-    <dd>Set |context|.{{[[devicePreference]]}} to the value of {{MLContextOptions}}'s {{devicePreference}} member.
-    <dd>Set |context|.{{[[powerPreference]]}} to the value of {{MLContextOptions}}'s {{powerPreference}} member.
-
-    <dt>{{WebGLRenderingContext}}
-    <dd>Set |context|.{{[[contextType]]}} to [=webgl-context|webgl=].
-    <dd>Set |context|.{{[[devicePreference]]}} to "[=device-preference-gpu|gpu=]".
-    <dd>Set |context|.{{[[powerPreference]]}} to "[=power-preference-default|default=]".
-
-    <dt>{{GPUDevice}}
+    <dt>{{MLContextOptions}} != null && {{MLContextOptions}}.{{deviceType}} == "gpu"
+    <dd>Set |context|.{{[[deviceType]]}} to "[=device-type-gpu|gpu=]".
+    <dd>Set |context|.{{[[powerPreference]]}} to the value of {{MLContextOptions}}'s {{powerPreference}}.
+    <dl class=switch>
+    <dt>{{MLContextOptions}}.{{glContext}} != null
+    <dd>Set |context|.{{[[contextType]]}} to [=webgpu-context|webgl=].
+    <dt>Otherwise
     <dd>Set |context|.{{[[contextType]]}} to [=webgpu-context|webgpu=].
-    <dd>Set |context|.{{[[devicePreference]]}} to "[=device-preference-gpu|gpu=]".
-    <dd>Set |context|.{{[[powerPreference]]}} to "[=power-preference-default|default=]".
-
+    </dl>
     <dt>Otherwise
-    <dd>Set |context|.{{[[contextType]]}} to [=default-context|default=].
-    <dd>Set |context|.{{[[devicePreference]]}} to "[=device-preference-default|default=]".
+    <dd>Set |context|.{{[[contextType]]}} to [=cpu-context|cpu=].
+    <dd>Set |context|.{{[[deviceType]]}} to "[=device-type-cpu|cpu=]".
     <dd>Set |context|.{{[[powerPreference]]}} to "[=power-preference-default|default=]".
     </dl>
 1. Return |context|.
 
-Note: When {{[[contextType]]}} is set to "[=webgl-context|webgl=]" or "[=webgpu-context|webgpu=]", [=device preference=] "[=device-preference-gpu|gpu=]" is implied and {{[[devicePreference]]}} is set to "[=device-preference-gpu|gpu=]" and {{[[powerPreference]]}} is set to "[=power-preference-default|default=]".
-
 ### Permissions Policy Integration ### {#permissions-policy-integration}
 
 This specification defines a <a>policy-controlled feature</a> identified by the
@@ -604,26 +589,24 @@ string "<code><dfn data-lt="webnn-feature">webnn</dfn></code>".
 Its <a>default allowlist</a> is <code>'self'</code>.
 
 ## MLContext ## {#api-mlcontext}
-The {{MLContext}} interface represents a global state of neural network compute workload and execution processes. Each {{MLContext}} object has associated [=context type=], [=device preference=] and [=power preference=].
+The {{MLContext}} interface represents a global state of neural network compute workload and execution processes. Each {{MLContext}} object has associated [=context type=], [=device type=] and [=power preference=].
 
 The <dfn>context type</dfn> is the type of the execution context that manages the resources and facilitates the compilation and execution of the neural network graph:
 <dl>
-<dt>"<code><dfn data-lt="default-context">default</dfn></code>"</dt>
-<dd>Context created per the user agent's preference.</dd>
+<dt>"<code><dfn data-lt="cpu-context">cpu</dfn></code>"</dt>
+<dd>Context created for CPU execution.</dd>
 <dt>"<code><dfn data-lt="webgl-context">webgl</dfn></code>"</dt>
 <dd>Context created from WebGL rendering context.</dd>
 <dt>"<code><dfn data-lt="webgpu-context">webgpu</dfn></code>"</dt>
 <dd>Context created from WebGPU device.</dd>
 </dl>
 
-The <dfn>device preference</dfn> indicates the preferred kind of device to be used. It is one of the following:
+The <dfn>device type</dfn> indicates the kind of device used for the context. It is one of the following:
 <dl>
-<dt>"<code><dfn data-lt="device-preference-default">default</dfn></code>"</dt>
-<dd>The user agent selects the most suitable device to use.</dd>
-<dt>"<code><dfn data-lt="device-preference-gpu">gpu</dfn></code>"</dt>
+<dt>"<code><dfn data-lt="device-type-cpu">cpu</dfn></code>"</dt>
+<dd>Provides the broadest compatibility and usability across all client devices with varying degrees of performance.</dd>
+<dt>"<code><dfn data-lt="device-type-gpu">gpu</dfn></code>"</dt>
 <dd>Provides the broadest range of achievable performance across graphics hardware platforms from consumer devices to professional workstations.</dd>
-<dt>"<code><dfn data-lt="device-preference-cpu">cpu</dfn></code>"</dt>
-<dd>Provides the broadest reach of software compute availability, but with limited scalability of execution performance on the more complex neural networks.</dd>
 </dl>
 
 The <dfn>power preference</dfn> indicates preference as related to power consumption. It is one of the following:
@@ -660,8 +643,6 @@ interface MLContext {
   undefined compute(MLGraph graph, MLNamedArrayInputs inputs, MLNamedArrayOutputs outputs);
 
   Promise<undefined> computeAsync(MLGraph graph, MLNamedInputs inputs, MLNamedOutputs outputs);
-
-  MLCommandEncoder createCommandEncoder();
 };
 </script>
 
@@ -671,16 +652,20 @@ interface MLContext {
     : <dfn>\[[contextType]]</dfn> of type [=context type=]
     ::
         The {{MLContext}}'s [=context type=].
-    : <dfn>\[[devicePreference]]</dfn> of type [=device preference=]
+    : <dfn>\[[deviceType]]</dfn> of type [=device type=]
     ::
-        The {{MLContext}}'s [=device preference=].
+        The {{MLContext}}'s [=device type=].
     : <dfn>\[[powerPreference]]</dfn> of type [=power preference=]
     ::
         The {{MLContext}}'s [=power preference=].
 </dl>
 
+<div class="note">
+When the {{[[contextType]]}} is set to [=webgpu-context|webgpu=] but the {{MLContextOptions}}.{{gpuDevice}} is not specified, the user agent is responsible for creating an internal GPU device that operates within the context and is capable of ML workload submission on behalf of the calling application. In this setting however, only {{ArrayBufferView}} inputs and outputs are allowed in and out of the graph execution since the application has no way to know what type of internal GPU device is being created on their behalf. In this case, the user agent is responsible for automatic uploads and downloads of the inputs and outputs to and from the GPU memory using this said internal device.
+</div>
+
 ### compute ### {#api-mlcontext-compute}
-Synchronously carries out the computational workload of a compiled graph {{MLGraph}} on the calling thread, which must be a worker thread, to produce results as defined by the operations in the graph. This method of execution requires an {{MLContext}} created with {{MLContextOptions}} with the {{MLDevicePreference}} option set to either "cpu" or "default" resolved to a CPU context. Otherwise, it throws an {{OperationError}} exception.
+Synchronously carries out the computational workload of a compiled graph {{MLGraph}} on the calling thread, which must be a worker thread, to produce results as defined by the operations in the graph. This method of execution requires an {{MLContext}} created with {{MLContextOptions}} with the {{MLDeviceType}} option set to "cpu". Otherwise, it throws an {{OperationError}} exception.
 
 <div algorithm=mlcontext.compute>
 
@@ -828,7 +813,7 @@ console.log(&#96;values: ${bufferE}&#96;);
 </div>
 
 ### computeAsync ### {#api-mlcontext-computeasync}
-Asynchronously carries out the computational workload of a compiled graph {{MLGraph}} on a worker thread to avoid blocking the calling thread while producing results as defined by the operations in the graph. This method of execution requires an {{MLContext}} created with {{MLContextOptions}} or {{WebGLRenderingContext}}. Otherwise, it throws an {{OperationError}} exception.
+Asynchronously carries out the computational workload of a compiled graph {{MLGraph}} on a separate timeline, either on a worker thread for the CPU execution, or on a GPU timeline for the submission of GPU workload on the command queue. The asynchronous nature of this call avoids blocking the calling thread while the computation for result is ongoing. This method of execution is supported when the {{MLContext}} is created with {{MLContextOptions}} with the {{MLDeviceType}} option set to either "cpu" or "gpu".
 
 <div algorithm=mlcontext.computeasync>
 
@@ -899,13 +884,10 @@ Asynchronously carries out the computational workload of a compiled graph {{MLGr
     1. Return Promise<{{undefined}}>.
 </div>
 
-### createCommandEncoder ### {#api-mlcontext-createcommandencoder}
-Create {{MLCommandEncoder}} interface used to record the ML workload onto a {{GPUCommandBuffer}} to allow mixing of ML workload with other GPU workloads in an application that leverages WebGPU. This method only succeeds on an {{MLContext}} created with {{GPUDevice}}. Otherwise, it throws an {{OperationError}} exception.
-
-<div algorithm=mlcontext.createcommandencoder>
+<div class="note">
+When the {{[[contextType]]}} of {{MLContext}} is set to [=webgpu-context|webgpu=] with the {{MLContextOptions}}.{{gpuDevice}} also specified, the user agent queues the ML workload needed to compute the result of the operations in the graph in an internal command buffer before submitting it to the {{gpuDevice}}'s default {{GPUQueue}}. The user agent then returns the call with a {{Promise}} that resolves once the queue finishes processing all the work submitted to it.
 
-    **Returns:** 
-        - an {{MLCommandEncoder}}. The command encoder used to record ML workload on the GPU.
+It is strongly recommended that the internal command buffer used to submit the workload is reused across {{MLContext/computeAsync()}} calls when it is not in use for efficiency.
 </div>
 
 ## MLOperandDescriptor ## {#api-mloperanddescriptor}
@@ -2464,148 +2446,6 @@ interface MLGraph {};
         The underlying implementation provided by the User Agent.
 </dl>
 
-## MLCommandEncoder ## {#api-mlcommandencoder}
-The {{MLCommandEncoder}} interface represents a method of execution that synchronously records the computational workload of a compiled graph {{MLGraph}} to a GPU command buffer {{GPUCommandBuffer}} on the calling thread. Since the workload is not immediately executed, just recorded, this method allows more flexibility for the caller to determine how and when the recorded commands will be submitted for execution on the GPU relative to other GPU workload on the same queue.
-
-<script type=idl>
-typedef (GPUBuffer or GPUTexture) MLGPUResource;
-
-dictionary MLGPUInput {
-  required MLGPUResource resource;
-  required sequence<long> dimensions;
-};
-
-typedef record<DOMString, (MLGPUResource or MLGPUInput)> MLNamedGPUInputs;
-typedef record<DOMString, MLGPUResource> MLNamedGPUOutputs;
-
-[SecureContext, Exposed=(Window, DedicatedWorker)]
-interface MLCommandEncoder {
-  undefined initializeGraph(MLGraph graph, MLNamedGPUInputs inputs);
-
-  undefined dispatch(MLGraph graph, MLNamedGPUInputs inputs, MLNamedGPUOutputs outputs);
-
-  GPUCommandBuffer finish(optional GPUCommandBufferDescriptor descriptor = {});
-};
-</script>
-
-{{MLCommandEncoder}} has the following internal slots:
-
-<dl dfn-type=attribute dfn-for="MLCommandEncoder">
-    : <dfn>\[[context]]</dfn> of type {{MLContext}}
-    ::
-        The context of type {{MLContext}} associated with this {{MLCommandEncoder}}.
-
-    : <dfn>\[[implementation]]</dfn>
-    ::
-        The underlying implementation provided by the User Agent.
-</dl>
-
-<dl dfn-type=method dfn-for=MLCommandEncoder>
-    : <dfn>initializeGraph(graph, inputs)</dfn>
-    ::
-        Record the initialization of the graph {{MLGraph}} on the GPU command buffer {{GPUCommandBuffer}} with constant inputs {{MLNamedGPUInputs}} such as weight inputs. This is a necessary step for optimal performance as it allows the underlying platform an opportunity to prepare and optimize constant input data for the following execution of computational wordloads on the queue. It should only be done once per graph.
-
-        <div algorithm=MLCommandEncoder.initializeGraph>
-            **Called on:** {{MLCommandEncoder}} |this|.
-
-            **Arguments:**
-            <pre class=argumentdef for="MLCommandEncoder/initializeGraph(graph, inputs)">
-                |graph|: an {{MLGraph}}. The compiled graph to be executed.
-                |inputs|: an {{MLNamedGPUInputs}}. The resources and optional dimensions of constant inputs.
-            </pre>
-
-            **Returns:** {{undefined}}.
-        </div>
-
-    : <dfn>dispatch(graph, inputs, outputs)</dfn>
-    ::
-        Record the computational workload of the {{MLGraph}} on the GPU command buffer {{GPUCommandBuffer}} with {{MLNamedGPUInputs}} and {{MLNamedGPUOutputs}}. Return once the recording is completed.
-
-        <div algorithm=MLCommandEncoder.dispatch>
-            **Called on:** {{MLCommandEncoder}} |this|.
-
-            **Arguments:**
-            <pre class=argumentdef for="MLCommandEncoder/dispatch(graph, inputs, outputs)">
-                |graph|: an {{MLGraph}}. The compiled graph to be executed.
-                |inputs|: an {{MLNamedGPUInputs}}. The resources and optional dimensions of inputs.
-                |outputs|: an {{MLNamedGPUOutputs}}. The pre-allocated resources of required outputs.
-            </pre>
-
-            **Returns:** {{undefined}}.
-
-            1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
-                <div class=validusage>
-                    1. For each |key| -> |value| of |inputs|:
-                        1. |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|] must exist.
-                        1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
-                        1. Let |inputSize| be 1.
-                        1. If |value| is an {{MLGPUInput}}, then:
-                            1. The length of |value|.{{MLGPUInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
-                            1. Let |i| be 0.
-                            1. While true:
-                                1. Let |dimension| be |value|.{{MLGPUInput/dimensions}}[|i|].
-                                1. |dimension| must be greater than 0.
-                                1. If |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|] is greater than 0, then |dimension| must be equal to |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|].
-                                1. Set |inputSize| to the product of |inputSize| and |dimension|.
-                                1. Increment |i| by 1.
-                                1. If |i| if equal to the length of |value|.{{MLGPUInput/dimensions}}, then break.
-                        1. Else:
-                            1. For each |dimension| of |inputDesc|.{{MLOperandDescriptor/dimensions}}:
-                                1. The value of |dimension| must be greater than 0.
-                                1. Set |inputSize| to the product of |inputSize| and |dimension|.
-                        1. If |value| is an {{MLGPUInput}}, then let |resource| be |value|.{{MLGPUInput/resource}}.
-                        1. If |value| is an {{MLGPUResource}}, then let |resource| be |value|.
-
-                    1. For each |key| -> |value| of |outputs|:
-                        1. |graph|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
-                </div>
-            <!-- Dispatch -->
-            1. For each |key| -> |value| of |inputs|:
-                1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
-                1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
-                1. If |value| is an {{MLGPUInput}}, then:
-                    1. Set the dimensions of |inputTensor| to |value|.{{MLGPUInput/dimensions}}.
-                1. Else:
-                    1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.
-                1. If |value| is an {{MLGPUInput}}, then:
-                    1. Set the values of |inputTensor| to the values of |value|.{{MLGPUInput/resource}}.
-                1. If |value| is an {{MLGPUResource}}, then:
-                    1. Set the values of |inputTensor| to the values of |value|.
-                1. Set the input of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
-            1. For each |key| -> |value| of |outputs|:
-                1. Issue a compute request for output of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key|.
-                1. Wait for the compute request to be completed.
-                1. If there is an error returned by |graph|.{{MLGraph/[[implementation]]}}, then:
-                    1. Throw an {{OperationError}} {{DOMException}} and stop.
-                1. Else:
-                    1. Let |outputTensor| be the output tensor returned by |graph|.{{MLGraph/[[implementation]]}}.
-                    1. If the kind of |value| is not compatible with the value type of |outputTensor|, then throw a {{DataError}} {{DOMException}} and stop.
-                    1. Let |outputSize| be 1.
-                    1. For each |dimension| of dimensions of |outputTensor|:
-                        1. Set |outputSize| to the product of |outputSize| and |dimension|.
-                    1. If |outputSize| is greater than the length of |value|, then:
-                        1. Throw a {{DataError}} {{DOMException}} and stop.
-                    1. Else:
-                        1. Set the values of |value| to the values of |outputTensor|.
-            1. Return {{undefined}}.
-        </div>
-
-    : <dfn>finish(descriptor)</dfn>
-    ::
-        Complete the recording of the command sequence and return a corresponding {{GPUCommandBuffer}}.
-
-        <div algorithm=MLCommandEncoder.finish>
-            **Called on:** {{MLCommandEncoder}} |this|.
-
-            **Arguments:**
-            <pre class=argumentdef for="MLCommandEncoder/finish(descriptor)">
-                |descriptor|: an {{GPUCommandBufferDescriptor}}. Descriptor of the command buffer.
-            </pre>
-
-            **Returns:** {{GPUCommandBuffer}}.
-        </div>
-</dl>
-
 Examples {#examples}
 =====================
 

From 26d5bc185144a00ad82cd2f4aefc2e8e98f3e5c2 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Sun, 27 Mar 2022 17:04:35 -0700
Subject: [PATCH 05/13] Fix missing graph param

---
 explainer.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/explainer.md b/explainer.md
index 50f29393..4cd1d8b8 100644
--- a/explainer.md
+++ b/explainer.md
@@ -47,7 +47,7 @@ const bufferB = new Float32Array(4).fill(0.8);
 const bufferC = new Float32Array(4);
 const inputs = {'A': bufferA, 'B': bufferB};
 const outputs = {'C': bufferC};
-context.compute(inputs, outputs);
+context.compute(graph, inputs, outputs);
 // The computed result of [[1, 1], [1, 1]] is in the buffer associated with
 // the output operand.
 console.log('Output value: ' + bufferC);
@@ -154,7 +154,7 @@ export class NSNet2 {
       'gru94': gru94Buffer,
       'gru157': gru157Buffer
     };
-    return this.context.compute(inputs, outputs);
+    return this.context.compute(this.graph, inputs, outputs);
   }
 }
 ```

From 3feadd7885c1f7e4b8fafea43c705a942cfcdee5 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Sun, 27 Mar 2022 17:25:21 -0700
Subject: [PATCH 06/13] Fix build break due to merge conflict.

---
 index.bs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/index.bs b/index.bs
index 285c2dcb..ed2d57fe 100644
--- a/index.bs
+++ b/index.bs
@@ -398,7 +398,7 @@ In order to not allow an attacker to target a specific implementation that may c
 
 Issue: Hinting partially mitigates the concern. Investigate additional mitigations.
 
-The API design minimizes the attack surface for the compiled computational graph. The {{MLGraphBuilder}} interface that hosts the various operations is a data definition API and as such doesn't execute anything, only constructs data. What follows, is that the potential for an attack is limited to when binding the data to the graph before executing it by invoking the {{MLGraph/compute()}} method. This enables implementers to focus on hardening the {{MLGraph/compute()}} method. For example, by making sure it honors the boundary of data and fails appropriately when the bounds are not respected.
+The API design minimizes the attack surface for the compiled computational graph. The {{MLGraphBuilder}} interface that hosts the various operations is a data definition API and as such doesn't execute anything, only constructs data. What follows, is that the potential for an attack is limited to when binding the data to the graph before executing it by invoking the {{MLContext/compute()}} method. This enables implementers to focus on hardening the {{MLContext/compute()}} method. For example, by making sure it honors the boundary of data and fails appropriately when the bounds are not respected.
 
 Purpose-built Web APIs for measuring high-resolution time mitigate against timing attacks using techniques such as resolution reduction, adding jitter, detection of abuse and API call throttling [[hr-time-3]]. The practical deployment of WebNN implementations are likely to bring enough jitter to make timing attacks impractical (e.g. because they would use IPC) but implementers are advised to consider and test their implementations against timing attacks.
 

From 6a24927b3c8212ef4e6316558e66dc8019bce286 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Sun, 27 Mar 2022 17:36:13 -0700
Subject: [PATCH 07/13] Fix build break: Disambiguate the WebIDL link to
 {{Promise}} definition.

---
 index.bs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/index.bs b/index.bs
index ed2d57fe..7d6eda03 100644
--- a/index.bs
+++ b/index.bs
@@ -33,6 +33,9 @@ urlPrefix: https://gpuweb.github.io/gpuweb/; spec: WEBGPU
         text: GPUQueue; url: queues
         text: GPUCommandBuffer; url: command-buffers
         text: GPUCommandBufferDescriptor; url: dictdef-gpucommandbufferdescriptor
+urlPrefix: https://webidl.spec.whatwg.org/; spec: WEBIDL
+    type: interface
+        text: Promise; url: idl-promise
 </pre>
 <pre class="biblio">
 {

From ef9262bfdb2c9ba24553d658fa4e238a81bf5f39 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Sun, 17 Apr 2022 23:02:16 -0700
Subject: [PATCH 08/13] GPU support on sync execution (limited to worker
 thread). Default context only supports CPU inputs and outputs (automatic
 upload/download). Reintroduce MLCommandEncoder for WebGPU interop.

---
 index.bs | 305 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 229 insertions(+), 76 deletions(-)

diff --git a/index.bs b/index.bs
index 7d6eda03..70c182b2 100644
--- a/index.bs
+++ b/index.bs
@@ -466,6 +466,18 @@ semantics, with no side effects.
 Each operation invocation conceptually returns a distinct new value, without
 changing the value of any other {{MLOperand}}.
 
+The runtime values (of {{MLOperand}}s) are tensors, which are essentially multidimensional
+arrays. The representation of the tensors is implementation dependent, but it typically
+includes the array data stored in some buffer (memory) and some metadata describing the
+array data (such as its shape). 
+
+As mentioned above, the operations have a functional semantics. This allows the implementation
+to potentially share the array data between multiple tensors. For example, the implementation
+of operations such as reshape, or slice, or squeeze may return a view of its input tensor
+that shares the same buffer as the input tensor. (In the case of reshape or squeeze,
+the entire data is shared, while in the case of slice, a part of the input data is shared.)
+The implementation may use views, as above, for intermediate values.
+
 The {{MLGraphBuilder/build()}} method of the {{MLGraphBuilder}} interface is used to compile and optimize
 the computation graph used to compute one or more specified outputs. The key
 purpose of the compilation step is to enable optimizations that span two or
@@ -473,9 +485,8 @@ more operations, such as operation or loop fusion.
 
 Once the {{MLGraph}} is constructed, there are multiple ways by which the graph may be executed. The
 {{MLContext/compute()}} method represents a way the execution of the graph is carried out immediately 
-on the calling thread, which must also be a worker thread. The execution produces the results of the computation 
-from all the inputs bound to the graph. This type of execution is limited only to when the computational device 
-bound to the context is a CPU device.
+on the calling thread, which must also be a worker thread, either on a CPU or GPU device. The execution 
+produces the results of the computation from all the inputs bound to the graph.
 
 The {{MLContext/computeAsync()}} method represents a way the execution of the graph is performed asynchronously
 either on a parallel timeline in a separate worker thread for the CPU execution or on a GPU timeline in a GPU 
@@ -486,38 +497,32 @@ time the operation is successfully completed on the offloaded timeline at which
 signaled. This type of execution supports both the CPU and GPU device, including when the context is created 
 from the {{WebGLRenderingContext}}.
 
-In each of these various execution methods, the caller supplies the input values using {{MLNamedInputs}}
-or equivalent type, binding the input {{MLOperand}}s to their values. The caller supplies pre-allocated
-buffers for output {{MLOperand}}s using {{MLNamedOutputs}} or equivalent type.
+In both the {{MLContext/compute()}} and {{MLContext/computeAsync()}} execution methods, the caller supplies 
+the input values using {{MLNamedArrayInputs}}, binding the input {{MLOperand}}s to their values. The caller
+then supplies pre-allocated buffers for output {{MLOperand}}s using {{MLNamedArrayOutputs}}.
 
-The runtime values (of {{MLOperand}}s) are tensors, which are essentially multidimensional
-arrays. The representation of the tensors is implementation dependent, but it typically
-includes the array data stored in some buffer (memory) and some metadata describing the
-array data (such as its shape). 
-
-As mentioned above, the operations have a functional semantics. This allows the implementation
-to potentially share the array data between multiple tensors. For example, the implementation
-of operations such as reshape, or slice, or squeeze may return a view of its input tensor
-that shares the same buffer as the input tensor. (In the case of reshape or squeeze,
-the entire data is shared, while in the case of slice, a part of the input data is shared.)
-The implementation may use views, as above, for intermediate values.
+The {{MLCommandEncoder}} interface created by the {{MLContext/createCommandEncoder()}} method supports 
+a graph execution method that provides the maximum flexibility to callers that also utilize WebGPU in their 
+application. It does this by placing the workload required to initialize and compute the results of the 
+operations in the graph onto a {{GPUCommandBuffer}}. The callers are responsible for the eventual submission 
+of this workload on the {{GPUQueue}} through the WebGPU queue submission mechanism. Once the submitted workload 
+is completely executed, the result is avaialble in the bound output buffers.
 
 ## Device Selection ## {#programming-model-device-selection}
 
-An {{MLContext}} interface represents a global state of neural network execution. One of the important context states is the underlying execution device that manages the resources and facilitates the compilation and the eventual execution of the neural network graph. An {{MLContext}} could be created from a specific GPU device such as {{GPUDevice}} or {{WebGLRenderingContext}} that is already in use by the application, in which case the corresponding {{GPUBuffer}} or {{WebGLBuffer}} resources used as graph constants, as well as the {{GPUTexture}} and {{WebGLTexture}} as graph inputs must also be created from the same device. In a multi-adapter configuration, the device used for {{MLContext}} must be created from the same adapter as the device used to allocate the resources referenced in the graph.
+An {{MLContext}} interface represents a global state of neural network execution. One of the important context states is the underlying execution device that manages the resources and facilitates the compilation and the eventual execution of the neural network graph. In addition to the default method of creation with {{MLContextOptions}}, an {{MLContext}} could also be created from a specific GPU device such as {{GPUDevice}} or {{WebGLRenderingContext}} that is already in use by the application, in which case the corresponding {{GPUBuffer}} or {{WebGLBuffer}} resources used as graph constants, as well as the {{GPUTexture}} and {{WebGLTexture}} as graph inputs must also be created from the same device. In a multi-adapter configuration, the device used for {{MLContext}} must be created from the same adapter as the device used to allocate the resources referenced in the graph.
 
 In a situation when a GPU context executes a graph with a constant or an input in the system memory as an {{ArrayBufferView}}, the input content is automatically uploaded from the system memory to the GPU memory, and downloaded back to the system memory of an {{ArrayBufferView}} output buffer at the end of the graph execution. This data upload and download cycles will only occur whenever the execution device requires the data to be copied out of and back into the system memory, such as in the case of the GPU. It doesn't occur when the device is a CPU device. Additionally, the result of the graph execution is in a known layout format. While the execution may be optimized for a native memory access pattern in an intermediate result within the graph, the output of the last operation of the graph must convert the content back to a known layout format at the end of the graph in order to maintain the expected behavior from the caller's perspective.
 
 When an {{MLContext}} is created with {{MLContextOptions}}, the user agent selects and creates the underlying execution device by taking into account the application's [=power preference=] and [=device type=] specified in the {{MLPowerPreference}} and {{MLDeviceType}} options.
 
-The following table summarizes the types of resource supported by the device option.
+The following table summarizes the types of resource supported by the context created through different method of creation:
 
 <div class="note">
 <table>
-  <tr><th>Device Option<th>ArrayBufferView<th>GPUBuffer<th>GPUTexture<th>WebGLBuffer<th>WebGLTexture
-  <tr><td>cpu<td>Yes<td>No<td>No<td>No<td>No
-  <tr><td>gpu (GPUDevice == null)<td>Yes<td>No<td>No<td>No<td>No
-  <tr><td>gpu (GPUDevice != null)<td>Yes<td>Yes<td>Yes<td>No<td>No
+  <tr><th>Creation method<th>ArrayBufferView<th>GPUBuffer<th>GPUTexture<th>WebGLBuffer<th>WebGLTexture
+  <tr><td>MLContextOptions<td>Yes<td>No<td>No<td>No<td>No
+  <tr><td>GPUDevice<td>Yes<td>Yes<td>Yes<td>No<td>No
   <tr><td>WebGLRenderingContext<td>Yes<td>No<td>No<td>Yes<td>Yes
 </table>
 </div>
@@ -554,33 +559,34 @@ enum MLPowerPreference {
 dictionary MLContextOptions {
   MLDeviceType deviceType = "cpu";
   MLPowerPreference powerPreference = "default";
-  GPUDevice gpuDevice = null;
-  WebGLRenderingContext glContext = null;
 };
 
 [SecureContext, Exposed=(Window, DedicatedWorker)]
 interface ML {
   MLContext createContext(optional MLContextOptions options = {});
+  MLContext createContext(GPUDevice gpuDevice);
+  MLContext createContext(WebGLRenderingContext glContext);
 };
 </script>
 
 The {{ML/createContext()}} method steps are:
 1. If [=this=]'s [=relevant global object=]'s [=associated Document=] is not [=allowed to use=] the [=webnn-feature|webnn=] feature, then throw a "{{SecurityError!!exception}}" {{DOMException}} and abort these steps.
 1. Let |context| be a new {{MLContext}} object.
-1. Switch on the options specified in the {{MLContextOptions}}
+1. Switch on the method's first argument:
     <dl class=switch>
-    <dt>{{MLContextOptions}} != null && {{MLContextOptions}}.{{deviceType}} == "gpu"
-    <dd>Set |context|.{{[[deviceType]]}} to "[=device-type-gpu|gpu=]".
+    <dt>{{MLContextOptions}}
+    <dd>Set |context|.{{[[contextType]]}} to [=default-context|default=].
+    <dd>Set |context|.{{[[deviceType]]}} to the value of {{MLContextOptions}}'s {{deviceType}}.
     <dd>Set |context|.{{[[powerPreference]]}} to the value of {{MLContextOptions}}'s {{powerPreference}}.
-    <dl class=switch>
-    <dt>{{MLContextOptions}}.{{glContext}} != null
-    <dd>Set |context|.{{[[contextType]]}} to [=webgpu-context|webgl=].
-    <dt>Otherwise
+
+    <dt>{{GPUDevice}}
     <dd>Set |context|.{{[[contextType]]}} to [=webgpu-context|webgpu=].
-    </dl>
-    <dt>Otherwise
-    <dd>Set |context|.{{[[contextType]]}} to [=cpu-context|cpu=].
-    <dd>Set |context|.{{[[deviceType]]}} to "[=device-type-cpu|cpu=]".
+    <dd>Set |context|.{{[[deviceType]]}} to "[=device-type-gpu|gpu=]".
+    <dd>Set |context|.{{[[powerPreference]]}} to "[=power-preference-default|default=]".
+
+    <dt>{{WebGLRenderingContext}}
+    <dd>Set |context|.{{[[contextType]]}} to [=webgl-context|webgl=].
+    <dd>Set |context|.{{[[deviceType]]}} to "[=device-type-gpu|gpu=]".
     <dd>Set |context|.{{[[powerPreference]]}} to "[=power-preference-default|default=]".
     </dl>
 1. Return |context|.
@@ -596,8 +602,8 @@ The {{MLContext}} interface represents a global state of neural network compute
 
 The <dfn>context type</dfn> is the type of the execution context that manages the resources and facilitates the compilation and execution of the neural network graph:
 <dl>
-<dt>"<code><dfn data-lt="cpu-context">cpu</dfn></code>"</dt>
-<dd>Context created for CPU execution.</dd>
+<dt>"<code><dfn data-lt="default-context">default</dfn></code>"</dt>
+<dd>Context created per user preference options.</dd>
 <dt>"<code><dfn data-lt="webgl-context">webgl</dfn></code>"</dt>
 <dd>Context created from WebGL rendering context.</dd>
 <dt>"<code><dfn data-lt="webgpu-context">webgpu</dfn></code>"</dt>
@@ -623,30 +629,16 @@ The <dfn>power preference</dfn> indicates preference as related to power consump
 </dl>
 
 <script type=idl>
-typedef (MLBufferView or WebGLTexture or GPUTexture) MLResource;
-
-dictionary MLInput {
-  required MLResource resource;
-  required sequence<long> dimensions;
-};
-
 dictionary MLArrayInput {
   required ArrayBufferView resource;
   required sequence<long> dimensions;
 };
 
-typedef record<DOMString, (MLResource or MLInput)> MLNamedInputs;
-typedef record<DOMString, MLResource> MLNamedOutputs;
 typedef record<DOMString, (ArrayBufferView or MLArrayInput)> MLNamedArrayInputs;
 typedef record<DOMString, ArrayBufferView> MLNamedArrayOutputs;
 
 [SecureContext, Exposed=(Window, DedicatedWorker)]
-interface MLContext {
-  [Exposed=(DedicatedWorker)]
-  undefined compute(MLGraph graph, MLNamedArrayInputs inputs, MLNamedArrayOutputs outputs);
-
-  Promise<undefined> computeAsync(MLGraph graph, MLNamedInputs inputs, MLNamedOutputs outputs);
-};
+interface MLContext {};
 </script>
 
 {{MLContext}} has the following internal slots:
@@ -664,22 +656,28 @@ interface MLContext {
 </dl>
 
 <div class="note">
-When the {{[[contextType]]}} is set to [=webgpu-context|webgpu=] but the {{MLContextOptions}}.{{gpuDevice}} is not specified, the user agent is responsible for creating an internal GPU device that operates within the context and is capable of ML workload submission on behalf of the calling application. In this setting however, only {{ArrayBufferView}} inputs and outputs are allowed in and out of the graph execution since the application has no way to know what type of internal GPU device is being created on their behalf. In this case, the user agent is responsible for automatic uploads and downloads of the inputs and outputs to and from the GPU memory using this said internal device.
+When the {{[[contextType]]}} is set to [=default-context|default=] with the {{MLContextOptions}}.{{deviceType}} set to [=device-type-gpu|gpu=], the user agent is responsible for creating an internal GPU device that operates within the context and is capable of ML workload submission on behalf of the calling application. In this setting however, only {{ArrayBufferView}} inputs and outputs are allowed in and out of the graph execution since the application has no way to know what type of internal GPU device is being created on their behalf. In this case, the user agent is responsible for automatic uploads and downloads of the inputs and outputs to and from the GPU memory using this said internal device.
 </div>
 
-### compute ### {#api-mlcontext-compute}
-Synchronously carries out the computational workload of a compiled graph {{MLGraph}} on the calling thread, which must be a worker thread, to produce results as defined by the operations in the graph. This method of execution requires an {{MLContext}} created with {{MLContextOptions}} with the {{MLDeviceType}} option set to "cpu". Otherwise, it throws an {{OperationError}} exception.
+### Synchronous Execution ### {#api-mlcontext-sync-execution}
+Synchronously carries out the computational workload of a compiled graph {{MLGraph}} on the calling thread, which must be a worker thread, to produce results as defined by the operations in the graph. This method of execution requires an {{MLContext}} created with {{MLContextOptions}}. Otherwise, it throws an {{OperationError}} exception.
+
+<script type=idl>
+partial interface MLContext {
+  [Exposed=(DedicatedWorker)]
+  undefined compute(MLGraph graph, MLNamedArrayInputs inputs, MLNamedArrayOutputs outputs);
+};
+</script>
 
 <div algorithm=mlcontext.compute>
 
     **Arguments:**
-    <pre class=argumentdef for="MLContext/compute(graph, inputs, outputs)">
-        |graph|: an {{MLGraph}}. The compiled graph to be executed.
-        |inputs|: an {{MLNamedArrayInputs}}. The resources and optional dimensions of inputs.
-        |outputs|: an {{MLNamedArrayOutputs}}. The pre-allocated resources of required outputs.
-    </pre>
+      - *graph*: an {{MLGraph}}. The compiled graph to be executed.
+      - *inputs*: an {{MLNamedArrayInputs}}. The resources and optional dimensions of inputs.
+      - *outputs*: an {{MLNamedArrayOutputs}}. The pre-allocated resources of required outputs.
 
     **Returns:** {{undefined}}.
+
     1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
         <div class=validusage>
             1. For each |key| -> |value| of |inputs|:
@@ -815,40 +813,46 @@ console.log(&#96;values: ${bufferE}&#96;);
 </pre>
 </div>
 
-### computeAsync ### {#api-mlcontext-computeasync}
-Asynchronously carries out the computational workload of a compiled graph {{MLGraph}} on a separate timeline, either on a worker thread for the CPU execution, or on a GPU timeline for the submission of GPU workload on the command queue. The asynchronous nature of this call avoids blocking the calling thread while the computation for result is ongoing. This method of execution is supported when the {{MLContext}} is created with {{MLContextOptions}} with the {{MLDeviceType}} option set to either "cpu" or "gpu".
+### Asynchronous Execution ### {#api-mlcontext-async-execution}
+Asynchronously carries out the computational workload of a compiled graph {{MLGraph}} on a separate timeline, either on a worker thread for the CPU execution, or on a GPU timeline for the submission of GPU workload on the command queue. The asynchronous nature of this call avoids blocking the calling thread while the computation for result is ongoing. This method of execution requires an {{MLContext}} created with {{MLContextOptions}}. Otherwise, it throws an {{OperationError}} exception.
+
+<script type=idl>
+partial interface MLContext {
+  Promise<undefined> computeAsync(MLGraph graph, MLNamedArrayInputs inputs, 
+                    MLNamedArrayOutputs outputs);
+};
+</script>
 
 <div algorithm=mlcontext.computeasync>
 
     **Arguments:**
-    <pre class=argumentdef for="MLContext/computeAsync(graph, inputs, outputs)">
-        |graph|: an {{MLGraph}}. The compiled graph to be executed.
-        |inputs|: an {{MLNamedInputs}}. The resources and optional dimensions of inputs.
-        |outputs|: an {{MLNamedOutputs}}. The pre-allocated resources of required outputs.
-    </pre>
+      - *graph*: an {{MLGraph}}. The compiled graph to be executed.
+      - *inputs*: an {{MLNamedArrayInputs}}. The resources and optional dimensions of inputs.
+      - *outputs*: an {{MLNamedArrayOutputs}}. The pre-allocated resources of required outputs.
 
     **Returns:** Promise<{{undefined}}>.
+    
     1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
         <div class=validusage>
             1. For each |key| -> |value| of |inputs|:
                 1. |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|] must exist.
                 1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
                 1. Let |inputSize| be 1.
-                1. If |value| is an {{MLInput}}, then:
-                    1. The length of |value|.{{MLInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+                1. If |value| is an {{MLArrayInput}}, then:
+                    1. The length of |value|.{{MLArrayInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
                     1. Let |i| be 0.
                     1. While true:
-                        1. Let |dimension| be |value|.{{MLInput/dimensions}}[|i|].
+                        1. Let |dimension| be |value|.{{MLArrayInput/dimensions}}[|i|].
                         1. |dimension| must be greater than 0.
                         1. If |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|] is greater than 0, then |dimension| must be equal to |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|].
                         1. Set |inputSize| to the product of |inputSize| and |dimension|.
                         1. Increment |i| by 1.
-                        1. If |i| if equal to the length of |value|.{{MLInput/dimensions}}, then break.
+                        1. If |i| if equal to the length of |value|.{{MLArrayInput/dimensions}}, then break.
                 1. Else:
                     1. For each |dimension| of |inputDesc|.{{MLOperandDescriptor/dimensions}}:
                         1. The value of |dimension| must be greater than 0.
                         1. Set |inputSize| to the product of |inputSize| and |dimension|.
-                1. If |value| is an {{MLInput}}, then let |resource| be |value|.{{MLInput/resource}}.
+                1. If |value| is an {{MLArrayInput}}, then let |resource| be |value|.{{MLArrayInput/resource}}.
                 1. If |value| is an {{MLResource}}, then let |resource| be |value|.
                 1. If |resource| is an {{ArrayBufferView}}, then:
                     1. The kind of |resource| must be compatible with |inputDesc|.{{MLOperandDescriptor/type}} according to [this table](#appendices-mloperandtype-arraybufferview-compatibility).
@@ -861,11 +865,11 @@ Asynchronously carries out the computational workload of a compiled graph {{MLGr
         1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
         1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
         1. If |value| is an {{MLInput}}, then:
-            1. Set the dimensions of |inputTensor| to |value|.{{MLInput/dimensions}}.
+            1. Set the dimensions of |inputTensor| to |value|.{{MLArrayInput/dimensions}}.
         1. Else:
             1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.
-        1. If |value| is an {{MLInput}}, then:
-            1. Set the values of |inputTensor| to the values of |value|.{{MLInput/resource}}.
+        1. If |value| is an {{MLArrayInput}}, then:
+            1. Set the values of |inputTensor| to the values of |value|.{{MLArrayInput/resource}}.
         1. If |value| is an {{MLResource}}, then:
             1. Set the values of |inputTensor| to the values of |value|.
         1. Set the input of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
@@ -893,6 +897,19 @@ When the {{[[contextType]]}} of {{MLContext}} is set to [=webgpu-context|webgpu=
 It is strongly recommended that the internal command buffer used to submit the workload is reused across {{MLContext/computeAsync()}} calls when it is not in use for efficiency.
 </div>
 
+### WebGPU Interoperability ### {#api-mlcontext-webgpu-interop}
+Create {{MLCommandEncoder}} interface used to record the ML workload onto a WebGPU-compatible {{GPUCommandBuffer}} to allow mixing of ML workload with other GPU workload in an application that leverages WebGPU. This method only succeeds on an {{MLContext}} created with {{GPUDevice}}. Otherwise, it throws an {{OperationError}} exception.
+
+<script type=idl>
+partial interface MLContext {
+  MLCommandEncoder createCommandEncoder();
+};
+</script>
+
+<div algorithm=mlcontext.createcommandencoder>
+    **Returns:** {{MLCommandEncoder}}. The command encoder used to record ML workload on the GPU.
+</div>
+
 ## MLOperandDescriptor ## {#api-mloperanddescriptor}
 <script type=idl>
 enum MLInputOperandLayout {
@@ -2449,6 +2466,142 @@ interface MLGraph {};
         The underlying implementation provided by the User Agent.
 </dl>
 
+## MLCommandEncoder ## {#api-mlcommandencoder}
+The {{MLCommandEncoder}} interface represents a method of execution that synchronously records the computational workload of a compiled {{MLGraph}} to a {{GPUCommandBuffer}} on the calling thread. Since the workload is not immediately executed, just recorded, this method allows more flexibility for the caller to determine how and when the recorded commands will be submitted for execution on the GPU relative to other GPU workload on the same or different queue.
+
+<script type=idl>
+typedef (GPUBuffer or GPUTexture) MLGPUResource;
+
+dictionary MLGPUInput {
+  required MLGPUResource resource;
+  required sequence<long> dimensions;
+};
+
+typedef record<DOMString, (MLGPUResource or MLGPUInput)> MLNamedGPUInputs;
+typedef record<DOMString, MLGPUResource> MLNamedGPUOutputs;
+
+[SecureContext, Exposed=(Window, DedicatedWorker)]
+interface MLCommandEncoder {};
+</script>
+
+{{MLCommandEncoder}} has the following internal slots:
+
+<dl dfn-type=attribute dfn-for="MLCommandEncoder">
+    : <dfn>\[[context]]</dfn> of type {{MLContext}}
+    ::
+        The context of type {{MLContext}} associated with this {{MLCommandEncoder}}.
+
+    : <dfn>\[[implementation]]</dfn>
+    ::
+        The underlying implementation provided by the User Agent.
+</dl>
+
+### Weight Preprocessing ### {#api-mlcommandencoder-graph-preprocessing}
+Record the initialization of the {{MLGraph}} with constant inputs {{MLNamedGPUInputs}} such as weight tensors. This is a necessary step for optimal performance as it gives the platform an opportunity to prepare and optimize constant input data for the subsequent execution of the graph. This method should only be called once per graph.
+
+<script type=idl>
+partial interface MLCommandEncoder {
+  undefined initializeGraph(MLGraph graph, MLNamedGPUInputs constants);
+};
+</script>
+
+<div algorithm=mlcommandencoder.initializegraph>
+    **Arguments:**
+        - *graph*: an {{MLGraph}}. The compiled graph to be initialized with constants.
+        - *constants*: an {{MLNamedGPUInputs}}. The resources and optional dimensions of constant inputs.
+
+    **Returns:** {{undefined}}.
+</div>
+
+### Dispatch Execution Commands ### {#api-mlcommandencoder-dispatch-commands}
+Record the {{MLGraph}} execution with the inputs {{MLNamedGPUInputs}} and outputs {{MLNamedGPUOutputs}}.
+
+<script type=idl>
+partial interface MLCommandEncoder {
+  undefined dispatch(MLGraph graph, MLNamedGPUInputs inputs, MLNamedGPUOutputs outputs);
+};
+</script>
+
+<div algorithm=mlcommandencoder.dispatch>
+    **Arguments:**
+        - *graph*: an {{MLGraph}}. The compiled graph to be executed.
+        - *inputs*: an {{MLNamedGPUInputs}}. The resources and optional dimensions of inputs.
+        - *outputs*: an {{MLNamedGPUOutputs}}. The pre-allocated resources of required outputs.
+
+    **Returns:** {{undefined}}.
+    
+      1. If any of the following requirements are unmet, then throw a {{DataError}} {{DOMException}} and stop.
+          <div class=validusage>
+              1. For each |key| -> |value| of |inputs|:
+                  1. |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|] must exist.
+                  1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+                  1. Let |inputSize| be 1.
+                  1. If |value| is an {{MLGPUInput}}, then:
+                      1. The length of |value|.{{MLGPUInput/dimensions}} must be the same as the length of |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+                      1. Let |i| be 0.
+                      1. While true:
+                          1. Let |dimension| be |value|.{{MLGPUInput/dimensions}}[|i|].
+                          1. |dimension| must be greater than 0.
+                          1. If |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|] is greater than 0, then |dimension| must be equal to |inputDesc|.{{MLOperandDescriptor/dimensions}}[|i|].
+                          1. Set |inputSize| to the product of |inputSize| and |dimension|.
+                          1. Increment |i| by 1.
+                          1. If |i| if equal to the length of |value|.{{MLGPUInput/dimensions}}, then break.
+                  1. Else:
+                      1. For each |dimension| of |inputDesc|.{{MLOperandDescriptor/dimensions}}:
+                          1. The value of |dimension| must be greater than 0.
+                          1. Set |inputSize| to the product of |inputSize| and |dimension|.
+                  1. If |value| is an {{MLGPUInput}}, then let |resource| be |value|.{{MLGPUInput/resource}}.
+                  1. If |value| is an {{MLGPUResource}}, then let |resource| be |value|.
+              1. For each |key| -> |value| of |outputs|:
+                  1. |graph|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
+          </div>
+
+      1. For each |key| -> |value| of |inputs|:
+          1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
+          1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
+          1. If |value| is an {{MLGPUInput}}, then:
+              1. Set the dimensions of |inputTensor| to |value|.{{MLGPUInput/dimensions}}.
+          1. Else:
+              1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.
+          1. If |value| is an {{MLGPUInput}}, then:
+              1. Set the values of |inputTensor| to the values of |value|.{{MLGPUInput/resource}}.
+          1. If |value| is an {{MLGPUResource}}, then:
+              1. Set the values of |inputTensor| to the values of |value|.
+          1. Set the input of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
+      1. For each |key| -> |value| of |outputs|:
+          1. Issue a compute request for output of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key|.
+          1. Wait for the compute request to be completed.
+          1. If there is an error returned by |graph|.{{MLGraph/[[implementation]]}}, then:
+              1. Throw an {{OperationError}} {{DOMException}} and stop.
+          1. Else:
+              1. Let |outputTensor| be the output tensor returned by |graph|.{{MLGraph/[[implementation]]}}.
+              1. If the kind of |value| is not compatible with the value type of |outputTensor|, then throw a {{DataError}} {{DOMException}} and stop.
+              1. Let |outputSize| be 1.
+              1. For each |dimension| of dimensions of |outputTensor|:
+                  1. Set |outputSize| to the product of |outputSize| and |dimension|.
+              1. If |outputSize| is greater than the length of |value|, then:
+                  1. Throw a {{DataError}} {{DOMException}} and stop.
+              1. Else:
+                  1. Set the values of |value| to the values of |outputTensor|.
+      1. Return {{undefined}}.
+</div>
+
+### Generate GPU Command Buffer ### {#api-mlcommandencoder-generate-gpu-command-buffer}
+Complete the recording of ML workload and return a WebGPU-compatible {{GPUCommandBuffer}} containing the recorded workload.
+
+<script type=idl>
+partial interface MLCommandEncoder {
+  GPUCommandBuffer finish(optional GPUCommandBufferDescriptor descriptor = {});
+};
+</script>
+
+<div algorithm=mlcommandencoder.finish>
+    **Arguments:**
+        - *descriptor*: an optional {{GPUCommandBufferDescriptor}}. Descriptor of the command buffer.
+
+    **Returns:** {{GPUCommandBuffer}}.
+</div>
+
 Examples {#examples}
 =====================
 

From 3710e713c9da15fcac6d55a84a6c69e530d207be Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Sun, 17 Apr 2022 23:25:10 -0700
Subject: [PATCH 09/13] Remove undefined `MLResource` type.

---
 index.bs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/index.bs b/index.bs
index 70c182b2..0a3f1001 100644
--- a/index.bs
+++ b/index.bs
@@ -853,7 +853,6 @@ partial interface MLContext {
                         1. The value of |dimension| must be greater than 0.
                         1. Set |inputSize| to the product of |inputSize| and |dimension|.
                 1. If |value| is an {{MLArrayInput}}, then let |resource| be |value|.{{MLArrayInput/resource}}.
-                1. If |value| is an {{MLResource}}, then let |resource| be |value|.
                 1. If |resource| is an {{ArrayBufferView}}, then:
                     1. The kind of |resource| must be compatible with |inputDesc|.{{MLOperandDescriptor/type}} according to [this table](#appendices-mloperandtype-arraybufferview-compatibility).
                     1. The length of |resource| must be the same as |inputSize|.
@@ -870,8 +869,6 @@ partial interface MLContext {
             1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.
         1. If |value| is an {{MLArrayInput}}, then:
             1. Set the values of |inputTensor| to the values of |value|.{{MLArrayInput/resource}}.
-        1. If |value| is an {{MLResource}}, then:
-            1. Set the values of |inputTensor| to the values of |value|.
         1. Set the input of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key| to |inputTensor|.
     1. For each |key| -> |value| of |outputs|:
         1. Issue a compute request for output of |graph|.{{MLGraph/[[implementation]]}} that is associated with |key|.

From ff7e4f2295be6a2bbb3a0c5e84f789b2850ca961 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Sun, 17 Apr 2022 23:32:29 -0700
Subject: [PATCH 10/13] Remove references to undefined MLInput type.

---
 index.bs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/index.bs b/index.bs
index 0a3f1001..9bbb8350 100644
--- a/index.bs
+++ b/index.bs
@@ -706,7 +706,6 @@ partial interface MLContext {
             1. For each |key| -> |value| of |outputs|:
                 1. |graph|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
         </div>
-    <!-- Compute -->
     1. For each |key| -> |value| of |inputs|:
         1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
         1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
@@ -859,11 +858,10 @@ partial interface MLContext {
             1. For each |key| -> |value| of |outputs|:
                 1. |graph|.{{MLGraph/[[outputNames]]}}[|key|] must exist.
         </div>
-    <!-- Compute -->
     1. For each |key| -> |value| of |inputs|:
         1. Let |inputDesc| be |graph|.{{MLGraph/[[inputDescriptors]]}}[|key|].
         1. Let |inputTensor| be a new tensor for |graph|.{{MLGraph/[[implementation]]}} of data type that is compatible with |inputDesc|.{{MLOperandDescriptor/type}}.
-        1. If |value| is an {{MLInput}}, then:
+        1. If |value| is an {{MLArrayInput}}, then:
             1. Set the dimensions of |inputTensor| to |value|.{{MLArrayInput/dimensions}}.
         1. Else:
             1. Set the dimensions of |inputTensor| to |inputDesc|.{{MLOperandDescriptor/dimensions}}.

From 59bbbe9a4821c6e9b6b5c11384bf9c6e50c60533 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Wed, 20 Apr 2022 21:43:27 -0700
Subject: [PATCH 11/13] Remove outdated note section in the computeAsync
 description. Further clarify graph initialization stage and remove the
 unnecessary second param.

---
 index.bs | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/index.bs b/index.bs
index 9bbb8350..c358a791 100644
--- a/index.bs
+++ b/index.bs
@@ -401,7 +401,7 @@ In order to not allow an attacker to target a specific implementation that may c
 
 Issue: Hinting partially mitigates the concern. Investigate additional mitigations.
 
-The API design minimizes the attack surface for the compiled computational graph. The {{MLGraphBuilder}} interface that hosts the various operations is a data definition API and as such doesn't execute anything, only constructs data. What follows, is that the potential for an attack is limited to when binding the data to the graph before executing it by invoking the {{MLContext/compute()}} method. This enables implementers to focus on hardening the {{MLContext/compute()}} method. For example, by making sure it honors the boundary of data and fails appropriately when the bounds are not respected.
+The API design minimizes the attack surface for the compiled computational graph. The {{MLGraphBuilder}} interface that hosts the various operations is a data definition API and as such doesn't execute anything, only constructs data. What follows, is that the potential for an attack is limited to when binding the data to the graph before executing it by invoking the {{MLContext}}.{{MLContext/compute()}} method. This enables implementers to focus on hardening the {{MLContext}}.{{MLContext/compute()}} method. For example, by making sure it honors the boundary of data and fails appropriately when the bounds are not respected.
 
 Purpose-built Web APIs for measuring high-resolution time mitigate against timing attacks using techniques such as resolution reduction, adding jitter, detection of abuse and API call throttling [[hr-time-3]]. The practical deployment of WebNN implementations are likely to bring enough jitter to make timing attacks impractical (e.g. because they would use IPC) but implementers are advised to consider and test their implementations against timing attacks.
 
@@ -461,7 +461,7 @@ At inference time, every {{MLOperand}} will be bound to a tensor (the actual dat
 
 The {{MLGraphBuilder}} interface enables the creation of {{MLOperand}}s.
 A key part of the {{MLGraphBuilder}} interface are the operations (such as 
-{{MLGraphBuilder/gemm()}} and {{MLGraphBuilder/softmax()}}). The operations have a functional
+{{MLGraphBuilder}}.{{MLGraphBuilder/gemm()}} and {{MLGraphBuilder}}.{{MLGraphBuilder/softmax()}}). The operations have a functional
 semantics, with no side effects.
 Each operation invocation conceptually returns a distinct new value, without
 changing the value of any other {{MLOperand}}.
@@ -478,17 +478,17 @@ that shares the same buffer as the input tensor. (In the case of reshape or sque
 the entire data is shared, while in the case of slice, a part of the input data is shared.)
 The implementation may use views, as above, for intermediate values.
 
-The {{MLGraphBuilder/build()}} method of the {{MLGraphBuilder}} interface is used to compile and optimize
+The {{MLGraphBuilder}}.{{MLGraphBuilder/build()}} method of the {{MLGraphBuilder}} interface is used to compile and optimize
 the computation graph used to compute one or more specified outputs. The key
 purpose of the compilation step is to enable optimizations that span two or
 more operations, such as operation or loop fusion.
 
 Once the {{MLGraph}} is constructed, there are multiple ways by which the graph may be executed. The
-{{MLContext/compute()}} method represents a way the execution of the graph is carried out immediately 
+{{MLContext}}.{{MLContext/compute()}} method represents a way the execution of the graph is carried out immediately 
 on the calling thread, which must also be a worker thread, either on a CPU or GPU device. The execution 
 produces the results of the computation from all the inputs bound to the graph.
 
-The {{MLContext/computeAsync()}} method represents a way the execution of the graph is performed asynchronously
+The {{MLContext}}.{{MLContext/computeAsync()}} method represents a way the execution of the graph is performed asynchronously
 either on a parallel timeline in a separate worker thread for the CPU execution or on a GPU timeline in a GPU 
 command queue. This method returns immediately without blocking the calling thread while the actual execution is 
 offloaded to a different timeline. This type of execution is appropriate when the responsiveness of the calling 
@@ -497,11 +497,11 @@ time the operation is successfully completed on the offloaded timeline at which
 signaled. This type of execution supports both the CPU and GPU device, including when the context is created 
 from the {{WebGLRenderingContext}}.
 
-In both the {{MLContext/compute()}} and {{MLContext/computeAsync()}} execution methods, the caller supplies 
+In both the {{MLContext}}.{{MLContext/compute()}} and {{MLContext}}.{{MLContext/computeAsync()}} execution methods, the caller supplies 
 the input values using {{MLNamedArrayInputs}}, binding the input {{MLOperand}}s to their values. The caller
 then supplies pre-allocated buffers for output {{MLOperand}}s using {{MLNamedArrayOutputs}}.
 
-The {{MLCommandEncoder}} interface created by the {{MLContext/createCommandEncoder()}} method supports 
+The {{MLCommandEncoder}} interface created by the {{MLContext}}.{{MLContext/createCommandEncoder()}} method supports 
 a graph execution method that provides the maximum flexibility to callers that also utilize WebGPU in their 
 application. It does this by placing the workload required to initialize and compute the results of the 
 operations in the graph onto a {{GPUCommandBuffer}}. The callers are responsible for the eventual submission 
@@ -886,12 +886,6 @@ partial interface MLContext {
     1. Return Promise<{{undefined}}>.
 </div>
 
-<div class="note">
-When the {{[[contextType]]}} of {{MLContext}} is set to [=webgpu-context|webgpu=] with the {{MLContextOptions}}.{{gpuDevice}} also specified, the user agent queues the ML workload needed to compute the result of the operations in the graph in an internal command buffer before submitting it to the {{gpuDevice}}'s default {{GPUQueue}}. The user agent then returns the call with a {{Promise}} that resolves once the queue finishes processing all the work submitted to it.
-
-It is strongly recommended that the internal command buffer used to submit the workload is reused across {{MLContext/computeAsync()}} calls when it is not in use for efficiency.
-</div>
-
 ### WebGPU Interoperability ### {#api-mlcontext-webgpu-interop}
 Create {{MLCommandEncoder}} interface used to record the ML workload onto a WebGPU-compatible {{GPUCommandBuffer}} to allow mixing of ML workload with other GPU workload in an application that leverages WebGPU. This method only succeeds on an {{MLContext}} created with {{GPUDevice}}. Otherwise, it throws an {{OperationError}} exception.
 
@@ -2491,23 +2485,26 @@ interface MLCommandEncoder {};
         The underlying implementation provided by the User Agent.
 </dl>
 
-### Weight Preprocessing ### {#api-mlcommandencoder-graph-preprocessing}
-Record the initialization of the {{MLGraph}} with constant inputs {{MLNamedGPUInputs}} such as weight tensors. This is a necessary step for optimal performance as it gives the platform an opportunity to prepare and optimize constant input data for the subsequent execution of the graph. This method should only be called once per graph.
+### Graph Initialization ### {#api-mlcommandencoder-graph-initialization}
+Record the initialization of the {{MLGraph}}. This is a necessary step for optimal performance during graph execution as it gives the platform an opportunity to prepare and optimize constant input data for the subsequent execution of the graph. This method should only be called once per graph.
 
 <script type=idl>
 partial interface MLCommandEncoder {
-  undefined initializeGraph(MLGraph graph, MLNamedGPUInputs constants);
+  undefined initializeGraph(MLGraph graph);
 };
 </script>
 
 <div algorithm=mlcommandencoder.initializegraph>
     **Arguments:**
-        - *graph*: an {{MLGraph}}. The compiled graph to be initialized with constants.
-        - *constants*: an {{MLNamedGPUInputs}}. The resources and optional dimensions of constant inputs.
+        - *graph*: an {{MLGraph}}. The compiled graph to be initialized with graph constant inputs.
 
     **Returns:** {{undefined}}.
 </div>
 
+<div class="note">
+Graph initialization stage typically involves a process known as "weight preprocessing" where all the constant inputs to the graph are preprocessed and cached at the operating system level for subsequent graph execution calls. The initializing inputs are typically the constant weight data specified through the {{MLGraphBuilder}}.{{MLGraphBuilder/constant(desc, bufferView)}} method as constant operands during graph construction time.
+</div>
+
 ### Dispatch Execution Commands ### {#api-mlcommandencoder-dispatch-commands}
 Record the {{MLGraph}} execution with the inputs {{MLNamedGPUInputs}} and outputs {{MLNamedGPUOutputs}}.
 

From 0ed918fac4da4ada3501fffe2513da429e8e1a11 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Thu, 21 Apr 2022 21:09:45 -0700
Subject: [PATCH 12/13] Add a note section describing when graph initialization
 occurs on a default GPU context.

---
 index.bs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/index.bs b/index.bs
index c358a791..704923a0 100644
--- a/index.bs
+++ b/index.bs
@@ -989,6 +989,10 @@ interface MLGraphBuilder {
 };
 </script>
 
+<div class="note">
+The {{MLGraphBuilder}}.{{MLGraphBuilder/build()}} method compiles the graph builder state up to the specified output operands into a compiled graph according to the type of {{MLContext}} backing it. When the {{[[contextType]]}} of the {{MLContext}} is set to [=default-context|default=] with the {{MLContextOptions}}.{{deviceType}} set to [=device-type-gpu|gpu=], the compiled graph is initialized right before the {{MLGraphBuilder/build()}} method call returns. This initialization stage is important for optimal performance of the subsequent graph executions. See [[#api-mlcommandencoder-graph-initialization]] for more detail.
+</div>
+
 ### batchNormalization ### {#api-mlgraphbuilder-batchnorm}
 Normalize the tensor values of input features across the batch dimension using [[Batch-Normalization]]. For each input feature, the mean and variance values of that feature supplied in this calculation as parameters are previously computed across the batch dimension of the input during the model training phase of this operation.
 <script type=idl>

From 0ea819d7d57b000cce5188aa613fe5571f8de463 Mon Sep 17 00:00:00 2001
From: Chai Chaoweeraprasit <wchao1115@hotmail.com>
Date: Fri, 22 Apr 2022 10:34:03 -0700
Subject: [PATCH 13/13] Adjust some wordings on the note section of
 `MLGraphBuilder.build` method.

---
 index.bs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/index.bs b/index.bs
index 704923a0..89238762 100644
--- a/index.bs
+++ b/index.bs
@@ -985,12 +985,13 @@ interface MLGraphBuilder {
   MLOperand constant(double value, optional MLOperandType type = "float32");
 
   // Compile the graph up to the specified output operands
+  [Exposed=(DedicatedWorker)]
   MLGraph build(MLNamedOperands outputs);
 };
 </script>
 
 <div class="note">
-The {{MLGraphBuilder}}.{{MLGraphBuilder/build()}} method compiles the graph builder state up to the specified output operands into a compiled graph according to the type of {{MLContext}} backing it. When the {{[[contextType]]}} of the {{MLContext}} is set to [=default-context|default=] with the {{MLContextOptions}}.{{deviceType}} set to [=device-type-gpu|gpu=], the compiled graph is initialized right before the {{MLGraphBuilder/build()}} method call returns. This initialization stage is important for optimal performance of the subsequent graph executions. See [[#api-mlcommandencoder-graph-initialization]] for more detail.
+The {{MLGraphBuilder}}.{{MLGraphBuilder/build()}} method compiles the graph builder state up to the specified output operands into a compiled graph according to the type of {{MLContext}} that creates it. Since this operation can be costly in some machine configurations, the calling thread must only be a worker thread to avoid potential disruption of the user experience. When the {{[[contextType]]}} of the {{MLContext}} is set to [=default-context|default=], the compiled graph is initialized right before the {{MLGraphBuilder/build()}} method call returns. This graph initialization stage is important for optimal performance of the subsequent graph executions. See [[#api-mlcommandencoder-graph-initialization]] for more detail.
 </div>
 
 ### batchNormalization ### {#api-mlgraphbuilder-batchnorm}

Device Option	ArrayBufferView	GPUBuffer	GPUTexture	WebGLBuffer	WebGLTexture -
cpu	Yes	No	No	No	No -
gpu (GPUDevice == null)	Yes	No	No	No	No -
gpu (GPUDevice != null)	Yes	Yes	Yes	No	No +
Creation method	ArrayBufferView	GPUBuffer	GPUTexture	WebGLBuffer	WebGLTexture +
MLContextOptions	Yes	No	No	No	No +
GPUDevice	Yes	Yes	Yes	No	No
WebGLRenderingContext	Yes	No	No	Yes	Yes