tensorflow
diff --git a/‎tfjs-backend-webgpu/src/kernels/argminmax_webgpu.ts
+135-1 b/‎tfjs-backend-webgpu/src/kernels/argminmax_webgpu.ts
+135-1
diff --git a/‎tfjs-backend-webgpu/src/kernels/depthwise_conv2d_3x3_webgpu.ts
+104-1 b/‎tfjs-backend-webgpu/src/kernels/depthwise_conv2d_3x3_webgpu.ts
+104-1
diff --git a/‎tfjs-backend-webgpu/src/kernels/depthwise_conv2d_webgpu.ts
+1-2 b/‎tfjs-backend-webgpu/src/kernels/depthwise_conv2d_webgpu.ts
+1-2
@@ -18,9 +18,10 @@
 import {backend_util, util} from '@tensorflow/tfjs-core';
 
 import {getCoordsDataType} from '../shader_preprocessor';
+import {getCoordsDataTypeWgsl, getGlobalIndexStringWgsl, getMainHeaderStringWgsl} from '../shader_preprocessor_wgsl';
 import {computeDispatch} from '../webgpu_util';
 
-import {WebGPUProgram} from './webgpu_program';
+import {getUseWgsl, WebGPUProgram} from './webgpu_program';
 
 export class ArgMinMaxProgram implements WebGPUProgram {
   outputShape: number[];
@@ -30,9 +31,11 @@ export class ArgMinMaxProgram implements WebGPUProgram {
   workGroupSize: [number, number, number];
   variableNames = ['x'];
   uniforms = 'int axis;';
+  uniformsWgsl = 'axis : u32;';
   inputShape: number[];
   reductionFactor: number;
   op: string;
+  useWgsl: boolean;
 
   constructor(inputShape: number[], axis: number, reduceType: 'min'|'max') {
     const axes = [axis];
@@ -67,6 +70,7 @@ export class ArgMinMaxProgram implements WebGPUProgram {
 
     this.inputShape = inputShape;
     this.shaderKey = `argMinMax${this.op}`;
+    this.useWgsl = getUseWgsl();
   }
 
   getUserCode(): string {
@@ -192,4 +196,134 @@ export class ArgMinMaxProgram implements WebGPUProgram {
     `;
     return userCode;
   }
+
+  getUserCodeWgsl(): string {
+    // When this.workGroupSize[0] > 1, each thread reduces Length /
+    // this.workGroupSize[0] values. Thes results are stored in shared memory
+    // and iteratively reduced.
+    const reduceInSharedMemory = this.workGroupSize[0] > 1;
+    const sharedMemorySnippet = `
+      var<workgroup> xBestIndices : array<u32, ${this.workGroupSize[0]}>;
+      var<workgroup> xBestValues : array<f32, ${this.workGroupSize[0]}>;
+    `;
+
+    const sharedMemoryReduceSnippet = `
+      xBestIndices[localId.x] = bestIndex;
+      xBestValues[localId.x] = bestValue;
+
+      for(var currentSize = WorkGroupSize; currentSize > 1u; currentSize = DIV_CEIL(currentSize, ${
+        this.reductionFactor}u)) {
+        workgroupBarrier();
+
+        for (var w = 0u; w < ${this.reductionFactor}u; w = w + 1u) {
+          let i = localId.x * ${this.reductionFactor}u + w;
+          if (i < currentSize) {
+            let candidateIndex = xBestIndices[i];
+            let candidate = xBestValues[i];
+            if(candidate ${this.op} bestValue && !isNanCustom(candidate)) {
+              bestValue = candidate;
+              bestIndex = candidateIndex;
+            }
+          }
+        }
+
+        xBestIndices[localId.x] = bestIndex;
+        xBestValues[localId.x] = bestValue;
+      }
+
+      if (localId.x == 0u) {
+        setOutputFlatI32(flatOutputIndex, i32(bestIndex));
+      }
+    `;
+
+    const outputCoordsType = getCoordsDataTypeWgsl(this.outputShape.length);
+
+    const indexOutputCoords = (outputCoords: string, index: string) => {
+      if (this.outputShape.length === 1) {
+        return outputCoords;
+      } else {
+        return `${outputCoords}[${index}]`;
+      }
+    };
+
+    const indexInputShape = (index: string) => {
+      if (this.inputShape.length === 1) {
+        return 'uniforms.xShape';
+      } else {
+        return `uniforms.xShape[${index}]`;
+      }
+    };
+
+    const userCode = `
+      fn DIV_CEIL(a : u32, b : u32) -> u32 {
+        return ((a - 1u) / b + 1u);
+      }
+
+      let WorkGroupSize = ${this.workGroupSize[0]}u;
+
+      ${reduceInSharedMemory ? sharedMemorySnippet : ''}
+
+      // In order to get a flattened index into the input tensor, we need to
+      // add back the index along the reduced dimension to |outputCoords|.
+      // This function outputs the offset to the first value along
+      // |axis| and the stride to get the next value of the input along |axis|.
+      fn getInputCoordInfo(globalId : vec3<u32>, globalIndex : u32) -> vec2<u32>{
+        let outputCoords : ${
+        outputCoordsType} = getOutputCoords(globalId, globalIndex);
+        var i = ${this.outputShape.length - 1}u;
+
+        var stride = 1u;
+        var inputStride = 1u;
+        var offset = 0u;
+
+        for (var r = 1u; r <= ${this.inputShape.length}u; r = r + 1u) {
+          let length = ${indexInputShape(`${this.inputShape.length}u - r`)};
+          if (${this.inputShape.length}u - r == uniforms.axis) {
+            inputStride = stride;
+          } else {
+            offset = offset + ${
+        indexOutputCoords('outputCoords', 'i')} * stride;
+            i = i - 1u;
+          }
+          stride = stride * length;
+        }
+
+        return vec2<u32>(offset, inputStride);
+      }
+
+      fn getInputIndex(coordInfo : vec2<u32>, index : u32) -> u32{
+        return coordInfo[0] + coordInfo[1] * index;
+      }
+
+      ${getMainHeaderStringWgsl(this.workGroupSize)} {
+        ${getGlobalIndexStringWgsl(this.workGroupSize)}
+        let coordInfo = getInputCoordInfo(globalId, index);
+
+        var bestIndex = 0u;
+        var bestValue = x.numbers[getInputIndex(coordInfo, bestIndex)];
+
+        let Length = ${indexInputShape('uniforms.axis')};
+        let WorkPerThread = DIV_CEIL(Length, WorkGroupSize);
+
+        for (var w = 0u; w < WorkPerThread; w = w + 1u) {
+          let i = globalId.x * WorkPerThread + w;
+          if (i < Length) {
+            let candidate = x.numbers[getInputIndex(coordInfo, i)];
+            if (candidate ${
+        this.op} bestValue && !isNanCustom(f32(candidate))) {
+              bestValue = candidate;
+              bestIndex = i;
+            }
+          }
+        }
+
+        let flatOutputIndex = globalId.y;
+        ${
+        reduceInSharedMemory ?
+            sharedMemoryReduceSnippet :
+            'setOutputFlatI32(flatOutputIndex, i32(bestIndex));'}
+      }
+    `;
+    return userCode;
+  }
 }
@@ -16,9 +16,12 @@
  */
 
 import {backend_util, util} from '@tensorflow/tfjs-core';
+
+import {getGlobalIndexStringWgsl, getMainHeaderStringWgsl} from '../shader_preprocessor_wgsl';
 import {computeDispatch} from '../webgpu_util';
+
 import {mapActivationToShaderProgram} from './activation_util';
-import {WebGPUProgram} from './webgpu_program';
+import {getUseWgsl, WebGPUProgram} from './webgpu_program';
 
 export class DepthwiseConv2D3x3Program implements WebGPUProgram {
   outputShape: number[];
@@ -27,12 +30,15 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
   dispatch: [number, number, number];
   variableNames = ['x', 'W'];
   uniforms = 'ivec2 pad, stride, dilation, inDims;';
+  uniformsWgsl =
+      'pad : vec2<u32>; stride : vec2<u32>; dilation : vec2<u32>; inDims : vec2<u32>;';
   workGroupSize: [number, number, number] = [4, 4, 4];
   convInfo: backend_util.Conv2DInfo;
   addBias: boolean;
   activation: backend_util.Activation;
   hasPreluActivation: boolean;
   isVec4 = true;
+  useWgsl: boolean;
 
   constructor(
       convInfo: backend_util.Conv2DInfo, addBias = false,
@@ -59,6 +65,7 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
     this.hasPreluActivation = hasPreluActivation;
 
     this.shaderKey = `depthwise3x3_${activation}`;
+    this.useWgsl = getUseWgsl();
   }
 
   getUserCode(): string {
@@ -153,4 +160,100 @@ export class DepthwiseConv2D3x3Program implements WebGPUProgram {
     `;
     return userCode;
   }
+
+  getUserCodeWgsl(): string {
+    let activationSnippet = '', applyActivationSnippet = '';
+    if (this.activation) {
+      const activationOp = mapActivationToShaderProgram(
+          this.activation, this.isVec4, this.useWgsl);
+      if (this.hasPreluActivation) {
+        activationSnippet =
+            `fn activation(a : vec4<f32>, globalId : vec3<u32>, globalIndex : u32) -> vec4<f32> {
+          let b = getPreluActivationWeightsAtOutCoordsByGlobalId(globalId, globalIndex);
+          ${activationOp}
+        }`;
+      } else {
+        activationSnippet = `
+        fn activation(a : vec4<f32>, globalId : vec3<u32>, globalIndex : u32) -> vec4<f32> {
+            ${activationOp}
+          }
+        `;
+      }
+
+      applyActivationSnippet =
+          `dotProd[i] = activation(dotProd[i], globalId, index);`;
+    }
+
+    const addBiasSnippet = this.addBias ?
+        'dotProd[i] = dotProd[i] + getBiasAtOutCoordsByCoords(coords);' :
+        '';
+
+    const userCode = `
+      ${activationSnippet}
+
+      ${getMainHeaderStringWgsl(this.workGroupSize)} {
+        ${getGlobalIndexStringWgsl(this.workGroupSize)}
+        let batch = 0u;
+        let r = globalId.x;
+        let c = globalId.y * 4u;
+        let d2 = globalId.z * 4u;
+        let xRCCorner = vec2<i32>(vec2<u32>(r, c) * uniforms.stride - uniforms.pad);
+        let d1 = d2;
+        let q = 0u;
+
+        let xRCorner = xRCCorner.x;
+        let xCCorner = xRCCorner.y;
+
+        var wVals : array<vec4<f32>, 9>;
+        wVals[0] = getW(0u, 0u, d1, q);
+        wVals[1] = getW(0u, 1u, d1, q);
+        wVals[2] = getW(0u, 2u, d1, q);
+        wVals[3] = getW(1u, 0u, d1, q);
+        wVals[4] = getW(1u, 1u, d1, q);
+        wVals[5] = getW(1u, 2u, d1, q);
+        wVals[6] = getW(2u, 0u, d1, q);
+        wVals[7] = getW(2u, 1u, d1, q);
+        wVals[8] = getW(2u, 2u, d1, q);
+
+        var xVals : array<array<vec4<f32>, 6>, 3>;
+        for (var wR = 0u; wR < 3u; wR = wR + 1u) {
+          let xR = xRCorner + i32(wR * uniforms.dilation[0]);
+          for (var wC = 0u; wC < 6u; wC = wC + 1u) {
+            let xC = xCCorner + i32(wC * uniforms.dilation[1]);
+            if (xR < 0 || xR >= i32(uniforms.inDims[0]) || xC < 0 || xC >= i32(uniforms.inDims[1])) {
+              xVals[wR][wC] = vec4<f32>(0.0);
+            } else {
+              xVals[wR][wC] = getX(batch, u32(xR), u32(xC), d1);
+            }
+          }
+        }
+
+        var dotProd : array<vec4<f32>, 4>;
+        dotProd[0] = vec4<f32>(0.0);
+        dotProd[1] = vec4<f32>(0.0);
+        dotProd[2] = vec4<f32>(0.0);
+        dotProd[3] = vec4<f32>(0.0);
+
+        for (var wR = 0u; wR < 3u; wR = wR + 1u) {
+          for (var wC = 0u; wC < 3u; wC = wC + 1u) {
+            let indexW = wR * 3u + wC;
+            dotProd[0] = dotProd[0] + xVals[wR][0u + wC] * wVals[indexW];
+            dotProd[1] = dotProd[1] + xVals[wR][1u + wC] * wVals[indexW];
+            dotProd[2] = dotProd[2] + xVals[wR][2u + wC] * wVals[indexW];
+            dotProd[3] = dotProd[3] + xVals[wR][3u + wC] * wVals[indexW];
+          }
+        }
+
+        for (var i = 0u; i < 4u; i = i + 1u) {
+          let coords = vec4<u32>(batch, r, c + i, d2);
+          if (coordsInBounds4D(coords, uniforms.outShape)) {
+            ${addBiasSnippet}
+            ${applyActivationSnippet}
+            setOutput(coords[0], coords[1], coords[2], coords[3], dotProd[i]);
+          }
+        }
+      }
+    `;
+    return userCode;
+  }
 }
@@ -230,8 +230,7 @@ export class DepthwiseConv2DProgram implements WebGPUProgram {
 
         // Extract if checking out of for loop for performance.
         if (inputRowStart >= 0 && inputColStart >= 0 &&
-          inputRowEnd < i32(uniforms.inDims[0]) && inputColEnd < i32(uniforms.inDims[1]))
-          {
+          inputRowEnd < i32(uniforms.inDims[0]) && inputColEnd < i32(uniforms.inDims[1])) {
             // Here using a constant value |this.convInfo.filterHeight| instead
             // of uniform value is in order to loop unrolling.
             for (var wR = 0u; wR < ${