Implement quantized jax.lax.ragged_dot.

liudangyi · copybara-github · commit d1d7d6483722 · 2025-09-25T13:55:20.000-07:00
This change introduces `qwix._src.core.ragged_dot`, which provides a quantized version of `jax.lax.ragged_dot`. It includes a "fast" path that performs the dot product on quantized values and applies scales later, and a "slow" path that dequantizes before calling the standard `ragged_dot`. The choice between fast and slow paths depends on whether the quantization involves zero points or specific channelwise scale shapes.

This is a very basic version, notable limitations are
* Only the default dimension_numbers are supported (thus it's ragged_dot rather than  ragged_dot_general).
* Tiling is not supported.

We will address those later.

Move the logic about dtype handling into a new function `qarray.get_accumulator_and_result_type` which is shared by multiple ops.

PiperOrigin-RevId: 811488890
diff --git a/qwix/_src/core/conv_general.py b/qwix/_src/core/conv_general.py
@@ -177,6 +177,7 @@ def conv_general_dilated(
     dimension_numbers: jax.lax.ConvGeneralDilatedDimensionNumbers = None,
     feature_group_count: int = 1,
     batch_group_count: int = 1,
+    # TODO(dangyi): Add preferred_element_type.
 ) -> jax.Array:
   """Dispatches to fast or slow conv_general_dilated depending on the inputs."""
   if isinstance(lhs, qarray.QArray) and isinstance(rhs, qarray.QArray):
diff --git a/qwix/_src/core/dot_general.py b/qwix/_src/core/dot_general.py
@@ -203,20 +203,9 @@ def _fast_dot_general(
     rhs_scale = qarray.split_axis(rhs_scale, {a: 1 for a in rhs_tiled_ca})
     rhs_scale = qarray.transpose_array(rhs_scale, rhs_scale_transpose)
 
-  if preferred_element_type is None:
-    # We want to override the preferred_element_type to int32 for int8 x int8
-    # dot_general, or bfloat16/float32 for fp8 x fp8 dot_general.
-    if all('int' in x.dtype.name for x in (lhs_value, rhs_value)):
-      preferred_element_type = jnp.int32
-    elif lhs_scale is not None:
-      preferred_element_type = lhs_scale.dtype
-    elif rhs_scale is not None:
-      preferred_element_type = rhs_scale.dtype
-  else:
-    if lhs_scale is not None:
-      lhs_scale = lhs_scale.astype(preferred_element_type)
-    if rhs_scale is not None:
-      rhs_scale = rhs_scale.astype(preferred_element_type)
+  preferred_element_type, result_type = qarray.get_accumulator_and_result_type(
+      lhs, rhs, preferred_element_type=preferred_element_type
+  )
 
   res = jax.lax.dot_general(
       lhs_value,
@@ -259,7 +248,7 @@ def _fast_dot_general(
     res = qarray.call_with_generic_broadcast(jnp.multiply, res, rhs_scale)
   if sum_axes:
     res = jnp.sum(res, axis=sum_axes)
-  return res
+  return res.astype(result_type)
 
 
 def _slow_dot_general(
@@ -321,15 +310,9 @@ def loop_dot_general(
     else:
       ca_tile_counts.append(1)
 
-  acc_dtype = None
-  if all('int' in x.dtype.name for x in (lhs_value, rhs_value)):
-    acc_dtype = jnp.int32
-  elif preferred_element_type is not None:
-    acc_dtype = preferred_element_type
-  elif lhs_scale is not None:
-    acc_dtype = lhs_scale.dtype
-  elif rhs_scale is not None:
-    acc_dtype = rhs_scale.dtype
+  preferred_element_type, result_type = qarray.get_accumulator_and_result_type(
+      lhs, rhs, preferred_element_type=preferred_element_type
+  )
 
   lhs_scale_transpose, rhs_scale_transpose = _get_scale_transpose(
       dimension_numbers, (len(lhs_value.shape), len(rhs_value.shape))
@@ -357,7 +340,7 @@ def take_slice(
         take_slice(lhs_value, lhs_ca, ca_tile_indices),
         take_slice(rhs_value, rhs_ca, ca_tile_indices),
         dimension_numbers=dimension_numbers,
-        preferred_element_type=acc_dtype,
+        preferred_element_type=preferred_element_type,
         **kwargs,
     )
     if lhs_scale is not None:
@@ -370,9 +353,7 @@ def take_slice(
       out = qarray.call_with_generic_broadcast(jnp.multiply, out, scale)
     acc = out if acc is None else acc + out
   assert acc is not None
-  if preferred_element_type is not None:
-    acc = acc.astype(preferred_element_type)
-  return acc
+  return acc.astype(result_type)
 
 
 # If a contracting dimension has a tile size smaller than this threshold, tiled
diff --git a/qwix/_src/core/einsum.py b/qwix/_src/core/einsum.py
@@ -108,23 +108,23 @@ def einsum(
   Returns:
     The result of the einsum, a floating-point jax.Array.
   """
+  # preferred_element_type has to be set for jnp.einsum so that it won't infer
+  # the type from qvalue x qvalue.
+  _, preferred_element_type = qarray.get_accumulator_and_result_type(
+      *[a for a in args if isinstance(a, qarray.MaybeQArray)],
+      preferred_element_type=preferred_element_type,
+  )
+
   # We want to use jnp.einsum with quantized dot_general to avoid duplicating
   # the implementation. However, jnp.einsum will check the inputs to be
   # jax Arrays. To work around this, we send the qvalue to jnp.einsum and
   # restore the actual QArray before actually passing them to dot_general.
   args = list(args)
   qvalue_to_qarray = {}
-
-  # preferred_element_type needs to be set for jnp.einsum so that it won't infer
-  # the type from qvalue x qvalue.
-  scale_dtypes = []
   for i, arg in enumerate(args):
     if isinstance(arg, qarray.QArray):
       args[i] = arg.qvalue
       qvalue_to_qarray[id(arg.qvalue)] = arg
-      scale_dtypes.append(arg.scale.dtype)
-  if preferred_element_type is None and scale_dtypes:
-    preferred_element_type = jnp.result_type(*scale_dtypes)
 
   def _dot_general(*args, **kwargs):
     args = [qvalue_to_qarray.pop(id(a), a) for a in args]
diff --git a/qwix/_src/core/qarray.py b/qwix/_src/core/qarray.py
@@ -297,6 +297,8 @@ def transpose_array(
   Returns:
     The transposed array.
   """
+  if any(l > 1 for a, l in enumerate(array.shape) if a not in transpose):
+    raise ValueError(f'Cannot transpose {array.shape} as {transpose}.')
   used_axes = [a for a in transpose if a is not None and array.shape[a] != 1]
   # If used_axes is already in order, no actual transpose is needed and we can
   # just reshape the array.
@@ -553,3 +555,46 @@ def clip_to_calibration(
   else:
     raise ValueError(f'Unsupported calibration: {calibration}')
   return array.reshape(original_shape)
+
+
+def get_accumulator_and_result_type(
+    *args: MaybeQArray,
+    preferred_element_type: jax.typing.DTypeLike | None,
+) -> tuple[jax.typing.DTypeLike, jax.typing.DTypeLike]:
+  """jnp.result_type for QArray.
+
+  Accumulator type is the dtype used for the dot_general computation.
+  Result type is the dtype of the final result.
+
+  Args:
+    *args: The arguments to dot_general.
+    preferred_element_type: The preferred element type for dot_general.
+
+  Returns:
+    A tuple of the accumulator type and the result type.
+  """
+  qvalue_dtypes, dequant_dtypes = [], []
+  for arg in args:
+    if isinstance(arg, QArray):
+      qvalue_dtypes.append(arg.qvalue.dtype)  # note qtype can be different.
+      dequant_dtypes.append(arg.scale.dtype)
+    else:
+      qvalue_dtypes.append(arg.dtype)
+      dequant_dtypes.append(arg.dtype)
+
+  # Result type should only depend on dequant_dtype and preferred_element_type.
+  result_type = preferred_element_type
+  if result_type is None:
+    # There's no dtype promotion path for fp8 or lower, and int4 or lower.
+    # We manually upcast them to bf16 or int32.
+    for i, t in enumerate(dequant_dtypes):
+      if t.itemsize <= 1:
+        dequant_dtypes[i] = jnp.int32 if 'int' in t.name else jnp.bfloat16
+    result_type = jnp.result_type(*dequant_dtypes)
+
+  # Accumulator type should be the same as result type except for int x int.
+  accumulator_type = result_type
+  if all('int' in t.name for t in qvalue_dtypes):
+    accumulator_type = jnp.int32
+
+  return accumulator_type, result_type
diff --git a/qwix/_src/core/ragged_dot.py b/qwix/_src/core/ragged_dot.py
@@ -0,0 +1,137 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Quantized jax.lax.ragged_dot."""
+
+import jax
+from jax import numpy as jnp
+from qwix._src.core import qarray
+
+
+def _fast_ragged_dot(
+    lhs: qarray.MaybeQArray,
+    rhs: qarray.MaybeQArray,
+    group_sizes: jax.Array,
+    precision: jax.lax.PrecisionLike = None,
+    preferred_element_type: jax.typing.DTypeLike | None = None,
+    group_offset: jax.Array | None = None,
+) -> jax.Array:
+  """Quantized jax.lax.ragged_dot."""
+  if isinstance(lhs, qarray.QArray):
+    assert lhs.zero_point is None, 'not supported yet'
+    lhs_value = lhs.qvalue
+    lhs_scale = lhs.scale
+  else:
+    lhs_value = lhs
+    lhs_scale = None
+  if isinstance(rhs, qarray.QArray):
+    assert rhs.zero_point is None, 'not supported yet'
+    rhs_value = rhs.qvalue
+    rhs_scale = rhs.scale
+  else:
+    rhs_value = rhs
+    rhs_scale = None
+
+  preferred_element_type, result_type = qarray.get_accumulator_and_result_type(
+      lhs, rhs, preferred_element_type=preferred_element_type
+  )
+
+  out = jax.lax.ragged_dot(
+      lhs_value,
+      rhs_value,
+      group_sizes,
+      precision=precision,
+      preferred_element_type=preferred_element_type,
+      group_offset=group_offset,
+  )
+
+  # ragged_dot has fixed dimension numbers which makes implementation a lot
+  # easier, i.e., lhs: [m, k], rhs: [g, k, n], res: [m, n].
+  # TODO(dangyi): support arbitrary dimension numbers.
+  if lhs_scale is not None:  # [m, 1]
+    lhs_scale = qarray.transpose_array(lhs_scale, (0, None))
+    out = qarray.call_with_generic_broadcast(jnp.multiply, out, lhs_scale)
+  if rhs_scale is not None:  # [1, 1, n] or [g, 1, n]
+    if rhs_scale.shape[0] == 1:
+      # It's possible to apply the scale to the out directly.
+      rhs_scale = qarray.transpose_array(rhs_scale, (None, 2))
+    else:
+      # We need another ragged_dot to apply the scale to the out.
+      rhs_scale = jax.lax.ragged_dot(
+          jnp.ones((out.shape[0], 1), rhs_scale.dtype),
+          rhs_scale,
+          group_sizes,
+          group_offset=group_offset,
+      )
+    out = qarray.call_with_generic_broadcast(jnp.multiply, out, rhs_scale)
+
+  return out.astype(result_type)
+
+
+def _slow_ragged_dot(
+    lhs: qarray.MaybeQArray,
+    rhs: qarray.MaybeQArray,
+    group_sizes: jax.Array,
+    **kwargs,
+) -> jax.Array:
+  """Quantized jax.lax.ragged_dot which dequantizes first."""
+  if isinstance(lhs, qarray.QArray):
+    lhs = qarray.dequantize(lhs)
+  if isinstance(rhs, qarray.QArray):
+    rhs = qarray.dequantize(rhs)
+  return jax.lax.ragged_dot(lhs, rhs, group_sizes, **kwargs)
+
+
+def ragged_dot(
+    lhs: qarray.MaybeQArray,
+    rhs: qarray.MaybeQArray,
+    group_sizes: jax.Array,
+    precision: jax.lax.PrecisionLike = None,
+    preferred_element_type: jax.typing.DTypeLike | None = None,
+    group_offset: jax.Array | None = None,
+) -> jax.Array:
+  """Quantized jax.lax.ragged_dot."""
+  use_fast_ragged_dot = True
+
+  # fast_ragged_dot does't support channelwise scales on group axis, or tiled
+  # scales on contracting axes, or zero_point.
+  if isinstance(lhs, qarray.QArray):  # [m, k]
+    if lhs.zero_point is not None or lhs.scale.shape[1] > 1:
+      use_fast_ragged_dot = False
+  if isinstance(rhs, qarray.QArray):  # [g, k, n]
+    if (
+        rhs.zero_point is not None
+        or rhs.scale.shape[0] > 1
+        or rhs.scale.shape[1] > 1
+    ):
+      use_fast_ragged_dot = False
+
+  if use_fast_ragged_dot:
+    return _fast_ragged_dot(
+        lhs,
+        rhs,
+        group_sizes,
+        precision=precision,
+        preferred_element_type=preferred_element_type,
+        group_offset=group_offset,
+    )
+  else:
+    return _slow_ragged_dot(
+        lhs,
+        rhs,
+        group_sizes,
+        precision=precision,
+        preferred_element_type=preferred_element_type,
+        group_offset=group_offset,
+    )
diff --git a/tests/core/einsum_test.py b/tests/core/einsum_test.py
@@ -200,7 +200,7 @@ def _einsum(lhs, rhs):
           lhs_shape=(10, 256, 16),
           rhs_shape=(256, 16, 128),
           lhs_asymmetric=True,
-          expected_rel_mae=0.0130005,
+          expected_rel_mae=0.0129395,
       ),
       dict(
           testcase_name='lhs_asymmetric_subchannel',
diff --git a/tests/core/ragged_dot_test.py b/tests/core/ragged_dot_test.py
@@ -0,0 +1,71 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import jax
+from jax import numpy as jnp
+from qwix._src.core import qarray
+from qwix._src.core import ragged_dot
+
+
+def mae(a, b):
+  assert a.dtype == b.dtype and a.shape == b.shape
+  return jnp.abs(a - b).mean() / jnp.abs(a).mean()
+
+
+class RaggedDotTest(parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name='no_channelwise',
+          lhs_how=qarray.HowToQuantize(qtype=jnp.int8, channelwise_axes=[]),
+          rhs_how=qarray.HowToQuantize(qtype=jnp.int8, channelwise_axes=[]),
+      ),
+      dict(
+          testcase_name='channelwise',
+          lhs_how=qarray.HowToQuantize(qtype=jnp.int8, channelwise_axes=[0]),
+          rhs_how=qarray.HowToQuantize(qtype=jnp.int8, channelwise_axes=[2]),
+      ),
+      dict(
+          testcase_name='more_channelwise',
+          lhs_how=qarray.HowToQuantize(qtype=jnp.int8, channelwise_axes=[0]),
+          rhs_how=qarray.HowToQuantize(qtype=jnp.int8, channelwise_axes=[0, 2]),
+      ),
+  )
+  def test_ragged_dot(
+      self,
+      lhs_how,
+      rhs_how,
+      disable_fast_path=False,
+  ):
+    lhs = jax.random.normal(jax.random.key(0), (256, 16), jnp.bfloat16)
+    rhs = jax.random.normal(jax.random.key(1), (10, 16, 64), jnp.bfloat16)
+    group_sizes = jnp.array([10, 20, 30, 40, 0, 115, 6, 7, 1, 27], jnp.int32)
+
+    fp_res = jax.lax.ragged_dot(lhs, rhs, group_sizes)
+
+    qlhs = qarray.quantize(lhs, lhs_how)
+    qrhs = qarray.quantize(rhs, rhs_how)
+
+    slow_res = ragged_dot._slow_ragged_dot(qlhs, qrhs, group_sizes)
+    self.assertLess(mae(slow_res, fp_res), 0.02)
+
+    if not disable_fast_path:
+      fast_res = ragged_dot._fast_ragged_dot(qlhs, qrhs, group_sizes)
+      self.assertLess(mae(fast_res, slow_res), 0.005)
+
+
+if __name__ == '__main__':
+  absltest.main()