Support stochastic rounding in Qwix.

nicolagp · copybara-github · commit b3143b3e1d57 · 2025-09-25T17:47:52.000-07:00
PiperOrigin-RevId: 811569429
diff --git a/qwix/_src/core/dot_general_qt.py b/qwix/_src/core/dot_general_qt.py
@@ -53,6 +53,10 @@ class DotGeneralQtConfig:
   disable_channelwise_axes: bool = False
   bwd_use_original_residuals: bool = False  # what to use as residuals
 
+  # Configs for stochastic rounding.
+  dlhs_stochastic_rounding_noise_fn: numerics.NoiseFn | None = None
+  drhs_stochastic_rounding_noise_fn: numerics.NoiseFn | None = None
+
   # Deprecated. No longer used.
   dlhs_lhs_qtype: jax.typing.DTypeLike | None = None  # incoming gradient
   dlhs_rhs_qtype: jax.typing.DTypeLike | None = None  # residual rhs
@@ -227,6 +231,17 @@ def _compute_gradient_for_operand(
       )
       if config.disable_channelwise_axes:
         g_how = dataclasses.replace(g_how, channelwise_axes=[])
+
+      if for_dlhs and config.dlhs_stochastic_rounding_noise_fn:
+        g_how = dataclasses.replace(
+            g_how,
+            noise_fn=config.dlhs_stochastic_rounding_noise_fn,
+        )
+      if not for_dlhs and config.drhs_stochastic_rounding_noise_fn:
+        g_how = dataclasses.replace(
+            g_how,
+            noise_fn=config.drhs_stochastic_rounding_noise_fn,
+        )
       g = qarray.quantize(g, g_how)
 
     grad_res = dot_general.dot_general(g, y, bwd_dnums)
diff --git a/qwix/_src/core/numerics.py b/qwix/_src/core/numerics.py
@@ -13,9 +13,15 @@
 # limitations under the License.
 """Numerics for quantization."""
 
+from typing import Callable, Sequence
 import jax
 from jax import numpy as jnp
 
+# A function that generates noise for stochastic rounding.
+# args:  shape: The shape of the noise to generate.
+# returns: An array of noise with the given shape with channelwise noise axes.
+NoiseFn = Callable[[Sequence[int]], jax.Array]
+
 
 def should_quantize(dtype: jax.typing.DTypeLike) -> bool:
   """Returns True if the dtype should be quantized."""
@@ -64,7 +70,11 @@ def get_symmetric_bound(qtype: jax.typing.DTypeLike) -> float:
         return jnp.iinfo(qtype).max + 0.5
 
 
-def convert_to(x: jax.Array, qtype: jax.typing.DTypeLike) -> jax.Array:
+def convert_to(
+    x: jax.Array,
+    qtype: jax.typing.DTypeLike,
+    noise_fn: NoiseFn | None = None,
+) -> jax.Array:
   """Rounds and converts x to the given qtype."""
   match qtype:
     case 'nf4':
@@ -84,8 +94,15 @@ def convert_to(x: jax.Array, qtype: jax.typing.DTypeLike) -> jax.Array:
       try:
         finfo = jnp.finfo(qtype)
       except ValueError:
+        finfo = None
+      if finfo is None:
         # dtype is an integer type. We need to round manually but clipping can
         # be handled by "astype".
+        if noise_fn is not None:
+          # Stochastic rounding is done in fp32 to avoid bias from bf16, e.g.
+          # round(bf16(41)-bf16(0.4)) ~= round(40.5) = 40, rather than
+          # round(41-0.4) = round(40.6) = 41.
+          x = x.astype(jnp.float32) + noise_fn(x.shape)
         return jnp.round(x).astype(qtype)
       # dtype is a floating point type. No rounding needed, but we need to
       # clip to the range to avoid inf or nan (e.g. for e4m3fn).
diff --git a/qwix/_src/core/qarray.py b/qwix/_src/core/qarray.py
@@ -254,6 +254,8 @@ class HowToQuantize:
   # The calibration method to use. The format is <method>[,<args>], e.g.
   # "absmax" or "fixed,-10,10". Check calibrate() for supported methods.
   calibration_method: str = 'absmax'
+  # Noise function to use for stochastic rounding.
+  noise_fn: numerics.NoiseFn | None = None
 
 
 ShapeT: TypeAlias = Sequence[int]
@@ -476,6 +478,7 @@ def quantize_with_scale_zero_point(
     qtype: jax.typing.DTypeLike,
     scale: jax.Array,
     zero_point: jax.Array | None,
+    noise_fn: numerics.NoiseFn | None = None,
 ) -> QArray:
   """Quantizes an array with the given scale and zero_point.
 
@@ -484,6 +487,8 @@ def quantize_with_scale_zero_point(
     qtype: The logical type used for quantization.
     scale: The scale to use.
     zero_point: The zero_point to use.
+    noise_fn: The noise function to add to the quantized array for stochastic
+      rounding.
 
   Returns:
     The quantized array.
@@ -504,7 +509,7 @@ def quantize_with_scale_zero_point(
     qvalue = call_with_generic_broadcast(
         jnp.add, qvalue, zero_point.astype(qvalue.dtype)
     )
-  qvalue = numerics.convert_to(qvalue, qtype)
+  qvalue = numerics.convert_to(qvalue, qtype, noise_fn)
   return QArray(qvalue, scale, zero_point, qtype)
 
 
@@ -515,7 +520,9 @@ def quantize(
   """Quantizes an array using a dynamic range."""
   calibration = calibrate(array, how)
   scale, zero_point = compute_scale_zero_point(calibration, how.qtype)
-  return quantize_with_scale_zero_point(array, how.qtype, scale, zero_point)
+  return quantize_with_scale_zero_point(
+      array, how.qtype, scale, zero_point, how.noise_fn
+  )
 
 
 def dequantize(array: QArray) -> jax.Array:
diff --git a/qwix/_src/core/stochastic_rounding.py b/qwix/_src/core/stochastic_rounding.py
@@ -0,0 +1,33 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stochastic rounding utilities."""
+
+from typing import Sequence
+import jax
+
+
+def uniform_noise(
+    shape: tuple[int, ...],
+    *,
+    key: jax.Array,
+    channelwise_noise_axes: Sequence[int] = (0,),
+) -> jax.Array:
+  """Uniform noise."""
+
+  # Keep shape dimensions only for channelwise_noise_axes.
+  noise_shape = tuple(
+      dim if axis in channelwise_noise_axes else 1
+      for axis, dim in enumerate(shape)
+  )
+  return jax.random.uniform(key, noise_shape) - 0.5
diff --git a/qwix/_src/flax_util.py b/qwix/_src/flax_util.py
@@ -327,3 +327,20 @@ def _check_shape(value: Any, init_fn: Callable[[], Any]):
   abs_init_value = jax.eval_shape(lambda: unbox(init_fn()))
   if abs_value != abs_init_value:
     raise ValueError(f'{abs_value} != {abs_init_value}')
+
+
+def make_rng(rng_stream: str) -> jax.Array:
+  """Returns a random key from rng stream."""
+
+  # Get random key.
+  module = get_current_module()
+  if isinstance(module, nn.Module):
+    key = module.make_rng(rng_stream)
+  elif isinstance(module, nnx.Module):
+    if rng_stream != 'stochastic_rounding':
+      raise ValueError(f'Unsupported nnx rng_stream: {rng_stream}')
+    key = module.rngs.stochastic_rounding()
+  else:
+    raise ValueError('Current module is not known.')
+
+  return key
diff --git a/qwix/_src/providers/qt.py b/qwix/_src/providers/qt.py
@@ -27,6 +27,7 @@
 from qwix._src import qconfig
 from qwix._src.core import conv_general_qt
 from qwix._src.core import dot_general_qt
+from qwix._src.core import stochastic_rounding
 
 
 @dataclasses.dataclass(frozen=True, kw_only=True)
@@ -52,6 +53,13 @@ class QtRule(qconfig.QuantizationRule):
   # residuals for backward pass.
   bwd_use_original_residuals: bool = False
 
+  # Use stochastic rounding for the gradients. (Only 'uniform' is supported.)
+  bwd_stochastic_rounding: str | None = None
+
+  # Use channelwise noise for stochastic rounding. By default, it will generate
+  # noise for the 0th dimension and broadcast it over remaining dimensions.
+  channelwise_noise_axes: Sequence[int] = (0,)
+
   # Override any fields in DotGeneralQtConfig.
   additional_qt_config: Mapping[str, Any] | None = None
 
@@ -385,6 +393,26 @@ def _create_dot_general_qt_config(
       if rhs_is_weight:
         drhs_tile_size = rule.bwd_weight_grad_tile_size
 
+    if rule.bwd_stochastic_rounding == 'uniform':
+      dlhs_stochastic_rounding_noise_fn = functools.partial(
+          stochastic_rounding.uniform_noise,
+          key=flax_util.make_rng('stochastic_rounding'),
+          channelwise_noise_axes=rule.channelwise_noise_axes,
+      )
+      drhs_stochastic_rounding_noise_fn = functools.partial(
+          stochastic_rounding.uniform_noise,
+          key=flax_util.make_rng('stochastic_rounding'),
+          channelwise_noise_axes=rule.channelwise_noise_axes,
+      )
+    elif rule.bwd_stochastic_rounding is not None:
+      raise ValueError(
+          'Stochastic rounding should be "uniform" or None, got:'
+          f' {rule.bwd_stochastic_rounding}'
+      )
+    else:
+      dlhs_stochastic_rounding_noise_fn = None
+      drhs_stochastic_rounding_noise_fn = None
+
     qt_config = dot_general_qt.DotGeneralQtConfig(
         # fwd configs.
         lhs_qtype=lhs_qtype,
@@ -405,6 +433,8 @@ def _create_dot_general_qt_config(
         # misc.
         disable_channelwise_axes=rule.disable_channelwise_axes,
         bwd_use_original_residuals=rule.bwd_use_original_residuals,
+        dlhs_stochastic_rounding_noise_fn=dlhs_stochastic_rounding_noise_fn,
+        drhs_stochastic_rounding_noise_fn=drhs_stochastic_rounding_noise_fn,
     )
 
     if rule.additional_qt_config:
diff --git a/tests/core/numerics_test.py b/tests/core/numerics_test.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 from absl.testing import absltest
+import jax
 from jax import numpy as jnp
 from qwix._src.core import numerics
+from qwix._src.core import stochastic_rounding
 
 
 class NumericsTest(absltest.TestCase):
@@ -77,6 +80,24 @@ def test_uint(self):
         jnp.array([0, 4, 129, 255], jnp.uint8),
     )
 
+  def test_stochastic_rounding(self):
+    key = jax.random.PRNGKey(0)
+    x = jnp.full((10000,), 0.5)
+    noise_fn = functools.partial(stochastic_rounding.uniform_noise, key=key)
+    y = numerics.convert_to(x, jnp.int8, noise_fn=noise_fn)
+    # Without stochastic rounding, this would be rounded to all zeros based on
+    # round-half-to-even.
+    self.assertAlmostEqual(jnp.mean(y), 0.5, delta=0.1)
+
+    # Test with negative values.
+    x = jnp.full((10000,), -0.5)
+    _, subkey = jax.random.split(key)
+    noise_fn = functools.partial(stochastic_rounding.uniform_noise, key=subkey)
+    y = numerics.convert_to(x, jnp.int8, noise_fn=noise_fn)
+    # Without stochastic rounding, this would be rounded to all zeros based on
+    # round-half-to-even.
+    self.assertAlmostEqual(jnp.mean(y), -0.5, delta=0.1)
+
   def test_nf4(self):
     self._assert_equal(
         numerics.convert_to(jnp.array([-1.0, -0.5, 0.0, 0.8, 1.0]), "nf4"),
diff --git a/tests/core/stochastic_rounding_test.py b/tests/core/stochastic_rounding_test.py
@@ -0,0 +1,43 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import functools
+from absl.testing import absltest
+from absl.testing import parameterized
+import jax
+import jax.numpy as jnp
+from qwix._src.core import stochastic_rounding
+
+
+class StochasticRoundingTest(parameterized.TestCase):
+
+  def test_uniform_noise(self):
+    key = jax.random.PRNGKey(0)
+    shape = (2, 3)
+    noise_fn = functools.partial(
+        stochastic_rounding.uniform_noise, key=key, channelwise_noise_axes=(0,)
+    )
+    noise = noise_fn(shape)
+    self.assertEqual(noise.shape, (2, 1))
+    noise = jnp.broadcast_to(noise, shape)
+    # Check that the noise is the same along the shared axis.
+    self.assertTrue(jnp.all(noise[0, 0] == noise[0, 1]))
+    self.assertTrue(jnp.all(noise[1, 0] == noise[1, 1]))
+    # Check that the noise is different along the non-shared axis.
+    self.assertFalse(jnp.all(noise[0, 0] == noise[1, 0]))
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tests/flax_util_test.py b/tests/flax_util_test.py
@@ -186,6 +186,37 @@ def test_update_boxed(self):
       self.assertIsInstance(updated, nnx.Param)
       self.assertEqual(updated.sharding_names, ("b", "a", None))
 
+  def test_make_rng_linen(self):
+    class MyModule(nn.Module):
+
+      @nn.compact
+      def __call__(self, x):
+        key = flax_util.make_rng("stochastic_rounding")
+        return key
+
+    key = jax.random.PRNGKey(0)
+    module = MyModule()
+    variables = module.init(
+        {"params": key, "stochastic_rounding": key}, jnp.ones((1,))
+    )
+    rng_key = module.apply(
+        variables, jnp.ones((1,)), rngs={"stochastic_rounding": key}
+    )
+    self.assertEqual(rng_key.shape, (2,))
+
+  def test_make_rng_nnx(self):
+    class MyModule(nnx.Module):
+
+      def __init__(self, *, rngs: nnx.Rngs):
+        self.rngs = rngs
+
+      def __call__(self):
+        return flax_util.make_rng("stochastic_rounding")
+
+    module = MyModule(rngs=nnx.Rngs(stochastic_rounding=0))
+    key = module()
+    self.assertEqual(key.shape, ())
+
 
 if __name__ == "__main__":
   absltest.main()