pytorch
diff --git a/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Lines changed: 44 additions & 7 deletions b/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Lines changed: 44 additions & 7 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
Lines changed: 205 additions & 1 deletion b/‎fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
Lines changed: 205 additions & 1 deletion
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16.cu
Lines changed: 6 additions & 6 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16.cu
Lines changed: 6 additions & 6 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_f.cu
Lines changed: 8 additions & 3 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_f.cu
Lines changed: 8 additions & 3 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_t.cu
Lines changed: 8 additions & 3 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_t.cu
Lines changed: 8 additions & 3 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_192_2_2_1_f.cu
Lines changed: 8 additions & 3 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_192_2_2_1_f.cu
Lines changed: 8 additions & 3 deletions
@@ -27,7 +27,8 @@
 )
 from fbgemm_gpu.experimental.gen_ai.quantize import (
     quantize_int4_preshuffle,
-    scaled_fp4_quant,
+    scale_mxfp4_quant,
+    scale_nvfp4_quant,
 )
 
 try:
@@ -2005,9 +2006,9 @@ def cuda(self) -> bool:
 
 
 @register_quantize_op
-class FP4Gemm(QuantizeOpBase):
+class NVFP4Gemm(QuantizeOpBase):
     """
-    FP4 matmul with block-wise scaling.
+    NVFP4 matmul with block-wise scaling.
     """
 
     def quantize(self, x, w):
@@ -2019,16 +2020,52 @@ def quantize(self, x, w):
         )
         global_scale = 1 / (x_global_scale * w_global_scale)
 
-        xq, x_scale = scaled_fp4_quant(x, x_global_scale)
-        wq, w_scale = scaled_fp4_quant(w, w_global_scale)
+        xq, x_scale = scale_nvfp4_quant(x, x_global_scale)
+        wq, w_scale = scale_nvfp4_quant(w, w_global_scale)
         return xq, wq, x_scale, w_scale, global_scale
 
     def compute(self, xq, wq, x_scale, w_scale, global_scale):
-        return torch.ops.fbgemm.f4f4bf16(xq, wq, x_scale, w_scale, global_scale)
+        return torch.ops.fbgemm.f4f4bf16(
+            xq, wq, x_scale, w_scale, global_scale=global_scale, use_mx=False
+        )
 
     def quantize_and_compute(self, x, w):
         xq, wq, x_scale, w_scale, global_scale = self.quantize(x, w)
-        return self.compute(xq, wq, x_scale, w_scale, global_scale)
+        return self.compute(
+            xq, wq, x_scale, w_scale, global_scale=global_scale, use_mx=False
+        )
+
+    @property
+    def name(self) -> str:
+        return "cutlass_nv_f4f4bf16"
+
+    @property
+    def hip(self) -> bool:
+        # F4F4BF16 only supported for cuda.
+        return False
+
+    @property
+    def cuda(self) -> bool:
+        return True
+
+
+@register_quantize_op
+class MXFP4Gemm(QuantizeOpBase):
+    """
+    MXFP4 matmul with block-wise scaling.
+    """
+
+    def quantize(self, x, w):
+        xq, x_scale = scale_mxfp4_quant(x)
+        wq, w_scale = scale_mxfp4_quant(w)
+        return xq, wq, x_scale, w_scale
+
+    def compute(self, xq, wq, x_scale, w_scale):
+        return torch.ops.fbgemm.f4f4bf16(xq, wq, x_scale, w_scale)
+
+    def quantize_and_compute(self, x, w):
+        xq, wq, x_scale, w_scale = self.quantize(x, w)
+        return self.compute(xq, wq, x_scale, w_scale)
 
     @property
     def name(self) -> str:
 
@@ -164,7 +164,7 @@ def _quantize(
     return wq, scales
 
 
-def scaled_fp4_quant(
+def scale_nvfp4_quant(
     input: torch.Tensor, input_global_scale: torch.Tensor
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
@@ -216,3 +216,207 @@ def round_up(x: int, y: int) -> int:
     torch.ops.fbgemm.scaled_fp4_quant(output, input, output_scale, input_global_scale)
     output_scale = output_scale.view(torch.float8_e4m3fn)
     return output, output_scale
+
+
+def _fp32_to_fp4_unpacked(x: torch.Tensor, ebits: int, mbits: int) -> torch.Tensor:
+    """Converts a float32 tensor to a unpacked float4 tensor.
+    Args:
+        x (torch.Tensor): The input float32 tensor.
+        ebits (int): The number of bits in the exponent.
+        mbits (int): The number of bits in the mantissa.
+    Returns:
+        torch.Tensor: The resulting unpacked float4 tensor.
+    """
+
+    def _n_ones(n: int) -> int:
+        return (1 << n) - 1
+
+    EBITS_F32, MBITS_F32 = 8, 23
+    F32_EXP_BIAS = _n_ones(EBITS_F32 - 1)
+
+    assert x.dtype == torch.float
+    assert 1 + ebits + mbits <= 8
+
+    # calculate constants
+    exp_bias = _n_ones(ebits - 1)
+    max_int = _n_ones(ebits + mbits)
+    sign_mask = 1 << (ebits + mbits)
+
+    magic_adder = _n_ones(MBITS_F32 - mbits - 1)
+
+    # all E bits and M bits are 1s
+    max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2**mbits))
+
+    # E bits = 1, M bits = 0
+    min_normal = 2 ** (1 - exp_bias)
+
+    denorm_exp = (
+        # exp bias conversion between formats
+        (F32_EXP_BIAS - exp_bias)
+        # mantissa length difference between formats
+        + (MBITS_F32 - mbits)
+        # add one to encoded exponent for denormalized numbers
+        + 1
+    )
+    denorm_mask_int = denorm_exp << MBITS_F32
+
+    # reinterpret int32 as float32
+    denorm_mask_float = torch.tensor(denorm_mask_int, dtype=torch.int32).view(
+        torch.float32
+    )
+
+    # save the sign
+    # Note that we have torch.uint32, but some ops like cpu bit shifts
+    # do not work on it. So, we stay in int32.
+    x = x.view(torch.int32)
+    sign = x & 0x80000000
+
+    # set everything to positive, will add sign back at the end
+    x = x ^ sign
+    x = x.view(torch.float)
+
+    # rewrite saturate/denorm/norm branches without explicit data dependent
+    # control flow, to be more compiler friendly
+    saturate_mask = x >= max_normal
+    denormal_mask = torch.logical_and(torch.logical_not(saturate_mask), x < min_normal)
+    normal_mask = torch.logical_not(torch.logical_or(saturate_mask, denormal_mask))
+
+    denormal_x = x + denorm_mask_float
+    denormal_x = denormal_x.view(torch.int32)
+    denormal_x -= denorm_mask_int
+    denormal_x = denormal_x.to(torch.uint8)
+
+    normal_x = x.view(torch.int32)
+    # resulting mantissa is odd
+    mant_odd = (normal_x >> (MBITS_F32 - mbits)) & 1
+    # update exponent, rounding bias part 1
+    val_to_add = ((exp_bias - F32_EXP_BIAS) << MBITS_F32) + magic_adder
+    normal_x += val_to_add
+    # rounding bias part 2
+    normal_x += mant_odd
+    # take the bits!
+    normal_x = normal_x >> (MBITS_F32 - mbits)
+    normal_x = normal_x.to(torch.uint8)
+
+    x = torch.full_like(x, max_int, dtype=torch.uint8)
+    x = torch.where(denormal_mask, denormal_x, x)
+    x = torch.where(normal_mask, normal_x, x)
+
+    # add sign back
+    sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
+    sign_lp = sign_lp.to(torch.uint8)
+    # Right shift of a negative signed integer can fill the least significant
+    # bits with either 1s or 0s, depending on the implementation. Since PyTorch
+    # doesn't have an uint32 dtype, we mask out these bits to get just the
+    # f4 sign bit
+    sign_lp = sign_lp & sign_mask
+    x = x | sign_lp
+
+    return x.to(torch.uint8)
+
+
+def _to_blocked(x: torch.Tensor) -> torch.Tensor:
+    """Converts a tensor to the blocked layout.
+    Args:
+        x (torch.Tensor): The input tensor in non-blocked layout.
+    Returns:
+        torch.Tensor: The output tensor in the blocked layout.
+    """
+
+    def ceil_div(a: int, b: int) -> int:
+        return (a + b - 1) // b
+
+    rows, cols = x.shape
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    # Calculate the padded shape
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    padded = x
+    if (rows, cols) != (padded_rows, padded_cols):
+        padded = torch.zeros(
+            (padded_rows, padded_cols),
+            device=x.device,
+            dtype=x.dtype,
+        )
+        padded[:rows, :cols] = x
+
+    # Rearrange the blocks
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+# This PyTorch version refers to https://github.com/pytorch/ao/blob/v0.10.0/torchao/prototype/mx_formats/mx_tensor.py#L146
+def scale_mxfp4_quant(
+    x: torch.Tensor, block_size: int = 32
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale.
+    Args:
+        x (torch.Tensor): The input tensor to be quantized to FP4
+        block_size (int): The block size to use for quantization. Default is 32.
+    Returns:
+        xq (torch.Tensor): Quantized FP4 output tensor
+        scale (torch.Tensor): Scale E8M0 tensor
+    """
+
+    F4_E2M1_MAX = 6.0
+    E8M0_EXPONENT_BIAS = 127
+    EBITS_F4_E2M1, MBITS_F4_E2M1 = 2, 1
+
+    # calculate the scale in e8m0 format
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+
+    # find max value of the data
+    # Note: this only implements the `minimally supported` version of
+    # https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    # section 6.3.
+    max_abs = torch.amax(torch.abs(x), 1)
+    max_pos = F4_E2M1_MAX
+
+    descale = max_abs / max_pos
+    scale = torch.where(
+        torch.isnan(descale),
+        0xFF,  # Handle biased exponent for nan
+        # NOTE: descale < (torch.finfo(torch.float32).smallest_normal / 2) is handled through clamping
+        (
+            torch.clamp(
+                torch.ceil(torch.log2(descale)),
+                min=-E8M0_EXPONENT_BIAS,
+                max=E8M0_EXPONENT_BIAS,
+            )
+            + E8M0_EXPONENT_BIAS
+        ).to(torch.uint8),
+    )
+
+    descale_fp = torch.where(
+        scale == 0,
+        1.0,
+        torch.exp2(E8M0_EXPONENT_BIAS - scale.to(torch.float32)),
+    )
+
+    # scale and saturated cast the data elements to max of target dtype
+    xq = torch.clamp(x * descale_fp.unsqueeze(1), min=-1 * max_pos, max=max_pos)
+
+    xq = xq.reshape(orig_shape)
+    xq = _fp32_to_fp4_unpacked(xq, EBITS_F4_E2M1, MBITS_F4_E2M1)
+    orig_shape = [*orig_shape[:-1], orig_shape[-1] // 2]
+
+    shape = xq.shape
+    assert shape[-1] % 2 == 0
+    xq = xq.contiguous().view(-1)
+    xq = (xq[::2] << 4 | xq[1::2]).view((*shape[:-1], shape[-1] // 2))
+
+    target_numel = scale.numel() * block_size / 2
+    assert target_numel == xq.numel(), f"{target_numel} != {xq.numel()}"
+
+    scale = scale.view(torch.float8_e8m0fnu)
+    scale = scale.view(orig_shape[0], -1)
+    scale = _to_blocked(scale)
+
+    return xq, scale
@@ -24,8 +24,8 @@ at::Tensor dispatch_f4f4bf16_kernel(
     at::Tensor WQ, // FP4
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor global_scale,
-    bool use_mx = false) {
+    std::optional<at::Tensor> global_scale,
+    bool use_mx = true) {
   auto M = XQ.size(0);
   auto K = XQ.size(1);
   auto N = WQ.size(0);
@@ -173,8 +173,8 @@ at::Tensor f4f4bf16(
     at::Tensor WQ, // FP4
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor global_scale,
-    bool use_mx = false) {
+    std::optional<at::Tensor> global_scale,
+    bool use_mx = true) {
   return dispatch_f4f4bf16_kernel(
       XQ, WQ, x_scale, w_scale, global_scale, use_mx);
 }
@@ -186,8 +186,8 @@ at::Tensor f4f4bf16(
     at::Tensor WQ, // FP4
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor global_scale,
-    bool use_mx = false) {
+    std::optional<at::Tensor> global_scale,
+    bool use_mx = true) {
   throw std::runtime_error(
       "CUDA version is older than 12.8"); // requires CUDA>=12.8
 }
 
@@ -17,10 +17,15 @@ at::Tensor f4f4bf16_128_128_4_1_1_f(
     at::Tensor WQ, // FP4
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor global_scale) {
+    std::optional<at::Tensor> global_scale = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return _f4f4bf16<128, 128, 4, 1, 1, false>(
-      XQ, WQ, x_scale, w_scale, global_scale);
+  return _f4f4bf16<
+      cutlass::nv_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      4,
+      1,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
 }
 
 #endif
 
@@ -17,10 +17,15 @@ at::Tensor f4f4bf16_128_128_4_1_1_t(
     at::Tensor WQ, // FP4
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor global_scale) {
+    std::optional<at::Tensor> global_scale = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return _f4f4bf16<128, 128, 4, 1, 1, true>(
-      XQ, WQ, x_scale, w_scale, global_scale);
+  return _f4f4bf16<
+      cutlass::mx_float4_t<cutlass::float_e2m1_t>,
+      128,
+      128,
+      4,
+      1,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
 }
 
 #endif
 
@@ -17,10 +17,15 @@ at::Tensor f4f4bf16_128_192_2_2_1_f(
     at::Tensor WQ, // FP4
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor global_scale) {
+    std::optional<at::Tensor> global_scale = std::nullopt) {
   // Dispatch this kernel to the correct underlying implementation.
-  return _f4f4bf16<128, 192, 2, 2, 1, false>(
-      XQ, WQ, x_scale, w_scale, global_scale);
+  return _f4f4bf16<
+      cutlass::nv_float4_t<cutlass::float_e2m1_t>,
+      128,
+      192,
+      2,
+      2,
+      1>(XQ, WQ, x_scale, w_scale, global_scale);
 }
 
 #endif