Introduce custom all-reduce that communicates in FP8 precision (#201)

dzmitry-huba · web-flow · commit c4f0e728b96c · 2025-05-23T15:56:45.000-07:00
* Introduce custom all-reduce that communicates in FP8 precision and reduces in FP32 precision

* Address linter and test failures

* Address linter and test failures

* Address linter and test failures

* Address linter and test failures

* Address linter and test failures

* Address pytest seg fault

* Address pytest seg fault

* Address review feedback
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ Issues = "https://github.com/pytorch-labs/torchft/issues"
 
 [project.optional-dependencies]
 dev = [
-    "pytest",
+    "pytest==8.3.4",
     "pytest-timeout",
     "black",
     "pyre-check",
diff --git a/torchft/_test_utils.py b/torchft/_test_utils.py
@@ -0,0 +1,111 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def any_nan(ts: list[torch.Tensor]) -> bool:
+    """
+    Check if any tensor in the list contains NaN values.
+
+    Args:
+        ts: List of tensors to check for NaN values
+
+    Returns:
+        True if any tensor contains NaN values, False otherwise
+    """
+    for t in ts:
+        if torch.isnan(t).any():
+            return True
+    return False
+
+
+def combine_views(
+    views: list[list[tuple[int, ...]]],
+    combinations: list[list[tuple[int, ...]]],
+    tmp: list[tuple[int, ...]],
+    i: int,
+) -> None:
+    """
+    Recursively generate all possible combinations of views from a list of
+    lists of views.
+
+    This function uses backtracking to generate all possible combinations by
+    selecting each list in the input. The results are stored in the
+    combinations list.
+
+    Args:
+        views: A list of lists, where each inner list contains possible view
+            shapes (tuples)
+        combinations: Output list where all combinations will be stored
+        tmp: Temporary list to build the current combination
+        i: Current index in the views list being processed
+
+    Returns:
+        None. Results are stored in the combinations list passed as
+        an argument.
+    """
+    if i == len(views):
+        combinations.append(tmp.copy())
+        return
+
+    for j in range(len(views[i])):
+        tmp.append(views[i][j])
+        combine_views(views, combinations, tmp, i + 1)
+        tmp.pop()
+
+
+def gen_views(inp: torch.Tensor) -> list[tuple[int, ...]]:
+    """
+    Generate all possible 2D views (shapes) for a tensor with a given number
+    of elements.
+
+    This function finds all pairs of integers (m, n) such that m * n equals the
+    total number of elements in the input tensor. These pairs represent possible
+    2D shapes that the tensor can be reshaped into.
+
+    Args:
+        inp: Input tensor
+
+    Returns:
+        A list of tuples, where each tuple (m, n) represents a possible 2D shape
+        such that m * n equals the total number of elements in the input tensor
+    """
+    size = inp.numel()
+
+    views = []
+    for m in range(1 if size % 2 == 0 else 2, size):
+        if size % m == 0:
+            views.append((m, size // m))
+
+    return views
+
+
+def gen_splits(inp: torch.Tensor, split_size: int) -> list[list[tuple[int, ...]]]:
+    """
+    Split a tensor into chunks and generate all possible combinations of views.
+
+    This function first splits the input tensor into chunks of the specified size,
+    then generates all possible 2D views for each chunk, and finally computes all
+    possible combinations of these views across all chunks.
+
+    Args:
+        inp: Input tensor to be split
+        split_size: Size of each chunk
+
+    Returns:
+        A list of lists, where each inner list contains a combination of view
+        shapes, one for each chunk of the input tensor
+    """
+    views = []
+
+    for split in torch.split(inp, split_size):
+        views.append(gen_views(split))
+
+    combinations = []
+    combine_views(views, combinations, [], 0)
+
+    return combinations
diff --git a/torchft/collectives.py b/torchft/collectives.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+# pyre-ignore[21]: Could not find a module corresponding to import `triton`
+import triton
+from torch import cuda
+from torch.distributed.distributed_c10d import (
+    AllgatherOptions,
+    AllreduceOptions,
+    AllToAllOptions,
+    ReduceOp,
+)
+from torch.futures import Future
+
+from torchft.process_group import ProcessGroup
+from torchft.quantization import (
+    fused_dequantize_from_fp8,
+    fused_quantize_into_fp8,
+    fused_reduce_fp8,
+)
+
+
+def _to_alltoall_options(opts: AllreduceOptions) -> AllToAllOptions:
+    alltoall_opts = AllToAllOptions()
+    alltoall_opts.timeout = opts.timeout
+    return alltoall_opts
+
+
+def _to_allgather_options(opts: AllreduceOptions) -> AllgatherOptions:
+    allgather_opts = AllgatherOptions()
+    allgather_opts.timeout = opts.timeout
+    return allgather_opts
+
+
+def allreduce_quantized(
+    tensors: list[torch.Tensor],
+    opts: AllreduceOptions | ReduceOp,
+    process_group: ProcessGroup,
+    sync_stream: cuda.Stream | None = None,
+) -> Future[None]:
+    """
+    Performs a quantized all-reduce operation on a list of tensors.
+
+    This function implements an optimized all-reduce that reduces communication
+    overhead by quantizing tensors to FP8 format before sending them over the
+    network. The algorithm works as follows:
+
+    1. Quantize input tensors to FP8 format
+    2. Distribute chunks of quantized tensors to all ranks using all-to-all
+    3. Reduce chunks locally in higher precision after dequantization
+    4. Collect reduced chunks from all ranks using all-gather
+    5. Dequantize the result back to the original precision
+
+    This implementation only supports the AVG reduce operation.
+
+    Args:
+        tensors: List of tensors to be reduced. All tensors must be on the same
+            CUDA device and have the same dtype.
+        opts: Options for the all-reduce operation. Can be either an
+            AllreduceOptions object or a ReduceOp enum. If a ReduceOp is
+            provided, it must be ReduceOp.AVG.
+        process_group: The process group to perform the all-reduce on.
+        sync_stream: Optional CUDA stream to use for synchronization. If None,
+            a new stream will be created.
+
+    Returns:
+        A Future that can be used to wait for the operation to complete and
+        clean up intermediate buffers.
+
+    Raises:
+        NotImplementedError: If the reduce operation is not ReduceOp.AVG.
+    """
+    if isinstance(opts, ReduceOp):
+        allreduce_opts = AllreduceOptions()
+        allreduce_opts.reduceOp = opts
+    else:
+        allreduce_opts = opts
+
+    # Check if the reduceOp is AVG, as only AVG is supported
+    if allreduce_opts.reduceOp != ReduceOp.AVG:
+        raise NotImplementedError(
+            f"ReduceOp {allreduce_opts.reduceOp} is not supported "
+            f"for quantized allreduce, only AVG is supported"
+        )
+
+    rank = process_group.rank()
+    world_size = process_group.size()
+
+    if sync_stream is None:
+        sync_stream = cuda.Stream()
+
+    assert sync_stream is not None
+    # Ensure that all operations are completed on the current stream
+    # before proceeding with all-reduce
+    sync_stream.wait_stream(cuda.current_stream())
+    with cuda.stream(sync_stream):
+        # Quantize tensoers and compute their scales, all inlined in the
+        # output tensor.
+        quantized_tensors = fused_quantize_into_fp8(tensors, world_size)
+
+        # Allocate output tensor where all-reduce results will be stored
+        quantized_tensors_out = torch.zeros_like(quantized_tensors)
+        # Collect chunks and their scales from other ranks
+        process_group.alltoall_base(
+            quantized_tensors_out.view(world_size, -1),
+            quantized_tensors.view(world_size, -1),
+            [],
+            [],
+            _to_alltoall_options(allreduce_opts),
+        ).wait()
+
+        # Reduce chunks locally in higher precision after dequantization.
+        # The output is again quantized.
+        fused_reduce_fp8(
+            tensors,
+            quantized_tensors_out,
+            world_size,
+            rank,
+        )
+
+        # Collect reduced chunks from other ranks.
+        process_group.allgather_into_tensor_coalesced(
+            [quantized_tensors.view(world_size, -1)],
+            [torch.split(quantized_tensors_out.view(world_size, -1), 1)[rank]],
+            _to_allgather_options(allreduce_opts),
+        ).wait()
+
+        # Dequantize and copy to output buffer.
+        fused_dequantize_from_fp8(tensors, quantized_tensors, world_size)
+
+        class QuantizedAllReduceFuture(Future[None]):
+            def __init__(
+                self,
+                sync_stream: cuda.Stream,
+                quantized_tensors: torch.Tensor,
+                quantized_tensors_out: torch.Tensor,
+            ) -> None:
+                super().__init__()
+                self._sync_stream = sync_stream
+                self._quantized_tensors = quantized_tensors
+                self._quantized_tensors_out = quantized_tensors_out
+
+            def wait(self) -> None:
+                # Wait for the synchronization to complete.
+                cuda.current_stream().wait_stream(self._sync_stream)
+                # Clean up intermediate buffers.
+                del self._quantized_tensors_out
+                del self._quantized_tensors
+
+        # pyre-ignore[29]
+        return QuantizedAllReduceFuture(
+            sync_stream, quantized_tensors, quantized_tensors_out
+        )
diff --git a/torchft/collectives_test.py b/torchft/collectives_test.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Callable
+from unittest import TestCase, skipUnless
+
+import torch
+from parameterized import parameterized
+from torch import cuda
+from torch.distributed import AllreduceOptions, ReduceOp
+from torch.distributed.distributed_c10d import ReduceOp
+
+from torchft import _test_utils
+from torchft.process_group import ProcessGroup
+from torchft.process_group_test import MultiPgBaseTest
+
+try:
+    # pyre-ignore[21]: Could not find a module corresponding to import `triton`
+    import triton
+except ImportError:
+    pass
+else:
+    from torchft.collectives import allreduce_quantized
+
+    @skipUnless(
+        torch.cuda.is_available() and torch.cuda.device_count() >= 2,
+        "2 CUDA devices are required for this test",
+    )
+    class QuantizedAllReduceTest(MultiPgBaseTest):
+        BACKEND = "nccl"
+        WORLD_SIZE = 2
+
+        def _run_parallel_collectives(
+            self, collective: Callable[[ProcessGroup, int, str], None]
+        ) -> None:
+            futures = []
+            for rank in range(self.WORLD_SIZE):
+                pg = self.pg_pool[rank]
+                device = f"cuda:{rank}"
+                fut = self.executor.submit(collective, pg, rank, device)
+                futures.append(fut)
+
+            self._collect(futures)
+
+        def _run_collective(
+            self,
+            pg: ProcessGroup,
+            rank: int,
+            device: str,
+            tensors_num: int,
+            tensor_size: int,
+            multiplier: float,
+            tolerance: float,
+        ) -> None:
+            cuda.set_device(device)
+            inp = (
+                torch.rand(
+                    tensors_num * tensor_size,
+                    dtype=torch.float32,
+                    device=device,
+                )
+                * multiplier
+            )
+            for split in _test_utils.gen_splits(inp, tensor_size):
+                actual = inp.clone()
+                expected = inp.clone()
+                tensors = [
+                    i.view(*s)
+                    for s, i in zip(
+                        split,
+                        torch.split(actual, tensor_size),
+                    )
+                ]
+
+                fut = allreduce_quantized(tensors, ReduceOp.AVG, pg)
+                fut.wait()
+
+                work = pg.allreduce([expected], ReduceOp.AVG)
+                work.get_future().wait()
+
+                diff = torch.abs((expected - actual).div(expected))
+                mean_diff = diff.mean().item()
+
+                if mean_diff > tolerance:
+                    raise AssertionError(f"Results not within tolerance {tolerance}")
+
+        END_TO_END_CONFIGS: list[tuple[int, float]] = [
+            (ts, m)
+            for ts in [128, 512, 1024, 2048, 4096]
+            for m in [1.0, 10.0, 100.0, 1000.0]
+        ]
+
+        @parameterized.expand(END_TO_END_CONFIGS)
+        def test_collective(self, tensor_size: int, multiplier: float) -> None:
+            self._run_parallel_collectives(
+                lambda pg, rank, device: self._run_collective(
+                    pg,
+                    rank,
+                    device,
+                    3,
+                    tensor_size,
+                    multiplier,
+                    3.0,
+                )
+            )
diff --git a/torchft/quantization.py b/torchft/quantization.py
diff --git a/torchft/quantization_test.py b/torchft/quantization_test.py