Open-sourced update on 12/19/2024 (facebookresearch#63)

Summary: Pull Request resolved: facebookresearch#63 1. Add `HybridShardDistributor` (i.e., per-parameter HSDP or HSDP2) implemented by wz337 and hjmshi into `DistributedShampoo`. 2. Disable quantization functionality for now. Differential Revision: D67398314
tsunghsienlee · Dec 19, 2024 · cca8b37 · cca8b37
1 parent b5dd2f2
commit cca8b37
Show file tree

Hide file tree

Showing 24 changed files with 1,597 additions and 1,218 deletions.
diff --git a/distributed_shampoo/__init__.py b/distributed_shampoo/__init__.py
@@ -22,7 +22,6 @@
     FullyShardShampooConfig,
     GraftingConfig,
     HSDPShampooConfig,
-    PrecisionConfig,
     PreconditionerConfig,
     RMSpropGraftingConfig,
     SGDGraftingConfig,
@@ -58,7 +57,6 @@
     "FullyShardShampooConfig",
     "HSDPShampooConfig",
     # `precision_config`.
-    "PrecisionConfig",
     # `preconditioner_config` options.
     "PreconditionerConfig",  # Abstract base class.
     "ShampooPreconditionerConfig",  # Based on `PreconditionerConfig`.

diff --git a/distributed_shampoo/distributed_shampoo.py b/distributed_shampoo/distributed_shampoo.py
diff --git a/distributed_shampoo/examples/ddp_cifar10_example.py b/distributed_shampoo/examples/ddp_cifar10_example.py
@@ -13,7 +13,7 @@
 import torch.distributed as dist
 import torch.distributed.checkpoint as dist_checkpoint
 
-from distributed_shampoo import DDPShampooConfig, DistributedShampoo, PrecisionConfig
+from distributed_shampoo import DDPShampooConfig, DistributedShampoo
 from distributed_shampoo.examples.trainer_utils import (
     get_data_loader_and_sampler,
     get_model_and_loss_fn,
@@ -117,23 +117,12 @@
         grafting_beta2=args.grafting_beta2,
         grafting_epsilon=args.grafting_epsilon,
         use_merge_dims=args.use_merge_dims,
-        use_pytorch_compile=args.use_pytorch_compile,
         distributed_config=DDPShampooConfig(
             communication_dtype=args.communication_dtype,
             num_trainers_per_group=args.num_trainers_per_group,
             communicate_params=args.communicate_params,
         ),
-        precision_config=PrecisionConfig(
-            computation_dtype=args.computation_dtype.value,
-            factor_matrix_dtype=args.factor_matrix_dtype.value,
-            inv_factor_matrix_dtype=args.inv_factor_matrix_dtype.value,
-            corrected_eigenvalues_dtype=args.corrected_eigenvalues_dtype.value,
-            factor_matrix_eigenvectors_dtype=args.factor_matrix_eigenvectors_dtype.value,
-            filtered_grad_dtype=args.filtered_grad_dtype.value,
-            momentum_dtype=args.momentum_dtype.value,
-            grafting_state_dtype=args.grafting_state_dtype.value,
-        ),
-        use_protected_eigh=args.use_protected_eigh,
+        preconditioner_dtype=args.preconditioner_dtype,
         track_root_inv_residuals=args.track_root_inv_residuals,
         preconditioner_computation_type=args.preconditioner_computation_type,
     )

diff --git a/distributed_shampoo/examples/default_cifar10_example.py b/distributed_shampoo/examples/default_cifar10_example.py
@@ -11,7 +11,6 @@
 import os
 
 import torch
-from distributed_shampoo import PrecisionConfig
 
 from distributed_shampoo.examples.trainer_utils import (
     get_data_loader_and_sampler,
@@ -132,19 +131,8 @@ def train_default_model(
         grafting_epsilon=args.grafting_epsilon,
         grafting_beta2=args.grafting_beta2,
         use_merge_dims=args.use_merge_dims,
-        use_pytorch_compile=args.use_pytorch_compile,
         distributed_config=None,
-        precision_config=PrecisionConfig(
-            computation_dtype=args.computation_dtype.value,
-            factor_matrix_dtype=args.factor_matrix_dtype.value,
-            inv_factor_matrix_dtype=args.inv_factor_matrix_dtype.value,
-            corrected_eigenvalues_dtype=args.corrected_eigenvalues_dtype.value,
-            factor_matrix_eigenvectors_dtype=args.factor_matrix_eigenvectors_dtype.value,
-            filtered_grad_dtype=args.filtered_grad_dtype.value,
-            momentum_dtype=args.momentum_dtype.value,
-            grafting_state_dtype=args.grafting_state_dtype.value,
-        ),
-        use_protected_eigh=args.use_protected_eigh,
+        preconditioner_dtype=args.preconditioner_dtype,
         track_root_inv_residuals=args.track_root_inv_residuals,
         preconditioner_computation_type=args.preconditioner_computation_type,
     )

diff --git a/distributed_shampoo/examples/fsdp_cifar10_example.py b/distributed_shampoo/examples/fsdp_cifar10_example.py
@@ -12,11 +12,7 @@
 
 import torch.distributed as dist
 
-from distributed_shampoo import (
-    compile_fsdp_parameter_metadata,
-    FSDPShampooConfig,
-    PrecisionConfig,
-)
+from distributed_shampoo import compile_fsdp_parameter_metadata, FSDPShampooConfig
 from distributed_shampoo.examples.trainer_utils import (
     get_data_loader_and_sampler,
     get_model_and_loss_fn,
@@ -115,21 +111,10 @@
         grafting_epsilon=args.grafting_epsilon,
         grafting_beta2=args.grafting_beta2,
         use_merge_dims=args.use_merge_dims,
-        use_pytorch_compile=args.use_pytorch_compile,
         distributed_config=FSDPShampooConfig(
             param_to_metadata=compile_fsdp_parameter_metadata(model),
         ),
-        precision_config=PrecisionConfig(
-            computation_dtype=args.computation_dtype.value,
-            factor_matrix_dtype=args.factor_matrix_dtype.value,
-            inv_factor_matrix_dtype=args.inv_factor_matrix_dtype.value,
-            corrected_eigenvalues_dtype=args.corrected_eigenvalues_dtype.value,
-            factor_matrix_eigenvectors_dtype=args.factor_matrix_eigenvectors_dtype.value,
-            filtered_grad_dtype=args.filtered_grad_dtype.value,
-            momentum_dtype=args.momentum_dtype.value,
-            grafting_state_dtype=args.grafting_state_dtype.value,
-        ),
-        use_protected_eigh=args.use_protected_eigh,
+        preconditioner_dtype=args.preconditioner_dtype,
         track_root_inv_residuals=args.track_root_inv_residuals,
         preconditioner_computation_type=args.preconditioner_computation_type,
     )

diff --git a/distributed_shampoo/examples/fully_shard_cifar10_example.py b/distributed_shampoo/examples/fully_shard_cifar10_example.py
@@ -14,11 +14,7 @@
 import torch.distributed as dist
 import torch.distributed.checkpoint as dist_checkpoint
 
-from distributed_shampoo import (
-    DistributedShampoo,
-    FullyShardShampooConfig,
-    PrecisionConfig,
-)
+from distributed_shampoo import DistributedShampoo, FullyShardShampooConfig
 from distributed_shampoo.examples.trainer_utils import (
     get_data_loader_and_sampler,
     get_model_and_loss_fn,
@@ -137,19 +133,8 @@ def create_model_and_optimizer_and_loss_fn(args, device):
         grafting_epsilon=args.grafting_epsilon,
         grafting_beta2=args.grafting_beta2,
         use_merge_dims=args.use_merge_dims,
-        use_pytorch_compile=args.use_pytorch_compile,
         distributed_config=FullyShardShampooConfig(),
-        precision_config=PrecisionConfig(
-            computation_dtype=args.computation_dtype.value,
-            factor_matrix_dtype=args.factor_matrix_dtype.value,
-            inv_factor_matrix_dtype=args.inv_factor_matrix_dtype.value,
-            corrected_eigenvalues_dtype=args.corrected_eigenvalues_dtype.value,
-            factor_matrix_eigenvectors_dtype=args.factor_matrix_eigenvectors_dtype.value,
-            filtered_grad_dtype=args.filtered_grad_dtype.value,
-            momentum_dtype=args.momentum_dtype.value,
-            grafting_state_dtype=args.grafting_state_dtype.value,
-        ),
-        use_protected_eigh=args.use_protected_eigh,
+        preconditioner_dtype=args.preconditioner_dtype,
         track_root_inv_residuals=args.track_root_inv_residuals,
         preconditioner_computation_type=args.preconditioner_computation_type,
     )

diff --git a/distributed_shampoo/examples/hsdp_cifar10_example.py b/distributed_shampoo/examples/hsdp_cifar10_example.py
@@ -12,11 +12,7 @@
 
 import torch.distributed as dist
 
-from distributed_shampoo import (
-    compile_fsdp_parameter_metadata,
-    HSDPShampooConfig,
-    PrecisionConfig,
-)
+from distributed_shampoo import compile_fsdp_parameter_metadata, HSDPShampooConfig
 
 from distributed_shampoo.examples.trainer_utils import (
     get_data_loader_and_sampler,
@@ -128,23 +124,12 @@
         grafting_epsilon=args.grafting_epsilon,
         grafting_beta2=args.grafting_beta2,
         use_merge_dims=args.use_merge_dims,
-        use_pytorch_compile=args.use_pytorch_compile,
         distributed_config=HSDPShampooConfig(
             param_to_metadata=compile_fsdp_parameter_metadata(model),
             device_mesh=device_mesh,
             num_trainers_per_group=args.num_trainers_per_group,
         ),
-        precision_config=PrecisionConfig(
-            computation_dtype=args.computation_dtype.value,
-            factor_matrix_dtype=args.factor_matrix_dtype.value,
-            inv_factor_matrix_dtype=args.inv_factor_matrix_dtype.value,
-            corrected_eigenvalues_dtype=args.corrected_eigenvalues_dtype.value,
-            factor_matrix_eigenvectors_dtype=args.factor_matrix_eigenvectors_dtype.value,
-            filtered_grad_dtype=args.filtered_grad_dtype.value,
-            momentum_dtype=args.momentum_dtype.value,
-            grafting_state_dtype=args.grafting_state_dtype.value,
-        ),
-        use_protected_eigh=args.use_protected_eigh,
+        preconditioner_dtype=args.preconditioner_dtype,
         track_root_inv_residuals=args.track_root_inv_residuals,
         preconditioner_computation_type=args.preconditioner_computation_type,
     )

diff --git a/distributed_shampoo/examples/trainer_utils.py b/distributed_shampoo/examples/trainer_utils.py
@@ -30,7 +30,6 @@
     DistributedConfig,
     DistributedShampoo,
     GraftingConfig,
-    PrecisionConfig,
     PreconditionerConfig,
     RMSpropGraftingConfig,
     SGDGraftingConfig,
@@ -195,16 +194,6 @@ def get_args():
             action="store_true",
             help="Use merge dims for Shampoo.",
         )
-        parser.add_argument(
-            "--use-pytorch-compile",
-            action="store_true",
-            help="Use PyTorch compile for Shampoo.",
-        )
-        parser.add_argument(
-            "--use-protected-eigh",
-            action="store_true",
-            help="Uses protected eigendecomposition.",
-        )
         parser.add_argument(
             "--track-root-inv-residuals",
             action="store_true",
@@ -239,52 +228,10 @@ def get_args():
 
         # Arguments for mixed-precision.
         parser.add_argument(
-            "--computation-dtype",
-            type=lambda t: enum_type_parse(t, DType),
-            default=DType.FP32,
-            help="Data type for all computation in Shampoo.",
-        )
-        parser.add_argument(
-            "--factor-matrix-dtype",
-            type=lambda t: enum_type_parse(t, DType),
-            default=DType.FP32,
-            help="Data type for storing Shampoo factor matrices.",
-        )
-        parser.add_argument(
-            "--inv-factor-matrix-dtype",
-            type=lambda t: enum_type_parse(t, DType),
-            default=DType.FP32,
-            help="Data type for storing Shampoo inverse factor matrices.",
-        )
-        parser.add_argument(
-            "--corrected-eigenvalues-dtype",
-            type=lambda t: enum_type_parse(t, DType),
-            default=DType.FP32,
-            help="Data type for storing corrected eigenvalues of Shampoo preconditioner.",
-        )
-        parser.add_argument(
-            "--factor-matrix-eigenvectors-dtype",
-            type=lambda t: enum_type_parse(t, DType),
-            default=DType.FP32,
-            help="Data type for storing Shampoo factor matrices eigenvectors.",
-        )
-        parser.add_argument(
-            "--filtered-grad-dtype",
-            type=lambda t: enum_type_parse(t, DType),
-            default=DType.FP32,
-            help="Data type for storing filtered gradients.",
-        )
-        parser.add_argument(
-            "--momentum-dtype",
-            type=lambda t: enum_type_parse(t, DType),
-            default=DType.FP32,
-            help="Data type for storing momentum states.",
-        )
-        parser.add_argument(
-            "--grafting-state-dtype",
+            "--preconditioner-dtype",
             type=lambda t: enum_type_parse(t, DType),
             default=DType.FP32,
-            help="Data type for storing grafting preconditioners.",
+            help="Preconditioner dtype for Shampoo.",
         )
 
         # Arguments for DDP Shampoo.
@@ -438,10 +385,8 @@ def instantiate_optimizer(
     grafting_beta2: float,
     grafting_epsilon: float,
     use_merge_dims: bool,
-    use_pytorch_compile: bool,
     distributed_config: DistributedConfig | None,
-    precision_config: PrecisionConfig | None,
-    use_protected_eigh: bool,
+    preconditioner_dtype: DType,
     track_root_inv_residuals: bool,
     preconditioner_computation_type: PreconditionerComputationType,
 ) -> torch.optim.Optimizer:
@@ -493,10 +438,8 @@ def instantiate_optimizer(
                 grafting_type, grafting_beta2, grafting_epsilon
             ),
             use_merge_dims=use_merge_dims,
-            use_pytorch_compile=use_pytorch_compile,
             distributed_config=distributed_config,
-            precision_config=precision_config,
-            use_protected_eigh=use_protected_eigh,
+            preconditioner_dtype=preconditioner_dtype.value,
             track_root_inv_residuals=track_root_inv_residuals,
             preconditioner_config=instantiate_preconditioner_config(
                 preconditioner_computation_type

diff --git a/distributed_shampoo/gpu_tests/shampoo_pt2_test.py b/distributed_shampoo/gpu_tests/shampoo_pt2_test.py
@@ -42,7 +42,8 @@ def _shampoo_optim_factory(
             parameters,
             lr=0.01,
             betas=betas,
-            beta3=betas[0] * betas[0],
+            # TODO: comment out beta3 to unblock quantization changes; need to fix PT2 FMA changes for this test
+            # beta3=betas[0] * betas[0],
             epsilon=1e-10,
             momentum=0.9,
             dampening=0.9,