Add multi-gpu support to MonaiAlgo (#5228)

holgerroth · web-flow · commit 65280140edeb · 2022-09-30T14:25:10.000Z
Signed-off-by: Holger Roth <hroth@nvidia.com> Fixes #5195. ### Description Add support for MonaiAlgo to be run with torchrun for multi-gpu training. ### Types of changes  - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. Signed-off-by: Holger Roth <hroth@nvidia.com>
diff --git a/monai/fl/client/monai_algo.py b/monai/fl/client/monai_algo.py
@@ -15,6 +15,7 @@
 from typing import Optional, Union
 
 import torch
+import torch.distributed as dist
 
 import monai
 from monai.bundle import ConfigParser
@@ -112,6 +113,9 @@ class MonaiAlgo(ClientAlgo):
         benchmark: set benchmark to `False` for full deterministic behavior in cuDNN components.
             Note, full determinism in federated learning depends also on deterministic behavior of other FL components,
             e.g., the aggregator, which is not controlled by this class.
+        multi_gpu: whether to run MonaiAlgo in a multi-GPU setting; defaults to `False`.
+        backend: backend to use for torch.distributed; defaults to "nccl".
+        init_method: init_method for torch.distributed; defaults to "env://".
     """
 
     def __init__(
@@ -128,6 +132,9 @@ def __init__(
         save_dict_key: Optional[str] = "model",
         seed: Optional[int] = None,
         benchmark: bool = True,
+        multi_gpu: bool = False,
+        backend: str = "nccl",
+        init_method: str = "env://",
     ):
         self.logger = logging.getLogger(self.__class__.__name__)
         if config_evaluate_filename == "default":
@@ -144,6 +151,9 @@ def __init__(
         self.save_dict_key = save_dict_key
         self.seed = seed
         self.benchmark = benchmark
+        self.multi_gpu = multi_gpu
+        self.backend = backend
+        self.init_method = init_method
 
         self.app_root = None
         self.train_parser = None
@@ -156,6 +166,7 @@ def __init__(
         self.post_evaluate_filters = None
         self.iter_of_start_time = 0
         self.global_weights = None
+        self.rank = 0
 
         self.phase = FlPhase.IDLE
         self.client_name = None
@@ -174,6 +185,15 @@ def initialize(self, extra=None):
         self.client_name = extra.get(ExtraItems.CLIENT_NAME, "noname")
         self.logger.info(f"Initializing {self.client_name} ...")
 
+        if self.multi_gpu:
+            dist.init_process_group(backend=self.backend, init_method=self.init_method)
+            self._set_cuda_device()
+            self.logger.info(
+                f"Using multi-gpu training on rank {self.rank} (available devices: {torch.cuda.device_count()})"
+            )
+            if self.rank > 0:
+                self.logger.setLevel(logging.WARNING)
+
         if self.seed:
             monai.utils.set_determinism(seed=self.seed)
         torch.backends.cudnn.benchmark = self.benchmark
@@ -243,6 +263,8 @@ def train(self, data: ExchangeObject, extra=None):
             extra: Dict with additional information that can be provided by FL system.
 
         """
+        self._set_cuda_device()
+
         if extra is None:
             extra = {}
         if not isinstance(data, ExchangeObject):
@@ -284,6 +306,8 @@ def get_weights(self, extra=None):
                 or load requested model type from disk (`ModelType.BEST_MODEL` or `ModelType.FINAL_MODEL`).
 
         """
+        self._set_cuda_device()
+
         if extra is None:
             extra = {}
 
@@ -361,6 +385,8 @@ def evaluate(self, data: ExchangeObject, extra=None):
             return_metrics: `ExchangeObject` containing evaluation metrics.
 
         """
+        self._set_cuda_device()
+
         if extra is None:
             extra = {}
         if not isinstance(data, ExchangeObject):
@@ -421,6 +447,9 @@ def finalize(self, extra=None):
             self.logger.info(f"Terminating {self.client_name} evaluator...")
             self.evaluator.terminate()
 
+        if self.multi_gpu:
+            dist.destroy_process_group()
+
     def _check_converted(self, global_weights, local_var_dict, n_converted):
         if n_converted == 0:
             self.logger.warning(
@@ -447,3 +476,8 @@ def _add_config_files(self, config_files):
                     f"Expected config files to be of type str or list but got {type(config_files)}: {config_files}"
                 )
         return files
+
+    def _set_cuda_device(self):
+        if self.multi_gpu:
+            self.rank = int(os.environ["LOCAL_RANK"])
+            torch.cuda.set_device(self.rank)
diff --git a/monai/networks/utils.py b/monai/networks/utils.py
@@ -516,6 +516,8 @@ def copy_model_state(
     unchanged_keys = sorted(set(all_keys).difference(updated_keys))
     logger.info(f"'dst' model updated: {len(updated_keys)} of {len(dst_dict)} variables.")
     if inplace and isinstance(dst, torch.nn.Module):
+        if isinstance(dst, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
+            dst = dst.module
         dst.load_state_dict(dst_dict)
     return dst_dict, updated_keys, unchanged_keys