Add MTIA info to sharder (#3032)

emasap · facebook-github-bot · commit b1aa49cd3f50 · 2025-06-05T02:20:28.000-07:00
Summary: Pull Request resolved: #3032 Reviewed By: kausv Differential Revision: D74064134 fbshipit-source-id: 3a56ced167b2cf0a0559ef106a902c68bd241eae
diff --git a/torchrec/distributed/embedding_types.py b/torchrec/distributed/embedding_types.py
@@ -519,8 +519,7 @@ def storage_usage(
             storage_map = {
                 "cuda": ParameterStorage.HBM,
                 "cpu": ParameterStorage.DDR,
-                # TODO: Update it later. Setting for MTIA is same as CPU's for now.
-                "mtia": ParameterStorage.DDR,
+                "mtia": ParameterStorage.HBM,
             }
             return {
                 storage_map[compute_device_type].value: get_tensor_size_bytes(tensor)
diff --git a/torchrec/distributed/planner/enumerators.py b/torchrec/distributed/planner/enumerators.py
@@ -80,7 +80,9 @@ def __init__(
         self._use_exact_enumerate_order: bool = (
             use_exact_enumerate_order if use_exact_enumerate_order else False
         )
-        memory_type = "hbm_cap" if topology.compute_device == "cuda" else "ddr_cap"
+        memory_type = (
+            "hbm_cap" if topology.compute_device in {"cuda", "mtia"} else "ddr_cap"
+        )
         self._device_memory_sizes: Optional[
             List[int]
         ] = (  # only used with custom topology where memory is different within a topology
diff --git a/torchrec/distributed/planner/shard_estimators.py b/torchrec/distributed/planner/shard_estimators.py
@@ -1261,7 +1261,7 @@ def calculate_shard_storages(
                 count_ephemeral_storage_cost=count_ephemeral_storage_cost,
                 is_inference=is_inference,
             )
-            if compute_device == "cuda"
+            if compute_device in {"cuda", "mtia"}
             else 0
         )
         for input_size, output_size, hbm_specific_size in zip(
@@ -1273,7 +1273,7 @@ def calculate_shard_storages(
     ddr_sizes: List[int] = [
         (
             input_size + output_size + ddr_specific_size
-            if compute_device in {"cpu", "mtia"} and not is_inference
+            if compute_device == "cpu" and not is_inference
             else ddr_specific_size
         )
         for input_size, output_size, ddr_specific_size in zip(
diff --git a/torchrec/distributed/planner/storage_reservations.py b/torchrec/distributed/planner/storage_reservations.py
@@ -73,8 +73,8 @@ def _reserve_dense_storage(
         dense_tensor_size = dense_tensor_estimate
 
     dense_tensor_storage = Storage(
-        hbm=dense_tensor_size if topology.compute_device == "cuda" else 0,
-        ddr=dense_tensor_size if topology.compute_device in {"cpu", "mtia"} else 0,
+        hbm=dense_tensor_size if topology.compute_device in {"cuda", "mtia"} else 0,
+        ddr=dense_tensor_size if topology.compute_device == "cpu" else 0,
     )
 
     for device in topology.devices:
@@ -93,8 +93,8 @@ def _reserve_kjt_storage(
     kjt_size = math.ceil(sum(batch_inputs) * float(input_data_type_size)) * multiplier
 
     kjt_storage = Storage(
-        hbm=kjt_size if topology.compute_device == "cuda" else 0,
-        ddr=kjt_size if topology.compute_device in {"cpu", "mtia"} else 0,
+        hbm=kjt_size if topology.compute_device in {"cuda", "mtia"} else 0,
+        ddr=kjt_size if topology.compute_device == "cpu" else 0,
     )
 
     for device in topology.devices:
diff --git a/torchrec/distributed/planner/types.py b/torchrec/distributed/planner/types.py
@@ -284,7 +284,7 @@ def __init__(
         self._world_size = world_size
 
         hbm_per_device = [0] * world_size
-        if self._compute_device == "cuda":
+        if self._compute_device == "cuda" or self._compute_device == "mtia":
             hbm_per_device = [hbm_cap if hbm_cap else HBM_CAP] * world_size
         ddr_cap_per_rank = [ddr_cap if ddr_cap else DDR_CAP] * world_size
 
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
@@ -1197,8 +1197,7 @@ def storage_usage(
         storage_map = {
             "cuda": ParameterStorage.HBM,
             "cpu": ParameterStorage.DDR,
-            # TODO: Update it later. Setting for MTIA is same as CPU's for now.
-            "mtia": ParameterStorage.DDR,
+            "mtia": ParameterStorage.HBM,
         }
         return {storage_map[compute_device_type].value: get_tensor_size_bytes(tensor)}
 

Original file line number	Diff line number	Diff line change
`@@ -1261,7 +1261,7 @@ def calculate_shard_storages(`
`1261`	`1261`	`count_ephemeral_storage_cost=count_ephemeral_storage_cost,`
`1262`	`1262`	`is_inference=is_inference,`
`1263`	`1263`	`)`
`1264`		`- if compute_device == "cuda"`
	`1264`	`+ if compute_device in {"cuda", "mtia"}`
`1265`	`1265`	`else 0`
`1266`	`1266`	`)`
`1267`	`1267`	`for input_size, output_size, hbm_specific_size in zip(`
`@@ -1273,7 +1273,7 @@ def calculate_shard_storages(`
`1273`	`1273`	`ddr_sizes: List[int] = [`
`1274`	`1274`	`(`
`1275`	`1275`	`input_size + output_size + ddr_specific_size`
`1276`		`- if compute_device in {"cpu", "mtia"} and not is_inference`
	`1276`	`+ if compute_device == "cpu" and not is_inference`
`1277`	`1277`	`else ddr_specific_size`
`1278`	`1278`	`)`
`1279`	`1279`	`for input_size, output_size, ddr_specific_size in zip(`
Original file line number	Diff line number	Diff line change
`@@ -1197,8 +1197,7 @@ def storage_usage(`
`1197`	`1197`	`storage_map = {`
`1198`	`1198`	`"cuda": ParameterStorage.HBM,`
`1199`	`1199`	`"cpu": ParameterStorage.DDR,`
`1200`		`- # TODO: Update it later. Setting for MTIA is same as CPU's for now.`
`1201`		`- "mtia": ParameterStorage.DDR,`
	`1200`	`+ "mtia": ParameterStorage.HBM,`
`1202`	`1201`	`}`
`1203`	`1202`	`return {storage_map[compute_device_type].value: get_tensor_size_bytes(tensor)}`
`1204`	`1203`