support get state dict and apply state dict (pytorch#2976)

emlin · facebook-github-bot · commit d5a014c98132 · 2025-05-21T16:24:24.000-07:00
Summary: Pull Request resolved: pytorch#2976 X-link: pytorch/FBGEMM#4145 X-link: facebookresearch/FBGEMM#1226 # Functions **Saving State Dict** When saving state dict, we convert IDs from local to global. This allows us to avoid shifting IDs based on sharding decisions when tables are resharded. **Checkpoint Loading Mode** We have enabled load state dict mode for checkpoint loading, which allows us to cache all ID, weight, and bucket tensors in memory before applying them to the backend. This approach ensures that all data is loaded correctly, even though the checkpoint client does not support ordered tensor loading. # Current Solution The current solution involves caching all data in Python tensors, following these steps: - Set self.local_weight_counts based on checkpoint bucket tensor size. - Enable load state dict mode to initialize local cache tensors. - Call state_dict to get empty tensors for the checkpoint loader. - Write checkpoint data to cached tensors from persisted checkpoint by the checkpoint loader. - Call apply_state_dict to write all cached tensors to the backend. **Apply State Dict Flow** During the apply_state_dict step, we perform the following operations: - If optimizer offloading is enabled: - Loop through chunks of weight and optimizer. - Concatenate weight and optimizer together. - Write to backend using KVTensorWrapper interface. - If optimizer offloading is disabled: - Set optimizer to device tensor based on ID. - Write ID weight to backend for each table. # Limitations The current solution has two limitations: - Memory overhead: - When writing data to the backend, the Python tensor's memory cannot be released until the whole tensor data is duplicated in the backend. This can lead to high memory usage, especially when dealing with single large tables. - Performance regression: - With optimizer offloading, we need to concatenate weight and optimizer together before writing to the backend. To avoid triple one large tensor's memory, we loop through smaller chunks during writing, which can cause performance regression. # Future Improvements After the first version e2e is ready, we plan to support unordered loading from the backend to improve performance and reduce memory overhead. Reviewed By: bobbyliujb Differential Revision: D74790154
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -29,6 +29,10 @@
 
 import torch
 import torch.distributed as dist
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
+    BackendType,
+    KVZCHParams,
+)
 from fbgemm_gpu.split_table_batched_embeddings_ops_inference import (
     IntNBitTableBatchedEmbeddingBagsCodegen,
 )
@@ -42,7 +46,6 @@
     SplitTableBatchedEmbeddingBagsCodegen,
 )
 from fbgemm_gpu.tbe.ssd import ASSOC, SSDTableBatchedEmbeddingBags
-from fbgemm_gpu.tbe.ssd.training import BackendType, KVZCHParams
 from fbgemm_gpu.tbe.ssd.utils.partially_materialized_tensor import (
     PartiallyMaterializedTensor,
 )
@@ -861,6 +864,7 @@ def _gen_named_parameters_by_table_fused(
         table_count = table_name_to_count.pop(table_name)
         if emb_module.weights_precision == SparseType.INT8:
             dim += emb_module.int8_emb_row_dim_offset
+        # pyre-ignore [29]
         offset = emb_module.weights_physical_offsets[t_idx]
         weights: torch.Tensor
         if location == EmbeddingLocation.DEVICE.value:
@@ -1253,6 +1257,16 @@ def __init__(
         compute_kernel = config.embedding_tables[0].compute_kernel
         embedding_location = compute_kernel_to_embedding_location(compute_kernel)
 
+        # every split_embeding_weights call is expensive, since it iterates over all the elements in the backend kv db
+        # use split weights result cache so that multiple calls in the same train iteration will only trigger once
+        self._split_weights_res: Optional[
+            Tuple[
+                List[ShardedTensor],
+                List[ShardedTensor],
+                List[ShardedTensor],
+            ]
+        ] = None
+
         self._emb_module: SSDTableBatchedEmbeddingBags = SSDTableBatchedEmbeddingBags(
             embedding_specs=list(zip(self._num_embeddings, self._local_cols)),
             feature_table_map=self._feature_table_map,
@@ -1265,11 +1279,18 @@ def __init__(
         logger.info(
             f"tbe_unique_id:{self._emb_module.tbe_unique_id} => table name to count dict:{self.table_name_to_count}"
         )
-
-        self._optim: KeyValueEmbeddingFusedOptimizer = KeyValueEmbeddingFusedOptimizer(
-            config,
-            self._emb_module,
-            pg,
+        self._table_name_to_weight_count_per_rank: Dict[str, List[int]] = {}
+        self._init_sharded_split_embedding_weights()  # this will populate self._split_weights_res
+        self._optim: ZeroCollisionKeyValueEmbeddingFusedOptimizer = (
+            ZeroCollisionKeyValueEmbeddingFusedOptimizer(
+                config,
+                self._emb_module,
+                # pyre-ignore[16]
+                sharded_embedding_weights_by_table=self._split_weights_res[0],
+                table_name_to_weight_count_per_rank=self._table_name_to_weight_count_per_rank,
+                sharded_embedding_weight_ids=self._split_weights_res[1],
+                pg=pg,
+            )
         )
         self._param_per_table: Dict[str, nn.Parameter] = dict(
             _gen_named_parameters_by_table_ssd_pmt(
@@ -1281,16 +1302,6 @@ def __init__(
         )
         self.init_parameters()
 
-        # every split_embeding_weights call is expensive, since it iterates over all the elements in the backend kv db
-        # use split weights result cache so that multiple calls in the same train iteration will only trigger once
-        self._split_weights_res: Optional[
-            Tuple[
-                List[ShardedTensor],
-                List[ShardedTensor],
-                List[ShardedTensor],
-            ]
-        ] = None
-
     def init_parameters(self) -> None:
         """
         An advantage of KV TBE is that we don't need to init weights. Hence skipping.
@@ -1393,7 +1404,7 @@ def named_parameters(
     # pyre-ignore [15]
     def named_split_embedding_weights(
         self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True
-    ) -> Iterator[Tuple[str, PartiallyMaterializedTensor]]:
+    ) -> Iterator[Tuple[str, Union[PartiallyMaterializedTensor, torch.Tensor]]]:
         assert (
             remove_duplicate
         ), "remove_duplicate=False not supported in BaseBatchedEmbedding.named_split_embedding_weights"
@@ -1404,50 +1415,41 @@ def named_split_embedding_weights(
             key = append_prefix(prefix, f"{config.name}.weight")
             yield key, tensor
 
-    def get_named_split_embedding_weights_snapshot(self, prefix: str = "") -> Iterator[
-        Tuple[
-            str,
-            Union[ShardedTensor, PartiallyMaterializedTensor],
-            Optional[ShardedTensor],
-            Optional[ShardedTensor],
-        ]
-    ]:
-        """
-        Return an iterator over embedding tables, for each table yielding
-        table name,
-        PMT for embedding table with a valid RocksDB snapshot to support tensor IO
-        optional ShardedTensor for weight_id
-        optional ShardedTensor for bucket_cnt
-        """
-        if self._split_weights_res is not None:
-            pmt_sharded_t_list = self._split_weights_res[0]
-            # pyre-ignore
-            weight_id_sharded_t_list = self._split_weights_res[1]
-            bucket_cnt_sharded_t_list = self._split_weights_res[2]
-            for table_idx, pmt_sharded_t in enumerate(pmt_sharded_t_list):
-                table_config = self._config.embedding_tables[table_idx]
-                key = append_prefix(prefix, f"{table_config.name}")
-
-                yield key, pmt_sharded_t, weight_id_sharded_t_list[
-                    table_idx
-                ], bucket_cnt_sharded_t_list[table_idx]
+    # initialize sharded _split_weights_res if it's None
+    # this method is used to generate sharded embedding weights once for all following state_dict
+    # calls in checkpointing and publishing.
+    # When training is resumed, the cached value will be reset to None and the value needs to be
+    # rebuilt for next checkpointing and publishing, as the weight id, weight embedding will be updated
+    # during training in backend k/v store.
+    def _init_sharded_split_embedding_weights(
+        self, prefix: str = "", force_regenerate: bool = False
+    ) -> None:
+        if not force_regenerate and self._split_weights_res is not None:
             return
 
         pmt_list, weight_ids_list, bucket_cnt_list = self.split_embedding_weights(
-            no_snapshot=False, should_flush=True
+            no_snapshot=False,
+            should_flush=True,
         )
         emb_table_config_copy = copy.deepcopy(self._config.embedding_tables)
         for emb_table in emb_table_config_copy:
             emb_table.local_metadata.placement._device = torch.device("cpu")
 
         pmt_sharded_t_list = create_virtual_sharded_tensors(
-            emb_table_config_copy, pmt_list, self._pg, prefix
+            emb_table_config_copy,
+            pmt_list,
+            self._pg,
+            prefix,
         )
         weight_id_sharded_t_list = create_virtual_sharded_tensors(
             emb_table_config_copy, weight_ids_list, self._pg, prefix  # pyre-ignore
         )
         bucket_cnt_sharded_t_list = create_virtual_sharded_tensors(
-            emb_table_config_copy, bucket_cnt_list, self._pg, prefix  # pyre-ignore
+            emb_table_config_copy,
+            # pyre-ignore [6]
+            bucket_cnt_list,
+            self._pg,
+            prefix,
         )
         # pyre-ignore
         assert len(pmt_list) == len(weight_ids_list) == len(bucket_cnt_list)
@@ -1456,6 +1458,34 @@ def get_named_split_embedding_weights_snapshot(self, prefix: str = "") -> Iterat
             == len(weight_id_sharded_t_list)
             == len(bucket_cnt_sharded_t_list)
         )
+        self._split_weights_res = (
+            pmt_sharded_t_list,
+            weight_id_sharded_t_list,
+            bucket_cnt_sharded_t_list,
+        )
+
+    def get_named_split_embedding_weights_snapshot(self, prefix: str = "") -> Iterator[
+        Tuple[
+            str,
+            Union[ShardedTensor, PartiallyMaterializedTensor],
+            Optional[ShardedTensor],
+            Optional[ShardedTensor],
+        ]
+    ]:
+        """
+        Return an iterator over embedding tables, for each table yielding
+        table name,
+        PMT for embedding table with a valid RocksDB snapshot to support tensor IO
+        optional ShardedTensor for weight_id
+        optional ShardedTensor for bucket_cnt
+        """
+        self._init_sharded_split_embedding_weights()
+        # pyre-ignore[16]
+        self._optim.set_sharded_embedding_weight_ids(self._split_weights_res[1])
+
+        pmt_sharded_t_list = self._split_weights_res[0]
+        weight_id_sharded_t_list = self._split_weights_res[1]
+        bucket_cnt_sharded_t_list = self._split_weights_res[2]
         for table_idx, pmt_sharded_t in enumerate(pmt_sharded_t_list):
             table_config = self._config.embedding_tables[table_idx]
             key = append_prefix(prefix, f"{table_config.name}")
@@ -1464,12 +1494,6 @@ def get_named_split_embedding_weights_snapshot(self, prefix: str = "") -> Iterat
                 table_idx
             ], bucket_cnt_sharded_t_list[table_idx]
 
-        self._split_weights_res = (
-            pmt_sharded_t_list,
-            weight_id_sharded_t_list,
-            bucket_cnt_sharded_t_list,
-        )
-
     def flush(self) -> None:
         """
         Flush the embeddings in cache back to SSD. Should be pretty expensive.
@@ -1486,19 +1510,18 @@ def purge(self) -> None:
 
     # pyre-ignore [15]
     def split_embedding_weights(
-        self, no_snapshot: bool = True, should_flush: bool = False
+        self, no_snapshot: bool = True, should_flush: bool = True
     ) -> Tuple[
-        List[PartiallyMaterializedTensor],
+        Union[List[PartiallyMaterializedTensor], List[torch.Tensor]],
         Optional[List[torch.Tensor]],
         Optional[List[torch.Tensor]],
     ]:
-        return self.emb_module.split_embedding_weights(
-            no_snapshot, should_flush=should_flush
-        )
+        return self.emb_module.split_embedding_weights(no_snapshot, should_flush)
 
     def forward(self, features: KeyedJaggedTensor) -> torch.Tensor:
         # reset split weights during training
         self._split_weights_res = None
+        self._optim.set_sharded_embedding_weight_ids(sharded_embedding_weight_ids=None)
 
         return self.emb_module(
             indices=features.values().long(),