Append columns to the SSD cache for storing optimizer data, v3 (#4125)

q10 · facebook-github-bot · commit e051cadbd2a7 · 2025-05-14T22:04:41.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1206 Pull Request resolved: #4125 - Append columns to the SSD cache for storing optimizer data. For backwards compatibility, this feature is disabled by default, unless the client builds `SSDTableBatchedEmbeddingBags` with `KVZCHParams` that has `enable_optimizer_offloading` set to `true`. - This is a graft of D74051349 to land to main, since D74051349 is based on the ZCH WIP stack Reviewed By: sryap, duduyi2013 Differential Revision: D74748058 fbshipit-source-id: 67f086884f6d4204e551d96a8d52b16527fd3ce2
diff --git a/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py b/fbgemm_gpu/fbgemm_gpu/split_embedding_configs.py
@@ -8,6 +8,7 @@
 # pyre-strict
 
 import enum
+import math
 from typing import Any, Dict  # noqa: F401
 
 import torch
@@ -40,6 +41,23 @@ class EmbOptimType(enum.Enum):
     def __str__(self) -> str:
         return self.value
 
+    def state_size(self) -> int:
+        """
+        Returns the size of the data (in bytes) required to hold the optimizer
+        state (per table row), or 0 if none needed
+        """
+        return {
+            # Only holds the momentum float value per row
+            EmbOptimType.EXACT_ROWWISE_ADAGRAD: torch.float32.itemsize,
+        }.get(self, 0)
+
+    def state_size_dim(self, dtype: torch.dtype) -> int:
+        """
+        Returns the size of the data (in units of elements of dtype) rquired to
+        hold optimizer information (per table row)
+        """
+        return int(math.ceil(self.state_size() / dtype.itemsize))
+
 
 # Base class for quantization configuration (in case other numeric types have
 # configs)
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/common.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/common.py
@@ -18,3 +18,25 @@
     pass
 
 ASSOC = 32
+
+
+def pad4(value: int) -> int:
+    """
+    Compute the smallest multiple of 4 that is greater than or equal to the given value.
+
+    Parameters:
+        value (int): The integer to align (must be non-negative).
+
+    Returns:
+        int: The aligned value.
+
+    Raises:
+        ValueError: If the input is negative.
+        TypeError: If the input is not an integer.
+    """
+    if not isinstance(value, int):
+        raise TypeError("Input must be an integer")
+    if value < 0:
+        raise ValueError("Input must be a non-negative integer")
+
+    return (value + 3) & ~3
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -16,6 +16,7 @@
 import tempfile
 import threading
 import time
+from functools import cached_property
 from math import floor, log2
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 import torch  # usort:skip
@@ -57,7 +58,7 @@
 
 from ..cache import get_unique_indices_v2
 
-from .common import ASSOC
+from .common import ASSOC, pad4
 from .utils.partially_materialized_tensor import PartiallyMaterializedTensor
 
 
@@ -172,6 +173,29 @@ def __init__(
     ) -> None:
         super(SSDTableBatchedEmbeddingBags, self).__init__()
 
+        assert optimizer in (
+            OptimType.EXACT_ROWWISE_ADAGRAD,
+        ), f"Optimizer {optimizer} is not supported by SSDTableBatchedEmbeddingBags"
+        self.optimizer = optimizer
+
+        assert weights_precision in (SparseType.FP32, SparseType.FP16)
+        self.weights_precision = weights_precision
+        self.output_dtype: int = output_dtype.as_int()
+
+        # Zero collision TBE configurations
+        self.kv_zch_params = kv_zch_params
+        self.backend_type = backend_type
+        self.enable_optimizer_offloading: bool = False
+        if self.kv_zch_params:
+            self.kv_zch_params.validate()
+            self.enable_optimizer_offloading = (
+                # pyre-ignore [16]
+                self.kv_zch_params.enable_optimizer_offloading
+            )
+
+            if self.enable_optimizer_offloading:
+                logging.info("Optimizer state offloading is enabled")
+
         self.pooling_mode = pooling_mode
         self.bounds_check_mode_int: int = bounds_check_mode.value
         self.embedding_specs = embedding_specs
@@ -207,7 +231,11 @@ def __init__(
         feature_dims = [dims[t] for t in self.feature_table_map]
         D_offsets = [dims[t] for t in self.feature_table_map]
         D_offsets = [0] + list(itertools.accumulate(D_offsets))
+
+        # Sum of row length of all tables
         self.total_D: int = D_offsets[-1]
+
+        # Max number of elements required to store a row in the cache
         self.max_D: int = max(dims)
         self.register_buffer(
             "D_offsets",
@@ -273,15 +301,15 @@ def __init__(
         assert (
             element_size == 4 or element_size == 2
         ), f"Invalid element size {element_size}"
-        cache_size = cache_sets * ASSOC * element_size * self.max_D
+        cache_size = cache_sets * ASSOC * element_size * self.cache_row_dim
         logging.info(
             f"Using cache for SSD with admission algorithm "
             f"{CacheAlgorithm.LRU}, {cache_sets} sets, stored on {'DEVICE' if ssd_cache_location is EmbeddingLocation.DEVICE else 'MANAGED'} with {ssd_rocksdb_shards} shards, "
             f"SSD storage directory: {ssd_storage_directory}, "
             f"Memtable Flush Period: {ssd_memtable_flush_period}, "
             f"Memtable Flush Offset: {ssd_memtable_flush_offset}, "
             f"Desired L0 files per compaction: {ssd_l0_files_per_compact}, "
-            f"{cache_size / 1024.0 / 1024.0 / 1024.0 : .2f}GB, "
+            f"Cache size: {cache_size / 1024.0 / 1024.0 / 1024.0 : .2f}GB, "
             f"weights precision: {weights_precision}, "
             f"output dtype: {output_dtype}, "
             f"chunk size in bulk init: {bulk_init_chunk_size} bytes, backend_type: {backend_type}, "
@@ -331,10 +359,6 @@ def __init__(
             EmbeddingLocation.DEVICE,
         )
 
-        assert weights_precision in (SparseType.FP32, SparseType.FP16)
-        self.weights_precision = weights_precision
-        self.output_dtype: int = output_dtype.as_int()
-
         cache_dtype = weights_precision.as_dtype()
         if ssd_cache_location == EmbeddingLocation.MANAGED:
             self.register_buffer(
@@ -345,7 +369,7 @@ def __init__(
                         device=self.current_device,
                         dtype=cache_dtype,
                     ),
-                    [cache_sets * ASSOC, self.max_D],
+                    [cache_sets * ASSOC, self.cache_row_dim],
                     is_host_mapped=self.uvm_host_mapped,
                 ),
             )
@@ -354,7 +378,7 @@ def __init__(
                 "lxu_cache_weights",
                 torch.zeros(
                     cache_sets * ASSOC,
-                    self.max_D,
+                    self.cache_row_dim,
                     device=self.current_device,
                     dtype=cache_dtype,
                 ),
@@ -457,17 +481,6 @@ def __init__(
         )
         # logging.info("DEBUG: weights_precision {}".format(weights_precision))
 
-        # zero collision TBE configurations
-        self.kv_zch_params = kv_zch_params
-        self.backend_type = backend_type
-        self.enable_optimizer_offloading: bool = False
-        if self.kv_zch_params:
-            self.kv_zch_params.validate()
-            self.enable_optimizer_offloading = (
-                # pyre-ignore [16]
-                self.kv_zch_params.enable_optimizer_offloading
-            )
-
         """
         ##################### for ZCH v.Next loading checkpoints Short Term Solution #######################
         weight_id tensor is the weight and optimizer keys, to load from checkpoint, weight_id tensor
@@ -510,7 +523,7 @@ def __init__(
                 f"Logging SSD offloading setup, tbe_unique_id:{tbe_unique_id}, l2_cache_size:{l2_cache_size}GB, enable_async_update:{enable_async_update}"
                 f"passed_in_path={ssd_directory}, num_shards={ssd_rocksdb_shards},num_threads={ssd_rocksdb_shards},"
                 f"memtable_flush_period={ssd_memtable_flush_period},memtable_flush_offset={ssd_memtable_flush_offset},"
-                f"l0_files_per_compact={ssd_l0_files_per_compact},max_D={self.max_D},rate_limit_mbps={ssd_rate_limit_mbps},"
+                f"l0_files_per_compact={ssd_l0_files_per_compact},max_D={self.max_D},cache_row_dim={self.cache_row_dim},rate_limit_mbps={ssd_rate_limit_mbps},"
                 f"size_ratio={ssd_size_ratio},compaction_trigger={ssd_compaction_trigger}, lazy_bulk_init_enabled={lazy_bulk_init_enabled},"
                 f"write_buffer_size_per_tbe={ssd_rocksdb_write_buffer_size},max_write_buffer_num_per_db_shard={ssd_max_write_buffer_num},"
                 f"uniform_init_lower={ssd_uniform_init_lower},uniform_init_upper={ssd_uniform_init_upper},"
@@ -526,7 +539,7 @@ def __init__(
                 ssd_memtable_flush_period,
                 ssd_memtable_flush_offset,
                 ssd_l0_files_per_compact,
-                self.max_D,
+                self.cache_row_dim,
                 ssd_rate_limit_mbps,
                 ssd_size_ratio,
                 ssd_compaction_trigger,
@@ -567,7 +580,7 @@ def __init__(
                 ps_client_thread_num if ps_client_thread_num is not None else 32,
                 ps_max_key_per_request if ps_max_key_per_request is not None else 500,
                 l2_cache_size,
-                self.max_D,
+                self.cache_row_dim,
             )
         else:
             raise AssertionError(f"Invalid backend type {self.backend_type}")
@@ -707,11 +720,6 @@ def __init__(
                 self._update_cache_counter_and_pointers
             )
 
-        assert optimizer in (
-            OptimType.EXACT_ROWWISE_ADAGRAD,
-        ), f"Optimizer {optimizer} is not supported by SSDTableBatchedEmbeddingBags"
-        self.optimizer = optimizer
-
         # stats reporter
         self.gather_ssd_cache_stats = gather_ssd_cache_stats
         self.stats_reporter: Optional[TBEStatsReporter] = (
@@ -798,6 +806,22 @@ def __init__(
 
         self.bounds_check_version: int = get_bounds_check_version_for_platform()
 
+    @cached_property
+    def cache_row_dim(self) -> int:
+        """
+        Compute the effective physical cache row size taking into account
+        padding to the nearest 4 elements and the optimizer state appended to
+        the back of the row
+        """
+        if self.enable_optimizer_offloading:
+            return self.max_D + pad4(
+                # Compute the number of elements of cache_dtype needed to store the
+                # optimizer state
+                self.optimizer.state_size_dim(self.weights_precision.as_dtype())
+            )
+        else:
+            return self.max_D
+
     @property
     # pyre-ignore
     def ssd_db(self):
@@ -854,7 +878,7 @@ def _insert_all_kv(self) -> None:
         row_offset = 0
         row_count = floor(
             self.bulk_init_chunk_size
-            / (self.max_D * self.weights_precision.as_dtype().itemsize)
+            / (self.cache_row_dim * self.weights_precision.as_dtype().itemsize)
         )
         total_dim0 = 0
         for dim0, _ in self.embedding_specs:
@@ -863,7 +887,7 @@ def _insert_all_kv(self) -> None:
         start_ts = time.time()
         chunk_tensor = torch.empty(
             row_count,
-            self.max_D,
+            self.cache_row_dim,
             dtype=self.weights_precision.as_dtype(),
             device="cuda",
         )
@@ -1397,7 +1421,7 @@ def _prefetch(  # noqa C901
 
             # Allocation a scratch pad for the current iteration. The scratch
             # pad is a UVA tensor
-            inserted_rows_shape = (assigned_cache_slots.numel(), self.max_D)
+            inserted_rows_shape = (assigned_cache_slots.numel(), self.cache_row_dim)
             if linear_cache_indices.numel() > 0:
                 inserted_rows = torch.ops.fbgemm.new_unified_tensor(
                     torch.zeros(
@@ -2093,7 +2117,7 @@ def flush(self, force: bool = False) -> None:
         active_slots_mask = self.lxu_cache_state != -1
 
         active_weights_gpu = self.lxu_cache_weights[active_slots_mask.view(-1)].view(
-            -1, self.max_D
+            -1, self.cache_row_dim
         )
         active_ids_gpu = self.lxu_cache_state.view(-1)[active_slots_mask.view(-1)]
 
@@ -2195,7 +2219,7 @@ def _report_ssd_l1_cache_stats(self) -> None:
                 data_bytes=int(
                     ssd_cache_stats_delta[stat_index.value]
                     * element_size
-                    * self.max_D
+                    * self.cache_row_dim
                     / passed_steps
                 ),
             )
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_l2_cache_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_l2_cache_test.py
@@ -57,7 +57,7 @@ def generate_fbgemm_ssd_tbe(
         mixed: bool,
         enable_l2: bool = True,
         ssd_rocksdb_shards: int = 1,
-    ) -> Tuple[SSDTableBatchedEmbeddingBags, List[int], List[int], int]:
+    ) -> Tuple[SSDTableBatchedEmbeddingBags, List[int], List[int]]:
         E = int(10**log_E)
         D = D * 4
         if not mixed:
@@ -84,7 +84,7 @@ def generate_fbgemm_ssd_tbe(
             l2_cache_size=1 if enable_l2 else 0,
             ssd_rocksdb_shards=ssd_rocksdb_shards,
         )
-        return emb, Es, Ds, max(Ds)
+        return emb, Es, Ds
 
     @given(**default_st, do_flush=st.sampled_from([True, False]))
     @settings(**default_settings)
@@ -97,12 +97,10 @@ def test_l2_flush(
         weights_precision: SparseType,
         do_flush: bool,
     ) -> None:
-        emb, Es, Ds, max_D = self.generate_fbgemm_ssd_tbe(
-            T, D, log_E, weights_precision, mixed
-        )
+        emb, Es, _ = self.generate_fbgemm_ssd_tbe(T, D, log_E, weights_precision, mixed)
         indices = torch.arange(start=0, end=sum(Es))
         weights = torch.randn(
-            indices.numel(), max_D, dtype=weights_precision.as_dtype()
+            indices.numel(), emb.cache_row_dim, dtype=weights_precision.as_dtype()
         )
         weights_from_l2 = torch.empty_like(weights)
         count = torch.as_tensor([indices.numel()])
@@ -134,7 +132,7 @@ def test_l2_io(
         weights_precision: SparseType,
         enable_l2: bool,
     ) -> None:
-        emb, _, _, max_D = self.generate_fbgemm_ssd_tbe(
+        emb, _, _ = self.generate_fbgemm_ssd_tbe(
             T, D, log_E, weights_precision, mixed, enable_l2
         )
         E = int(10**log_E)
@@ -146,7 +144,7 @@ def test_l2_io(
             np.random.choice(E, replace=False, size=(N,)), dtype=torch.int64
         )
         weights = torch.randn(
-            indices.numel(), max_D, dtype=weights_precision.as_dtype()
+            indices.numel(), emb.cache_row_dim, dtype=weights_precision.as_dtype()
         )
         sub_N = N // num_rounds
 
@@ -191,15 +189,13 @@ def test_l2_prefetch_compatibility(
         mixed: bool,
         weights_precision: SparseType,
     ) -> None:
-        emb, _, _, max_D = self.generate_fbgemm_ssd_tbe(
-            T, D, log_E, weights_precision, mixed
-        )
+        emb, _, _ = self.generate_fbgemm_ssd_tbe(T, D, log_E, weights_precision, mixed)
         E = int(10**log_E)
         N = E
         indices = torch.as_tensor(
             np.random.choice(E, replace=False, size=(N,)), dtype=torch.int64
         )
-        weights = torch.randn(N, max_D, dtype=weights_precision.as_dtype())
+        weights = torch.randn(N, emb.cache_row_dim, dtype=weights_precision.as_dtype())
         new_weights = weights + 1
         weights_out = torch.empty_like(weights)
         count = torch.as_tensor([E])
@@ -242,9 +238,7 @@ def test_l2_multiple_flush_at_same_train_iter(
         mixed: bool,
         weights_precision: SparseType,
     ) -> None:
-        emb, _, _, _ = self.generate_fbgemm_ssd_tbe(
-            T, D, log_E, weights_precision, mixed
-        )
+        emb, _, _ = self.generate_fbgemm_ssd_tbe(T, D, log_E, weights_precision, mixed)
 
         with patch.object(torch.cuda, "synchronize") as mock_calls:
             mock_calls.side_effect = None
@@ -269,7 +263,7 @@ def test_rocksdb_get_discrete_ids(
         mixed: bool,
         weights_precision: SparseType,
     ) -> None:
-        emb, Es, Ds, max_D = self.generate_fbgemm_ssd_tbe(
+        emb, Es, _ = self.generate_fbgemm_ssd_tbe(
             T, D, log_E, weights_precision, mixed, False, 8
         )
         E = int(10**log_E)
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
@@ -109,8 +109,6 @@ def test_ssd(self, indice_int64_t: bool, weights_precision: SparseType) -> None:
             indices = torch.as_tensor(
                 np.random.choice(E, replace=False, size=(N,)), dtype=torch.int32
             )
-        weights = torch.randn(N, D, dtype=weights_precision.as_dtype())
-        output_weights = torch.empty_like(weights)
         count = torch.tensor([N])
 
         feature_table_map = list(range(1))
@@ -124,6 +122,10 @@ def test_ssd(self, indice_int64_t: bool, weights_precision: SparseType) -> None:
             weights_precision=weights_precision,
             l2_cache_size=8,
         )
+
+        weights = torch.randn(N, emb.cache_row_dim, dtype=weights_precision.as_dtype())
+        output_weights = torch.empty_like(weights)
+
         emb.ssd_db.get_cuda(indices, output_weights, count)
         torch.cuda.synchronize()
         assert (output_weights <= 0.1).all().item()