add auto feature score collection to EC (pytorch#5030)

emlin · facebook-github-bot · commit 7f793b869d74 · 2025-10-20T21:08:41.000-07:00
Summary: X-link: meta-pytorch/torchrec#3474 X-link: facebookresearch/FBGEMM#2043 Enable feature score auto collection in ShardedEmbeddingCollection based on static feature to score mapping. If user needs custom score for specific id, they can disable auto collection and then change model code explicitly to collect score for each id. Here is the sample eviction policy config in embedding_table config to enable auto score collection: virtual_table_eviction_policy=FeatureScoreBasedEvictionPolicy( training_id_eviction_trigger_count=260_000_000, # 260M training_id_keep_count=160_000_000, # 160M enable_auto_feature_score_collection=True, feature_score_mapping={ "sparse_public_original_content_creator": 1.0, }, feature_score_default_value=0.5, ), Additionally the counter collected previously during EC dedup is not used by kvzch backend, so this diff removed that counter and allow KJT to transfer a single float32 weight tensor to backend. This allows feature score collection for EBC since there could have another float weight for EBC pooling already. Reviewed By: EddyLXJ Differential Revision: D83945722
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -2088,7 +2088,7 @@ def _prefetch(  # noqa C901
                         torch.tensor(
                             [weights.shape[0]], device="cpu", dtype=torch.long
                         ),
-                        weights.cpu().view(torch.float32).view(-1, 2),
+                        weights.cpu(),
                     )
 
             # Generate row addresses (pointing to either L1 or the current
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
@@ -768,7 +768,6 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                       CHECK_EQ(indices.size(0), engege_rates.size(0));
                       auto indices_data_ptr = indices.data_ptr<index_t>();
                       auto engage_rate_ptr = engege_rates.data_ptr<float>();
-                      int64_t stride = 2;
                       {
                         auto before_write_lock_ts =
                             facebook::WallClockUtil::NowInUsecFast();
@@ -783,8 +782,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                              index_iter++) {
                           const auto& id_index = *index_iter;
                           auto id = int64_t(indices_data_ptr[id_index]);
-                          float engege_rate =
-                              float(engage_rate_ptr[id_index * stride + 0]);
+                          float engege_rate = float(engage_rate_ptr[id_index]);
                           // use mempool
                           weight_type* block = nullptr;
                           auto before_lookup_cache_ts =

Original file line number	Diff line number	Diff line change
`@@ -2088,7 +2088,7 @@ def _prefetch( # noqa C901`
`2088`	`2088`	`torch.tensor(`
`2089`	`2089`	`[weights.shape[0]], device="cpu", dtype=torch.long`
`2090`	`2090`	`),`
`2091`		`- weights.cpu().view(torch.float32).view(-1, 2),`
	`2091`	`+ weights.cpu(),`
`2092`	`2092`	`)`
`2093`	`2093`
`2094`	`2094`	`# Generate row addresses (pointing to either L1 or the current`