Skip to content

Support DI sharding for FPE_EBC #2968

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions torchrec/distributed/quant_embeddingbag.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def __init__(
self,
module: EmbeddingBagCollectionInterface,
table_name_to_parameter_sharding: Dict[str, ParameterSharding],
env: ShardingEnv,
env: Union[ShardingEnv, Dict[str, ShardingEnv]], # support for hybrid sharding
fused_params: Optional[Dict[str, Any]] = None,
device: Optional[torch.device] = None,
feature_processor: Optional[FeatureProcessorsCollection] = None,
Expand Down Expand Up @@ -462,11 +462,30 @@ def __init__(
f"Feature processor has inconsistent devices. Expected {feature_processor_device}, got {param.device}"
)

if isinstance(env, Dict):
expected_device_type = "cuda"
for (
embedding_configs
) in self._sharding_type_device_group_to_sharding_infos.values():
# throws if not all shards only have the expected device type
shard_device_type = get_device_from_sharding_infos(embedding_configs)
if isinstance(shard_device_type, tuple):
raise RuntimeError(
f"Sharding across multiple device types for FeatureProcessedEmbeddingBagCollection is not supported yet, got {shard_device_type}",
)
assert (
shard_device_type == expected_device_type
), f"Expected {expected_device_type} but got {shard_device_type} for FeatureProcessedEmbeddingBagCollection sharding device type"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we assert here and the expected_device_type is always cuda, why don't we keep the signature of the method's env as a single ShardingEnv, and just pass the cuda's sharding evn to this?

# TODO(hcxu): support hybrid sharding with feature_processors_per_rank: ModuleList(ModuleList()), if compatible
world_size = env[expected_device_type].world_size
else:
world_size = env.world_size

if feature_processor_device is None:
for _ in range(env.world_size):
self.feature_processors_per_rank.append(feature_processor)
self.feature_processors_per_rank += [feature_processor] * world_size
else:
for i in range(env.world_size):
for i in range(world_size):
# Generic copy, for example initailized on cpu but -> sharding as meta
self.feature_processors_per_rank.append(
copy.deepcopy(feature_processor)
Expand Down
22 changes: 17 additions & 5 deletions torchrec/distributed/quant_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,16 @@ def get_bucket_offsets_per_virtual_table(
}


def get_param_id_from_type(is_sqebc: bool, is_sqmcec: bool, is_sfpebc: bool) -> str:
if is_sqebc:
return "embedding_bags"
elif is_sqmcec:
return "_embedding_module.embeddings"
elif is_sfpebc:
return "_embedding_bag_collection.embedding_bags"
return "embeddings"


def sharded_tbes_weights_spec(
sharded_model: torch.nn.Module,
virtual_table_name_to_bucket_lengths: Optional[Dict[str, list[int]]] = None,
Expand Down Expand Up @@ -454,11 +464,14 @@ def sharded_tbes_weights_spec(
is_sqebc: bool = "ShardedQuantEmbeddingBagCollection" in type_name
is_sqec: bool = "ShardedQuantEmbeddingCollection" in type_name
is_sqmcec: bool = "ShardedQuantManagedCollisionEmbeddingCollection" in type_name
is_sfpebc: bool = (
"ShardedQuantFeatureProcessedEmbeddingBagCollection" in type_name
)

if is_sqebc or is_sqec or is_sqmcec:
if is_sqebc or is_sqec or is_sqmcec or is_sfpebc:
assert (
is_sqec + is_sqebc + is_sqmcec == 1
), "Cannot have any two of ShardedQuantEmbeddingBagCollection, ShardedQuantEmbeddingCollection and ShardedQuantManagedCollisionEmbeddingCollection are true"
is_sqec + is_sqebc + is_sqmcec + is_sfpebc == 1
), "Cannot have any two of ShardedQuantEmbeddingBagCollection, ShardedQuantEmbeddingCollection, ShardedQuantManagedCollisionEmbeddingCollection and ShardedQuantFeatureProcessedEmbeddingBagCollection are true"
tbes_configs: Dict[
IntNBitTableBatchedEmbeddingBagsCodegen, GroupedEmbeddingConfig
] = module.tbes_configs()
Expand Down Expand Up @@ -550,8 +563,7 @@ def sharded_tbes_weights_spec(
row_offsets,
table_metadata.shard_offsets[1],
]
s: str = "embedding_bags" if is_sqebc else "embeddings"
s = ("_embedding_module." if is_sqmcec else "") + s
s: str = get_param_id_from_type(is_sqebc, is_sqmcec, is_sfpebc)
unsharded_fqn_weight_prefix: str = f"{module_fqn}.{s}.{table_name}"
unsharded_fqn_weight: str = unsharded_fqn_weight_prefix + ".weight"

Expand Down
Loading