Test commit (#3084)

aliafzal · facebook-github-bot · commit 08776433c15e · 2025-06-11T14:49:13.000-07:00
Summary: Pull Request resolved: #3084 Rollback Plan: Differential Revision: D76457454
diff --git a/torchrec/distributed/model_tracker/model_delta_tracker.py b/torchrec/distributed/model_tracker/model_delta_tracker.py
@@ -6,7 +6,9 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-strict
-from typing import Dict, List, Optional, Union
+import logging as logger
+from collections import Counter, OrderedDict
+from typing import Dict, Iterable, List, Optional, Union
 
 import torch
 
@@ -49,6 +51,8 @@ class ModelDeltaTracker:
             call.
         delete_on_read (bool, optional): whether to delete the tracked ids after all consumers have read them.
         mode (TrackingMode, optional): tracking mode to use from supported tracking modes. Default: TrackingMode.ID_ONLY.
+        fqns_to_skip (Iterable[str], optional): list of FQNs to skip tracking. Default: None.
+
     """
 
     DEFAULT_CONSUMER: str = "default"
@@ -59,11 +63,15 @@ def __init__(
         consumers: Optional[List[str]] = None,
         delete_on_read: bool = True,
         mode: TrackingMode = TrackingMode.ID_ONLY,
+        fqns_to_skip: Iterable[str] = (),
     ) -> None:
         self._model = model
         self._consumers: List[str] = consumers or [self.DEFAULT_CONSUMER]
         self._delete_on_read = delete_on_read
         self._mode = mode
+        self._fqn_to_feature_map: Dict[str, List[str]] = {}
+        self._fqns_to_skip: Iterable[str] = fqns_to_skip
+        self.fqn_to_feature_names()
         pass
 
     def record_lookup(self, kjt: KeyedJaggedTensor, states: torch.Tensor) -> None:
@@ -85,14 +93,70 @@ def get_delta(self, consumer: Optional[str] = None) -> Dict[str, DeltaRows]:
         """
         return {}
 
-    def fqn_to_feature_names(self, module: nn.Module) -> Dict[str, List[str]]:
+    def fqn_to_feature_names(self) -> Dict[str, List[str]]:
         """
-        Returns a mapping from FQN to feature names for a given module.
-
-        Args:
-            module (nn.Module): the module to retrieve feature names for.
+        Returns a mapping of FQN to feature names from all Supported Modules [EmbeddingCollection and EmbeddingBagCollection] present in the given model.
         """
-        return {}
+        if (self._fqn_to_feature_map is not None) and len(self._fqn_to_feature_map) > 0:
+            return self._fqn_to_feature_map
+
+        table_to_feature_names: Dict[str, List[str]] = OrderedDict()
+        table_to_fqn: Dict[str, str] = OrderedDict()
+        for fqn, named_module in self._model.named_modules():
+            split_fqn = fqn.split(".")
+            # Skipping partial FQNs present in fqns_to_skip
+            # TODO: Validate if we need to support more complex patterns for skipping fqns
+            should_skip = False
+            for fqn_to_skip in self._fqns_to_skip:
+                if fqn_to_skip in split_fqn:
+                    logger.info(f"Skipping {fqn} because it is part of fqns_to_skip")
+                    should_skip = True
+                    break
+            if should_skip:
+                continue
+
+            # Using FQNs of the embedding and mapping them to features as state_dict() API uses these to key states.
+            if isinstance(named_module, SUPPORTED_MODULES):
+                for table_name, config in named_module._table_name_to_config.items():
+                    logger.info(
+                        f"Found {table_name} for {fqn} with features {config.feature_names}"
+                    )
+                    table_to_feature_names[table_name] = config.feature_names
+            for table_name in table_to_feature_names:
+                # Using the split FQN to get the exact table name matching. Otherwise, checking "table_name in fqn"
+                # will incorrectly match fqn with all the table names that have the same prefix
+                if table_name in split_fqn:
+                    embedding_fqn = fqn.replace("_dmp_wrapped_module.module.", "")
+                    if table_name in table_to_fqn:
+                        # Sanity check for validating that we don't have more then one table mapping to same fqn.
+                        logger.warning(
+                            f"Override {table_to_fqn[table_name]} with {embedding_fqn} for entry {table_name}"
+                        )
+                    table_to_fqn[table_name] = embedding_fqn
+            logger.info(f"Table to fqn: {table_to_fqn}")
+        flatten_names = [
+            name for names in table_to_feature_names.values() for name in names
+        ]
+        # TODO: Validate if there is a better way to handle duplicate feature names.
+        # Logging a warning if duplicate feature names are found across tables, but continue execution as this could be a valid case.
+        if len(set(flatten_names)) != len(flatten_names):
+            counts = Counter(flatten_names)
+            duplicates = [item for item, count in counts.items() if count > 1]
+            logger.warning(f"duplicate feature names found: {duplicates}")
+
+        fqn_to_feature_names: Dict[str, List[str]] = OrderedDict()
+        for table_name in table_to_feature_names:
+            if table_name not in table_to_fqn:
+                # This is likely unexpected, where we can't locate the FQN associated with this table.
+                logger.warning(
+                    f"Table {table_name} not found in {table_to_fqn}, skipping"
+                )
+                continue
+            fqn_to_feature_names[table_to_fqn[table_name]] = table_to_feature_names[
+                table_name
+            ]
+        self._fqn_to_feature_map = fqn_to_feature_names
+        return fqn_to_feature_names
 
     def clear(self, consumer: Optional[str] = None) -> None:
         """
diff --git a/torchrec/distributed/model_tracker/tests/utils.py b/torchrec/distributed/model_tracker/tests/utils.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+from dataclasses import dataclass
+from typing import cast, Dict, Iterable, List, Optional, Union
+
+import torch
+
+from torch import nn
+from torchrec.distributed.embedding_types import EmbeddingComputeKernel
+from torchrec.distributed.planner import ParameterConstraints
+from torchrec.distributed.types import ShardingType
+from torchrec.modules.embedding_configs import EmbeddingBagConfig, EmbeddingConfig
+from torchrec.modules.embedding_modules import (
+    EmbeddingBagCollection,
+    EmbeddingCollection,
+)
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+@dataclass
+class EmbeddingTableProps:
+    """
+    Properties of an embedding table.
+
+    Args:
+        embedding_table_config: Config of the embedding table of Union(EmbeddingConfig or EmbeddingBagConfig)
+        sharding (ShardingType): sharding type of the table
+        weight_type (WeightedType): weight
+    """
+
+    embedding_table_config: Union[EmbeddingConfig, EmbeddingBagConfig]
+    sharding: ShardingType
+    is_weighted: bool = False
+
+
+class TestECModel(nn.Module):
+    """
+    Test model with EmbeddingCollection and Linear layers.
+
+    Args:
+        tables (List[EmbeddingConfig]): list of embedding tables
+        device (Optional[torch.device]): device on which buffers will be initialized
+
+    Example:
+        TestECModel(tables=[EmbeddingConfig(...)])
+    """
+
+    def __init__(
+        self, tables: List[EmbeddingConfig], device: Optional[torch.device] = None
+    ) -> None:
+        super().__init__()
+        self.ec: EmbeddingCollection = EmbeddingCollection(
+            tables=tables,
+            device=device if device else torch.device("meta"),
+        )
+
+        embedding_dim = tables[0].embedding_dim
+
+        self.seq: nn.Sequential = nn.Sequential(
+            *[nn.Linear(embedding_dim, embedding_dim) for _ in range(3)]
+        )
+
+    def forward(self, features: KeyedJaggedTensor) -> torch.Tensor:
+        """
+        Forward pass of the TestECModel.
+
+        Args:
+            features (KeyedJaggedTensor): Input features for the model.
+
+        Returns:
+            torch.Tensor: Output tensor after processing through the model.
+        """
+
+        lookup_result = self.ec(features)
+        return self.seq(torch.cat([jt.values() for _, jt in lookup_result.items()]))
+
+
+class TestEBCModel(nn.Module):
+    """
+    Test model with EmbeddingBagCollection and Linear layers.
+
+    Args:
+        tables (List[EmbeddingBagConfig]): list of embedding tables
+        device (Optional[torch.device]): device on which buffers will be initialized
+
+    Example:
+        TestEBCModel(tables=[EmbeddingBagConfig(...)])
+    """
+
+    def __init__(
+        self, tables: List[EmbeddingBagConfig], device: Optional[torch.device] = None
+    ) -> None:
+        super().__init__()
+        self.ebc: EmbeddingBagCollection
+        self.ebc = EmbeddingBagCollection(
+            tables=tables,
+            device=device if device else torch.device("meta"),
+        )
+
+        embedding_dim = tables[0].embedding_dim
+        self.seq: nn.Sequential = nn.Sequential(
+            *[nn.Linear(embedding_dim, embedding_dim) for _ in range(3)]
+        )
+
+    def forward(self, features: KeyedJaggedTensor) -> torch.Tensor:
+        """
+        Forward pass of the TestEBCModel.
+
+        Args:
+            features (KeyedJaggedTensor): Input features for the model.
+
+        Returns:
+            torch.Tensor: Output tensor after processing through the model.
+        """
+
+        lookup_result = self.ebc(features).to_dict()
+        return self.seq(torch.cat(tuple(lookup_result.values())))
+
+
+def create_ec_model(
+    tables: Iterable[EmbeddingTableProps],
+    device: Optional[torch.device] = None,
+) -> nn.Module:
+    """
+    Create an EmbeddingCollection model with the given tables.
+
+    Args:
+        tables (List[EmbeddingTableProps]): list of embedding tables
+        device (Optional[torch.device]): device on which buffers will be initialized
+
+    Returns:
+        nn.Module: EmbeddingCollection model
+    """
+    return TestECModel(
+        tables=[
+            cast(EmbeddingConfig, table.embedding_table_config) for table in tables
+        ],
+        device=device,
+    )
+
+
+def create_ebc_model(
+    tables: Iterable[EmbeddingTableProps],
+    device: Optional[torch.device] = None,
+) -> nn.Module:
+    """
+    Create an EmbeddinBagCollection model with the given tables.
+
+    Args:
+        tables (List[EmbeddingTableProps]): list of embedding tables
+        device (Optional[torch.device]): device on which buffers will be initialized
+
+    Returns:
+        nn.Module: EmbeddingCollection model
+    """
+    return TestEBCModel(
+        tables=[
+            cast(EmbeddingBagConfig, table.embedding_table_config) for table in tables
+        ],
+        device=device,
+    )
+
+
+def generate_planner_constraints(
+    tables: Iterable[EmbeddingTableProps],
+) -> dict[str, ParameterConstraints]:
+    """
+    Generate planner constraints for the given tables.
+
+    Args:
+        tables (List[EmbeddingTableProps]): list of embedding tables
+
+    Returns:
+        Dict[str, ParameterConstraints]: planner constraints
+    """
+    constraints: Dict[str, ParameterConstraints] = {}
+    for table in tables:
+        sharding_types = [table.sharding.value]
+        constraints[table.embedding_table_config.name] = ParameterConstraints(
+            sharding_types=sharding_types,
+            compute_kernels=[EmbeddingComputeKernel.FUSED.value],
+            feature_names=table.embedding_table_config.feature_names,
+            pooling_factors=[1.0],
+        )
+    return constraints