Skip load metadata tensor

EddyLXJ · facebook-github-bot · commit 061bcad44e08 · 2025-09-05T15:09:38.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1856 The metadata tensor is newly added for kvzch table. Some old checkpoints may not have this fqn. Directly load old checkpoint can cause fqn missing error. This diff try to skip init metadata tensor at load checkpoint func. Metadata tensor is not used in training, so it is okay to skip load. It will be created during saving checkpoint. Differential Revision: D81811024
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -1957,7 +1957,7 @@ def __init__(
                 List[ShardedTensor],
                 List[ShardedTensor],
                 List[ShardedTensor],
-                List[ShardedTensor],
+                Optional[List[ShardedTensor]],
             ]
         ] = None
 
@@ -2126,26 +2126,31 @@ def _init_sharded_split_embedding_weights(
             self._table_name_to_weight_count_per_rank,
             use_param_size_as_rows=True,
         )
-        metadata_sharded_t_list = create_virtual_sharded_tensors(
-            emb_table_config_copy,
-            metadata_list,  # pyre-ignore [6]
-            self._pg,
-            prefix,
-            self._table_name_to_weight_count_per_rank,
-        )
+        metadata_sharded_t_list = None
+        if metadata_list is not None:
+            metadata_sharded_t_list = create_virtual_sharded_tensors(
+                emb_table_config_copy,
+                metadata_list,
+                self._pg,
+                prefix,
+                self._table_name_to_weight_count_per_rank,
+            )
 
         assert (
             len(pmt_list)
             == len(weight_ids_list)  # pyre-ignore
             == len(bucket_cnt_list)  # pyre-ignore
-            == len(metadata_list)  # pyre-ignore
         )
         assert (
             len(pmt_sharded_t_list)
             == len(weight_id_sharded_t_list)
             == len(bucket_cnt_sharded_t_list)
-            == len(metadata_sharded_t_list)
         )
+        if metadata_list is not None:
+            assert metadata_sharded_t_list is not None
+            assert len(pmt_list) == len(metadata_list)
+            assert len(pmt_sharded_t_list) == len(metadata_sharded_t_list)
+
         self._split_weights_res = (
             pmt_sharded_t_list,
             weight_id_sharded_t_list,
@@ -2181,10 +2186,13 @@ def get_named_split_embedding_weights_snapshot(self, prefix: str = "") -> Iterat
         for table_idx, pmt_sharded_t in enumerate(pmt_sharded_t_list):
             table_config = self._config.embedding_tables[table_idx]
             key = append_prefix(prefix, f"{table_config.name}")
+            metadata_sharded_t = None
+            if metadata_sharded_t_list is not None:
+                metadata_sharded_t = metadata_sharded_t_list[table_idx]
 
             yield key, pmt_sharded_t, weight_id_sharded_t_list[
                 table_idx
-            ], bucket_cnt_sharded_t_list[table_idx], metadata_sharded_t_list[table_idx]
+            ], bucket_cnt_sharded_t_list[table_idx], metadata_sharded_t
 
     def flush(self) -> None:
         """
@@ -2849,7 +2857,7 @@ def __init__(
                 List[ShardedTensor],
                 List[ShardedTensor],
                 List[ShardedTensor],
-                List[ShardedTensor],
+                Optional[List[ShardedTensor]],
             ]
         ] = None
 
@@ -3018,26 +3026,31 @@ def _init_sharded_split_embedding_weights(
             self._table_name_to_weight_count_per_rank,
             use_param_size_as_rows=True,
         )
-        metadata_sharded_t_list = create_virtual_sharded_tensors(
-            emb_table_config_copy,
-            metadata_list,  # pyre-ignore [6]
-            self._pg,
-            prefix,
-            self._table_name_to_weight_count_per_rank,
-        )
+        metadata_sharded_t_list = None
+        if metadata_list is not None:
+            metadata_sharded_t_list = create_virtual_sharded_tensors(
+                emb_table_config_copy,
+                metadata_list,
+                self._pg,
+                prefix,
+                self._table_name_to_weight_count_per_rank,
+            )
 
         assert (
             len(pmt_list)
             == len(weight_ids_list)  # pyre-ignore
             == len(bucket_cnt_list)  # pyre-ignore
-            == len(metadata_list)  # pyre-ignore
         )
         assert (
             len(pmt_sharded_t_list)
             == len(weight_id_sharded_t_list)
             == len(bucket_cnt_sharded_t_list)
-            == len(metadata_sharded_t_list)
         )
+        if metadata_list is not None:
+            assert metadata_sharded_t_list is not None
+            assert len(pmt_list) == len(metadata_list)
+            assert len(pmt_sharded_t_list) == len(metadata_sharded_t_list)
+
         self._split_weights_res = (
             pmt_sharded_t_list,
             weight_id_sharded_t_list,
@@ -3073,10 +3086,13 @@ def get_named_split_embedding_weights_snapshot(self, prefix: str = "") -> Iterat
         for table_idx, pmt_sharded_t in enumerate(pmt_sharded_t_list):
             table_config = self._config.embedding_tables[table_idx]
             key = append_prefix(prefix, f"{table_config.name}")
+            metadata_sharded_t = None
+            if metadata_sharded_t_list is not None:
+                metadata_sharded_t = metadata_sharded_t_list[table_idx]
 
             yield key, pmt_sharded_t, weight_id_sharded_t_list[
                 table_idx
-            ], bucket_cnt_sharded_t_list[table_idx], metadata_sharded_t_list[table_idx]
+            ], bucket_cnt_sharded_t_list[table_idx], metadata_sharded_t
 
     def flush(self) -> None:
         """
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -1067,7 +1067,6 @@ def post_state_dict_hook(
                             assert (
                                 weight_ids_sharded_t is not None
                                 and id_cnt_per_bucket_sharded_t is not None
-                                and metadata_sharded_t is not None
                             )
                             # The logic here assumes there is only one shard per table on any particular rank
                             # if there are cases each rank has >1 shards, we need to update here accordingly
@@ -1121,12 +1120,13 @@ def update_destination(
                         destination,
                         virtual_table_sharded_t_map[table_name][1],
                     )
-                    update_destination(
-                        table_name,
-                        "metadata",
-                        destination,
-                        virtual_table_sharded_t_map[table_name][2],
-                    )
+                    if virtual_table_sharded_t_map[table_name][2] is not None:
+                        update_destination(
+                            table_name,
+                            "metadata",
+                            destination,
+                            virtual_table_sharded_t_map[table_name][2],
+                        )
 
         def _post_load_state_dict_hook(
             module: "ShardedEmbeddingCollection",
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -1213,7 +1213,6 @@ def post_state_dict_hook(
                             assert (
                                 weight_ids_sharded_t is not None
                                 and id_cnt_per_bucket_sharded_t is not None
-                                and metadata_sharded_t is not None
                             )
                             # The logic here assumes there is only one shard per table on any particular rank
                             # if there are cases each rank has >1 shards, we need to update here accordingly
@@ -1267,12 +1266,13 @@ def update_destination(
                         destination,
                         virtual_table_sharded_t_map[table_name][1],
                     )
-                    update_destination(
-                        table_name,
-                        "metadata",
-                        destination,
-                        virtual_table_sharded_t_map[table_name][2],
-                    )
+                    if virtual_table_sharded_t_map[table_name][2] is not None:
+                        update_destination(
+                            table_name,
+                            "metadata",
+                            destination,
+                            virtual_table_sharded_t_map[table_name][2],
+                        )
 
         def _post_load_state_dict_hook(
             module: "ShardedEmbeddingBagCollection",