st publish mode only load weight (#3538)

EddyLXJ · meta-codesync[bot] · commit 22baa4f108d2 · 2025-11-14T22:17:46.000-08:00
Summary: X-link: pytorch/FBGEMM#5116 Pull Request resolved: #3538 X-link: https://github.com/facebookresearch/FBGEMM/pull/2122 For silvertorch publish, we don't want to load opt into backend due to limited cpu memory in publish host. So we need to load the whole row into state dict which loading the checkpoint in st publish, then only save weight into backend, after that backend will only have metaheader + weight. For the first loading, we need to set dim with metaheader_dim + emb_dim + optimizer_state_dim, otherwise the checkpoint loadding will throw size mismatch error. after the first loading, we only need to get metaheader+weight from backend for state dict, so we can set dim with metaheader_dim + emb Reviewed By: emlin Differential Revision: D85830053 fbshipit-source-id: 0eddbe9e69ea8271e8c77dc0147e87a08f0b3934
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -477,13 +477,15 @@ def _populate_zero_collision_tbe_params(
             else False
         )
     )
+
     tbe_params["kv_zch_params"] = KVZCHParams(
         bucket_offsets=bucket_offsets,
         bucket_sizes=bucket_sizes,
         enable_optimizer_offloading=True,
         backend_return_whole_row=(backend_type == BackendType.DRAM),
         eviction_policy=eviction_policy,
         embedding_cache_mode=embedding_cache_mode_,
+        load_ckpt_without_opt=eviction_tbe_config.load_ckpt_without_opt,
     )
 
 
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
@@ -664,6 +664,7 @@ class KeyValueParams:
         enable_raw_embedding_streaming: Optional[bool]: enable raw embedding streaming for SSD TBE
         res_store_shards: Optional[int] = None: the number of shards to store the raw embeddings
         kvzch_tbe_config: Optional[KVZCHTBEConfig]: KVZCH config for TBE
+        load_ckpt_without_opt: bool: whether it is st publish
 
         # Parameter Server (PS) Attributes
         ps_hosts (Optional[Tuple[Tuple[str, int]]]): List of PS host ip addresses
@@ -690,6 +691,7 @@ class KeyValueParams:
     )
     res_store_shards: Optional[int] = None  # shards to store the raw embeddings
     kvzch_tbe_config: Optional[KVZCHTBEConfig] = None
+    load_ckpt_without_opt: bool = False  # is st publish
 
     # Parameter Server (PS) Attributes
     ps_hosts: Optional[Tuple[Tuple[str, int], ...]] = None
@@ -719,6 +721,7 @@ def __hash__(self) -> int:
                 self.enable_raw_embedding_streaming,
                 self.res_store_shards,
                 self.kvzch_tbe_config,
+                self.load_ckpt_without_opt,
             )
         )
 

Original file line number	Diff line number	Diff line change
`@@ -477,13 +477,15 @@ def _populate_zero_collision_tbe_params(`
`477`	`477`	`else False`
`478`	`478`	`)`
`479`	`479`	`)`
	`480`	`+`
`480`	`481`	`tbe_params["kv_zch_params"] = KVZCHParams(`
`481`	`482`	`bucket_offsets=bucket_offsets,`
`482`	`483`	`bucket_sizes=bucket_sizes,`
`483`	`484`	`enable_optimizer_offloading=True,`
`484`	`485`	`backend_return_whole_row=(backend_type == BackendType.DRAM),`
`485`	`486`	`eviction_policy=eviction_policy,`
`486`	`487`	`embedding_cache_mode=embedding_cache_mode_,`
	`488`	`+ load_ckpt_without_opt=eviction_tbe_config.load_ckpt_without_opt,`
`487`	`489`	`)`
`488`	`490`
`489`	`491`
Original file line number	Diff line number	Diff line change
`@@ -664,6 +664,7 @@ class KeyValueParams:`
`664`	`664`	`enable_raw_embedding_streaming: Optional[bool]: enable raw embedding streaming for SSD TBE`
`665`	`665`	`res_store_shards: Optional[int] = None: the number of shards to store the raw embeddings`
`666`	`666`	`kvzch_tbe_config: Optional[KVZCHTBEConfig]: KVZCH config for TBE`
	`667`	`+ load_ckpt_without_opt: bool: whether it is st publish`
`667`	`668`
`668`	`669`	`# Parameter Server (PS) Attributes`
`669`	`670`	`ps_hosts (Optional[Tuple[Tuple[str, int]]]): List of PS host ip addresses`
`@@ -690,6 +691,7 @@ class KeyValueParams:`
`690`	`691`	`)`
`691`	`692`	`res_store_shards: Optional[int] = None # shards to store the raw embeddings`
`692`	`693`	`kvzch_tbe_config: Optional[KVZCHTBEConfig] = None`
	`694`	`+ load_ckpt_without_opt: bool = False # is st publish`
`693`	`695`
`694`	`696`	`# Parameter Server (PS) Attributes`
`695`	`697`	`ps_hosts: Optional[Tuple[Tuple[str, int], ...]] = None`
`@@ -719,6 +721,7 @@ def __hash__(self) -> int:`
`719`	`721`	`self.enable_raw_embedding_streaming,`
`720`	`722`	`self.res_store_shards,`
`721`	`723`	`self.kvzch_tbe_config,`
	`724`	`+ self.load_ckpt_without_opt,`
`722`	`725`	`)`
`723`	`726`	`)`
`724`	`727`