fix bug: fsdp2 cannnot run with npu, because the hardcode with cuda

frozenleaves · frozenleaves · commit 48a6dac83ffd · 2025-09-22T19:30:18.000+08:00
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -465,8 +465,9 @@ def fsdp2_load_full_state_dict(accelerator, model: torch.nn.Module, full_sd: dic
     """
     import torch.distributed as dist
     from torch.distributed.tensor import distribute_tensor
-
-    # Model was previously copied to meta device
+    from accelerate.state import PartialState
+    
+	# Model was previously copied to meta device
     meta_sharded_sd = model.state_dict()
     sharded_sd = {}
 
@@ -498,8 +499,8 @@ def _cast_and_contiguous(tensor, to_contiguous, dtype):
 
     if accelerator.is_main_process:
         for (param_name, full_param), sharded_param in zip(full_sd.items(), meta_sharded_sd.values()):
-            full_param = full_param.detach().cuda()
             mesh = sharded_param.device_mesh
+            full_param = full_param.detach().to(mesh.device_type)
             dist.broadcast(full_param, src=0, group=mesh.get_group())
             sharded_tensor = distribute_tensor(full_param, mesh, sharded_param.placements)
             to_contiguous, casting_dtype = _infer_parameter_dtype(
@@ -512,8 +513,8 @@ def _cast_and_contiguous(tensor, to_contiguous, dtype):
     # We need this else to have a matching `broadcast` for all of the ranks, else we deadlock
     else:
         for param_name, sharded_param in meta_sharded_sd.items():
-            full_tensor = torch.empty(sharded_param.size(), device="cuda", dtype=sharded_param.dtype)
             mesh = sharded_param.device_mesh
+            full_tensor = torch.empty(sharded_param.size(), device=mesh.device_type, dtype=sharded_param.dtype)
             dist.broadcast(full_tensor, src=0, group=mesh.get_group())
             sharded_tensor = distribute_tensor(full_tensor, mesh, sharded_param.placements)
             to_contiguous, casting_dtype = _infer_parameter_dtype(