huggingface · frozenleaves · Sep 22, 2025 · Sep 25, 2025
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -498,8 +498,8 @@ def _cast_and_contiguous(tensor, to_contiguous, dtype):
 
     if accelerator.is_main_process:
         for (param_name, full_param), sharded_param in zip(full_sd.items(), meta_sharded_sd.values()):
-            full_param = full_param.detach().cuda()
             mesh = sharded_param.device_mesh
+            full_param = full_param.detach().to(mesh.device_type)
             dist.broadcast(full_param, src=0, group=mesh.get_group())
             sharded_tensor = distribute_tensor(full_param, mesh, sharded_param.placements)
             to_contiguous, casting_dtype = _infer_parameter_dtype(
@@ -512,8 +512,8 @@ def _cast_and_contiguous(tensor, to_contiguous, dtype):
     # We need this else to have a matching `broadcast` for all of the ranks, else we deadlock
     else:
         for param_name, sharded_param in meta_sharded_sd.items():
-            full_tensor = torch.empty(sharded_param.size(), device="cuda", dtype=sharded_param.dtype)
             mesh = sharded_param.device_mesh
+            full_tensor = torch.empty(sharded_param.size(), device=mesh.device_type, dtype=sharded_param.dtype)
             dist.broadcast(full_tensor, src=0, group=mesh.get_group())
             sharded_tensor = distribute_tensor(full_tensor, mesh, sharded_param.placements)
             to_contiguous, casting_dtype = _infer_parameter_dtype(

diff --git a/src/accelerate/utils/memory.py b/src/accelerate/utils/memory.py
@@ -153,7 +153,15 @@ def find_executable_batch_size(
 
         def reduce_batch_size_fn():
             nonlocal batch_size
-            batch_size = batch_size // 2
+            new_batch_size = int(batch_size * 0.9)
+            if new_batch_size == 0:
+                new_batch_size = 1
+            if new_batch_size == batch_size:
+                if batch_size > 1:
+                    new_batch_size = batch_size - 1
+                else:
+                    raise RuntimeError("Batch size reduced to 1 and still out of memory.")
+            batch_size = new_batch_size
             return batch_size
 
     def decorator(*args, **kwargs):