code
diff --git a/‎comfy/cli_args.py‎
Lines changed: 1 addition & 1 deletion b/‎comfy/cli_args.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎comfy/controlnet.py‎
Lines changed: 60 additions & 5 deletions b/‎comfy/controlnet.py‎
Lines changed: 60 additions & 5 deletions
diff --git a/‎comfy/ldm/hunyuan3dv2_1/hunyuandit.py‎
Lines changed: 12 additions & 8 deletions b/‎comfy/ldm/hunyuan3dv2_1/hunyuandit.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎comfy/memory_management.py‎
Lines changed: 18 additions & 18 deletions b/‎comfy/memory_management.py‎
Lines changed: 18 additions & 18 deletions
@@ -49,7 +49,7 @@ def __call__(self, parser, namespace, values, option_string=None):
 parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
+parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use, as a comma-separated list (e.g. '0' or '0,1'). All other devices will not be visible.")
 parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
 
@@ -15,13 +15,14 @@
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
-
+from __future__ import annotations
 
 import torch
 from enum import Enum
 import math
 import os
 import logging
+import copy
 import comfy.utils
 import comfy.model_management
 import comfy.model_detection
@@ -38,7 +39,7 @@
 import comfy.ldm.flux.controlnet
 import comfy.ldm.qwen_image.controlnet
 import comfy.cldm.dit_embedder
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 if TYPE_CHECKING:
     from comfy.hooks import HookGroup
 
@@ -64,6 +65,18 @@ class StrengthType(Enum):
     CONSTANT = 1
     LINEAR_UP = 2
 
+class ControlIsolation:
+    '''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.'''
+    def __init__(self, control: ControlBase):
+        self.control = control
+        self.orig_previous_controlnet = control.previous_controlnet
+
+    def __enter__(self):
+        self.control.previous_controlnet = None
+
+    def __exit__(self, *args):
+        self.control.previous_controlnet = self.orig_previous_controlnet
+
 class ControlBase:
     def __init__(self):
         self.cond_hint_original = None
@@ -77,14 +90,15 @@ def __init__(self):
         self.compression_ratio = 8
         self.upscale_algorithm = 'nearest-exact'
         self.extra_args = {}
-        self.previous_controlnet = None
+        self.previous_controlnet: Union[ControlBase, None] = None
         self.extra_conds = []
         self.strength_type = StrengthType.CONSTANT
         self.concat_mask = False
         self.extra_concat_orig = []
         self.extra_concat = None
         self.extra_hooks: HookGroup = None
         self.preprocess_image = lambda a: a
+        self.multigpu_clones: dict[torch.device, ControlBase] = {}
 
     def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
         self.cond_hint_original = cond_hint
@@ -111,17 +125,38 @@ def set_previous_controlnet(self, controlnet):
     def cleanup(self):
         if self.previous_controlnet is not None:
             self.previous_controlnet.cleanup()
-
+        for device_cnet in self.multigpu_clones.values():
+            with ControlIsolation(device_cnet):
+                device_cnet.cleanup()
         self.cond_hint = None
         self.extra_concat = None
         self.timestep_range = None
 
     def get_models(self):
         out = []
+        for device_cnet in self.multigpu_clones.values():
+            out += device_cnet.get_models_only_self()
         if self.previous_controlnet is not None:
             out += self.previous_controlnet.get_models()
         return out
 
+    def get_models_only_self(self):
+        'Calls get_models, but temporarily sets previous_controlnet to None.'
+        with ControlIsolation(self):
+            return self.get_models()
+
+    def get_instance_for_device(self, device):
+        'Returns instance of this Control object intended for selected device.'
+        return self.multigpu_clones.get(device, self)
+
+    def deepclone_multigpu(self, load_device, autoregister=False):
+        '''
+        Create deep clone of Control object where model(s) is set to other devices.
+
+        When autoregister is set to True, the deep clone is also added to multigpu_clones dict.
+        '''
+        raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.")
+
     def get_extra_hooks(self):
         out = []
         if self.extra_hooks is not None:
@@ -130,7 +165,7 @@ def get_extra_hooks(self):
             out += self.previous_controlnet.get_extra_hooks()
         return out
 
-    def copy_to(self, c):
+    def copy_to(self, c: ControlBase):
         c.cond_hint_original = self.cond_hint_original
         c.strength = self.strength
         c.timestep_percent_range = self.timestep_percent_range
@@ -284,6 +319,14 @@ def copy(self):
         self.copy_to(c)
         return c
 
+    def deepclone_multigpu(self, load_device, autoregister=False):
+        c = self.copy()
+        c.control_model = copy.deepcopy(c.control_model)
+        c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
+        if autoregister:
+            self.multigpu_clones[load_device] = c
+        return c
+
     def get_models(self):
         out = super().get_models()
         out.append(self.control_model_wrapped)
@@ -314,6 +357,10 @@ def pre_run(self, model, percent_to_timestep_function):
         super().pre_run(model, percent_to_timestep_function)
         self.set_extra_arg("base_model", model.diffusion_model)
 
+    def cleanup(self):
+        self.extra_args.pop("base_model", None)
+        super().cleanup()
+
     def copy(self):
         c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
         c.control_model = self.control_model
@@ -906,6 +953,14 @@ def copy(self):
         self.copy_to(c)
         return c
 
+    def deepclone_multigpu(self, load_device, autoregister=False):
+        c = self.copy()
+        c.t2i_model = copy.deepcopy(c.t2i_model)
+        c.device = load_device
+        if autoregister:
+            self.multigpu_clones[load_device] = c
+        return c
+
 def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
     compression_ratio = 8
     upscale_algorithm = 'nearest-exact'
 
@@ -607,9 +607,13 @@ def __init__(
     def forward(self, x, t, context, transformer_options = {}, **kwargs):
 
         x = x.movedim(-1, -2)
-        if context.shape[0] >= 2:
-            uncond_emb, cond_emb = context.chunk(2, dim = 0)
-            context = torch.cat([cond_emb, uncond_emb], dim = 0)
+
+        swap_cfg_halves = context.shape[0] >= 2
+
+        if swap_cfg_halves:
+            first_half, second_half = context.chunk(2, dim = 0)
+            context = torch.cat([second_half, first_half], dim = 0)
+
         main_condition = context
 
         t = 1.0 - t
@@ -657,8 +661,8 @@ def block_wrap(args):
         output = self.final_layer(combined)
         output =  output.movedim(-2, -1) * (-1.0)
 
-        if output.shape[0] >= 2:
-            cond_emb, uncond_emb = output.chunk(2, dim = 0)
-            return torch.cat([uncond_emb, cond_emb])
-        else:
-            return output
+        if swap_cfg_halves:
+            first_half, second_half = output.chunk(2, dim = 0)
+            output = torch.cat([second_half, first_half], dim = 0)
+
+        return output
@@ -1,6 +1,5 @@
 import math
 import ctypes
-import threading
 import dataclasses
 import torch
 from typing import NamedTuple
@@ -10,7 +9,7 @@
 
 class TensorFileSlice(NamedTuple):
     file_ref: object
-    thread_id: int
+    lock: object
     offset: int
     size: int
 
@@ -43,7 +42,6 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N
     file_obj = info.file_ref
     if (destination.device.type != "cpu"
             or file_obj is None
-            or threading.get_ident() != info.thread_id
             or destination.numel() * destination.element_size() < info.size
             or tensor.numel() * tensor.element_size() != info.size
             or tensor.storage_offset() != 0
@@ -57,27 +55,29 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N
     if hostbuf is not None:
         stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0
         device_ptr = destination2.data_ptr() if destination2 is not None else 0
-        hostbuf.read_file_slice(file_obj, info.offset, info.size,
-                                offset=destination.data_ptr() - hostbuf.get_raw_address(),
-                                stream=stream_ptr,
-                                device_ptr=device_ptr,
-                                device=None if destination2 is None else destination2.device.index)
+        with info.lock:
+            hostbuf.read_file_slice(file_obj, info.offset, info.size,
+                                    offset=destination.data_ptr() - hostbuf.get_raw_address(),
+                                    stream=stream_ptr,
+                                    device_ptr=device_ptr,
+                                    device=None if destination2 is None else destination2.device.index)
         return True
 
     buf_type = ctypes.c_ubyte * info.size
     view = memoryview(buf_type.from_address(destination.data_ptr()))
 
     try:
-        file_obj.seek(info.offset)
-        done = 0
-        while done < info.size:
-            try:
-                n = file_obj.readinto(view[done:])
-            except OSError:
-                return False
-            if n <= 0:
-                return False
-            done += n
+        with info.lock:
+            file_obj.seek(info.offset)
+            done = 0
+            while done < info.size:
+                try:
+                    n = file_obj.readinto(view[done:])
+                except OSError:
+                    return False
+                if n <= 0:
+                    return False
+                done += n
         return True
     finally:
         view.release()