code · pull · May 13, 2026 · May 12, 2026 · May 13, 2026 · May 13, 2026
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
@@ -1443,7 +1443,7 @@ class HiDreamO1(supported_models_base.BASE):
     }
 
     latent_format = latent_formats.HiDreamO1Pixel
-    memory_usage_factor = 0.6
+    memory_usage_factor = 0.033
     # fp16 not supported: LM MLP down_proj activations fp16 overflow, causing NaNs
     supported_inference_dtypes = [torch.bfloat16, torch.float32]
 

diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
@@ -297,6 +297,7 @@ class LoadAudio(IO.ComfyNode):
     @classmethod
     def define_schema(cls):
         input_dir = folder_paths.get_input_directory()
+        os.makedirs(input_dir, exist_ok=True)
         files = folder_paths.filter_files_content_types(os.listdir(input_dir), ["audio", "video"])
         return IO.Schema(
             node_id="LoadAudio",

diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py
@@ -338,8 +338,25 @@ def execute(cls, positive, negative, vae, latent, image, frame_idx, strength) ->
         noise_mask = get_noise_mask(latent)
 
         _, _, latent_length, latent_height, latent_width = latent_image.shape
+
+        # For mid-video multi-frame guides, prepend+strip a throwaway first frame so the VAE's "first latent = 1 pixel frame" asymmetry lands on the discarded slot
+        time_scale_factor = scale_factors[0]
+        num_frames_to_keep = ((image.shape[0] - 1) // time_scale_factor) * time_scale_factor + 1
+        resolved_frame_idx = frame_idx
+        if frame_idx < 0:
+            _, num_keyframes = get_keyframe_idxs(positive)
+            resolved_frame_idx = max((latent_length - num_keyframes - 1) * time_scale_factor + 1 + frame_idx, 0)
+        causal_fix = resolved_frame_idx == 0 or num_frames_to_keep == 1
+
+        if not causal_fix:
+            image = torch.cat([image[:1], image], dim=0)
+
         image, t = cls.encode(vae, latent_width, latent_height, image, scale_factors)
 
+        if not causal_fix:
+            t = t[:, :, 1:, :, :]
+            image = image[1:]
+
         frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors)
         assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence."
 
@@ -352,6 +369,7 @@ def execute(cls, positive, negative, vae, latent, image, frame_idx, strength) ->
             t,
             strength,
             scale_factors,
+            causal_fix=causal_fix,
         )
 
         # Track this guide for per-reference attention control.

diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py
@@ -40,23 +40,13 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou
 
     inverse_mask = torch.ones_like(mask) - mask
 
-    source_rgb = source[:, :3, :visible_height, :visible_width]
-    dest_slice = destination[..., top:bottom, left:right]
-
-    if destination.shape[1] == 4:
-        if torch.max(dest_slice) == 0:
-            destination[:, :3, top:bottom, left:right] = source_rgb
-            destination[:, 3:4, top:bottom, left:right] = mask
-        else:
-            destination[:, :3, top:bottom, left:right] = (mask * source_rgb) + (inverse_mask * dest_slice[:, :3])
-            destination[:, 3:4, top:bottom, left:right] = torch.max(mask, dest_slice[:, 3:4])
-    else:
-        source_portion = mask * source_rgb
-        destination_portion = inverse_mask * dest_slice
-        destination[..., top:bottom, left:right] = source_portion + destination_portion
+    source_portion = mask * source[..., :visible_height, :visible_width]
+    destination_portion = inverse_mask  * destination[..., top:bottom, left:right]
 
+    destination[..., top:bottom, left:right] = source_portion + destination_portion
     return destination
 
+
 class LatentCompositeMasked(IO.ComfyNode):
     @classmethod
     def define_schema(cls):
@@ -95,23 +85,18 @@ def define_schema(cls):
             display_name="Image Composite Masked",
             category="image",
             inputs=[
+                IO.Image.Input("destination"),
                 IO.Image.Input("source"),
                 IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1),
                 IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1),
                 IO.Boolean.Input("resize_source", default=False),
-                IO.Image.Input("destination", optional=True),
                 IO.Mask.Input("mask", optional=True),
             ],
             outputs=[IO.Image.Output()],
         )
 
     @classmethod
-    def execute(cls, source, x, y, resize_source, destination = None, mask = None) -> IO.NodeOutput:
-        if destination is None: # transparent rgba
-            B, H, W, C = source.shape
-            destination = torch.zeros((B, H, W, 4), dtype=source.dtype, device=source.device)
-            if C == 3:
-                source = torch.nn.functional.pad(source, (0, 1), value=1.0)
+    def execute(cls, destination, source, x, y, resize_source, mask = None) -> IO.NodeOutput:
         destination, source = node_helpers.image_alpha_fix(destination, source)
         destination = destination.clone().movedim(-1, 1)
         output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1)