Skip to content

Commit 78b3096

Browse files
authored
Void model - pass 1 & 2 (CORE-38) (Comfy-Org#13403)
1 parent 2b63add commit 78b3096

9 files changed

Lines changed: 1070 additions & 2 deletions

File tree

comfy/latent_formats.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -793,9 +793,27 @@ class ZImagePixelSpace(ChromaRadiance):
793793
pass
794794

795795
class CogVideoX(LatentFormat):
796+
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
797+
798+
scale_factor matches the vae/config.json scaling_factor for the 2b variant.
799+
The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
800+
use a different value; see CogVideoX1_5 below.
801+
"""
796802
latent_channels = 16
797803
latent_dimensions = 3
798804
temporal_downscale_ratio = 4
799805

800806
def __init__(self):
801807
self.scale_factor = 1.15258426
808+
809+
810+
class CogVideoX1_5(CogVideoX):
811+
"""Latent format for 5b-class CogVideoX checkpoints.
812+
813+
Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
814+
V1.5-5b family (including VOID inpainting). All of these have
815+
scaling_factor=0.7 in their vae/config.json. Auto-selected in
816+
supported_models.CogVideoX_T2V based on transformer hidden dim.
817+
"""
818+
def __init__(self):
819+
self.scale_factor = 0.7

comfy/sd.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
import comfy.text_encoders.qwen35
6767
import comfy.text_encoders.ernie
6868
import comfy.text_encoders.gemma4
69+
import comfy.text_encoders.cogvideo
6970

7071
import comfy.model_patcher
7172
import comfy.lora
@@ -1224,6 +1225,7 @@ class CLIPType(Enum):
12241225
NEWBIE = 24
12251226
FLUX2 = 25
12261227
LONGCAT_IMAGE = 26
1228+
COGVIDEOX = 27
12271229

12281230

12291231

@@ -1428,6 +1430,9 @@ class EmptyClass:
14281430
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
14291431
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None)
14301432
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
1433+
elif clip_type == CLIPType.COGVIDEOX:
1434+
clip_target.clip = comfy.text_encoders.cogvideo.cogvideo_te(**t5xxl_detect(clip_data))
1435+
clip_target.tokenizer = comfy.text_encoders.cogvideo.CogVideoXTokenizer
14311436
else: #CLIPType.MOCHI
14321437
clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
14331438
clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer

comfy/supported_models.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,6 +1872,14 @@ class CogVideoX_T2V(supported_models_base.BASE):
18721872
vae_key_prefix = ["vae."]
18731873
text_encoder_key_prefix = ["text_encoders."]
18741874

1875+
def __init__(self, unet_config):
1876+
# 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426.
1877+
# 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and
1878+
# Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json.
1879+
if unet_config.get("num_attention_heads", 0) >= 48:
1880+
self.latent_format = latent_formats.CogVideoX1_5
1881+
super().__init__(unet_config)
1882+
18751883
def get_model(self, state_dict, prefix="", device=None):
18761884
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
18771885
if self.unet_config.get("patch_size_t") is not None:
@@ -1898,6 +1906,20 @@ def get_model(self, state_dict, prefix="", device=None):
18981906
out = model_base.CogVideoX(self, image_to_video=True, device=device)
18991907
return out
19001908

1909+
class CogVideoX_Inpaint(CogVideoX_T2V):
1910+
unet_config = {
1911+
"image_model": "cogvideox",
1912+
"in_channels": 48,
1913+
}
1914+
1915+
def get_model(self, state_dict, prefix="", device=None):
1916+
if self.unet_config.get("patch_size_t") is not None:
1917+
self.unet_config.setdefault("sample_height", 96)
1918+
self.unet_config.setdefault("sample_width", 170)
1919+
self.unet_config.setdefault("sample_frames", 81)
1920+
out = model_base.CogVideoX(self, image_to_video=True, device=device)
1921+
return out
1922+
19011923

19021924
models = [
19031925
LotusD,
@@ -1978,6 +2000,7 @@ def get_model(self, state_dict, prefix="", device=None):
19782000
ErnieImage,
19792001
SAM3,
19802002
SAM31,
2003+
CogVideoX_Inpaint,
19812004
CogVideoX_I2V,
19822005
CogVideoX_T2V,
19832006
SVD_img2vid,

comfy/text_encoders/cogvideo.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,48 @@
11
import comfy.text_encoders.sd3_clip
2+
from comfy import sd1_clip
23

34

45
class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer):
6+
"""Inner T5 tokenizer for CogVideoX.
7+
8+
CogVideoX was trained with T5 embeddings padded to 226 tokens (not 77 like SD3).
9+
Used both directly by supported_models.CogVideoX_T2V.clip_target (paired with
10+
the raw T5XXLModel) and by the CogVideoXTokenizer outer wrapper below.
11+
"""
512
def __init__(self, embedding_directory=None, tokenizer_data={}):
613
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226)
14+
15+
16+
class CogVideoXTokenizer(sd1_clip.SD1Tokenizer):
17+
"""Outer tokenizer wrapper for CLIPLoader (type="cogvideox")."""
18+
def __init__(self, embedding_directory=None, tokenizer_data={}):
19+
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data,
20+
clip_name="t5xxl", tokenizer=CogVideoXT5Tokenizer)
21+
22+
23+
class CogVideoXT5XXL(sd1_clip.SD1ClipModel):
24+
"""Outer T5XXL model wrapper for CLIPLoader (type="cogvideox").
25+
26+
Wraps the raw T5XXL model in the SD1ClipModel interface so that CLIP.__init__
27+
(which reads self.dtypes) works correctly. The inner model is the standard
28+
sd3_clip.T5XXLModel (no attention_mask change needed for CogVideoX).
29+
"""
30+
def __init__(self, device="cpu", dtype=None, model_options={}):
31+
super().__init__(device=device, dtype=dtype, name="t5xxl",
32+
clip_model=comfy.text_encoders.sd3_clip.T5XXLModel,
33+
model_options=model_options)
34+
35+
36+
def cogvideo_te(dtype_t5=None, t5_quantization_metadata=None):
37+
"""Factory that returns a CogVideoXT5XXL class configured with the detected
38+
T5 dtype and optional quantization metadata, for use in load_text_encoder_state_dicts.
39+
"""
40+
class CogVideoXTEModel_(CogVideoXT5XXL):
41+
def __init__(self, device="cpu", dtype=None, model_options={}):
42+
if t5_quantization_metadata is not None:
43+
model_options = model_options.copy()
44+
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
45+
if dtype_t5 is not None:
46+
dtype = dtype_t5
47+
super().__init__(device=device, dtype=dtype, model_options=model_options)
48+
return CogVideoXTEModel_

0 commit comments

Comments
 (0)