Skip to content

Commit 9e9c129

Browse files
committed
Merge remote-tracking branch 'origin/master' into worksplit-multigpu
2 parents ac14ee6 + 885015e commit 9e9c129

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+4439
-1055
lines changed

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,18 +65,17 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
6565
- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
6666
- [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
6767
- [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
68-
- [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
6968
- [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
7069
- Image Editing Models
7170
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
7271
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
7372
- [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
73+
- [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model)
7474
- Video Models
7575
- [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
7676
- [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
7777
- [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
7878
- [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
79-
- [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
8079
- [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
8180
- [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
8281
- Audio Models
@@ -191,7 +190,7 @@ comfy install
191190

192191
## Manual Install (Windows, Linux)
193192

194-
python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet.
193+
Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12
195194

196195
Git clone this repo.
197196

app/user_manager.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -363,10 +363,17 @@ async def post_userdata(request):
363363
if not overwrite and os.path.exists(path):
364364
return web.Response(status=409, text="File already exists")
365365

366-
body = await request.read()
366+
try:
367+
body = await request.read()
367368

368-
with open(path, "wb") as f:
369-
f.write(body)
369+
with open(path, "wb") as f:
370+
f.write(body)
371+
except OSError as e:
372+
logging.warning(f"Error saving file '{path}': {e}")
373+
return web.Response(
374+
status=400,
375+
reason="Invalid filename. Please avoid special characters like :\\/*?\"<>|"
376+
)
370377

371378
user_path = self.get_request_user_filepath(request, None)
372379
if full_info:
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from .wav2vec2 import Wav2Vec2Model
2+
import comfy.model_management
3+
import comfy.ops
4+
import comfy.utils
5+
import logging
6+
import torchaudio
7+
8+
9+
class AudioEncoderModel():
10+
def __init__(self, config):
11+
self.load_device = comfy.model_management.text_encoder_device()
12+
offload_device = comfy.model_management.text_encoder_offload_device()
13+
self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
14+
self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast)
15+
self.model.eval()
16+
self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
17+
self.model_sample_rate = 16000
18+
19+
def load_sd(self, sd):
20+
return self.model.load_state_dict(sd, strict=False)
21+
22+
def get_sd(self):
23+
return self.model.state_dict()
24+
25+
def encode_audio(self, audio, sample_rate):
26+
comfy.model_management.load_model_gpu(self.patcher)
27+
audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate)
28+
out, all_layers = self.model(audio.to(self.load_device))
29+
outputs = {}
30+
outputs["encoded_audio"] = out
31+
outputs["encoded_audio_all_layers"] = all_layers
32+
return outputs
33+
34+
35+
def load_audio_encoder_from_sd(sd, prefix=""):
36+
audio_encoder = AudioEncoderModel(None)
37+
sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""})
38+
m, u = audio_encoder.load_sd(sd)
39+
if len(m) > 0:
40+
logging.warning("missing audio encoder: {}".format(m))
41+
42+
return audio_encoder

comfy/audio_encoders/wav2vec2.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import torch
2+
import torch.nn as nn
3+
from comfy.ldm.modules.attention import optimized_attention_masked
4+
5+
6+
class LayerNormConv(nn.Module):
7+
def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None):
8+
super().__init__()
9+
self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype)
10+
self.layer_norm = operations.LayerNorm(out_channels, elementwise_affine=True, device=device, dtype=dtype)
11+
12+
def forward(self, x):
13+
x = self.conv(x)
14+
return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1))
15+
16+
17+
class ConvFeatureEncoder(nn.Module):
18+
def __init__(self, conv_dim, dtype=None, device=None, operations=None):
19+
super().__init__()
20+
self.conv_layers = nn.ModuleList([
21+
LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations),
22+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
23+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
24+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
25+
LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
26+
LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
27+
LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations),
28+
])
29+
30+
def forward(self, x):
31+
x = x.unsqueeze(1)
32+
33+
for conv in self.conv_layers:
34+
x = conv(x)
35+
36+
return x.transpose(1, 2)
37+
38+
39+
class FeatureProjection(nn.Module):
40+
def __init__(self, conv_dim, embed_dim, dtype=None, device=None, operations=None):
41+
super().__init__()
42+
self.layer_norm = operations.LayerNorm(conv_dim, eps=1e-05, device=device, dtype=dtype)
43+
self.projection = operations.Linear(conv_dim, embed_dim, device=device, dtype=dtype)
44+
45+
def forward(self, x):
46+
x = self.layer_norm(x)
47+
x = self.projection(x)
48+
return x
49+
50+
51+
class PositionalConvEmbedding(nn.Module):
52+
def __init__(self, embed_dim=768, kernel_size=128, groups=16):
53+
super().__init__()
54+
self.conv = nn.Conv1d(
55+
embed_dim,
56+
embed_dim,
57+
kernel_size=kernel_size,
58+
padding=kernel_size // 2,
59+
groups=groups,
60+
)
61+
self.conv = torch.nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2)
62+
self.activation = nn.GELU()
63+
64+
def forward(self, x):
65+
x = x.transpose(1, 2)
66+
x = self.conv(x)[:, :, :-1]
67+
x = self.activation(x)
68+
x = x.transpose(1, 2)
69+
return x
70+
71+
72+
class TransformerEncoder(nn.Module):
73+
def __init__(
74+
self,
75+
embed_dim=768,
76+
num_heads=12,
77+
num_layers=12,
78+
mlp_ratio=4.0,
79+
dtype=None, device=None, operations=None
80+
):
81+
super().__init__()
82+
83+
self.pos_conv_embed = PositionalConvEmbedding(embed_dim=embed_dim)
84+
self.layers = nn.ModuleList([
85+
TransformerEncoderLayer(
86+
embed_dim=embed_dim,
87+
num_heads=num_heads,
88+
mlp_ratio=mlp_ratio,
89+
device=device, dtype=dtype, operations=operations
90+
)
91+
for _ in range(num_layers)
92+
])
93+
94+
self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype)
95+
96+
def forward(self, x, mask=None):
97+
x = x + self.pos_conv_embed(x)
98+
all_x = ()
99+
for layer in self.layers:
100+
all_x += (x,)
101+
x = layer(x, mask)
102+
x = self.layer_norm(x)
103+
all_x += (x,)
104+
return x, all_x
105+
106+
107+
class Attention(nn.Module):
108+
def __init__(self, embed_dim, num_heads, bias=True, dtype=None, device=None, operations=None):
109+
super().__init__()
110+
self.embed_dim = embed_dim
111+
self.num_heads = num_heads
112+
self.head_dim = embed_dim // num_heads
113+
114+
self.k_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
115+
self.v_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
116+
self.q_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
117+
self.out_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
118+
119+
def forward(self, x, mask=None):
120+
assert (mask is None) # TODO?
121+
q = self.q_proj(x)
122+
k = self.k_proj(x)
123+
v = self.v_proj(x)
124+
125+
out = optimized_attention_masked(q, k, v, self.num_heads)
126+
return self.out_proj(out)
127+
128+
129+
class FeedForward(nn.Module):
130+
def __init__(self, embed_dim, mlp_ratio, dtype=None, device=None, operations=None):
131+
super().__init__()
132+
self.intermediate_dense = operations.Linear(embed_dim, int(embed_dim * mlp_ratio), device=device, dtype=dtype)
133+
self.output_dense = operations.Linear(int(embed_dim * mlp_ratio), embed_dim, device=device, dtype=dtype)
134+
135+
def forward(self, x):
136+
x = self.intermediate_dense(x)
137+
x = torch.nn.functional.gelu(x)
138+
x = self.output_dense(x)
139+
return x
140+
141+
142+
class TransformerEncoderLayer(nn.Module):
143+
def __init__(
144+
self,
145+
embed_dim=768,
146+
num_heads=12,
147+
mlp_ratio=4.0,
148+
dtype=None, device=None, operations=None
149+
):
150+
super().__init__()
151+
152+
self.attention = Attention(embed_dim, num_heads, device=device, dtype=dtype, operations=operations)
153+
154+
self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
155+
self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
156+
self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
157+
158+
def forward(self, x, mask=None):
159+
residual = x
160+
x = self.layer_norm(x)
161+
x = self.attention(x, mask=mask)
162+
x = residual + x
163+
164+
x = x + self.feed_forward(self.final_layer_norm(x))
165+
return x
166+
167+
168+
class Wav2Vec2Model(nn.Module):
169+
"""Complete Wav2Vec 2.0 model."""
170+
171+
def __init__(
172+
self,
173+
embed_dim=1024,
174+
final_dim=256,
175+
num_heads=16,
176+
num_layers=24,
177+
dtype=None, device=None, operations=None
178+
):
179+
super().__init__()
180+
181+
conv_dim = 512
182+
self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations)
183+
self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations)
184+
185+
self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype))
186+
187+
self.encoder = TransformerEncoder(
188+
embed_dim=embed_dim,
189+
num_heads=num_heads,
190+
num_layers=num_layers,
191+
device=device, dtype=dtype, operations=operations
192+
)
193+
194+
def forward(self, x, mask_time_indices=None, return_dict=False):
195+
196+
x = torch.mean(x, dim=1)
197+
198+
x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7)
199+
200+
features = self.feature_extractor(x)
201+
features = self.feature_projection(features)
202+
203+
batch_size, seq_len, _ = features.shape
204+
205+
x, all_x = self.encoder(features)
206+
207+
return x, all_x

comfy/controlnet.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import comfy.cldm.mmdit
3838
import comfy.ldm.hydit.controlnet
3939
import comfy.ldm.flux.controlnet
40+
import comfy.ldm.qwen_image.controlnet
4041
import comfy.cldm.dit_embedder
4142
from typing import TYPE_CHECKING, Union
4243
if TYPE_CHECKING:
@@ -271,11 +272,11 @@ def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
271272
self.cond_hint = None
272273
compression_ratio = self.compression_ratio
273274
if self.vae is not None:
274-
compression_ratio *= self.vae.downscale_ratio
275+
compression_ratio *= self.vae.spacial_compression_encode()
275276
else:
276277
if self.latent_format is not None:
277278
raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
278-
self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center")
279+
self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[-1] * compression_ratio, x_noisy.shape[-2] * compression_ratio, self.upscale_algorithm, "center")
279280
self.cond_hint = self.preprocess_image(self.cond_hint)
280281
if self.vae is not None:
281282
loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
@@ -625,6 +626,15 @@ def load_controlnet_flux_instantx(sd, model_options={}):
625626
control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
626627
return control
627628

629+
def load_controlnet_qwen_instantx(sd, model_options={}):
630+
model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
631+
control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
632+
control_model = controlnet_load_state_dict(control_model, sd)
633+
latent_format = comfy.latent_formats.Wan21()
634+
extra_conds = []
635+
control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
636+
return control
637+
628638
def convert_mistoline(sd):
629639
return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})
630640

@@ -698,8 +708,11 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
698708
return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format
699709
else:
700710
return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
711+
elif "transformer_blocks.0.img_mlp.net.0.proj.weight" in controlnet_data:
712+
return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options)
701713
elif "controlnet_x_embedder.weight" in controlnet_data:
702714
return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
715+
703716
elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
704717
return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)
705718

comfy/k_diffusion/sampling.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,11 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
853853
return x
854854

855855

856+
@torch.no_grad()
857+
def sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'):
858+
return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
859+
860+
856861
@torch.no_grad()
857862
def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
858863
"""DPM-Solver++(3M) SDE."""
@@ -925,6 +930,16 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
925930
return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
926931

927932

933+
@torch.no_grad()
934+
def sample_dpmpp_2m_sde_heun_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'):
935+
if len(sigmas) <= 1:
936+
return x
937+
extra_args = {} if extra_args is None else extra_args
938+
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
939+
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
940+
return sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
941+
942+
928943
@torch.no_grad()
929944
def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
930945
if len(sigmas) <= 1:

0 commit comments

Comments
 (0)