Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
254622d
init
yousef-rafat Sep 5, 2025
1cff9b8
Merge branch 'master' into yousef-higgsv2
yousef-rafat Sep 5, 2025
df4b6a2
removed test files
yousef-rafat Sep 5, 2025
6e9335d
Merge branch 'yousef-higgsv2' of https://github.com/yousef-rafat/Comf…
yousef-rafat Sep 5, 2025
57c15f9
styling fixes
yousef-rafat Sep 5, 2025
f8d4891
additional styling
yousef-rafat Sep 5, 2025
233e441
.
yousef-rafat Sep 5, 2025
6412422
bug fixes + added some features
yousef-rafat Sep 8, 2025
5191fb2
Merge branch 'master' into yousef-higgsv2
yousef-rafat Sep 9, 2025
2ac8999
final
yousef-rafat Sep 9, 2025
fee1e57
Merge branch 'yousef-higgsv2' of https://github.com/yousef-rafat/Comf…
yousef-rafat Sep 9, 2025
12824ea
init
yousef-rafat Sep 27, 2025
a480271
Delete comfy/autoregressive_sampling.py
yousef-rafat Sep 27, 2025
786c386
...
yousef-rafat Sep 27, 2025
1d24e63
Merge branch 'yousef-hunyuan-foley' of https://github.com/yousef-rafa…
yousef-rafat Sep 27, 2025
c951e8f
.
yousef-rafat Sep 27, 2025
73cdb32
.
yousef-rafat Sep 27, 2025
3773d0d
Merge branch 'master' into yousef-hunyuan-foley
yousef-rafat Sep 27, 2025
aaa3bcc
fixed a small bug
yousef-rafat Sep 27, 2025
f85e1cf
Merge branch 'yousef-hunyuan-foley' of https://github.com/yousef-rafa…
yousef-rafat Sep 27, 2025
8311b15
allowed returning frames
yousef-rafat Sep 27, 2025
2ceb9f0
added clap tokenizer
yousef-rafat Sep 28, 2025
a6dabd2
fixed clap location
yousef-rafat Sep 28, 2025
42a265c
fixed multiple errors in nodes and model loading
yousef-rafat Sep 29, 2025
ab01ace
removed additional code in video_types
yousef-rafat Sep 29, 2025
cc3a138
some fixes in model loading and nodes
yousef-rafat Sep 30, 2025
4241f10
clip vision base support + small fixes
yousef-rafat Oct 1, 2025
663d971
work on the conditioning
yousef-rafat Oct 3, 2025
4b6c081
large optimizations and some fixes
yousef-rafat Oct 4, 2025
95d2aae
syncformer fix + some fixes
yousef-rafat Oct 6, 2025
220c65d
fixed the syncform logic + condition-related fixes
yousef-rafat Oct 6, 2025
4c782e3
fixes to make the model work
yousef-rafat Oct 8, 2025
e684ff2
a lot of fixes + siglip2_base support
yousef-rafat Oct 10, 2025
89fc51f
small bug fixes
yousef-rafat Oct 10, 2025
4908e74
bug fixes for siglip2 to work
yousef-rafat Oct 11, 2025
4653b90
final changes
yousef-rafat Oct 13, 2025
25f7bbe
ruff check
yousef-rafat Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions comfy/clip_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
import torch
from comfy.ldm.modules.attention import optimized_attention_for_device
from comfy.ldm.modules.attention import optimized_attention_for_device, MultiheadAttentionComfyv
import comfy.ops

class SiglipMultiheadAttentionPoolingHead(torch.nn.Module):
def __init__(self, hidden_size, num_attention_heads, layer_norm_eps, intermediate_size, activation, device=None, dtype=None, operations=None):
super().__init__()

self.probe = torch.nn.Parameter(torch.randn(1, 1, hidden_size, device=device, dtype=dtype))
self.attention = MultiheadAttentionComfyv(hidden_size, num_attention_heads, batch_first=True, device=device, dtype=dtype, operations=operations)
self.layernorm = operations.LayerNorm(hidden_size, eps=layer_norm_eps, device=device, dtype=dtype)
self.mlp = CLIPMLP(hidden_size, intermediate_size, activation = activation, device=device, dtype=dtype, operations=operations)

def forward(self, hidden_state):
batch_size = hidden_state.shape[0]
probe = self.probe.repeat(batch_size, 1, 1)

hidden_state = self.attention(probe, hidden_state, hidden_state)

residual = hidden_state
hidden_state = self.layernorm(hidden_state)
hidden_state = residual + self.mlp(hidden_state)

return hidden_state[:, 0]

class CLIPAttention(torch.nn.Module):
def __init__(self, embed_dim, heads, dtype, device, operations):
super().__init__()
Expand Down Expand Up @@ -198,6 +219,8 @@ def __init__(self, config_dict, dtype, device, operations):
intermediate_size = config_dict["intermediate_size"]
intermediate_activation = config_dict["hidden_act"]
model_type = config_dict["model_type"]
use_head = config_dict.get("use_head", False)
layer_norm_eps = config_dict.get("layer_norm_eps", 1e-6)

self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
if model_type == "siglip_vision_model":
Expand All @@ -208,6 +231,11 @@ def __init__(self, config_dict, dtype, device, operations):
self.output_layernorm = False
self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
self.post_layernorm = operations.LayerNorm(embed_dim)
self.use_head = use_head
if use_head:
self.head = SiglipMultiheadAttentionPoolingHead(
hidden_size=embed_dim, num_attention_heads=heads, layer_norm_eps=layer_norm_eps, intermediate_size=intermediate_size, activation=intermediate_activation, device=device, dtype=dtype, operations=operations
)

def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
x = self.embeddings(pixel_values)
Expand All @@ -216,7 +244,10 @@ def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
if self.output_layernorm:
x = self.post_layernorm(x)
pooled_output = x
if self.use_head:
pooled_output = self.head(x)
else:
pooled_output = x
else:
pooled_output = self.post_layernorm(x[:, 0, :])
return x, i, pooled_output
Expand Down
17 changes: 11 additions & 6 deletions comfy/clip_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ def __getitem__(self, key):
def __setitem__(self, key, item):
setattr(self, key, item)

def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True, resize_mode="bicubic"):
image = image[:, :, :, :3] if image.shape[3] > 3 else image
if image.dtype == torch.uint8:
image = image.float() / 255.0
mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
std = torch.tensor(std, device=image.device, dtype=image.dtype)
image = image.movedim(-1, 1)
Expand All @@ -29,7 +31,7 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
else:
scale_size = (size, size)

image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
image = torch.nn.functional.interpolate(image, size=scale_size, mode=resize_mode, antialias=True)
h = (image.shape[2] - size)//2
w = (image.shape[3] - size)//2
image = image[:,:,h:h+size,w:w+size]
Expand Down Expand Up @@ -71,9 +73,9 @@ def load_sd(self, sd):
def get_sd(self):
return self.model.state_dict()

def encode_image(self, image, crop=True):
def encode_image(self, image, crop=True, resize_mode = "bicubic"):
comfy.model_management.load_model_gpu(self.patcher)
pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop, resize_mode=resize_mode).float()
out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2)

outputs = Output()
Expand Down Expand Up @@ -122,9 +124,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
elif "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
if norm_weight == 1152:
if embed_shape == 729:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
elif embed_shape == 1024:
Expand All @@ -134,6 +137,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
else:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
elif embed_shape == 1024 and norm_weight == 768:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
else:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")

Expand Down
15 changes: 15 additions & 0 deletions comfy/clip_vision_siglip2_base_512.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 768,
"image_size": 512,
"intermediate_size": 3072,
"layer_norm_eps": 1e-06,
"model_type": "siglip_vision_model",
"num_attention_heads": 12,
"num_channels": 3,
"num_hidden_layers": 12,
"patch_size": 16,
"image_mean": [0.5, 0.5, 0.5],
"image_std": [0.5, 0.5, 0.5],
"use_head": true
}
4 changes: 4 additions & 0 deletions comfy/latent_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,10 @@ class ACEAudio(LatentFormat):
latent_channels = 8
latent_dimensions = 2

class HunyuanFoley(LatentFormat):
latent_dimensions = 128
latent_channels = 1024

class ChromaRadiance(LatentFormat):
latent_channels = 3

Expand Down
Loading