comfyanonymous · yousef-rafat · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -1,7 +1,28 @@
 import torch
-from comfy.ldm.modules.attention import optimized_attention_for_device
+from comfy.ldm.modules.attention import optimized_attention_for_device, MultiheadAttentionComfyv
 import comfy.ops
 
+class SiglipMultiheadAttentionPoolingHead(torch.nn.Module):
+    def __init__(self, hidden_size, num_attention_heads, layer_norm_eps, intermediate_size, activation, device=None, dtype=None, operations=None):
+        super().__init__()
+
+        self.probe = torch.nn.Parameter(torch.randn(1, 1, hidden_size, device=device, dtype=dtype))
+        self.attention = MultiheadAttentionComfyv(hidden_size, num_attention_heads, batch_first=True, device=device, dtype=dtype, operations=operations)
+        self.layernorm = operations.LayerNorm(hidden_size, eps=layer_norm_eps, device=device, dtype=dtype)
+        self.mlp = CLIPMLP(hidden_size, intermediate_size, activation = activation, device=device, dtype=dtype, operations=operations)
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
 class CLIPAttention(torch.nn.Module):
     def __init__(self, embed_dim, heads, dtype, device, operations):
         super().__init__()
@@ -198,6 +219,8 @@ def __init__(self, config_dict, dtype, device, operations):
         intermediate_size = config_dict["intermediate_size"]
         intermediate_activation = config_dict["hidden_act"]
         model_type = config_dict["model_type"]
+        use_head = config_dict.get("use_head", False)
+        layer_norm_eps = config_dict.get("layer_norm_eps", 1e-6)
 
         self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
         if model_type == "siglip_vision_model":
@@ -208,6 +231,11 @@ def __init__(self, config_dict, dtype, device, operations):
             self.output_layernorm = False
         self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
         self.post_layernorm = operations.LayerNorm(embed_dim)
+        self.use_head = use_head
+        if use_head:
+            self.head = SiglipMultiheadAttentionPoolingHead(
+                hidden_size=embed_dim, num_attention_heads=heads, layer_norm_eps=layer_norm_eps, intermediate_size=intermediate_size, activation=intermediate_activation, device=device, dtype=dtype, operations=operations
+            )
 
     def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
         x = self.embeddings(pixel_values)
@@ -216,7 +244,10 @@ def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
         x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
         if self.output_layernorm:
             x = self.post_layernorm(x)
-            pooled_output = x
+            if self.use_head:
+                pooled_output = self.head(x)
+            else:
+                pooled_output = x
         else:
             pooled_output = self.post_layernorm(x[:, 0, :])
         return x, i, pooled_output

@@ -17,8 +17,10 @@ def __getitem__(self, key):
     def __setitem__(self, key, item):
         setattr(self, key, item)
 
-def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
+def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True, resize_mode="bicubic"):
     image = image[:, :, :, :3] if image.shape[3] > 3 else image
+    if image.dtype == torch.uint8:
+        image = image.float() / 255.0
     mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
     std = torch.tensor(std, device=image.device, dtype=image.dtype)
     image = image.movedim(-1, 1)
@@ -29,7 +31,7 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
         else:
             scale_size = (size, size)
 
-        image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
+        image = torch.nn.functional.interpolate(image, size=scale_size, mode=resize_mode, antialias=True)
         h = (image.shape[2] - size)//2
         w = (image.shape[3] - size)//2
         image = image[:,:,h:h+size,w:w+size]
@@ -71,9 +73,9 @@ def load_sd(self, sd):
     def get_sd(self):
         return self.model.state_dict()
 
-    def encode_image(self, image, crop=True):
+    def encode_image(self, image, crop=True, resize_mode = "bicubic"):
         comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
+        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop, resize_mode=resize_mode).float()
         out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2)
 
         outputs = Output()
@@ -122,9 +124,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
         json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
     elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
         json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
-    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
+    elif "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
         embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
-        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
+        norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
+        if norm_weight == 1152:
             if embed_shape == 729:
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
             elif embed_shape == 1024:
@@ -134,6 +137,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
             else:
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+        elif embed_shape == 1024 and norm_weight == 768:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
         else:
             json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
 

@@ -0,0 +1,15 @@
+{
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "image_size": 512,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 12,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "patch_size": 16,
+    "image_mean": [0.5, 0.5, 0.5],
+    "image_std": [0.5, 0.5, 0.5],
+    "use_head": true
+  }
@@ -630,6 +630,10 @@ class ACEAudio(LatentFormat):
     latent_channels = 8
     latent_dimensions = 2
 
+class HunyuanFoley(LatentFormat):
+    latent_dimensions = 128
+    latent_channels = 1024
+
 class ChromaRadiance(LatentFormat):
     latent_channels = 3