comfyanonymous
diff --git a/‎.github/workflows/test-execution.yml‎
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/test-execution.yml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎comfy/cli_args.py‎
Lines changed: 2 additions & 1 deletion b/‎comfy/cli_args.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎comfy/clip_model.py‎
Lines changed: 11 additions & 1 deletion b/‎comfy/clip_model.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎comfy/clip_vision.py‎
Lines changed: 20 additions & 4 deletions b/‎comfy/clip_vision.py‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎comfy/controlnet.py‎
Lines changed: 13 additions & 3 deletions b/‎comfy/controlnet.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎comfy/image_encoders/dino2.py‎
Lines changed: 26 additions & 7 deletions b/‎comfy/image_encoders/dino2.py‎
Lines changed: 26 additions & 7 deletions
diff --git a/‎comfy/image_encoders/dino2_large.json‎
Lines changed: 22 additions & 0 deletions b/‎comfy/image_encoders/dino2_large.json‎
Lines changed: 22 additions & 0 deletions
@@ -0,0 +1,30 @@
+name: Execution Tests
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    continue-on-error: true
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python      
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+        pip install -r tests-unit/requirements.txt
+    - name: Run Execution Tests
+      run: |
+        python -m pytest tests/execution -v --skip-timing-checks
@@ -143,8 +143,9 @@ class PerformanceFeature(enum.Enum):
     Fp16Accumulation = "fp16_accumulation"
     Fp8MatrixMultiplication = "fp8_matrix_mult"
     CublasOps = "cublas_ops"
+    AutoTune = "autotune"
 
-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
 
 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
 parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")
 
@@ -61,15 +61,25 @@ def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate
     def forward(self, x, mask=None, intermediate_output=None):
         optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
 
+        all_intermediate = None
         if intermediate_output is not None:
-            if intermediate_output < 0:
+            if intermediate_output == "all":
+                all_intermediate = []
+                intermediate_output = None
+            elif intermediate_output < 0:
                 intermediate_output = len(self.layers) + intermediate_output
 
         intermediate = None
         for i, l in enumerate(self.layers):
             x = l(x, mask, optimized_attention)
             if i == intermediate_output:
                 intermediate = x.clone()
+            if all_intermediate is not None:
+                all_intermediate.append(x.unsqueeze(1).clone())
+
+        if all_intermediate is not None:
+            intermediate = torch.cat(all_intermediate, dim=1)
+
         return x, intermediate
 
 class CLIPEmbeddings(torch.nn.Module):
 
@@ -50,7 +50,13 @@ def __init__(self, json_config):
         self.image_size = config.get("image_size", 224)
         self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
         self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
-        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
+        model_type = config.get("model_type", "clip_vision_model")
+        model_class = IMAGE_ENCODERS.get(model_type)
+        if model_type == "siglip_vision_model":
+            self.return_all_hidden_states = True
+        else:
+            self.return_all_hidden_states = False
+
         self.load_device = comfy.model_management.text_encoder_device()
         offload_device = comfy.model_management.text_encoder_offload_device()
         self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
@@ -68,12 +74,18 @@ def get_sd(self):
     def encode_image(self, image, crop=True):
         comfy.model_management.load_model_gpu(self.patcher)
         pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
-        out = self.model(pixel_values=pixel_values, intermediate_output=-2)
+        out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2)
 
         outputs = Output()
         outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
         outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
-        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
+        if self.return_all_hidden_states:
+            all_hs = out[1].to(comfy.model_management.intermediate_device())
+            outputs["penultimate_hidden_states"] = all_hs[:, -2]
+            outputs["all_hidden_states"] = all_hs
+        else:
+            outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
+
         outputs["mm_projected"] = out[3]
         return outputs
 
@@ -124,8 +136,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
         else:
             json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
-    elif "embeddings.patch_embeddings.projection.weight" in sd:
+
+    # Dinov2
+    elif 'encoder.layer.39.layer_scale2.lambda1' in sd:
         json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
+    elif 'encoder.layer.23.layer_scale2.lambda1' in sd:
+        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_large.json")
     else:
         return None
 
 
@@ -253,7 +253,10 @@ def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
                 to_concat = []
                 for c in self.extra_concat_orig:
                     c = c.to(self.cond_hint.device)
-                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
+                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[-1], self.cond_hint.shape[-2], self.upscale_algorithm, "center")
+                    if c.ndim < self.cond_hint.ndim:
+                        c = c.unsqueeze(2)
+                        c = comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[2], dim=2)
                     to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
                 self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
 
@@ -585,11 +588,18 @@ def load_controlnet_flux_instantx(sd, model_options={}):
 
 def load_controlnet_qwen_instantx(sd, model_options={}):
     model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
-    control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_latent_channels = sd.get("controlnet_x_embedder.weight").shape[1]
+
+    extra_condition_channels = 0
+    concat_mask = False
+    if control_latent_channels == 68: #inpaint controlnet
+        extra_condition_channels = control_latent_channels - 64
+        concat_mask = True
+    control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(extra_condition_channels=extra_condition_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
     control_model = controlnet_load_state_dict(control_model, sd)
     latent_format = comfy.latent_formats.Wan21()
     extra_conds = []
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
     return control
 
 def convert_mistoline(sd):
 
@@ -31,6 +31,20 @@ def __init__(self, dim, dtype, device, operations):
     def forward(self, x):
         return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
 
+class Dinov2MLP(torch.nn.Module):
+    def __init__(self, hidden_size: int, dtype, device, operations):
+        super().__init__()
+
+        mlp_ratio = 4
+        hidden_features = int(hidden_size * mlp_ratio)
+        self.fc1 = operations.Linear(hidden_size, hidden_features, bias = True, device=device, dtype=dtype)
+        self.fc2 = operations.Linear(hidden_features, hidden_size, bias = True, device=device, dtype=dtype)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = torch.nn.functional.gelu(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
 
 class SwiGLUFFN(torch.nn.Module):
     def __init__(self, dim, dtype, device, operations):
@@ -50,12 +64,15 @@ def forward(self, x):
 
 
 class Dino2Block(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
+    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn):
         super().__init__()
         self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
         self.layer_scale1 = LayerScale(dim, dtype, device, operations)
         self.layer_scale2 = LayerScale(dim, dtype, device, operations)
-        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
+        if use_swiglu_ffn:
+            self.mlp = SwiGLUFFN(dim, dtype, device, operations)
+        else:
+            self.mlp = Dinov2MLP(dim, dtype, device, operations)
         self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
         self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
 
@@ -66,9 +83,10 @@ def forward(self, x, optimized_attention):
 
 
 class Dino2Encoder(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
+    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn):
         super().__init__()
-        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
+        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
+                                          for _ in range(num_layers)])
 
     def forward(self, x, intermediate_output=None):
         optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
@@ -78,8 +96,8 @@ def forward(self, x, intermediate_output=None):
                 intermediate_output = len(self.layer) + intermediate_output
 
         intermediate = None
-        for i, l in enumerate(self.layer):
-            x = l(x, optimized_attention)
+        for i, layer in enumerate(self.layer):
+            x = layer(x, optimized_attention)
             if i == intermediate_output:
                 intermediate = x.clone()
         return x, intermediate
@@ -128,9 +146,10 @@ def __init__(self, config_dict, dtype, device, operations):
         dim = config_dict["hidden_size"]
         heads = config_dict["num_attention_heads"]
         layer_norm_eps = config_dict["layer_norm_eps"]
+        use_swiglu_ffn = config_dict["use_swiglu_ffn"]
 
         self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
-        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
+        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
         self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
 
     def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
 
@@ -0,0 +1,22 @@
+{
+  "hidden_size": 1024,
+  "use_mask_token": true,
+  "patch_size": 14,
+  "image_size": 518,
+  "num_channels": 3,
+  "num_attention_heads": 16,
+  "initializer_range": 0.02,
+  "attention_probs_dropout_prob": 0.0,
+  "hidden_dropout_prob": 0.0,
+  "hidden_act": "gelu",
+  "mlp_ratio": 4,
+  "model_type": "dinov2",
+  "num_hidden_layers": 24,
+  "layer_norm_eps": 1e-6,
+  "qkv_bias": true,
+  "use_swiglu_ffn": false,
+  "layerscale_value": 1.0,
+  "drop_path_rate": 0.0,
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std": [0.229, 0.224, 0.225]
+}