mlfoundations · gpucce · Nov 25, 2022 · Nov 27, 2022 · Nov 27, 2022 · Nov 27, 2022
diff --git a/src/open_clip/coca_model.py b/src/open_clip/coca_model.py
@@ -123,22 +123,7 @@ def set_grad_checkpointing(self, enable=True):
         self.multimodal_decoder.grad_checkpointing = enable
 
     def encode_image(self, images, normalize=True, return_tokens=False):
-        x = self.visual.conv1(images)  # shape = [*, width, grid, grid]
-        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
-        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-        x = torch.cat(
-            [
-                self.visual.class_embedding.to(x.dtype)
-                + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
-                x,
-            ],
-            dim=1,
-        )  # shape = [*, grid ** 2 + 1, width]
-        x = x + self.visual.positional_embedding.to(x.dtype)
-        x = self.visual.ln_pre(x)
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.visual.transformer(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.visual(images, output_tokens=True)
         x = self.visual.ln_post(x)
 
         if self.visual.proj is not None:

diff --git a/src/open_clip/transformer.py b/src/open_clip/transformer.py
@@ -448,7 +448,7 @@ def init_parameters(self):
     def set_grad_checkpointing(self, enable=True):
         self.transformer.grad_checkpointing = enable
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, x: torch.Tensor, output_tokens = False):
         x = self.conv1(x)  # shape = [*, width, grid, grid]
         x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
@@ -465,6 +465,9 @@ def forward(self, x: torch.Tensor):
         x = self.transformer(x)
         x = x.permute(1, 0, 2)  # LND -> NLD
 
+        if output_tokens:
+            return x
+
         if self.global_average_pool:
             x = x.mean(dim=1)
         else: