mlfoundations · rom1504 · Jan 29, 2023 · Dec 20, 2022 · Dec 21, 2022 · Dec 21, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -3,7 +3,8 @@ name: Continuous integration
 on:
   push:
     branches:
-    - main
+    - main 
+    - coca
     paths-ignore:
     - '**.md'
     - 'CITATION.cff'
@@ -12,7 +13,8 @@ on:
     - 'docs/**'
   pull_request:
     branches:
-    - main
+    - main 
+    - coca
     paths-ignore:
     - '**.md'
     - 'CITATION.cff'

diff --git a/src/open_clip/__init__.py b/src/open_clip/__init__.py
@@ -1,9 +1,10 @@
 from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
-from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
+from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
 from .factory import list_models, add_model_config, get_model_config, load_checkpoint
-from .loss import ClipLoss
+from .loss import ClipLoss, CoCaLoss
 from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg,\
     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
+from .coca_model import CoCa
 from .openai import load_openai_model, list_openai_models
 from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained

diff --git a/src/open_clip/coca_model.py b/src/open_clip/coca_model.py
@@ -0,0 +1,206 @@
+from typing import Optional
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from dataclasses import dataclass
+
+from .transformer import (
+    LayerNormFp32,
+    LayerNorm,
+    QuickGELU,
+    MultimodalTransformer,
+)
+from .model import CLIPTextCfg, CLIPVisionCfg, _build_vision_tower, _build_text_tower
+from .generation_utils import top_a, top_k, top_p
+
+@dataclass
+class MultimodalCfg(CLIPTextCfg):
+    mlp_ratio: int = 4
+    dim_head: int = 64
+    heads: int = 8
+    n_queries: int = 256
+    attn_pooler_heads: int = 8
+    latent_dim: int = 512
+
+class CoCaEncoderDecoder(nn.Module):
+    def __init__(self, encoder, decoder) -> None:
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.encoder.set_grad_checkpointing(enable)
+        self.decoder.set_grad_checkpointing(enable)
+
+def _build_encoder_decoder_tower(
+    embed_dim,
+    multimodal_cfg,
+    text_cfg,
+    quick_gelu: bool = False,
+    cast_dtype: Optional[torch.dtype] = None,
+):
+
+    multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+    text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
+
+    encoder = _build_text_tower(
+        multimodal_cfg.latent_dim, 
+        text_cfg=text_cfg, 
+        quick_gelu=quick_gelu, 
+        cast_dtype=cast_dtype
+    )
+
+    vocab_size = (
+        encoder.config.vocab_size # for hf models
+        if hasattr(text_cfg, "hf_model_name") and text_cfg.hf_model_name is not None
+        else multimodal_cfg.vocab_size
+    )
+
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    norm_layer = (
+        LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+    )
+
+    decoder = MultimodalTransformer(
+        context_length=multimodal_cfg.context_length,
+        width=multimodal_cfg.width,
+        heads=multimodal_cfg.heads,
+        layers=multimodal_cfg.layers,
+        ls_init_value=multimodal_cfg.ls_init_value,
+        output_dim=embed_dim,
+        act_layer=act_layer,
+        norm_layer=norm_layer,
+    )
+
+    return CoCaEncoderDecoder(encoder, decoder), multimodal_cfg, vocab_size
+
+class CoCa(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        multimodal_cfg: MultimodalCfg,
+        text_cfg: CLIPTextCfg,
+        vision_cfg: CLIPVisionCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+        pad_id: int = 0
+    ):
+        super().__init__()
+        multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+        text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
+        vision_cfg = CLIPVisionCfg(**vision_cfg) if isinstance(vision_cfg, dict) else vision_cfg
+
+        norm_layer = (
+            LayerNormFp32
+            if cast_dtype in (torch.float16, torch.bfloat16)
+            else LayerNorm
+        )
+
+        self.text, multimodal_cfg, vocab_size = _build_encoder_decoder_tower(
+            embed_dim, multimodal_cfg, text_cfg, quick_gelu, cast_dtype
+        )
+        self.visual = _build_vision_tower(
+            multimodal_cfg.latent_dim, vision_cfg, quick_gelu, cast_dtype
+        )
+
+        self.to_logits = nn.Sequential(
+            norm_layer(multimodal_cfg.width), nn.Linear(multimodal_cfg.width, vocab_size, bias=False)
+        )
+
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.pad_id = pad_id
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+
+    def encode_image(self, images, normalize=True, return_tokens=False):
+        image_latent, tokens_embs = self.visual(images, output_tokens=True)
+        image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
+        return (image_latent, tokens_embs) if return_tokens else image_latent
+
+    def encode_text(self, text, normalize=True, return_tokens=False):
+        text = text[:, :-1] # make space for CLS token
+        text_latent, token_emb = self.text.encoder(text, output_tokens=True)
+        text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent
+        return (text_latent, token_emb) if return_tokens else text_latent
+
+    def forward(self, image, text, output_dict=False):
+
+        text_latent, token_embs = self.encode_text(text, return_tokens=True)
+        image_latent, image_embs = self.encode_image(image, return_tokens=True)
+
+        # TODO: add assertion to avoid bugs?
+        labels = text[:, -token_embs.shape[1]:]
+
+        token_embs = self.text.decoder(image_embs, token_embs)
+        logits = self.to_logits(token_embs)
+        if output_dict:
+            return {
+                "image_features":image_latent,
+                "text_features":text_latent,
+                "logits":logits,
+                "labels":labels,
+                "logit_scale":self.logit_scale.exp()
+            }
+
+        return image_latent, text_latent, logits, labels, self.logit_scale.exp()
+
+    def generate(
+        self,
+        image,
+        text,
+        seq_len,
+        max_seq_len=77,
+        mask_prob = 0.0,
+        temperature = 1.,
+        filter_logits_fn = top_k,
+        filter_thres = 0.9,
+        min_p_pow = 2.0,
+        min_p_ratio = 0.02,
+        ):
+
+        assert mask_prob < 1, "mask_prob must be smaller than 1."
+
+        was_training = self.training
+        num_dims = len(text.shape)
+
+        if num_dims == 1:
+            text = text[None, :]
+
+        _, t = text.shape
+        self.eval()
+        out = text
+
+        for _ in range(seq_len):
+            x = out[:, -max_seq_len:]
+
+            # TODO: adjust for dict output
+            logits = self(image, x)[2][:, -1]
+
+            if filter_logits_fn in {top_k, top_p}:
+                filtered_logits = filter_logits_fn(logits, thres=filter_thres)
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+
+            elif filter_logits_fn is top_a:
+                filtered_logits = filter_logits_fn(
+                    logits, min_p_pow=min_p_pow, min_p_ratio=min_p_ratio
+                )
+                probs = F.softmax(filtered_logits / temperature, dim=-1)
+
+            sample = torch.multinomial(probs, 1)
+
+            out = torch.cat((out, sample), dim=-1)
+
+
+        out = out[:, t:]
+
+        if num_dims == 1:
+            out = out.squeeze(0)
+
+        self.train(was_training)
+        return out
diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py
@@ -12,6 +12,8 @@
 from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
     resize_pos_embed, get_cast_dtype
+from .coca_model import CoCa
+from .loss import ClipLoss, CoCaLoss
 from .openai import load_openai_model
 from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model
 from .transform import image_transform
@@ -72,8 +74,7 @@ def get_model_config(model_name):
 
 def get_tokenizer(model_name):
     config = get_model_config(model_name)
-    tokenizer = HFTokenizer(config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize
-    return tokenizer
+    return HFTokenizer(config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize
 
 
 def load_state_dict(checkpoint_path: str, map_location='cpu'):
@@ -152,7 +153,10 @@ def create_model(
         if custom_text:
             if 'hf_model_name' in model_cfg.get('text_cfg', {}):
                 model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
-            model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
+            if "coca" in model_name:
+                model = CoCa(**model_cfg, cast_dtype=cast_dtype)
+            else:
+                model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
         else:
             model = CLIP(**model_cfg, cast_dtype=cast_dtype)
 
@@ -188,6 +192,25 @@ def create_model(
 
     return model
 
+def create_loss(args):
+    if "coca" in args.model.lower():
+        return CoCaLoss(
+            caption_loss_weight=args.coca_caption_loss_weight,
+            clip_loss_weight=args.coca_contrastive_loss_weight,
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod)
+    return ClipLoss(
+        local_loss=args.local_loss,
+        gather_with_grad=args.gather_with_grad,
+        cache_labels=True,
+        rank=args.rank,
+        world_size=args.world_size,
+        use_horovod=args.horovod)
+
 
 def create_model_and_transforms(
         model_name: str,

diff --git a/src/open_clip/generation_utils.py b/src/open_clip/generation_utils.py
@@ -0,0 +1,38 @@
+from math import ceil
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+def exists(val):
+    return val is not None
+
+# nucleus
+
+def top_p(logits, thres = 0.9):
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+    sorted_indices_to_remove = cum_probs > (1 - thres)
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+
+    sorted_logits[sorted_indices_to_remove] = float('-inf')
+    return sorted_logits.scatter(1, sorted_indices, sorted_logits)
+
+# topk
+
+def top_k(logits, thres = 0.9):
+    k = ceil((1 - thres) * logits.shape[-1])
+    val, ind = torch.topk(logits, k)
+    probs = torch.full_like(logits, float('-inf'))
+    probs.scatter_(1, ind, val)
+    return probs
+
+# top_a
+
+def top_a(logits, min_p_pow=2.0, min_p_ratio=0.02):
+    probs = F.softmax(logits, dim=-1)
+    limit = torch.pow(torch.max(probs), min_p_pow) * min_p_ratio
+    logits[probs < limit] = float('-inf')
+    logits[probs >= limit] = 1
+    return logits
diff --git a/src/open_clip/hf_model.py b/src/open_clip/hf_model.py
@@ -79,7 +79,6 @@ def forward(self, x: BaseModelOutput, attention_mask: TensorType):
 
         return x.last_hidden_state[:, self.cls_token_position, :]
 
-
 class HFTextEncoder(nn.Module):
     """HuggingFace model adapter"""
 
@@ -90,7 +89,8 @@ def __init__(
             config: PretrainedConfig = None,
             pooler_type: str = None,
             proj: str = None,
-            pretrained: bool = True):
+            pretrained: bool = True
+        ):
         super().__init__()
 
         self.output_dim = output_dim
@@ -113,11 +113,10 @@ def __init__(
         else:
             self.config = config
             self.transformer = AutoModel.from_config(config)
-
         if pooler_type is None:  # get default arch pooler
-            self.pooler = _POOLERS[(arch_dict[self.config.model_type]["pooler"])]()
-        else:
-            self.pooler = _POOLERS[pooler_type]()
+            pooler_type = (arch_dict[self.config.model_type]["pooler"])
+
+        self.pooler = _POOLERS[pooler_type]()
 
         d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
         if (d_model == output_dim) and (proj is None):  # do we always need a proj?
@@ -132,12 +131,22 @@ def __init__(
                 nn.Linear(hidden_size, output_dim, bias=False),
             )
 
-    def forward(self, x: TensorType) -> TensorType:
+    def forward(self, x: TensorType, output_tokens=False) -> TensorType:
         attn_mask = (x != self.config.pad_token_id).long()
         out = self.transformer(input_ids=x, attention_mask=attn_mask)
         pooled_out = self.pooler(out, attn_mask)
-
-        return self.proj(pooled_out)
+        projected = self.proj(pooled_out)
+
+        if output_tokens:
+            seq_len = out.last_hidden_state.shape[1]
+            tokens = (
+                out.last_hidden_state[:, torch.arange(seq_len) != self.pooler.cls_token_position, :] 
+                if type(self.pooler) == ClsPooler 
+                else out.last_hidden_state
+            )
+            return projected, tokens
+
+        return projected
 
     def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
         if not unlocked_layers:  # full freezing