mlfoundations · lingjzhu · Dec 21, 2022 · Dec 21, 2022 · Dec 26, 2022 · Dec 26, 2022
diff --git a/src/open_clip/__init__.py b/src/open_clip/__init__.py
@@ -2,7 +2,7 @@
 from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
 from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 from .loss import ClipLoss
-from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg,\
+from .model import CLIP, CustomTextCLIP, TextTextCLIP, CLIPTextCfg, CLIPVisionCfg,\
     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 from .openai import load_openai_model, list_openai_models
 from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\

diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py
@@ -10,7 +10,7 @@
 import torch
 
 from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
-from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
+from .model import CLIP, CustomTextCLIP, TextTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
     resize_pos_embed, get_cast_dtype
 from .openai import load_openai_model
 from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model
@@ -41,7 +41,7 @@ def _rescan_model_configs():
     for cf in config_files:
         with open(cf, 'r') as f:
             model_cfg = json.load(f)
-            if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
+            if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')) or all(a in model_cfg for a in ('embed_dim', 'query_cfg', 'doc_cfg')):
                 _MODEL_CONFIGS[cf.stem] = model_cfg
 
     _MODEL_CONFIGS = {k: v for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))}
@@ -72,7 +72,11 @@ def get_model_config(model_name):
 
 def get_tokenizer(model_name):
     config = get_model_config(model_name)
-    tokenizer = HFTokenizer(config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize
+    if 'text_cfg' in config.keys():
+        key = 'text_cfg'
+    elif 'query_cfg' in config.keys():
+        key = 'query_cfg'
+    tokenizer = HFTokenizer(config[key]['hf_tokenizer_name']) if 'hf_tokenizer_name' in config[key] else tokenize
     return tokenizer
 
 
@@ -109,6 +113,7 @@ def create_model(
         pretrained_image: bool = False,
         pretrained_hf: bool = True,
         cache_dir: Optional[str] = None,
+        text_to_text: Optional[bool] = False,
 ):
     model_name = model_name.replace('/', '-')  # for callers using old naming with / in ViT names
     if isinstance(device, str):
@@ -148,13 +153,22 @@ def create_model(
 
         cast_dtype = get_cast_dtype(precision)
         custom_text = model_cfg.pop('custom_text', False) or force_custom_text or ('hf_model_name' in model_cfg.get('text_cfg', {}))
-
-        if custom_text:
-            if 'hf_model_name' in model_cfg.get('text_cfg', {}):
-                model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
-            model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
+
+        # switch to TextTextCLIP
+        if text_to_text:
+            if 'hf_model_name' in model_cfg.get('doc_cfg', {}):
+                model_cfg['doc_cfg']['hf_model_pretrained'] = pretrained_hf
+            if 'hf_model_name' in model_cfg.get('query_cfg', {}):
+                model_cfg['query_cfg']['hf_model_pretrained'] = pretrained_hf
+
+                model = TextTextCLIP(**model_cfg, cast_dtype=cast_dtype)
         else:
-            model = CLIP(**model_cfg, cast_dtype=cast_dtype)
+            if custom_text:
+                if 'hf_model_name' in model_cfg.get('text_cfg', {}):
+                    model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
+                model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
+            else:
+                model = CLIP(**model_cfg, cast_dtype=cast_dtype)
 
         pretrained_cfg = {}
         if pretrained:
@@ -179,9 +193,10 @@ def create_model(
         if precision in ("fp16", "bf16"):
             convert_weights_to_lp(model, dtype=torch.bfloat16 if precision == 'bf16' else torch.float16)
 
+        if not text_to_text:
         # set image / mean metadata from pretrained_cfg if available, or use default
-        model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
-        model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD
+            model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
+            model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD
 
         if jit:
             model = torch.jit.script(model)
@@ -203,6 +218,7 @@ def create_model_and_transforms(
         image_mean: Optional[Tuple[float, ...]] = None,
         image_std: Optional[Tuple[float, ...]] = None,
         cache_dir: Optional[str] = None,
+        text_to_text: Optional[bool] = False,
 ):
     model = create_model(
         model_name,
@@ -216,22 +232,27 @@ def create_model_and_transforms(
         pretrained_image=pretrained_image,
         pretrained_hf=pretrained_hf,
         cache_dir=cache_dir,
+        text_to_text=text_to_text,
     )
 
-    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
-    image_std = image_std or getattr(model.visual, 'image_std', None)
-    preprocess_train = image_transform(
-        model.visual.image_size,
-        is_train=True,
-        mean=image_mean,
-        std=image_std
-    )
-    preprocess_val = image_transform(
-        model.visual.image_size,
-        is_train=False,
-        mean=image_mean,
-        std=image_std
-    )
+    if not text_to_text:
+        image_mean = image_mean or getattr(model.visual, 'image_mean', None)
+        image_std = image_std or getattr(model.visual, 'image_std', None)
+        preprocess_train = image_transform(
+            model.visual.image_size,
+            is_train=True,
+            mean=image_mean,
+            std=image_std
+        )
+        preprocess_val = image_transform(
+            model.visual.image_size,
+            is_train=False,
+            mean=image_mean,
+            std=image_std
+        )
+    else:
+        preprocess_val = None
+        preprocess_train = None
 
     return model, preprocess_train, preprocess_val
 
@@ -248,6 +269,7 @@ def create_model_from_pretrained(
         image_mean: Optional[Tuple[float, ...]] = None,
         image_std: Optional[Tuple[float, ...]] = None,
         cache_dir: Optional[str] = None,
+        text_to_text: Optional[bool] = False,
 ):
     if not is_pretrained_cfg(model_name, pretrained) and not os.path.exists(pretrained):
         raise RuntimeError(
@@ -263,18 +285,20 @@ def create_model_from_pretrained(
         force_quick_gelu=force_quick_gelu,
         force_custom_text=force_custom_text,
         cache_dir=cache_dir,
+        text_to_text=text_to_text,
     )
 
     if not return_transform:
         return model
 
-    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
-    image_std = image_std or getattr(model.visual, 'image_std', None)
-    preprocess = image_transform(
-        model.visual.image_size,
-        is_train=False,
-        mean=image_mean,
-        std=image_std
-    )
+    if not text_to_text:
+        image_mean = image_mean or getattr(model.visual, 'image_mean', None)
+        image_std = image_std or getattr(model.visual, 'image_std', None)
+        preprocess = image_transform(
+            model.visual.image_size,
+            is_train=False,
+            mean=image_mean,
+            std=image_std
+        )
 
     return model, preprocess
diff --git a/src/open_clip/loss.py b/src/open_clip/loss.py
@@ -16,8 +16,8 @@
 
 
 def gather_features(
-        image_features,
-        text_features,
+        features_1,
+        features_2,
         local_loss=False,
         gather_with_grad=False,
         rank=0,
@@ -28,38 +28,38 @@ def gather_features(
     if use_horovod:
         assert hvd is not None, 'Please install horovod'
         if gather_with_grad:
-            all_image_features = hvd.allgather(image_features)
-            all_text_features = hvd.allgather(text_features)
+            all_features_1 = hvd.allgather(features_1)
+            all_features_2 = hvd.allgather(features_2)
         else:
             with torch.no_grad():
-                all_image_features = hvd.allgather(image_features)
-                all_text_features = hvd.allgather(text_features)
+                all_features_1 = hvd.allgather(features_1)
+                all_features_2 = hvd.allgather(features_2)
             if not local_loss:
                 # ensure grads for local rank when all_* features don't have a gradient
-                gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
-                gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
-                gathered_image_features[rank] = image_features
-                gathered_text_features[rank] = text_features
-                all_image_features = torch.cat(gathered_image_features, dim=0)
-                all_text_features = torch.cat(gathered_text_features, dim=0)
+                gathered_features_1 = list(all_features_1.chunk(world_size, dim=0))
+                gathered_features_2 = list(all_features_2.chunk(world_size, dim=0))
+                gathered_features_1[rank] = features_1
+                gathered_features_2[rank] = features_2
+                all_features_1 = torch.cat(gathered_features_1, dim=0)
+                all_features_2 = torch.cat(gathered_features_2, dim=0)
     else:
         # We gather tensors from all gpus
         if gather_with_grad:
-            all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
-            all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+            all_features_1 = torch.cat(torch.distributed.nn.all_gather(features_1), dim=0)
+            all_features_2 = torch.cat(torch.distributed.nn.all_gather(features_2), dim=0)
         else:
-            gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
-            gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
-            dist.all_gather(gathered_image_features, image_features)
-            dist.all_gather(gathered_text_features, text_features)
+            gathered_features_1 = [torch.zeros_like(features_1) for _ in range(world_size)]
+            gathered_features_2 = [torch.zeros_like(features_2) for _ in range(world_size)]
+            dist.all_gather(gathered_features_1, features_1)
+            dist.all_gather(gathered_features_2, features_2)
             if not local_loss:
                 # ensure grads for local rank when all_* features don't have a gradient
-                gathered_image_features[rank] = image_features
-                gathered_text_features[rank] = text_features
-            all_image_features = torch.cat(gathered_image_features, dim=0)
-            all_text_features = torch.cat(gathered_text_features, dim=0)
+                gathered_features_1[rank] = features_1
+                gathered_features_2[rank] = features_2
+            all_features_1 = torch.cat(gathered_features_1, dim=0)
+            all_features_2 = torch.cat(gathered_features_2, dim=0)
 
-    return all_image_features, all_text_features
+    return all_features_1, all_features_2
 
 
 class ClipLoss(nn.Module):
@@ -85,25 +85,25 @@ def __init__(
         self.prev_num_logits = 0
         self.labels = {}
 
-    def forward(self, image_features, text_features, logit_scale):
-        device = image_features.device
+    def forward(self, features_1, features_2, logit_scale):
+        device = features_1.device
         if self.world_size > 1:
-            all_image_features, all_text_features = gather_features(
-                image_features, text_features,
+            all_features_1, all_features_2 = gather_features(
+                features_1, features_2,
                 self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
 
             if self.local_loss:
-                logits_per_image = logit_scale * image_features @ all_text_features.T
-                logits_per_text = logit_scale * text_features @ all_image_features.T
+                logits_per_feature_1 = logit_scale * features_1 @ all_features_2.T
+                logits_per_feature_2 = logit_scale * features_2 @ all_features_1.T
             else:
-                logits_per_image = logit_scale * all_image_features @ all_text_features.T
-                logits_per_text = logits_per_image.T
+                logits_per_feature_1 = logit_scale * all_features_1 @ all_features_2.T
+                logits_per_feature_2 = logits_per_feature_1.T
         else:
-            logits_per_image = logit_scale * image_features @ text_features.T
-            logits_per_text = logit_scale * text_features @ image_features.T
+            logits_per_feature_1 = logit_scale * features_1 @ features_2.T
+            logits_per_feature_2 = logit_scale * features_2 @ features_1.T
 
         # calculated ground-truth and cache if enabled
-        num_logits = logits_per_image.shape[0]
+        num_logits = logits_per_feature_1.shape[0]
         if self.prev_num_logits != num_logits or device not in self.labels:
             labels = torch.arange(num_logits, device=device, dtype=torch.long)
             if self.world_size > 1 and self.local_loss:
@@ -115,7 +115,7 @@ def forward(self, image_features, text_features, logit_scale):
             labels = self.labels[device]
 
         total_loss = (
-            F.cross_entropy(logits_per_image, labels) +
-            F.cross_entropy(logits_per_text, labels)
+            F.cross_entropy(logits_per_feature_1, labels) +
+            F.cross_entropy(logits_per_feature_2, labels)
             ) / 2
         return total_loss
diff --git a/src/open_clip/model.py b/src/open_clip/model.py
@@ -248,6 +248,47 @@ def forward(self, image, text):
         return image_features, text_features, self.logit_scale.exp()
 
 
+
+class TextTextCLIP(nn.Module):
+    def __init__(
+            self,
+            embed_dim: int,
+            query_cfg: CLIPTextCfg,
+            doc_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.doc = _build_text_tower(embed_dim, doc_cfg, quick_gelu, cast_dtype)
+        self.query = _build_text_tower(embed_dim, query_cfg, quick_gelu, cast_dtype)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+    def lock_query_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        self.query.lock(unlocked_layers, freeze_layer_norm)
+
+    def lock_doc_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        self.doc.lock(unlocked_layers, freeze_layer_norm)
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.doc.set_grad_checkpointing(enable)
+        self.query.set_grad_checkpointing(enable)
+
+    def encode_doc(self, text, normalize: bool = False):
+        features = self.doc(text)
+        return F.normalize(features, dim=-1) if normalize else features
+
+    def encode_query(self, text, normalize: bool = False):
+        features = self.query(text)
+        return F.normalize(features, dim=-1) if normalize else features
+
+    def forward(self, query, doc):
+        query_features = self.encode_query(query, normalize=True)
+        doc_features = self.encode_doc(doc, normalize=True)
+        return query_features, doc_features, self.logit_scale.exp()
+
+
+
 def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
     """Convert applicable model parameters to low-precision (bf16 or fp16)"""
 

diff --git a/src/open_clip/model_configs/roberta-roberta.json b/src/open_clip/model_configs/roberta-roberta.json
@@ -0,0 +1,16 @@
+{
+    "embed_dim": 512,
+    "quick_gelu": true,
+    "query_cfg": {
+        "hf_model_name": "roberta-base",
+        "hf_tokenizer_name": "roberta-base",
+        "proj": "mlp",
+        "pooler_type": "mean_pooler"
+    },
+    "doc_cfg": {
+        "hf_model_name": "roberta-base",
+        "hf_tokenizer_name": "roberta-base",
+        "proj": "mlp",
+        "pooler_type": "mean_pooler"
+    }
+}