microsoft
diff --git a/‎llm2clip/eva_clip/__init__.py
Lines changed: 2 additions & 1 deletion b/‎llm2clip/eva_clip/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎llm2clip/eva_clip/llm_model.py
Lines changed: 36 additions & 0 deletions b/‎llm2clip/eva_clip/llm_model.py
Lines changed: 36 additions & 0 deletions
diff --git a/‎llm2clip/eva_clip/model.py
Lines changed: 4 additions & 1 deletion b/‎llm2clip/eva_clip/model.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎llm2clip/eva_clip/model_configs/Llama3.1-8B-Siglip2-So-L-14-224.json
Lines changed: 22 additions & 0 deletions b/‎llm2clip/eva_clip/model_configs/Llama3.1-8B-Siglip2-So-L-14-224.json
Lines changed: 22 additions & 0 deletions
diff --git a/‎llm2clip/llm2vec/__init__.py
Lines changed: 1 addition & 0 deletions b/‎llm2clip/llm2vec/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -8,4 +8,5 @@
 from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
 from .tokenizer import SimpleTokenizer, tokenize
-from .transform import image_transform
+from .transform import image_transform
+from .llm_model import LLM2VecTextTransformer
@@ -0,0 +1,36 @@
+import torch
+from torch import nn
+from llm2vec import LLM2Vec
+
+class LLM2VecTextTransformer(nn.Module):
+    def __init__(self, text_proj=None):
+        super().__init__()
+        enable_bidirectional = True
+        base_model_name_or_path = "meta-llama/Llama-3.1-8B-Instruct"
+        extra_model_name_or_path = None
+        peft_path = "checkpoints/LLM2CLIP-Llama-3.1-8B"
+        self.text = LLM2Vec.from_pretrained(
+                base_model_name_or_path,
+                peft_path,
+                merge_peft = True,
+                extra_model_name_or_path=extra_model_name_or_path,
+                enable_bidirectional=enable_bidirectional,
+                attn_implementation = "flash_attention_2",
+                torch_dtype=torch.bfloat16
+            )
+        self.text_proj = text_proj
+        
+    def lock(self, **kwargs):
+        for param in self.text.parameters():
+            param.requires_grad = False
+            
+    def forward(self, text, batch_size=32): 
+        with torch.autocast("cuda"):        
+            x = self.text.encode(text,batch_size=batch_size).to(torch.float16)
+            if self.text_proj is not None:
+                x = self.text_proj(x, l2_norm=False)
+        return x
+    
+    def set_grad_checkpointing(self, enable=True):
+        #Not implemented
+        pass 
@@ -20,7 +20,7 @@
 from .timm_model import TimmModel
 from .eva_vit_model import EVAVisionTransformer
 from .transformer import LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer, LayerNormFp32
-
+from .llm_model import LLM2VecTextTransformer
 try:
     from apex.normalization import FusedLayerNorm
 except:
@@ -191,6 +191,9 @@ def _build_text_tower(
        )
     elif text_cfg.use_embedding:
         text = TextProj(embedding_dim=text_cfg.llm_embedding_dim, output_dim=embed_dim)
+    elif not text_cfg.use_embedding and text_cfg.llm_embedding_dim:
+        text_proj = TextProj(embedding_dim=text_cfg.llm_embedding_dim, output_dim=embed_dim)
+        text = LLM2VecTextTransformer(text_proj)
     else:
         act_layer = QuickGELU if quick_gelu else nn.GELU
         norm_layer = LayerNorm
 
@@ -0,0 +1,22 @@
+{
+    "embed_dim": 1152,
+    "vision_cfg": {
+        "timm_model_name": "vit_so400m_patch14_siglip_224.v2_webli",
+        "timm_model_pretrained": true,
+        "timm_pool": "map",
+        "timm_proj": "none",
+        "image_size": 224,
+        "layers": 27,
+        "width": 1152,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "use_embedding": false,
+        "llm_embedding_dim": 4096,
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}
@@ -0,0 +1 @@
+from .llm2vec import LLM2Vec