diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..bc7bcf7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,9 @@
+License for Non-commercial Scientific Research Purposes
+
+IDEA grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under IDEA’s copyright interests to reproduce, distribute, and create derivative works of the text, videos, codes solely for your non-commercial research purposes.
+
+Any other use, in particular any use for commercial, pornographic, military, or surveillance, purposes is prohibited.
+
+Text and visualization results are owned by International Digital Economy Academy (IDEA).
+
+You also need to obey the original license of the dependency models/data used in this service.
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..664d53f
--- /dev/null
+++ b/app.py
@@ -0,0 +1,661 @@
+import shutil
+import subprocess
+
+import torch
+import gradio as gr
+from fastapi import FastAPI
+import os
+from PIL import Image
+import tempfile
+from decord import VideoReader, cpu
+import uvicorn
+from transformers import TextStreamer
+
+import hashlib
+import os
+import sys
+import time
+import warnings
+from pathlib import Path
+from typing import Optional
+from typing import Dict, List, Literal, Optional, Tuple
+from lit_gpt.lora import GPT, Block, Config, lora_filter, mark_only_lora_as_trainable
+
+import lightning as L
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+
+from generate import generate as generate_
+from lit_llama import Tokenizer, LLaMA, LLaMAConfig
+from lit_llama.lora import lora
+from lit_llama.utils import EmptyInitOnDevice
+from lit_gpt.utils import lazy_load
+from scripts.video_dataset.prepare_video_dataset_video_llava import generate_prompt_mlp
+from options import option
+import imageio
+from tqdm import tqdm
+
+from models.multimodal_encoder.builder import build_image_tower, build_video_tower
+from models.multimodal_projector.builder import build_vision_projector
+
+
+title_markdown = ("""<div class="embed_hidden" style="text-align: center;">
+    <h1>MotionLLM: Understanding Human Behaviors from Human Motions and Videos</h1>
+    <h3>
+        <a href="https://lhchen.top" target="_blank" rel="noopener noreferrer">Ling-Hao Chen</a><sup>😎 1, 3</sup>,
+        <a href="https://shunlinlu.github.io" target="_blank" rel="noopener noreferrer">Shunlin Lu</a><sup>😎 2, 3</sup>,
+        <br>
+        <a href="https://ailingzeng.sit" target="_blank" rel="noopener noreferrer">Ailing Zeng</a><sup>3</sup>,
+        <a href="https://haozhang534.github.io/" target="_blank" rel="noopener noreferrer">Hao Zhang</a><sup>3, 4</sup>,
+        <a href="https://wabyking.github.io/old.html" target="_blank" rel="noopener noreferrer">Benyou Wang</a><sup>2</sup>,
+        <a href="http://zhangruimao.site" target="_blank" rel="noopener noreferrer">Ruimao Zhang</a><sup>2</sup>,
+        <a href="https://leizhang.org" target="_blank" rel="noopener noreferrer">Lei Zhang</a><sup>🤗 3</sup>
+    </h3>
+    <h3><sup>😎</sup><i>Co-first author. Listing order is random.</i> &emsp; <sup>🤗</sup><i>Corresponding author.</i></h3>
+    <h3>
+        <sup>1</sup>THU &emsp;
+        <sup>2</sup>CUHK (SZ) &emsp;
+        <sup>3</sup>IDEA Research  &emsp;
+        <sup>4</sup>HKUST
+    </h3>
+</div>
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <img src="https://lhchen.top/MotionLLM/assets/img/highlight.png" alt="MotionLLM" style="width:60%; height: auto; align-items: center;">
+</div>
+
+""")
+
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+"""
+
+
+tos_markdown = ("""
+*We are now working to support the motion branch of the MotionLLM model.
+
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. 
+It is forbidden to use the service to generate content that is illegal, harmful, violent, racist, or sexual
+The usage of this service is subject to the IDEA License.
+""")
+
+
+learn_more_markdown = ("""
+### License
+License for Non-commercial Scientific Research Purposes
+
+IDEA grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under IDEA’s copyright interests to reproduce, distribute, and create derivative works of the text, videos, codes solely for your non-commercial research purposes.
+
+Any other use, in particular any use for commercial, pornographic, military, or surveillance, purposes is prohibited.  
+
+Text and visualization results are owned by International Digital Economy Academy (IDEA).
+
+You also need to obey the original license of the dependency models/data used in this service.
+""")
+
+
+
+class LlavaMetaModel:
+
+    def __init__(self, config, pretrained_checkpoint):
+        super(LlavaMetaModel, self).__init__()
+        # import pdb; pdb.set_trace()
+        if hasattr(config, "mm_image_tower") or hasattr(config, "image_tower"):
+            self.image_tower = build_image_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+        if hasattr(config, "mm_video_tower") or hasattr(config, "video_tower"):
+            self.video_tower = build_video_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+            self.load_video_tower_pretrained(pretrained_checkpoint)
+
+    def get_image_tower(self):
+        image_tower = getattr(self, 'image_tower', None)
+        if type(image_tower) is list:
+            image_tower = image_tower[0]
+        return image_tower
+
+    def get_video_tower(self):
+        video_tower = getattr(self, 'video_tower', None)
+        
+        if type(video_tower) is list:
+            video_tower = video_tower[0]
+        return video_tower
+
+
+    def get_all_tower(self, keys):
+        tower = {key: getattr(self, f'get_{key}_tower') for key in keys}
+        return tower
+
+
+    def load_video_tower_pretrained(self, pretrained_checkpoint):
+        self.mm_projector.load_state_dict(pretrained_checkpoint, strict=True)
+
+
+    def initialize_image_modules(self, model_args, fsdp=None):
+        image_tower = model_args.image_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+        self.config.mm_image_tower = image_tower
+
+        image_tower = build_image_tower(model_args)
+
+        if fsdp is not None and len(fsdp) > 0:
+            self.image_tower = [image_tower]
+        else:
+            self.image_tower = image_tower
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = image_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+
+        self.mm_projector = build_vision_projector(self.config)
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+
+    def initialize_video_modules(self, model_args, fsdp=None):
+        video_tower = model_args.video_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+        self.config.mm_video_tower = video_tower
+
+        video_tower = build_video_tower(model_args)
+
+        if fsdp is not None and len(fsdp) > 0:
+            self.video_tower = [video_tower]
+        else:
+            self.video_tower = video_tower
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = video_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+
+        self.mm_projector = build_vision_projector(self.config)
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+
+    def encode_images(self, images):
+        image_features = self.get_image_tower()(images)
+        image_features = self.mm_projector(image_features)
+        return image_features
+
+    def encode_videos(self, videos):
+        # import pdb; pdb.set_trace()
+        # videos: torch.Size([1, 3, 8, 224, 224])
+        video_features = self.get_video_tower()(videos) # torch.Size([1, 2048, 1024])
+        video_features = self.mm_projector(video_features.float()) # torch.Size([1, 2048, 4096])
+        return video_features
+    
+    def get_multimodal_embeddings(self, X_modalities):
+        Xs, keys= X_modalities
+        
+        X_features = getattr(self, f'encode_{keys[0]}s')(Xs)  # expand to get batchsize
+
+        return X_features
+
+
+class Projection(nn.Module):
+    def __init__(self, ):
+        super().__init__()
+        self.linear_proj = nn.Linear(512, 4096)
+    def forward(self, x):
+        return self.linear_proj(x)
+
+
+class ProjectionNN(nn.Module):
+    def __init__(self, ):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(512, 4096),
+            nn.GELU(),
+            nn.Linear(4096, 4096)
+        )
+    def forward(self, x):
+        return self.proj(x)
+
+
+class Conversation():
+    def __init__(self, output=None, input_prompt=None, prompt=None):
+        if output is None:
+            self.messages = []
+        else:
+            self.messages = []
+            self.append_message(prompt, input_prompt, output)
+
+    def append_message(self, output, input_prompt, prompt, show_images):
+        # print(output)
+        # print(input_prompt)
+        # print(prompt)
+        # print(show_images)
+        self.messages.append((output, input_prompt, prompt, show_images))
+
+    def to_gradio_chatbot(self, show_images=None, output_text=None):
+        # return a list
+        if show_images is None:
+            show_images = self.messages[-1][3]
+            output_text = self.messages[-1][0]
+        return [
+            [show_images, output_text]
+        ]
+        
+    def get_info(self):
+        return self.messages[-1][0], self.messages[-1][1]
+
+
+class ConversationBuffer():
+    def __init__(self, input_text):
+        self.buffer_ = []
+        self.buffer.append(input_text)
+
+
+def init_conv():
+    conv = Conversation()
+    return conv
+
+
+def get_processor(X, config, device, pretrained_checkpoint_tower, model_path = 'LanguageBind/MotionLLM-7B'):
+    mm_backbone_mlp_model = LlavaMetaModel(config, pretrained_checkpoint_tower)
+
+    processor = {} 
+    if 'Image' in X:
+        image_tower = mm_backbone_mlp_model.get_image_tower() # LanguageBindImageTower()
+        if not image_tower.is_loaded:
+            image_tower.load_model()
+        image_tower.to(device=device, dtype=torch.float16)
+        image_processor = image_tower.image_processor
+        processor['image'] = image_processor
+    if 'Video' in X:
+        video_tower = mm_backbone_mlp_model.get_video_tower()
+        if not video_tower.is_loaded:
+            video_tower.load_model()
+        video_tower.to(device=device, dtype=torch.float16)
+        video_processor = video_tower.video_processor
+        processor['video'] = video_processor
+
+    return mm_backbone_mlp_model, processor
+
+
+def motionllm(
+    args, 
+    input_video_path: str,
+    text_en_in: str, 
+    quantize: Optional[str] = None,
+    dtype: str = "float32",
+    max_new_tokens: int = 200,
+    top_k: int = 200,
+    temperature: float = 0.8,
+    accelerator: str = "auto",):
+    
+    video_tensor = video_processor(input_video_path, return_tensors='pt')['pixel_values']
+
+    if type(video_tensor) is list:
+        tensor = [video.to('cuda', dtype=torch.float16) for video in video_tensor]
+    else:
+        tensor = video_tensor.to('cuda', dtype=torch.float16) # (1,3,8,224,224)
+
+    X_modalities = [tensor,['video']]
+    video_feature = mm_backbone_mlp_model.get_multimodal_embeddings(X_modalities)
+    prompt = text_en_in
+    input_prompt = prompt
+    
+    sample = {"instruction": prompt, "input": input_video_path}
+    
+    prefix = generate_prompt_mlp(sample)
+    pre = torch.cat((tokenizer.encode(prefix.split('INPUT_VIDEO: ')[0] + "\n", bos=True, eos=False, device=model.device).view(1, -1), tokenizer.encode("INPUT_VIDEO: ", bos=False, eos=False, device=model.device).view(1, -1)), dim=1)
+
+    prompt = (pre, ". ASSISTANT: ")
+    encoded = (prompt[0], video_feature[0], tokenizer.encode(prompt[1], bos=False, eos=False, device=model.device).view(1, -1))
+    
+    t0 = time.perf_counter()
+        
+    output_seq = generate_(
+        model,
+        idx=encoded,
+        max_seq_length=4096,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        eos_id=tokenizer.eos_id,
+        tokenizer = tokenizer,
+    )
+    outputfull = tokenizer.decode(output_seq)
+    output = outputfull.split("ASSISTANT:")[-1].strip()
+    print("================================")
+    print(output)
+    print("================================")
+    
+    return output, input_prompt, prompt
+
+
+def save_image_to_local(image):
+    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
+    image = Image.open(image)
+    image.save(filename)
+    # print(filename)
+    return filename
+
+
+def save_video_to_local(video_path):
+    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
+    shutil.copyfile(video_path, filename)
+    return filename
+
+
+def generate(image1, video, textbox_in, first_run, state, images_tensor):
+    flag = 1
+
+    image1 = image1 if image1 else "none"
+    video = video if video else "none"
+    
+    if type(state) is not Conversation:
+        state = init_conv()
+        images_tensor = [[], []]
+
+    first_run = False if len(state.messages) > 0 else True
+    text_en_in = textbox_in.replace("picture", "image")
+    output, input_prompt, prompt = motionllm(args, video, text_en_in)
+
+    text_en_out = output
+    textbox_out = text_en_out
+
+    show_images = ""
+    if os.path.exists(image1):
+        filename = save_image_to_local(image1)
+        show_images += f'<img src="./file={filename}" style="display: inline-block;width: 250px;max-height: 400px;">'
+
+    if os.path.exists(video):
+        filename = save_video_to_local(video)
+        show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={filename}"></video>'
+
+    show_images = textbox_in + "\n" + show_images
+    state.append_message(output, input_prompt, prompt, show_images)
+
+    torch.cuda.empty_cache()
+    
+    return (state, state.to_gradio_chatbot(show_images, output), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=image1 if os.path.exists(image1) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True))
+
+def regenerate(state):
+    if len(state.messages) > 0:
+        tobot = state.to_gradio_chatbot()
+        tobot[-1][1] = None 
+        textbox = state.messages[-1][1]
+        state.messages.pop(-1)
+        return state, tobot, False, textbox
+    return (state, [], True)
+
+
+def clear_history(state):
+    state = init_conv()
+    try:
+        tgt = state.to_gradio_chatbot()
+    except:
+        tgt = [None, None]
+    return (gr.update(value=None, interactive=True),
+        gr.update(value=None, interactive=True),\
+        gr.update(value=None, interactive=True),\
+        True, state, tgt, [[], []])
+
+
+def get_md5(file_path):
+    hash_md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def logging_up(video, state):
+    try:
+        state.get_info()
+    except:
+        return False
+    action = "upvote"
+    # Get the current time
+    current_time = str(time.time())
+
+    # Create an md5 object
+    hash_object = hashlib.md5(current_time.encode())
+
+    # Get the hexadecimal representation of the hash
+    md5_hash = get_md5(video) + "-" + hash_object.hexdigest()
+    
+    command = f"cp {video} ./feedback/{action}/mp4/{md5_hash}.mp4"
+    os.system(command)
+    with open (f"./feedback/{action}/txt/{md5_hash}.txt", "w") as f:
+        out, prp = state.get_info()
+        f.write(f"==========\nPrompt: {prp}\n==========\nOutput: {out}==========\n")
+    return True
+
+
+def logging_down(video, state):
+    try:
+        state.get_info()
+    except:
+        return False
+    action = "downvote"
+    # Get the current time
+    current_time = str(time.time())
+
+    # Create an md5 object
+    hash_object = hashlib.md5(current_time.encode())
+
+    # Get the hexadecimal representation of the hash
+    md5_hash = get_md5(video) + "-" + hash_object.hexdigest()
+    
+    command = f"cp {video} ./feedback/{action}/mp4/{md5_hash}.mp4"
+    os.system(command)
+    with open (f"./feedback/{action}/txt/{md5_hash}.txt", "w") as f:
+        out, prp = state.get_info()
+        f.write(f"==========\nPrompt: {prp}\n==========\nOutput: {out}==========\n")
+    return True
+
+
+torch.set_float32_matmul_precision("high")
+warnings.filterwarnings('ignore')
+args = option.get_args_parser()
+
+conv_mode = "llava_v1"
+model_path = 'LanguageBind/Video-LLaVA-7B'
+device = 'cuda'
+load_8bit = False
+load_4bit = True
+dtype = torch.float16
+
+if not os.path.exists("temp"):
+    os.makedirs("temp")
+
+lora_path = Path(args.lora_path)
+pretrained_llm_path = Path(f"./checkpoints/vicuna-7b-v1.5/lit_model.pth")
+tokenizer_llm_path = Path("./checkpoints/vicuna-7b-v1.5/tokenizer.model")
+
+# assert lora_path.is_file()
+assert pretrained_llm_path.is_file()
+assert tokenizer_llm_path.is_file()
+
+accelerator = "auto"
+fabric = L.Fabric(accelerator=accelerator, devices=1)
+
+dtype = "float32"
+dt = getattr(torch, dtype, None)
+if not isinstance(dt, torch.dtype):
+    raise ValueError(f"{dtype} is not a valid dtype.")
+dtype = dt
+
+quantize = None
+t0 = time.time()
+
+with EmptyInitOnDevice(
+    device=fabric.device, dtype=dtype, quantization_mode=quantize
+), lora(r=args.lora_r, alpha=args.lora_alpha, dropout=args.lora_dropout, enabled=True):
+    checkpoint_dir = Path("checkpoints/vicuna-7b-v1.5")
+    lora_query = True
+    lora_key = False
+    lora_value = True
+    lora_projection = False
+    lora_mlp = False
+    lora_head = False
+    config = Config.from_name(
+        name=checkpoint_dir.name,
+        r=args.lora_r,
+        alpha=args.lora_alpha,
+        dropout=args.lora_dropout,
+        to_query=lora_query,
+        to_key=lora_key,
+        to_value=lora_value,
+        to_projection=lora_projection,
+        to_mlp=lora_mlp,
+        to_head=lora_head,
+    )
+    model = GPT(config).bfloat16()
+
+mlp_path = args.mlp_path
+pretrained_checkpoint_mlp = torch.load(mlp_path)
+
+X = ['Video']
+
+mm_backbone_mlp_model, processor = get_processor(X, args, 'cuda', pretrained_checkpoint_mlp, model_path = 'LanguageBind/Video-LLaVA-7B')
+video_processor = processor['video']
+
+linear_proj = mm_backbone_mlp_model.mm_projector
+
+# 1. Load the pretrained weights
+pretrained_llm_checkpoint = lazy_load(pretrained_llm_path)
+# 2. Load the fine-tuned LoRA weights
+lora_checkpoint = lazy_load(lora_path)
+# 3. merge the two checkpoints
+model_state_dict = {**pretrained_llm_checkpoint, **lora_checkpoint}
+model.load_state_dict(model_state_dict, strict=True)
+print('Load llm base model from', pretrained_llm_path)
+print('Load lora model from', lora_path)
+
+# load mlp again, to en sure, not neccessary actually 
+linear_proj.load_state_dict(pretrained_checkpoint_mlp)
+linear_proj = linear_proj.cuda()
+print('Load mlp model again from', mlp_path)
+print(f"Time to load model: {time.time() - t0:.02f} seconds.", file=sys.stderr)
+
+model.eval()
+model = fabric.setup_module(model)
+linear_proj.eval()
+
+tokenizer = Tokenizer(tokenizer_llm_path)
+print('Load tokenizer from', tokenizer_llm_path)
+
+print(torch.cuda.memory_allocated())
+print(torch.cuda.max_memory_allocated())
+
+
+app = FastAPI()
+
+textbox = gr.Textbox(
+        show_label=False, placeholder="Enter text and press ENTER", container=False
+    )
+
+with gr.Blocks(title='MotionLLM', theme=gr.themes.Default(), css=block_css) as demo:
+    gr.Markdown(title_markdown)
+    state = gr.State()
+    buffer_ = gr.State()
+    first_run = gr.State()
+    images_tensor = gr.State()
+
+    with gr.Row():
+        with gr.Column(scale=3):
+            image1 = gr.State()
+            video = gr.Video(label="Input Video")
+
+            cur_dir = os.path.dirname(os.path.abspath(__file__))
+            gr.Examples(
+                examples=[
+                    [
+                        f"{cur_dir}/examples/Play_Electric_guitar_16_clip1.mp4",
+                        "why is the girl so happy",
+                    ],
+                    [
+                        f"{cur_dir}/examples/guoyoucai.mov",
+                        "what is the feeling of him",
+                    ],
+                    [
+                        f"{cur_dir}/examples/sprint_run_18_clip1.mp4",
+                        "Why is the man running so fast?",
+                    ],
+                    [
+                        f"{cur_dir}/examples/lift_weight.mp4",
+                        "Assume you are a fitness coach, refer to the video of the professional athlete, please analyze specific action essentials in steps and give detailed instruction.",
+                    ],
+                    [
+                        f"{cur_dir}/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4",
+                        "wow, can you teach me the motion, step by step in detail",
+                    ],
+                    [
+                        f"{cur_dir}/examples/mabaoguo.mp4",
+                        "why is the video funny?",
+                    ],
+                    [
+                        f"{cur_dir}/examples/COBRA_PUSH_UPS_clip2.mp4",
+                        "describe the body movement of the woman",
+                    ],
+                    [
+                        f"{cur_dir}/examples/sample_demo_1.mp4",
+                        "Why is this video interesting?",
+                    ],
+                ],
+                inputs=[video, textbox],
+            )
+
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(label="MotionLLM", bubble_full_width=True).style(height=875)
+            with gr.Row():
+                with gr.Column(scale=8):
+                    textbox.render()
+                with gr.Column(scale=1, min_width=50):
+                    submit_btn = gr.Button(
+                        value="Send", variant="primary", interactive=True
+                    )
+            with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
+                downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
+                flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
+                # stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
+                clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
+
+    gr.Markdown(tos_markdown)
+    gr.Markdown(learn_more_markdown)
+    
+    tmp = gr.State()
+    upvote_btn.click(logging_up, [video, state], [tmp])
+    
+    downvote_btn.click(logging_down, [video, state], [tmp])
+
+    submit_btn.click(generate, [image1, video, textbox, first_run, state, images_tensor],
+                     [state, chatbot, first_run, textbox, images_tensor, image1, video])
+
+    regenerate_btn.click(regenerate, [state], [state, chatbot, first_run, textbox]).then(
+        generate, [image1, video, textbox, first_run, state, images_tensor], [state, chatbot, first_run, textbox, images_tensor, image1, video])
+
+    clear_btn.click(clear_history, [state],
+                    [image1, video, textbox, first_run, state, chatbot, images_tensor])
+
+app = gr.mount_gradio_app(app, demo, path="/")
+uvicorn.run(app, host="0.0.0.0", port=6657)
\ No newline at end of file
diff --git a/examples/COBRA_PUSH_UPS_clip2.mp4 b/examples/COBRA_PUSH_UPS_clip2.mp4
new file mode 100644
index 0000000..de3486d
Binary files /dev/null and b/examples/COBRA_PUSH_UPS_clip2.mp4 differ
diff --git a/examples/Play_Electric_guitar_16_clip1.mp4 b/examples/Play_Electric_guitar_16_clip1.mp4
new file mode 100644
index 0000000..9fe2bf8
Binary files /dev/null and b/examples/Play_Electric_guitar_16_clip1.mp4 differ
diff --git a/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4 b/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4
new file mode 100644
index 0000000..ce83a78
Binary files /dev/null and b/examples/Shaolin_Kung_Fu_Wushu_Selfdefense_Sword_Form_Session_22_clip3.mp4 differ
diff --git a/examples/guoyoucai.mov b/examples/guoyoucai.mov
new file mode 100644
index 0000000..b1a2aa8
Binary files /dev/null and b/examples/guoyoucai.mov differ
diff --git a/examples/guoyoucai.mp4 b/examples/guoyoucai.mp4
new file mode 100644
index 0000000..aa1c944
Binary files /dev/null and b/examples/guoyoucai.mp4 differ
diff --git a/examples/lift_weight.mp4 b/examples/lift_weight.mp4
new file mode 100644
index 0000000..dc1766b
Binary files /dev/null and b/examples/lift_weight.mp4 differ
diff --git a/examples/mabaoguo.mp4 b/examples/mabaoguo.mp4
new file mode 100644
index 0000000..a03aa2b
Binary files /dev/null and b/examples/mabaoguo.mp4 differ
diff --git a/examples/sample_demo_1.mp4 b/examples/sample_demo_1.mp4
new file mode 100644
index 0000000..8afbc6c
Binary files /dev/null and b/examples/sample_demo_1.mp4 differ
diff --git a/examples/sprint_run_18_clip1.mp4 b/examples/sprint_run_18_clip1.mp4
new file mode 100644
index 0000000..9845521
Binary files /dev/null and b/examples/sprint_run_18_clip1.mp4 differ
diff --git a/generate.py b/generate.py
new file mode 100755
index 0000000..677a171
--- /dev/null
+++ b/generate.py
@@ -0,0 +1,199 @@
+import sys
+import time
+import warnings
+from pathlib import Path
+from typing import Optional
+
+import lightning as L
+import torch
+
+from lit_llama import LLaMA, Tokenizer
+from lit_llama.utils import EmptyInitOnDevice, lazy_load
+
+
+@torch.no_grad()
+def generate(
+    model: torch.nn.Module,
+    idx: torch.Tensor,
+    max_new_tokens: int,
+    max_seq_length: int,
+    temperature: float = 1.0,
+    top_k: Optional[int] = None,
+    eos_id: Optional[int] = None,
+    tokenizer = None,
+) -> torch.Tensor:
+    """Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+
+    The implementation of this function is modified from A. Karpathy's nanoGPT.
+
+    Args:
+        model: The model to use.
+        idx: Tensor of shape (T) with indices of the prompt sequence.
+        max_new_tokens: The number of new tokens to generate.
+        max_seq_length: The maximum sequence length allowed.
+        temperature: Scales the predicted logits by 1 / temperature
+        top_k: If specified, only sample among the tokens with the k highest probabilities
+        eos_id: If specified, stop generating any more token once the <eos> token is triggered
+    """
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    # import pdb; pdb.set_trace()
+    if type(idx) == tuple:
+        # import pdb; pdb.set_trace()
+        T = idx[0].shape[-1] + idx[2].shape[-1] + len(idx[1])
+        before_len = idx[0].shape[-1]
+        catted = torch.cat((idx[0], torch.zeros((1, len(idx[1]))).cuda(), idx[2]), dim=1).long()
+        idx = (catted, idx[1], before_len)
+        T_new = T + max_new_tokens
+        # import pdb; pdb.set_trace()
+        empty = torch.empty(T_new, dtype=idx[0].dtype, device=idx[0].device)
+        empty = torch.empty(T_new, dtype=idx[0].dtype, device=idx[0].device)
+        empty[:T] = idx[0]
+        idx = (empty, idx[1], [before_len])
+        # import pdb; pdb.set_trace()
+    else:
+        # import pdb; pdb.set_trace()
+        T = idx.size(0)
+        T_new = T + max_new_tokens
+        empty = torch.empty(T_new, dtype=idx.dtype, device=idx.device)
+        empty[:T] = idx
+        idx = empty
+
+    # generate max_new_tokens tokens
+    # import pdb; pdb.set_trace()
+    for t in range(T, T_new):
+        if type(idx) == tuple:
+            idx_cond = idx[0][:t]
+            tmp = idx_cond if T <= max_seq_length else idx_cond[-max_seq_length:]
+            # import pdb; pdb.set_trace()
+            idx_cond = (tmp.view(1, -1), idx[1].unsqueeze(0), idx[2])
+        else:
+            # ignore the not-filled-yet tokens
+            idx_cond = idx[:t]
+            # if the sequence context is growing too long we must crop it at max_seq_length
+            idx_cond = idx_cond if T <= max_seq_length else idx_cond[-max_seq_length:]
+
+        # forward
+        if type(idx) == tuple:
+            logits = model(idx_cond, maxlen=idx_cond[0].size(1))
+        else:
+            logits = model(idx_cond.view(1, -1))
+        logits = logits[0, -1] / temperature
+        
+        # import pdb; pdb.set_trace()
+        # optionally crop the logits to only the top k options
+        if top_k is not None:
+            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+            logits[logits < v[[-1]]] = -float("Inf")
+
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        idx_next = torch.multinomial(probs, num_samples=1)
+
+        # concatenate the new generation
+        if type(idx) == tuple:
+            seq = idx[0]
+            seq[t] = idx_next
+            idx = (seq, idx[1], idx[2])
+        else:
+            idx[t] = idx_next
+
+        # if <eos> token is triggered, return the output (stop generation)
+        if idx_next == eos_id:
+            if type(idx) == tuple:
+                return idx[0][:t+1]
+            else:
+                return idx[:t + 1]  # include the EOS token
+    if type(idx) == tuple:
+        return idx[0]
+    else:
+        return idx
+
+
+def main(
+    prompt: str = "Hello, my name is",
+    *,
+    num_samples: int = 1,
+    max_new_tokens: int = 50,
+    top_k: int = 200,
+    temperature: float = 0.8,
+    checkpoint_path: Optional[Path] = None,
+    tokenizer_path: Optional[Path] = None,
+    model_size: str = "7B",
+    quantize: Optional[str] = None,
+) -> None:
+    """Generates text samples based on a pre-trained LLaMA model and tokenizer.
+
+    Args:
+        prompt: The prompt string to use for generating the samples.
+        num_samples: The number of text samples to generate.
+        max_new_tokens: The number of generation steps to take.
+        top_k: The number of top most probable tokens to consider in the sampling process.
+        temperature: A value controlling the randomness of the sampling process. Higher values result in more random
+            samples.
+        checkpoint_path: The checkpoint path to load.
+        tokenizer_path: The tokenizer path to load.
+        model_size: The model size to load.
+        quantize: Whether to quantize the model and using which method:
+            ``"llm.int8"``: LLM.int8() mode,
+            ``"gptq.int4"``: GPTQ 4-bit mode.
+    """
+    if not checkpoint_path:
+        checkpoint_path = Path(f"./checkpoints/lit-llama/{model_size}/lit-llama.pth")
+    if not tokenizer_path:
+        tokenizer_path = Path("./checkpoints/lit-llama/tokenizer.model")
+    assert checkpoint_path.is_file(), checkpoint_path
+    assert tokenizer_path.is_file(), tokenizer_path
+
+    fabric = L.Fabric(accelerator="cuda", devices=1)
+    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+
+    print("Loading model ...", file=sys.stderr)
+    t0 = time.time()
+    with EmptyInitOnDevice(
+        device=fabric.device, dtype=dtype, quantization_mode=quantize
+    ):
+        model = LLaMA.from_name(model_size)
+
+    checkpoint = lazy_load(checkpoint_path)
+    model.load_state_dict(checkpoint)
+    print(f"Time to load model: {time.time() - t0:.02f} seconds.", file=sys.stderr)
+
+    model.eval()
+    model = fabric.setup_module(model)
+
+    tokenizer = Tokenizer(tokenizer_path)
+    encoded_prompt = tokenizer.encode(prompt, bos=True, eos=False, device=fabric.device)
+
+    L.seed_everything(1234)
+    t0 = time.perf_counter()
+
+    for _ in range(num_samples):
+        y = generate(
+            model,
+            encoded_prompt,
+            max_new_tokens,
+            model.config.block_size,  # type: ignore[union-attr,arg-type]
+            temperature=temperature,
+            top_k=top_k,
+        )
+        print(tokenizer.decode(y))
+
+    t = time.perf_counter() - t0
+    print(f"\n\nTime for inference: {t:.02f} sec total, {num_samples * max_new_tokens / t:.02f} tokens/sec", file=sys.stderr)
+    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    from jsonargparse import CLI
+
+    torch.set_float32_matmul_precision("high")
+    warnings.filterwarnings(
+        # Triggered internally at ../aten/src/ATen/EmptyTensor.cpp:31
+        "ignore", 
+        message="ComplexHalf support is experimental and many operators don't support it yet"
+    )
+    warnings.filterwarnings(
+        # Triggered in bitsandbytes/autograd/_functions.py:298
+        "ignore", 
+        message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization",
+    )
+    CLI(main)
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/models/constants.py b/models/constants.py
new file mode 100755
index 0000000..f1bcfae
--- /dev/null
+++ b/models/constants.py
@@ -0,0 +1,18 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+X_TOKEN_INDEX = {'IMAGE': -200, 'VIDEO': -201, 'AUDIO': -202, 'THERMAL': -203, 'DEPTH': -204}
+X_INDEX_TOKEN = {v: k for k, v in X_TOKEN_INDEX.items()}
+# IMAGE_TOKEN_INDEX = -200
+DEFAULT_X_TOKEN = {'IMAGE': "<image>", 'VIDEO': "<video>", 'AUDIO': "<audio>", 'THERMAL': "<thermal>", 'DEPTH': "<depth>"}
+# DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_X_PATCH_TOKEN = {'IMAGE': "<im_patch>", 'VIDEO': "<vi_patch>", 'AUDIO': "<au_patch>", 'THERMAL': "<th_patch>", 'DEPTH': "<de_patch>"}
+# DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_X_START_TOKEN = {'IMAGE': "<im_start>", 'VIDEO': "<vi_start>", 'AUDIO': "<au_start>", 'THERMAL': "<th_start>", 'DEPTH': "<de_start>"}
+# DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_X_END_TOKEN = {'IMAGE': "<im_end>", 'VIDEO': "<vi_end>", 'AUDIO': "<au_end>", 'THERMAL': "<th_end>", 'DEPTH': "<de_end>"}
+# DEFAULT_IM_END_TOKEN = "<im_end>"
diff --git a/models/encdec.py b/models/encdec.py
new file mode 100755
index 0000000..ae72afa
--- /dev/null
+++ b/models/encdec.py
@@ -0,0 +1,67 @@
+import torch.nn as nn
+from models.resnet import Resnet1D
+
+class Encoder(nn.Module):
+    def __init__(self,
+                 input_emb_width = 3,
+                 output_emb_width = 512,
+                 down_t = 3,
+                 stride_t = 2,
+                 width = 512,
+                 depth = 3,
+                 dilation_growth_rate = 3,
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        blocks.append(nn.Conv1d(input_emb_width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        
+        for i in range(down_t):
+            input_dim = width
+            block = nn.Sequential(
+                nn.Conv1d(input_dim, width, filter_t, stride_t, pad_t),
+                Resnet1D(width, depth, dilation_growth_rate, activation=activation, norm=norm),
+            )
+            blocks.append(block)
+        blocks.append(nn.Conv1d(width, output_emb_width, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return self.model(x)
+
+class Decoder(nn.Module):
+    def __init__(self,
+                 input_emb_width = 3,
+                 output_emb_width = 512,
+                 down_t = 3,
+                 stride_t = 2,
+                 width = 512,
+                 depth = 3,
+                 dilation_growth_rate = 3, 
+                 activation='relu',
+                 norm=None):
+        super().__init__()
+        blocks = []
+        
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        blocks.append(nn.Conv1d(output_emb_width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        for i in range(down_t):
+            out_dim = width
+            block = nn.Sequential(
+                Resnet1D(width, depth, dilation_growth_rate, reverse_dilation=True, activation=activation, norm=norm),
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv1d(width, out_dim, 3, 1, 1)
+            )
+            blocks.append(block)
+        blocks.append(nn.Conv1d(width, width, 3, 1, 1))
+        blocks.append(nn.ReLU())
+        blocks.append(nn.Conv1d(width, input_emb_width, 3, 1, 1))
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return self.model(x)
+    
diff --git a/models/evaluator_wrapper.py b/models/evaluator_wrapper.py
new file mode 100755
index 0000000..fe4558a
--- /dev/null
+++ b/models/evaluator_wrapper.py
@@ -0,0 +1,92 @@
+
+import torch
+from os.path import join as pjoin
+import numpy as np
+from models.modules import MovementConvEncoder, TextEncoderBiGRUCo, MotionEncoderBiGRUCo
+from utils.word_vectorizer import POS_enumerator
+
+def build_models(opt):
+    movement_enc = MovementConvEncoder(opt.dim_pose-4, opt.dim_movement_enc_hidden, opt.dim_movement_latent)
+    text_enc = TextEncoderBiGRUCo(word_size=opt.dim_word,
+                                  pos_size=opt.dim_pos_ohot,
+                                  hidden_size=opt.dim_text_hidden,
+                                  output_size=opt.dim_coemb_hidden,
+                                  device=opt.device)
+
+    motion_enc = MotionEncoderBiGRUCo(input_size=opt.dim_movement_latent,
+                                      hidden_size=opt.dim_motion_hidden,
+                                      output_size=opt.dim_coemb_hidden,
+                                      device=opt.device)
+
+    checkpoint = torch.load(pjoin(opt.checkpoints_dir, opt.dataset_name, 'text_mot_match', 'model', 'finest.tar'),
+                            map_location=opt.device)
+    movement_enc.load_state_dict(checkpoint['movement_encoder'])
+    text_enc.load_state_dict(checkpoint['text_encoder'])
+    motion_enc.load_state_dict(checkpoint['motion_encoder'])
+    print('Loading Evaluation Model Wrapper (Epoch %d) Completed!!' % (checkpoint['epoch']))
+    return text_enc, motion_enc, movement_enc
+
+
+class EvaluatorModelWrapper(object):
+
+    def __init__(self, opt):
+
+        if opt.dataset_name == 't2m':
+            opt.dim_pose = 263
+        elif opt.dataset_name == 'kit':
+            opt.dim_pose = 251
+        else:
+            raise KeyError('Dataset not Recognized!!!')
+
+        opt.dim_word = 300
+        opt.max_motion_length = 196
+        opt.dim_pos_ohot = len(POS_enumerator)
+        opt.dim_motion_hidden = 1024
+        opt.max_text_len = 20
+        opt.dim_text_hidden = 512
+        opt.dim_coemb_hidden = 512
+
+        # print(opt)
+
+        self.text_encoder, self.motion_encoder, self.movement_encoder = build_models(opt)
+        self.opt = opt
+        self.device = opt.device
+
+        self.text_encoder.to(opt.device)
+        self.motion_encoder.to(opt.device)
+        self.movement_encoder.to(opt.device)
+
+        self.text_encoder.eval()
+        self.motion_encoder.eval()
+        self.movement_encoder.eval()
+
+    # Please note that the results does not following the order of inputs
+    def get_co_embeddings(self, word_embs, pos_ohot, cap_lens, motions, m_lens):
+        with torch.no_grad():
+            word_embs = word_embs.detach().to(self.device).float()
+            pos_ohot = pos_ohot.detach().to(self.device).float()
+            motions = motions.detach().to(self.device).float()
+
+            '''Movement Encoding'''
+            movements = self.movement_encoder(motions[..., :-4]).detach()
+            m_lens = m_lens // self.opt.unit_length
+            motion_embedding = self.motion_encoder(movements, m_lens)
+
+            '''Text Encoding'''
+            text_embedding = self.text_encoder(word_embs, pos_ohot, cap_lens)
+        return text_embedding, motion_embedding
+
+    # Please note that the results does not following the order of inputs
+    def get_motion_embeddings(self, motions, m_lens):
+        with torch.no_grad():
+            motions = motions.detach().to(self.device).float()
+
+            align_idx = np.argsort(m_lens.data.tolist())[::-1].copy()
+            motions = motions[align_idx]
+            m_lens = m_lens[align_idx]
+
+            '''Movement Encoding'''
+            movements = self.movement_encoder(motions[..., :-4]).detach()
+            m_lens = m_lens // self.opt.unit_length
+            motion_embedding = self.motion_encoder(movements, m_lens)
+        return motion_embedding
diff --git a/models/modules.py b/models/modules.py
new file mode 100755
index 0000000..4f06cd9
--- /dev/null
+++ b/models/modules.py
@@ -0,0 +1,109 @@
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+
+def init_weight(m):
+    if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d):
+        nn.init.xavier_normal_(m.weight)
+        # m.bias.data.fill_(0.01)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+
+            
+class MovementConvEncoder(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(MovementConvEncoder, self).__init__()
+        self.main = nn.Sequential(
+            nn.Conv1d(input_size, hidden_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv1d(hidden_size, output_size, 4, 2, 1),
+            nn.Dropout(0.2, inplace=True),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+        self.out_net = nn.Linear(output_size, output_size)
+        self.main.apply(init_weight)
+        self.out_net.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        # print(outputs.shape)
+        return self.out_net(outputs)
+
+
+
+class TextEncoderBiGRUCo(nn.Module):
+    def __init__(self, word_size, pos_size, hidden_size, output_size, device):
+        super(TextEncoderBiGRUCo, self).__init__()
+        self.device = device
+
+        self.pos_emb = nn.Linear(pos_size, word_size)
+        self.input_emb = nn.Linear(word_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size)
+        )
+
+        self.input_emb.apply(init_weight)
+        self.pos_emb.apply(init_weight)
+        self.output_net.apply(init_weight)
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
+
+    # input(batch_size, seq_len, dim)
+    def forward(self, word_embs, pos_onehot, cap_lens):
+        num_samples = word_embs.shape[0]
+
+        pos_embs = self.pos_emb(pos_onehot)
+        inputs = word_embs + pos_embs
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+
+        cap_lens = cap_lens.data.tolist()
+        emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True)
+
+        gru_seq, gru_last = self.gru(emb, hidden)
+
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+
+        return self.output_net(gru_last)
+
+
+class MotionEncoderBiGRUCo(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size, device):
+        super(MotionEncoderBiGRUCo, self).__init__()
+        self.device = device
+
+        self.input_emb = nn.Linear(input_size, hidden_size)
+        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
+        self.output_net = nn.Sequential(
+            nn.Linear(hidden_size*2, hidden_size),
+            nn.LayerNorm(hidden_size),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(hidden_size, output_size)
+        )
+
+        self.input_emb.apply(init_weight)
+        self.output_net.apply(init_weight)
+        self.hidden_size = hidden_size
+        self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True))
+
+    # input(batch_size, seq_len, dim)
+    def forward(self, inputs, m_lens):
+        num_samples = inputs.shape[0]
+
+        input_embs = self.input_emb(inputs)
+        hidden = self.hidden.repeat(1, num_samples, 1)
+
+        cap_lens = m_lens.data.tolist()
+        emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True, enforce_sorted=False)
+
+        gru_seq, gru_last = self.gru(emb, hidden)
+
+        gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1)
+
+        return self.output_net(gru_last)
diff --git a/models/multimodal_encoder/builder.py b/models/multimodal_encoder/builder.py
new file mode 100755
index 0000000..85f6eda
--- /dev/null
+++ b/models/multimodal_encoder/builder.py
@@ -0,0 +1,49 @@
+import os
+from .clip_encoder import CLIPVisionTower
+from .languagebind import LanguageBindImageTower, LanguageBindVideoTower
+from .mae_encoder import MAEVisionTower
+from transformers import CLIPModel
+
+def build_image_tower(image_tower_cfg, **kwargs):
+    image_tower = getattr(image_tower_cfg, 'mm_image_tower', getattr(image_tower_cfg, 'image_tower', None))
+    is_absolute_path_exists = os.path.exists(image_tower)
+    if is_absolute_path_exists or image_tower.startswith("openai") or image_tower.startswith("laion"):
+        return CLIPVisionTower(image_tower, args=image_tower_cfg, **kwargs)
+    if image_tower.endswith('LanguageBind_Image'):
+        return LanguageBindImageTower(image_tower, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
+    if 'mae' in image_tower:
+        print('maemaemaemaemaemaemaemae')
+        print('maemaemaemaemaemaemaemae')
+        print('maemaemaemaemaemaemaemae')
+        print('maemaemaemaemaemaemaemae')
+        print('maemaemaemaemaemaemaemae')
+        return MAEVisionTower(image_tower, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
+    raise ValueError(f'Unknown image tower: {image_tower}')
+
+def build_video_tower(video_tower_cfg, **kwargs):
+    video_tower = getattr(video_tower_cfg, 'mm_video_tower', getattr(video_tower_cfg, 'video_tower', None))
+    if video_tower.endswith('LanguageBind_Video_merge'):
+        return LanguageBindVideoTower(video_tower, args=video_tower_cfg, cache_dir='./cache_dir', **kwargs)
+    raise ValueError(f'Unknown video tower: {video_tower}')
+
+
+
+# import os
+# from .clip_encoder import CLIPVisionTower
+# from .languagebind import LanguageBindImageTower, LanguageBindVideoTower
+# from transformers import CLIPModel
+
+# def build_image_tower(image_tower_cfg, **kwargs):
+#     image_tower = getattr(image_tower_cfg, 'mm_image_tower', getattr(image_tower_cfg, 'image_tower', None))
+#     is_absolute_path_exists = os.path.exists(image_tower)
+#     if is_absolute_path_exists or image_tower.startswith("openai") or image_tower.startswith("laion"):
+#         return CLIPVisionTower(image_tower, args=image_tower_cfg, **kwargs)
+#     if image_tower.endswith('LanguageBind_Image'):
+#         return LanguageBindImageTower(image_tower, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
+#     raise ValueError(f'Unknown image tower: {image_tower}')
+
+# def build_video_tower(video_tower_cfg, **kwargs):
+#     video_tower = getattr(video_tower_cfg, 'mm_video_tower', getattr(video_tower_cfg, 'video_tower', None))
+#     if video_tower.endswith('LanguageBind_Video'):
+#         return LanguageBindVideoTower(video_tower, args=video_tower_cfg, cache_dir='./cache_dir', **kwargs)
+#     raise ValueError(f'Unknown video tower: {video_tower}')
\ No newline at end of file
diff --git a/models/multimodal_encoder/clip_encoder.py b/models/multimodal_encoder/clip_encoder.py
new file mode 100755
index 0000000..dbb9015
--- /dev/null
+++ b/models/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,78 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/models/multimodal_encoder/languagebind/__init__.py b/models/multimodal_encoder/languagebind/__init__.py
new file mode 100755
index 0000000..3a4e50d
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/__init__.py
@@ -0,0 +1,285 @@
+import torch
+from torch import nn
+from transformers import AutoConfig
+
+from .image.configuration_image import LanguageBindImageConfig
+from .image.modeling_image import LanguageBindImage
+from .image.tokenization_image import LanguageBindImageTokenizer
+from .image.processing_image import LanguageBindImageProcessor
+
+from .video.configuration_video import LanguageBindVideoConfig
+from .video.modeling_video import LanguageBindVideo
+from .video.tokenization_video import LanguageBindVideoTokenizer
+from .video.processing_video import LanguageBindVideoProcessor
+
+from .depth.configuration_depth import LanguageBindDepthConfig
+from .depth.modeling_depth import LanguageBindDepth
+from .depth.tokenization_depth import LanguageBindDepthTokenizer
+from .depth.processing_depth import LanguageBindDepthProcessor
+
+from .audio.configuration_audio import LanguageBindAudioConfig
+from .audio.modeling_audio import LanguageBindAudio
+from .audio.tokenization_audio import LanguageBindAudioTokenizer
+from .audio.processing_audio import LanguageBindAudioProcessor
+
+from .thermal.configuration_thermal import LanguageBindThermalConfig
+from .thermal.modeling_thermal import LanguageBindThermal
+from .thermal.tokenization_thermal import LanguageBindThermalTokenizer
+from .thermal.processing_thermal import LanguageBindThermalProcessor
+
+
+
+config_dict = {
+    'thermal': LanguageBindThermalConfig,
+    'image': LanguageBindImageConfig,
+    'video': LanguageBindVideoConfig,
+    'depth': LanguageBindDepthConfig,
+    'audio': LanguageBindAudioConfig
+}
+model_dict = {
+    'thermal': LanguageBindThermal,
+    'image': LanguageBindImage,
+    'video': LanguageBindVideo,
+    'depth': LanguageBindDepth,
+    'audio': LanguageBindAudio
+}
+transform_dict = {
+    'video': LanguageBindVideoProcessor,
+    'audio': LanguageBindAudioProcessor,
+    'depth': LanguageBindDepthProcessor,
+    'thermal': LanguageBindThermalProcessor,
+    'image': LanguageBindImageProcessor,
+}
+
+class LanguageBind(nn.Module):
+    def __init__(self, clip_type=('thermal', 'image', 'video', 'depth', 'audio'), use_temp=True, cache_dir='./cache_dir'):
+        super(LanguageBind, self).__init__()
+        self.use_temp = use_temp
+        self.modality_encoder = {}
+        self.modality_proj = {}
+        self.modality_scale = {}
+        self.modality_config = {}
+        for c in clip_type:
+            pretrained_ckpt = f'LanguageBind/LanguageBind_{c.capitalize()}'
+            model = model_dict[c].from_pretrained(pretrained_ckpt, cache_dir=cache_dir)
+            self.modality_encoder[c] = model.vision_model
+            self.modality_proj[c] = model.visual_projection
+            self.modality_scale[c] = model.logit_scale
+            self.modality_config[c] = model.config
+        self.modality_encoder['language'] = model.text_model
+        self.modality_proj['language'] = model.text_projection
+
+        self.modality_encoder = nn.ModuleDict(self.modality_encoder)
+        self.modality_proj = nn.ModuleDict(self.modality_proj)
+
+    def forward(self, inputs):
+        outputs = {}
+        for key, value in inputs.items():
+            value = self.modality_encoder[key](**value)[1]
+            value = self.modality_proj[key](value)
+            value = value / value.norm(p=2, dim=-1, keepdim=True)
+            if self.use_temp:
+                if key != 'language':
+                    value = value * self.modality_scale[key].exp()
+            outputs[key] = value
+        return outputs
+
+def to_device(x, device):
+    out_dict = {k: v.to(device) for k, v in x.items()}
+    return out_dict
+
+
+
+
+class LanguageBindImageTower(nn.Module):
+    def __init__(self, image_tower, args, delay_load=False, cache_dir='./cache_dir'):
+        super().__init__()
+        # import pdb; pdb.set_trace()
+        self.is_loaded = False
+
+        self.image_tower_name = image_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        self.cache_dir = cache_dir
+
+        if not delay_load:
+            self.load_model()
+        else:
+            # import pdb; pdb.set_trace()
+            self.cfg_only = LanguageBindImageConfig.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir)
+
+    ############################################################
+    def load_model(self):
+        model = LanguageBindImage.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir)
+        self.image_tower = model.vision_model
+        self.image_tower.requires_grad_(False)
+
+        self.image_processor = LanguageBindImageProcessor(model.config)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            # print('images', images.shape)
+            image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            # print('image_forward_outs', len(image_forward_outs), image_forward_outs[0].shape)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+            # print('image_features', image_features.shape)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.image_tower.embeddings.class_embedding.dtype  #############
+
+    @property
+    def device(self):
+        return self.image_tower.embeddings.class_embedding.device  ##############
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.image_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+class temp_model(nn.Module):
+    def __init__(self):
+        super(temp_model, self).__init__()
+    def forward(self, **kwargs):
+        return torch.randn(25, 1, 256, 1024)
+
+
+class LanguageBindVideoTower(nn.Module):
+    def __init__(self, video_tower, args, delay_load=False, cache_dir='./cache_dir'):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.video_tower_name = video_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        self.cache_dir = cache_dir
+        
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = LanguageBindVideoConfig.from_pretrained(self.video_tower_name, cache_dir=self.cache_dir)
+
+        ## 使用deley load， from_pretrained 之后，self.is_loaded 仍然是false
+        # import pdb; pdb.set_trace()
+
+    ############################################################
+    def load_model(self):
+        model = LanguageBindVideo.from_pretrained(self.video_tower_name, cache_dir=self.cache_dir)
+        self.video_processor = LanguageBindVideoProcessor(model.config)
+
+
+        # model = LanguageBindImage.from_pretrained('LanguageBind/LanguageBind_Image', cache_dir=self.cache_dir)
+        self.video_tower = model.vision_model
+        self.video_tower.requires_grad_(False)
+
+
+        self.is_loaded = True
+
+    # def feature_select(self, image_forward_outs):
+    #     image_features = image_forward_outs.hidden_states[self.select_layer]
+    #     if self.select_feature == 'patch':
+    #         image_features = image_features[:, 1:]
+    #     elif self.select_feature == 'cls_patch':
+    #         image_features = image_features
+    #     else:
+    #         raise ValueError(f'Unexpected select feature: {self.select_feature}')
+    #     return image_features
+
+    def feature_select(self, video_forward_outs):
+        # print('len(video_forward_outs.hidden_states)', len(video_forward_outs.hidden_states))
+        video_features = video_forward_outs.hidden_states[self.select_layer]  # b t n c
+        b, t, n, c = video_features.shape
+        # print('video_features', video_features.shape)
+        if self.select_feature == 'patch':
+            # video_features = video_features[:, 1:]
+            video_features = video_features[:, :, 1:]
+            video_features = video_features.reshape(b, -1, c)
+        elif self.select_feature == 'cls_patch':
+            # video_features = video_features
+            video_features = video_features.reshape(b, -1, c)
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return video_features
+
+    @torch.no_grad()
+    def forward(self, videos):
+        # import pdb; pdb.set_trace()
+        if type(videos) is list:
+            video_features = []
+            for video in videos:
+                video_forward_out = self.video_tower(video.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                video_feature = self.feature_select(video_forward_out).to(video.dtype)
+                video_features.append(video_feature)
+        else:
+            # print(11111111111, videos.shape)
+            video_forward_outs = self.video_tower(videos.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            video_features = self.feature_select(video_forward_outs).to(videos.dtype)
+
+        return video_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.video_tower.embeddings.class_embedding.dtype  #############
+        # return torch.randn(1).cuda().dtype
+
+    @property
+    def device(self):
+        return self.video_tower.embeddings.class_embedding.device  ##############
+        # return torch.randn(1).cuda().device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.video_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/models/multimodal_encoder/languagebind/audio/configuration_audio.py b/models/multimodal_encoder/languagebind/audio/configuration_audio.py
new file mode 100755
index 0000000..865a496
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/configuration_audio.py
@@ -0,0 +1,430 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.add_time_attn = False  ######################################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+
+        add_time_attn=False, ################################
+        num_frames=1, ################################
+        force_patch_dropout=0.0, ################################
+        lora_r=2, ################################
+        lora_alpha=16, ################################
+        lora_dropout=0.0, ################################
+        num_mel_bins=0.0, ################################
+        target_length=0.0, ################################
+        video_decode_backend='decord', #########################
+        audio_sample_rate=16000,
+        audio_mean=0.5,
+        audio_std=0.5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.add_time_attn = add_time_attn  ################
+        self.num_frames = num_frames  ################
+        self.force_patch_dropout = force_patch_dropout  ################
+        self.lora_r = lora_r  ################
+        self.lora_alpha = lora_alpha  ################
+        self.lora_dropout = lora_dropout  ################
+        self.num_mel_bins = num_mel_bins  ################
+        self.target_length = target_length  ################
+        self.video_decode_backend = video_decode_backend  ################
+
+        self.audio_sample_rate = audio_sample_rate
+        self.audio_mean = audio_mean
+        self.audio_std = audio_std
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindAudioConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "LanguageBindAudio"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/audio/modeling_audio.py b/models/multimodal_encoder/languagebind/audio/modeling_audio.py
new file mode 100755
index 0000000..908ab43
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/modeling_audio.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+    CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_audio import LanguageBindAudioConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x, B, T):
+        if not self.training or self.prob == 0.:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        if T == 1:
+            rand = torch.randn(batch, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        else:
+            rand = torch.randn(B, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+            patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+            patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: LanguageBindAudioConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.add_time_attn = config.add_time_attn
+        if self.add_time_attn:
+            self.t = config.num_frames
+            self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+            nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+            self.embed_dim = config.hidden_size
+            self.temporal_attn = CLIPAttention(config)
+            self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.temporal_mlp = CLIPMLP(config)
+            self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+
+        if self.add_time_attn:
+            bt, n, d = hidden_states.shape
+            t = self.t
+
+            # time embed
+            if t != 1:
+                n = hidden_states.shape[1]
+                hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+                hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+                hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # time attn
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm1(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.temporal_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm2(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm2(hidden_states)
+            hidden_states = self.temporal_mlp(hidden_states)
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+        # spatial attn
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LanguageBindAudioConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LanguageBindAudio):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: LanguageBindAudioConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.patch_dropout = PatchDropout(config.force_patch_dropout)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        ######################################
+        if len(pixel_values.shape) == 7:
+            b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+            # print(pixel_values.shape)
+            B = b_new * pair_new * bs_new
+            pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+        elif len(pixel_values.shape) == 5:
+            B, _, T, _, _ = pixel_values.shape
+            # print(pixel_values.shape)
+            pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+        else:
+            # print(pixel_values.shape)
+            B, _, _, _ = pixel_values.shape
+            T = 1
+        ###########################
+        hidden_states = self.embeddings(pixel_values)
+
+        hidden_states = self.patch_dropout(hidden_states, B, T)  ##############################################
+
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindAudio(CLIPPreTrainedModel):
+    config_class = LanguageBindAudioConfig
+
+    def __init__(self, config: LanguageBindAudioConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.add_time_attn = vision_config.add_time_attn
+        self.lora_r = vision_config.lora_r
+        self.lora_alpha = vision_config.lora_alpha
+        self.lora_dropout = vision_config.lora_dropout
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.convert_to_lora()
+        self.resize_pos(self.vision_model.embeddings, vision_config)
+
+    def convert_to_lora(self):
+        if self.lora_r == 0:
+            return
+        if self.add_time_attn:
+            target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+                              "temporal_attn.q_proj", "temporal_attn.out_proj",
+                              "temporal_mlp.fc1", "temporal_mlp.fc2"]
+        else:
+            target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+        config = LoraConfig(
+            r=self.lora_r,  # 16
+            lora_alpha=self.lora_alpha,  # 16
+            target_modules=target_modules,  # self_attn.out_proj
+            lora_dropout=self.lora_dropout,  # 0.1
+            bias="none",
+            modules_to_save=[],
+        )
+        self.vision_model.encoder.is_gradient_checkpointing = False
+        self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+    def resize_pos(self, m, vision_config):
+        # convert embedding
+        if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+            m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+        m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+        # pos resize
+        old_pos_embed_state_dict = m.position_embedding.state_dict()
+        old_pos_embed = old_pos_embed_state_dict['weight']
+        dtype = old_pos_embed.dtype
+        grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+        extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+        new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+        if new_seq_len == old_pos_embed.shape[0]:
+            # m.to(args.device)
+            return
+
+        m.num_patches = grid_size[0] * grid_size[1]
+        m.num_positions = m.num_patches + 1
+        m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+        new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+        if extra_tokens:
+            pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+        else:
+            pos_emb_tok, pos_emb_img = None, old_pos_embed
+        old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+        # if is_master(args):
+        #     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+        pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+        pos_emb_img = F.interpolate(
+            pos_emb_img,
+            size=grid_size,
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        )
+        pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+        if pos_emb_tok is not None:
+            new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+        else:
+            new_pos_embed = pos_emb_img
+        old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+        m.position_embedding = new_position_embedding
+        m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+        # m.to(args.device)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindAudioConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/audio/processing_audio.py b/models/multimodal_encoder/languagebind/audio/processing_audio.py
new file mode 100755
index 0000000..8c9baec
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/processing_audio.py
@@ -0,0 +1,190 @@
+import cv2
+import numpy as np
+import torch
+# import torchaudio
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+from torch.nn import functional as F
+
+
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+
+
+#torchaudio.set_audio_backend("soundfile")
+
+def torchaudio_loader(path):
+    return torchaudio.load(path)
+
+def int16_to_float32_torch(x):
+    return (x / 32767.0).type(torch.float32)
+
+def float32_to_int16_torch(x):
+    x = torch.clamp(x, min=-1., max=1.)
+    return (x * 32767.).type(torch.int16)
+
+DEFAULT_AUDIO_FRAME_SHIFT_MS = 10
+
+class AudioTransform:
+    def __init__(self, config):
+        self.sample_rate = config.audio_sample_rate
+        self.num_mel_bins = config.num_mel_bins
+        self.target_length = config.target_length
+        self.audio_mean = config.audio_mean
+        self.audio_std = config.audio_std
+        # mean=-4.2677393
+        # std=4.5689974
+        self.norm = transforms.Normalize(mean=self.audio_mean, std=self.audio_std)
+
+    def __call__(self, audio_data_and_origin_sr):
+        audio_data, origin_sr = audio_data_and_origin_sr
+        if self.sample_rate != origin_sr:
+            # print(audio_data.shape, origin_sr)
+            audio_data = torchaudio.functional.resample(audio_data, orig_freq=origin_sr, new_freq=self.sample_rate)
+        waveform_melspec = self.waveform2melspec(audio_data[0])
+        return self.norm(waveform_melspec)
+
+    def waveform2melspec(self, audio_data):
+        max_len = self.target_length * self.sample_rate // 100
+        if audio_data.shape[-1] > max_len:
+            mel = self.get_mel(audio_data)
+            # split to three parts
+            chunk_frames = self.target_length
+            total_frames = mel.shape[0]
+            ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+            # print('total_frames-chunk_frames:', total_frames-chunk_frames,
+            #       'len(audio_data):', len(audio_data),
+            #       'chunk_frames:', chunk_frames,
+            #       'total_frames:', total_frames)
+            if len(ranges[1]) == 0:  # if the audio is too short, we just use the first chunk
+                ranges[1] = [0]
+            if len(ranges[2]) == 0:  # if the audio is too short, we just use the first chunk
+                ranges[2] = [0]
+            # randomly choose index for each part
+            # idx_front = np.random.choice(ranges[0])
+            # idx_middle = np.random.choice(ranges[1])
+            # idx_back = np.random.choice(ranges[2])
+            idx_front = ranges[0][0]  # fixed
+            idx_middle = ranges[1][0]
+            idx_back = ranges[2][0]
+            # select mel
+            mel_chunk_front = mel[idx_front:idx_front + chunk_frames, :]
+            mel_chunk_middle = mel[idx_middle:idx_middle + chunk_frames, :]
+            mel_chunk_back = mel[idx_back:idx_back + chunk_frames, :]
+            # stack
+            mel_fusion = torch.stack([mel_chunk_front, mel_chunk_middle, mel_chunk_back], dim=0)
+        elif audio_data.shape[-1] < max_len:  # padding if too short
+            n_repeat = int(max_len / len(audio_data))
+            audio_data = audio_data.repeat(n_repeat)
+            audio_data = F.pad(
+                audio_data,
+                (0, max_len - len(audio_data)),
+                mode="constant",
+                value=0,
+            )
+            mel = self.get_mel(audio_data)
+            mel_fusion = torch.stack([mel, mel, mel], dim=0)
+        else:  # if equal
+            mel = self.get_mel(audio_data)
+            mel_fusion = torch.stack([mel, mel, mel], dim=0)
+
+        # twice check
+        p = self.target_length - mel_fusion.shape[1]
+
+        # if abs(p) / self.target_length > 0.2:
+        #     logging.warning(
+        #         "Large gap between audio n_frames(%d) and "
+        #         "target_length (%d). Is the audio_target_length "
+        #         "setting correct?",
+        #         mel_fusion.shape[1],
+        #         self.target_length,
+        #     )
+
+        # cut and pad
+        if p > 0:
+            m = torch.nn.ZeroPad2d((0, 0, 0, p))
+            mel_fusion = m(mel_fusion)
+        elif p < 0:
+            mel_fusion = mel_fusion[:, 0: self.target_length, :]
+
+        mel_fusion = mel_fusion.transpose(1, 2)  # [3, target_length, mel_bins] -> [3, mel_bins, target_length]
+        return mel_fusion
+
+    def get_mel(self, audio_data):
+        # mel shape: (n_mels, T)
+        audio_data -= audio_data.mean()
+        mel = torchaudio.compliance.kaldi.fbank(
+            audio_data.unsqueeze(0),
+            htk_compat=True,
+            sample_frequency=self.sample_rate,
+            use_energy=False,
+            window_type="hanning",
+            num_mel_bins=self.num_mel_bins,
+            dither=0.0,
+            frame_length=25,
+            frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS,
+        )
+        return mel  # (T, n_mels)
+
+def get_audio_transform(config):
+    config = config.vision_config
+    return AudioTransform(config)
+
+
+def load_and_transform_audio(
+    audio_path,
+    transform,
+):
+    waveform_and_sr = torchaudio_loader(audio_path)
+    audio_outputs = transform(waveform_and_sr)
+
+    return audio_outputs
+
+class LanguageBindAudioProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = ("LanguageBindAudioTokenizer")
+
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.transform = get_audio_transform(config)
+        self.image_processor = load_and_transform_audio
+        self.tokenizer = tokenizer
+
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+                                      truncation=True, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            images = make_list_of_images(images)
+            image_features = [self.image_processor(image, self.transform) for image in images]
+            image_features = torch.stack(image_features)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {"pixel_values": image_features}
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/audio/tokenization_audio.py b/models/multimodal_encoder/languagebind/audio/tokenization_audio.py
new file mode 100755
index 0000000..6bc40be
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/audio/tokenization_audio.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lb203/LanguageBind-Audio": "https://huggingface.co/lb203/LanguageBind-Audio/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "lb203/LanguageBind-Audio": "https://huggingface.co/lb203/LanguageBind-Audio/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lb203/LanguageBind-Audio": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "lb203/LanguageBind-Audio": {},
+}
+
+class LanguageBindAudioTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            errors="replace",
+            unk_token="<|endoftext|>",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            pad_token="<|endoftext|>",  # hack to enable padding
+            **kwargs,
+    ):
+        super(LanguageBindAudioTokenizer, self).__init__(
+            vocab_file,
+            merges_file,
+            errors,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,  # hack to enable padding
+            **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/depth/configuration_depth.py b/models/multimodal_encoder/languagebind/depth/configuration_depth.py
new file mode 100755
index 0000000..0d3901b
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/configuration_depth.py
@@ -0,0 +1,425 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.add_time_attn = False  ######################################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+
+        add_time_attn=False, ################################
+        num_frames=1, ################################
+        force_patch_dropout=0.0, ################################
+        lora_r=2, ################################
+        lora_alpha=16, ################################
+        lora_dropout=0.0, ################################
+        num_mel_bins=0.0, ################################
+        target_length=0.0, ################################
+        max_depth=10,
+        video_decode_backend='decord', #########################
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.add_time_attn = add_time_attn  ################
+        self.num_frames = num_frames  ################
+        self.force_patch_dropout = force_patch_dropout  ################
+        self.lora_r = lora_r  ################
+        self.lora_alpha = lora_alpha  ################
+        self.lora_dropout = lora_dropout  ################
+        self.num_mel_bins = num_mel_bins  ################
+        self.target_length = target_length  ################
+        self.max_depth = max_depth  ################
+        self.video_decode_backend = video_decode_backend  ################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindDepthConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "LanguageBindDepth"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/depth/modeling_depth.py b/models/multimodal_encoder/languagebind/depth/modeling_depth.py
new file mode 100755
index 0000000..849eade
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/modeling_depth.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+    CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_depth import LanguageBindDepthConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x, B, T):
+        if not self.training or self.prob == 0.:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        if T == 1:
+            rand = torch.randn(batch, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        else:
+            rand = torch.randn(B, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+            patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+            patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: LanguageBindDepthConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.add_time_attn = config.add_time_attn
+        if self.add_time_attn:
+            self.t = config.num_frames
+            self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+            nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+            self.embed_dim = config.hidden_size
+            self.temporal_attn = CLIPAttention(config)
+            self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.temporal_mlp = CLIPMLP(config)
+            self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+
+        if self.add_time_attn:
+            bt, n, d = hidden_states.shape
+            t = self.t
+
+            # time embed
+            if t != 1:
+                n = hidden_states.shape[1]
+                hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+                hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+                hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # time attn
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm1(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.temporal_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm2(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm2(hidden_states)
+            hidden_states = self.temporal_mlp(hidden_states)
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+        # spatial attn
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LanguageBindDepthConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LanguageBindDepth):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: LanguageBindDepthConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.patch_dropout = PatchDropout(config.force_patch_dropout)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        ######################################
+        if len(pixel_values.shape) == 7:
+            b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+            # print(pixel_values.shape)
+            B = b_new * pair_new * bs_new
+            pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+        elif len(pixel_values.shape) == 5:
+            B, _, T, _, _ = pixel_values.shape
+            # print(pixel_values.shape)
+            pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+        else:
+            # print(pixel_values.shape)
+            B, _, _, _ = pixel_values.shape
+            T = 1
+        ###########################
+        hidden_states = self.embeddings(pixel_values)
+
+        hidden_states = self.patch_dropout(hidden_states, B, T)  ##############################################
+
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindDepth(CLIPPreTrainedModel):
+    config_class = LanguageBindDepthConfig
+
+    def __init__(self, config: LanguageBindDepthConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.add_time_attn = vision_config.add_time_attn
+        self.lora_r = vision_config.lora_r
+        self.lora_alpha = vision_config.lora_alpha
+        self.lora_dropout = vision_config.lora_dropout
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.convert_to_lora()
+        self.resize_pos(self.vision_model.embeddings, vision_config)
+
+    def convert_to_lora(self):
+        if self.lora_r == 0:
+            return
+        if self.add_time_attn:
+            target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+                              "temporal_attn.q_proj", "temporal_attn.out_proj",
+                              "temporal_mlp.fc1", "temporal_mlp.fc2"]
+        else:
+            target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+        config = LoraConfig(
+            r=self.lora_r,  # 16
+            lora_alpha=self.lora_alpha,  # 16
+            target_modules=target_modules,  # self_attn.out_proj
+            lora_dropout=self.lora_dropout,  # 0.1
+            bias="none",
+            modules_to_save=[],
+        )
+        self.vision_model.encoder.is_gradient_checkpointing = False
+        self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+    def resize_pos(self, m, vision_config):
+        # convert embedding
+        if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+            m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+        m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+        # pos resize
+        old_pos_embed_state_dict = m.position_embedding.state_dict()
+        old_pos_embed = old_pos_embed_state_dict['weight']
+        dtype = old_pos_embed.dtype
+        grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+        extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+        new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+        if new_seq_len == old_pos_embed.shape[0]:
+            # m.to(args.device)
+            return
+
+        m.num_patches = grid_size[0] * grid_size[1]
+        m.num_positions = m.num_patches + 1
+        m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+        new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+        if extra_tokens:
+            pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+        else:
+            pos_emb_tok, pos_emb_img = None, old_pos_embed
+        old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+        # if is_master(args):
+        #     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+        pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+        pos_emb_img = F.interpolate(
+            pos_emb_img,
+            size=grid_size,
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        )
+        pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+        if pos_emb_tok is not None:
+            new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+        else:
+            new_pos_embed = pos_emb_img
+        old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+        m.position_embedding = new_position_embedding
+        m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+        # m.to(args.device)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindDepthConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/depth/processing_depth.py b/models/multimodal_encoder/languagebind/depth/processing_depth.py
new file mode 100755
index 0000000..1019e0c
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/processing_depth.py
@@ -0,0 +1,108 @@
+import cv2
+import torch
+from PIL import Image
+from torch import nn
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+
+def opencv_loader(path):
+    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype('float32')
+
+
+class DepthNorm(nn.Module):
+    def __init__(
+        self,
+        max_depth=0,
+        min_depth=0.01,
+    ):
+        super().__init__()
+        self.max_depth = max_depth
+        self.min_depth = min_depth
+        self.scale = 1000.0  # nyuv2 abs.depth
+
+    def forward(self, image):
+        # image = np.array(image)
+        depth_img = image / self.scale  # (H, W)   in meters
+        depth_img = depth_img.clip(min=self.min_depth)
+        if self.max_depth != 0:
+            depth_img = depth_img.clip(max=self.max_depth)
+            depth_img /= self.max_depth   #  0-1
+        else:
+            depth_img /= depth_img.max()
+        depth_img = torch.from_numpy(depth_img).unsqueeze(0).repeat(3, 1, 1)  # assume image
+        return depth_img.to(torch.get_default_dtype())
+
+def get_depth_transform(config):
+    config = config.vision_config
+    transform = transforms.Compose(
+        [
+            DepthNorm(max_depth=config.max_depth),
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD),  # assume image
+            # transforms.Normalize((0.5, ), (0.5, ))  # 0-1 to norm distribution
+            # transforms.Normalize((0.0418, ), (0.0295, ))  # sun rgb-d  imagebind
+            # transforms.Normalize((0.02, ), (0.00295, ))  # nyuv2
+        ]
+    )
+    return transform
+
+def load_and_transform_depth(depth_path, transform):
+    depth = opencv_loader(depth_path)
+    depth_outputs = transform(depth)
+    return depth_outputs
+
+class LanguageBindDepthProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = ("LanguageBindDepthTokenizer")
+
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.transform = get_depth_transform(config)
+        self.image_processor = load_and_transform_depth
+        self.tokenizer = tokenizer
+
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+                                      truncation=True, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            images = make_list_of_images(images)
+            image_features = [self.image_processor(image, self.transform) for image in images]
+            image_features = torch.stack(image_features)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {"pixel_values": image_features}
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/depth/tokenization_depth.py b/models/multimodal_encoder/languagebind/depth/tokenization_depth.py
new file mode 100755
index 0000000..eda9905
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/depth/tokenization_depth.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lb203/LanguageBind-Depth": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "lb203/LanguageBind-Thermal": {},
+}
+
+class LanguageBindDepthTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            errors="replace",
+            unk_token="<|endoftext|>",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            pad_token="<|endoftext|>",  # hack to enable padding
+            **kwargs,
+    ):
+        super(LanguageBindDepthTokenizer, self).__init__(
+            vocab_file,
+            merges_file,
+            errors,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,  # hack to enable padding
+            **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/image/configuration_image.py b/models/multimodal_encoder/languagebind/image/configuration_image.py
new file mode 100755
index 0000000..c1c7b0f
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/configuration_image.py
@@ -0,0 +1,423 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.add_time_attn = False  ######################################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+
+        add_time_attn=False, ################################
+        num_frames=1, ################################
+        force_patch_dropout=0.0, ################################
+        lora_r=2, ################################
+        lora_alpha=16, ################################
+        lora_dropout=0.0, ################################
+        num_mel_bins=0.0, ################################
+        target_length=0.0, ################################
+        video_decode_backend='decord', #########################
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.add_time_attn = add_time_attn  ################
+        self.num_frames = num_frames  ################
+        self.force_patch_dropout = force_patch_dropout  ################
+        self.lora_r = lora_r  ################
+        self.lora_alpha = lora_alpha  ################
+        self.lora_dropout = lora_dropout  ################
+        self.num_mel_bins = num_mel_bins  ################
+        self.target_length = target_length  ################
+        self.video_decode_backend = video_decode_backend  ################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindImageConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "LanguageBindImage"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/image/modeling_image.py b/models/multimodal_encoder/languagebind/image/modeling_image.py
new file mode 100755
index 0000000..e95ac47
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/modeling_image.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+    CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_image import LanguageBindImageConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x, B, T):
+        if not self.training or self.prob == 0.:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        if T == 1:
+            rand = torch.randn(batch, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        else:
+            rand = torch.randn(B, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+            patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+            patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: LanguageBindImageConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.add_time_attn = config.add_time_attn
+        if self.add_time_attn:
+            self.t = config.num_frames
+            self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+            nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+            self.embed_dim = config.hidden_size
+            self.temporal_attn = CLIPAttention(config)
+            self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.temporal_mlp = CLIPMLP(config)
+            self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+
+        if self.add_time_attn:
+            bt, n, d = hidden_states.shape
+            t = self.t
+
+            # time embed
+            if t != 1:
+                n = hidden_states.shape[1]
+                hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+                hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+                hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # time attn
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm1(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.temporal_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm2(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm2(hidden_states)
+            hidden_states = self.temporal_mlp(hidden_states)
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+        # spatial attn
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LanguageBindImageConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LanguageBindImage):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: LanguageBindImageConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.patch_dropout = PatchDropout(config.force_patch_dropout)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        ######################################
+        if len(pixel_values.shape) == 7:
+            b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+            # print(pixel_values.shape)
+            B = b_new * pair_new * bs_new
+            pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+        elif len(pixel_values.shape) == 5:
+            B, _, T, _, _ = pixel_values.shape
+            # print(pixel_values.shape)
+            pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+        else:
+            # print(pixel_values.shape)
+            B, _, _, _ = pixel_values.shape
+            T = 1
+        ###########################
+        hidden_states = self.embeddings(pixel_values)
+
+        hidden_states = self.patch_dropout(hidden_states, B, T)  ##############################################
+
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindImage(CLIPPreTrainedModel):
+    config_class = LanguageBindImageConfig
+
+    def __init__(self, config: LanguageBindImageConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.add_time_attn = vision_config.add_time_attn
+        self.lora_r = vision_config.lora_r
+        self.lora_alpha = vision_config.lora_alpha
+        self.lora_dropout = vision_config.lora_dropout
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.convert_to_lora()
+        # self.resize_pos(self.vision_model.embeddings, vision_config)
+
+    def convert_to_lora(self):
+        if self.lora_r == 0:
+            return
+        if self.add_time_attn:
+            target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+                              "temporal_attn.q_proj", "temporal_attn.out_proj",
+                              "temporal_mlp.fc1", "temporal_mlp.fc2"]
+        else:
+            target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+        config = LoraConfig(
+            r=self.lora_r,  # 16
+            lora_alpha=self.lora_alpha,  # 16
+            target_modules=target_modules,  # self_attn.out_proj
+            lora_dropout=self.lora_dropout,  # 0.1
+            bias="none",
+            modules_to_save=[],
+        )
+        self.vision_model.encoder.is_gradient_checkpointing = False
+        self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+    def resize_pos(self, m, vision_config):
+        # convert embedding
+        if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+            m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+        m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+        # pos resize
+        old_pos_embed_state_dict = m.position_embedding.state_dict()
+        old_pos_embed = old_pos_embed_state_dict['weight']
+        dtype = old_pos_embed.dtype
+        grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+        extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+        new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+        if new_seq_len == old_pos_embed.shape[0]:
+            # m.to(args.device)
+            return
+
+        m.num_patches = grid_size[0] * grid_size[1]
+        m.num_positions = m.num_patches + 1
+        m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+        new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+        if extra_tokens:
+            pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+        else:
+            pos_emb_tok, pos_emb_img = None, old_pos_embed
+        old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+        # if is_master(args):
+        #     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+        pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+        pos_emb_img = F.interpolate(
+            pos_emb_img,
+            size=grid_size,
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        )
+        pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+        if pos_emb_tok is not None:
+            new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+        else:
+            new_pos_embed = pos_emb_img
+        old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+        m.position_embedding = new_position_embedding
+        m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+        # m.to(args.device)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindImageConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/image/processing_image.py b/models/multimodal_encoder/languagebind/image/processing_image.py
new file mode 100755
index 0000000..1aafc79
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/processing_image.py
@@ -0,0 +1,82 @@
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+
+def get_image_transform(config):
+    config = config.vision_config
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD)  # assume image
+        ]
+    )
+    return transform
+
+
+def load_and_transform_image(image_path, transform):
+    image = Image.open(image_path).convert('RGB') if isinstance(image_path, str) else image_path
+    image_outputs = transform(image)
+    return image_outputs
+
+class LanguageBindImageProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = ("LanguageBindImageTokenizer")
+
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.transform = get_image_transform(config)
+        self.image_processor = load_and_transform_image
+        self.tokenizer = tokenizer
+        self.image_mean = OPENAI_DATASET_MEAN
+        self.crop_size = {'height': 224, 'width': 224}
+
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+                                      truncation=True, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            images = make_list_of_images(images)
+            image_features = [self.image_processor(image, self.transform) for image in images]
+            image_features = torch.stack(image_features)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {"pixel_values": image_features}
+
+    def preprocess(self, images, return_tensors):
+        return self.__call__(images=images, return_tensors=return_tensors)
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/image/tokenization_image.py b/models/multimodal_encoder/languagebind/image/tokenization_image.py
new file mode 100755
index 0000000..593423d
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/image/tokenization_image.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lb203/LanguageBind-Image": "https://huggingface.co/lb203/LanguageBind-Image/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "lb203/LanguageBind-Image": "https://huggingface.co/lb203/LanguageBind-Image/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lb203/LanguageBind-Image": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "lb203/LanguageBind-Image": {},
+}
+
+class LanguageBindImageTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            errors="replace",
+            unk_token="<|endoftext|>",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            pad_token="<|endoftext|>",  # hack to enable padding
+            **kwargs,
+    ):
+        super(LanguageBindImageTokenizer, self).__init__(
+            vocab_file,
+            merges_file,
+            errors,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,  # hack to enable padding
+            **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/thermal/configuration_thermal.py b/models/multimodal_encoder/languagebind/thermal/configuration_thermal.py
new file mode 100755
index 0000000..fd6cedd
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/configuration_thermal.py
@@ -0,0 +1,423 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.add_time_attn = False  ######################################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+
+        add_time_attn=False, ################################
+        num_frames=1, ################################
+        force_patch_dropout=0.0, ################################
+        lora_r=2, ################################
+        lora_alpha=16, ################################
+        lora_dropout=0.0, ################################
+        num_mel_bins=0.0, ################################
+        target_length=0.0, ################################
+        video_decode_backend='decord', #########################
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.add_time_attn = add_time_attn  ################
+        self.num_frames = num_frames  ################
+        self.force_patch_dropout = force_patch_dropout  ################
+        self.lora_r = lora_r  ################
+        self.lora_alpha = lora_alpha  ################
+        self.lora_dropout = lora_dropout  ################
+        self.num_mel_bins = num_mel_bins  ################
+        self.target_length = target_length  ################
+        self.video_decode_backend = video_decode_backend  ################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindThermalConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "LanguageBindThermal"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/thermal/modeling_thermal.py b/models/multimodal_encoder/languagebind/thermal/modeling_thermal.py
new file mode 100755
index 0000000..f0323b3
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/modeling_thermal.py
@@ -0,0 +1,1030 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+    CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_thermal import LanguageBindThermalConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x, B, T):
+        if not self.training or self.prob == 0.:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        if T == 1:
+            rand = torch.randn(batch, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        else:
+            rand = torch.randn(B, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+            patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+            patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: LanguageBindThermalConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.add_time_attn = config.add_time_attn
+        if self.add_time_attn:
+            self.t = config.num_frames
+            self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+            nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+            self.embed_dim = config.hidden_size
+            self.temporal_attn = CLIPAttention(config)
+            self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.temporal_mlp = CLIPMLP(config)
+            self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+
+        if self.add_time_attn:
+            bt, n, d = hidden_states.shape
+            t = self.t
+
+            # time embed
+            if t != 1:
+                n = hidden_states.shape[1]
+                hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+                hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+                hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # time attn
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm1(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.temporal_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm2(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm2(hidden_states)
+            hidden_states = self.temporal_mlp(hidden_states)
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+        # spatial attn
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LanguageBindThermalConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LanguageBindThermal):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: LanguageBindThermalConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.patch_dropout = PatchDropout(config.force_patch_dropout)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        ######################################
+        if len(pixel_values.shape) == 7:
+            b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+            # print(pixel_values.shape)
+            B = b_new * pair_new * bs_new
+            pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+        elif len(pixel_values.shape) == 5:
+            B, _, T, _, _ = pixel_values.shape
+            # print(pixel_values.shape)
+            pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+        else:
+            # print(pixel_values.shape)
+            B, _, _, _ = pixel_values.shape
+            T = 1
+        ###########################
+        hidden_states = self.embeddings(pixel_values)
+
+        hidden_states = self.patch_dropout(hidden_states, B, T)  ##############################################
+
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindThermal(CLIPPreTrainedModel):
+    config_class = LanguageBindThermalConfig
+
+    def __init__(self, config: LanguageBindThermalConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.add_time_attn = vision_config.add_time_attn
+        self.lora_r = vision_config.lora_r
+        self.lora_alpha = vision_config.lora_alpha
+        self.lora_dropout = vision_config.lora_dropout
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.convert_to_lora()
+        self.resize_pos(self.vision_model.embeddings, vision_config)
+
+    def convert_to_lora(self):
+        if self.lora_r == 0:
+            return
+        if self.add_time_attn:
+            target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+                              "temporal_attn.q_proj", "temporal_attn.out_proj",
+                              "temporal_mlp.fc1", "temporal_mlp.fc2"]
+        else:
+            target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+        config = LoraConfig(
+            r=self.lora_r,  # 16
+            lora_alpha=self.lora_alpha,  # 16
+            target_modules=target_modules,  # self_attn.out_proj
+            lora_dropout=self.lora_dropout,  # 0.1
+            bias="none",
+            modules_to_save=[],
+        )
+        self.vision_model.encoder.is_gradient_checkpointing = False
+        self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+    def resize_pos(self, m, vision_config):
+        # convert embedding
+        if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+            m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+        m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+        # pos resize
+        old_pos_embed_state_dict = m.position_embedding.state_dict()
+        old_pos_embed = old_pos_embed_state_dict['weight']
+        dtype = old_pos_embed.dtype
+        grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+        extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+        new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+        if new_seq_len == old_pos_embed.shape[0]:
+            # m.to(args.device)
+            return
+
+        m.num_patches = grid_size[0] * grid_size[1]
+        m.num_positions = m.num_patches + 1
+        m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+        new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+        if extra_tokens:
+            pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+        else:
+            pos_emb_tok, pos_emb_img = None, old_pos_embed
+        old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+        # if is_master(args):
+        #     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+        pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+        pos_emb_img = F.interpolate(
+            pos_emb_img,
+            size=grid_size,
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        )
+        pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+        if pos_emb_tok is not None:
+            new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+        else:
+            new_pos_embed = pos_emb_img
+        old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+        m.position_embedding = new_position_embedding
+        m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+        # m.to(args.device)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindThermalConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/thermal/processing_thermal.py b/models/multimodal_encoder/languagebind/thermal/processing_thermal.py
new file mode 100755
index 0000000..36ed1f0
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/processing_thermal.py
@@ -0,0 +1,77 @@
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+
+def get_thermal_transform(config):
+    config = config.vision_config
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD)  # assume image
+        ]
+    )
+    return transform
+
+
+def load_and_transform_thermal(thermal_path, transform):
+    thermal = Image.open(thermal_path)
+    thermal_outputs = transform(thermal)
+    return thermal_outputs
+
+class LanguageBindThermalProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = ("LanguageBindThermalTokenizer")
+
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.transform = get_thermal_transform(config)
+        self.image_processor = load_and_transform_thermal
+        self.tokenizer = tokenizer
+
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+                                      truncation=True, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            images = make_list_of_images(images)
+            image_features = [self.image_processor(image, self.transform) for image in images]
+            image_features = torch.stack(image_features)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {"pixel_values": image_features}
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/thermal/tokenization_thermal.py b/models/multimodal_encoder/languagebind/thermal/tokenization_thermal.py
new file mode 100755
index 0000000..a4ebb56
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/thermal/tokenization_thermal.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lb203/LanguageBind-Thermal": "https://huggingface.co/lb203/LanguageBind-Thermal/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "lb203/LanguageBind-Thermal": "https://huggingface.co/lb203/LanguageBind-Thermal/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lb203/LanguageBind-Thermal": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "lb203/LanguageBind-Thermal": {},
+}
+
+class LanguageBindThermalTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            errors="replace",
+            unk_token="<|endoftext|>",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            pad_token="<|endoftext|>",  # hack to enable padding
+            **kwargs,
+    ):
+        super(LanguageBindThermalTokenizer, self).__init__(
+            vocab_file,
+            merges_file,
+            errors,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,  # hack to enable padding
+            **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/video/check_video_chatgpt_videos.py b/models/multimodal_encoder/languagebind/video/check_video_chatgpt_videos.py
new file mode 100755
index 0000000..eaddfff
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/check_video_chatgpt_videos.py
@@ -0,0 +1,56 @@
+import torch
+import cv2
+import decord
+from decord import VideoReader, cpu
+decord.bridge.set_bridge('torch')
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+from pytorchvideo.data.encoded_video import EncodedVideo
+from torchvision.transforms import Compose, Lambda, ToTensor
+from torchvision.transforms._transforms_video import NormalizeVideo, RandomCropVideo, RandomHorizontalFlipVideo, CenterCropVideo
+from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
+
+import os
+import glob
+from tqdm import tqdm
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def get_video_transform():
+    # import pdb; pdb.set_trace()
+
+
+    transform = Compose(
+        [
+            # UniformTemporalSubsample(num_frames),
+            Lambda(lambda x: x / 255.0),
+            NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+            ShortSideScale(size=224),
+            CenterCropVideo(224),
+            RandomHorizontalFlipVideo(p=0.5),
+        ]
+    )
+
+    return transform
+
+if __name__ == '__main__':
+    directory = '/comp_robot/lushunlin/MotionGPT/video_datasets/videochatgpt/videochatgpt_tune'
+    mp4_files = glob.glob(os.path.join(directory, '*.mp4'))
+    # import pdb; pdb.set_trace()
+    transform = get_video_transform()
+    for video_path in tqdm(mp4_files):
+        try:
+            decord.bridge.set_bridge('torch')
+            decord_vr = VideoReader(video_path, ctx=cpu(0))
+            duration = len(decord_vr)
+            frame_id_list = np.linspace(0, duration-1, 8, dtype=int)
+            video_data = decord_vr.get_batch(frame_id_list)
+            video_data = video_data.permute(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
+            video_outputs = transform(video_data)
+        except:
+            with open('/comp_robot/lushunlin/MotionGPT/records/decord_error.txt', 'a') as f:
+                f.write(video_path+'\n')
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/video/configuration_video.py b/models/multimodal_encoder/languagebind/video/configuration_video.py
new file mode 100755
index 0000000..4b108ec
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/configuration_video.py
@@ -0,0 +1,423 @@
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+
+class CLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.add_time_attn = False  ######################################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+
+
+class CLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
+    CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+
+        add_time_attn=False, ################################
+        num_frames=1, ################################
+        force_patch_dropout=0.0, ################################
+        lora_r=2, ################################
+        lora_alpha=16, ################################
+        lora_dropout=0.0, ################################
+        num_mel_bins=0.0, ################################
+        target_length=0.0, ################################
+        video_decode_backend='decord', #########################
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+        self.add_time_attn = add_time_attn  ################
+        self.num_frames = num_frames  ################
+        self.force_patch_dropout = force_patch_dropout  ################
+        self.lora_r = lora_r  ################
+        self.lora_alpha = lora_alpha  ################
+        self.lora_dropout = lora_dropout  ################
+        self.num_mel_bins = num_mel_bins  ################
+        self.target_length = target_length  ################
+        self.video_decode_backend = video_decode_backend  ################
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LanguageBindVideoConfig(PretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+    >>> from transformers import CLIPTextConfig, CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "LanguageBindVideo"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
+
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+
+
+
+
+
+
+
+
diff --git a/models/multimodal_encoder/languagebind/video/modeling_video.py b/models/multimodal_encoder/languagebind/video/modeling_video.py
new file mode 100755
index 0000000..cb5c621
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/modeling_video.py
@@ -0,0 +1,1033 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from einops import rearrange
+from peft import LoraConfig, get_peft_model
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel, add_start_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.models.clip.modeling_clip import CLIPMLP, CLIPAttention, CLIPTextEmbeddings, CLIPVisionEmbeddings, \
+    CLIPVisionModelWithProjection, CLIPTextModelWithProjection, _expand_mask, CLIPOutput, clip_loss
+from transformers.utils import add_start_docstrings_to_model_forward, replace_return_docstrings
+
+from .configuration_video import LanguageBindVideoConfig, CLIPVisionConfig, CLIPTextConfig
+
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x, B, T):
+        if not self.training or self.prob == 0.:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        if T == 1:
+            rand = torch.randn(batch, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        else:
+            rand = torch.randn(B, num_tokens)
+            patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+            patch_indices_keep = patch_indices_keep.unsqueeze(1).repeat(1, T, 1)
+            patch_indices_keep = rearrange(patch_indices_keep, 'b t n -> (b t) n')
+
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: LanguageBindVideoConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.add_time_attn = config.add_time_attn
+        if self.add_time_attn:
+            self.t = config.num_frames
+            self.temporal_embedding = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+            nn.init.normal_(self.temporal_embedding, std=config.hidden_size ** -0.5)
+
+            self.embed_dim = config.hidden_size
+            self.temporal_attn = CLIPAttention(config)
+            self.temporal_layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            # self.temporal_mlp = CLIPMLP(config)
+            # self.temporal_layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+
+
+        if self.add_time_attn:
+            bt, n, d = hidden_states.shape
+            t = self.t
+
+            # time embed
+            if t != 1:
+                n = hidden_states.shape[1]
+                hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+                hidden_states = hidden_states + self.temporal_embedding[:, :t, :]
+                hidden_states = rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # time attn
+            residual = hidden_states
+            hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # hidden_states = self.layer_norm1(hidden_states)  # share layernorm
+            hidden_states = self.temporal_layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.temporal_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+            # residual = hidden_states
+            # hidden_states = rearrange(hidden_states, '(b t) n d -> (b n) t d', t=t)
+            # # hidden_states = self.layer_norm2(hidden_states)  # share layernorm
+            # hidden_states = self.temporal_layer_norm2(hidden_states)
+            # hidden_states = self.temporal_mlp(hidden_states)
+            # hidden_states = residual + rearrange(hidden_states, '(b n) t d -> (b t) n d', n=n)
+
+        # spatial attn
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+
+
+
+
+
+
+
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LanguageBindVideoConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LanguageBindVideo):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPEncoder):
+            module.gradient_checkpointing = value
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, config: LanguageBindVideoConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.patch_dropout = PatchDropout(config.force_patch_dropout)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # print('input video raw shape', pixel_values.shape)
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        ######################################
+        if len(pixel_values.shape) == 7:
+            b_new, pair_new, T, bs_new, channel_new, h_new, w_new = pixel_values.shape
+            # print(pixel_values.shape)
+            B = b_new * pair_new * bs_new
+            pixel_values = pixel_values.reshape(B*T, channel_new, h_new, w_new)
+
+        elif len(pixel_values.shape) == 5:
+            B, _, T, _, _ = pixel_values.shape
+            # print(pixel_values.shape)
+            pixel_values = rearrange(pixel_values, 'b c t h w -> (b t) c h w')
+        else:
+            # print(pixel_values.shape)
+            B, _, _, _ = pixel_values.shape
+            T = 1
+        ###########################
+        hidden_states = self.embeddings(pixel_values)
+        # print(B, T)
+        hidden_states = self.patch_dropout(hidden_states, B, T)  ##############################################
+
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        # print('video encoder last_hidden_state', last_hidden_state.shape)
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        pooled_output = pooled_output.reshape(B, T, -1).mean(1) ################################
+        #################################
+        encoder_outputs.hidden_states = [rearrange(i, '(b t) n c -> b t n c', b=B) for i in encoder_outputs.hidden_states]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class LanguageBindVideo(CLIPPreTrainedModel):
+    config_class = LanguageBindVideoConfig
+
+    def __init__(self, config: LanguageBindVideoConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.add_time_attn = vision_config.add_time_attn
+        self.lora_r = vision_config.lora_r
+        self.lora_alpha = vision_config.lora_alpha
+        self.lora_dropout = vision_config.lora_dropout
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        # self.convert_to_lora()  ############################################
+        # self.resize_pos(self.vision_model.embeddings, vision_config)
+
+    def convert_to_lora(self):
+        if self.lora_r == 0:
+            return
+        if self.add_time_attn:
+            target_modules = ["temporal_attn.k_proj", "temporal_attn.v_proj",
+                              "temporal_attn.q_proj", "temporal_attn.out_proj",
+                              "temporal_mlp.fc1", "temporal_mlp.fc2"]
+        else:
+            target_modules = ["k_proj", "v_proj", "q_proj", "out_proj"]
+        config = LoraConfig(
+            r=self.lora_r,  # 16
+            lora_alpha=self.lora_alpha,  # 16
+            target_modules=target_modules,  # self_attn.out_proj
+            lora_dropout=self.lora_dropout,  # 0.1
+            bias="none",
+            modules_to_save=[],
+        )
+        self.vision_model.encoder.is_gradient_checkpointing = False
+        self.vision_model.encoder = get_peft_model(self.vision_model.encoder, config)
+
+    def resize_pos(self, m, vision_config):
+        # convert embedding
+        if vision_config.num_mel_bins!=0 and vision_config.target_length!=0:
+            m.image_size = [vision_config.num_mel_bins, vision_config.target_length]
+        m.config.image_size = [m.image_size, m.image_size] if isinstance(m.image_size, int) else m.image_size
+        # pos resize
+        old_pos_embed_state_dict = m.position_embedding.state_dict()
+        old_pos_embed = old_pos_embed_state_dict['weight']
+        dtype = old_pos_embed.dtype
+        grid_size = [m.config.image_size[0] // m.patch_size, m.config.image_size[1] // m.patch_size]
+        extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+        new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+        if new_seq_len == old_pos_embed.shape[0]:
+            # m.to(args.device)
+            return
+
+        m.num_patches = grid_size[0] * grid_size[1]
+        m.num_positions = m.num_patches + 1
+        m.register_buffer("position_ids", torch.arange(m.num_positions).expand((1, -1)))
+        new_position_embedding = nn.Embedding(m.num_positions, m.embed_dim)
+
+        if extra_tokens:
+            pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+        else:
+            pos_emb_tok, pos_emb_img = None, old_pos_embed
+        old_grid_size = [int(math.sqrt(len(pos_emb_img)))] * 2
+
+        # if is_master(args):
+        #     logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+        pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+        pos_emb_img = F.interpolate(
+            pos_emb_img,
+            size=grid_size,
+            mode='bicubic',
+            antialias=True,
+            align_corners=False,
+        )
+        pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+        if pos_emb_tok is not None:
+            new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+        else:
+            new_pos_embed = pos_emb_img
+        old_pos_embed_state_dict['weight'] = new_pos_embed.to(dtype)
+        m.position_embedding = new_position_embedding
+        m.position_embedding.load_state_dict(old_pos_embed_state_dict)
+
+        # m.to(args.device)
+
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=LanguageBindVideoConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
\ No newline at end of file
diff --git a/models/multimodal_encoder/languagebind/video/processing_video.py b/models/multimodal_encoder/languagebind/video/processing_video.py
new file mode 100755
index 0000000..92682ef
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/processing_video.py
@@ -0,0 +1,213 @@
+
+import torch
+import cv2
+import decord
+from decord import VideoReader, cpu
+decord.bridge.set_bridge('torch')
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from transformers import ProcessorMixin, BatchEncoding
+from transformers.image_processing_utils import BatchFeature
+from pytorchvideo.data.encoded_video import EncodedVideo
+from torchvision.transforms import Compose, Lambda, ToTensor
+from torchvision.transforms._transforms_video import NormalizeVideo, RandomCropVideo, RandomHorizontalFlipVideo, CenterCropVideo
+from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
+
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+def make_list_of_images(x):
+    if not isinstance(x, list):
+        return [x]
+    return x
+
+def get_video_transform(config):
+    config = config.vision_config
+    # import pdb; pdb.set_trace()
+    if config.video_decode_backend == 'pytorchvideo':
+        transform = ApplyTransformToKey(
+            key="video",
+            transform=Compose(
+                [
+                    UniformTemporalSubsample(config.num_frames),
+                    Lambda(lambda x: x / 255.0),
+                    NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+                    ShortSideScale(size=224),
+                    CenterCropVideo(224),
+                    RandomHorizontalFlipVideo(p=0.5),
+                ]
+            ),
+        )
+
+    elif config.video_decode_backend == 'decord':
+
+        transform = Compose(
+            [
+                # UniformTemporalSubsample(num_frames),
+                Lambda(lambda x: x / 255.0),
+                NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+                ShortSideScale(size=224),
+                CenterCropVideo(224),
+                RandomHorizontalFlipVideo(p=0.5),
+            ]
+        )
+
+    elif config.video_decode_backend == 'opencv':
+        transform = Compose(
+            [
+                # UniformTemporalSubsample(num_frames),
+                Lambda(lambda x: x / 255.0),
+                NormalizeVideo(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
+                ShortSideScale(size=224),
+                CenterCropVideo(224),
+                RandomHorizontalFlipVideo(p=0.5),
+            ]
+        )
+    else:
+        raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
+    return transform
+
+
+def load_and_transform_video(
+        video_path,
+        transform,
+        video_decode_backend='opencv',
+        clip_start_sec=0.0,
+        clip_end_sec=None,
+        num_frames=8,
+):
+    if video_decode_backend == 'pytorchvideo':
+        #  decord pyav
+        video = EncodedVideo.from_path(video_path, decoder="decord", decode_audio=False)
+        duration = video.duration
+        start_sec = clip_start_sec  # secs
+        end_sec = clip_end_sec if clip_end_sec is not None else duration  # secs
+        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
+        video_outputs = transform(video_data)
+
+    elif video_decode_backend == 'decord':
+        # if '/comp_robot/lushunlin/MotionGPT/video_datasets/videochatgpt/videochatgpt_tune/v_BkBbzC6nIvA.mp4' == video_path \
+        #     or '/comp_robot/lushunlin/MotionGPT/video_datasets/videochatgpt/videochatgpt_tune/v_fWVUEOVUzS4.mp4' == video_path:
+        #     import  pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
+        # try:
+        
+        decord.bridge.set_bridge('torch')
+        decord_vr = VideoReader(video_path, ctx=cpu(0))
+        # duration = len(decord_vr)
+        end_idx = len(decord_vr) - 1
+        start_idx = 0
+
+        
+        if clip_end_sec is not None:
+            fps = float(decord_vr.get_avg_fps())
+            start_idx = max(start_idx, round(clip_start_sec * fps))
+            end_idx = min(round(clip_end_sec * fps), end_idx)
+
+        frame_id_list = np.linspace(start_idx, end_idx, num_frames, dtype=int)
+        # import pdb; pdb.set_trace()
+        video_data = decord_vr.get_batch(frame_id_list)
+
+            
+        video_data = video_data.permute(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
+        video_outputs = transform(video_data)
+        # import pdb; pdb.set_trace()
+        # except:
+        #     with open('/comp_robot/lushunlin/MotionGPT/records/decord_error.txt', 'a') as f:
+        #         f.write(video_path+'\n')
+        #     print(video_path)
+        #     import pdb; pdb.set_trace()
+            
+            ###################
+
+    elif video_decode_backend == 'opencv':
+        cv2_vr = cv2.VideoCapture(video_path)
+        duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
+        frame_id_list = np.linspace(0, duration-5, num_frames, dtype=int)
+
+        video_data = []
+        for frame_idx in frame_id_list:
+            cv2_vr.set(1, frame_idx)
+            ret, frame = cv2_vr.read()
+            if not ret:
+                raise ValueError(f'video error at {video_path}')
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            video_data.append(torch.from_numpy(frame).permute(2, 0, 1))
+        cv2_vr.release()
+        video_data = torch.stack(video_data, dim=1)
+        video_outputs = transform(video_data)
+    else:
+        raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
+    # import pdb; pdb.set_trace()
+    return video_outputs
+
+class LanguageBindVideoProcessor(ProcessorMixin):
+    attributes = []
+    tokenizer_class = ("LanguageBindVideoTokenizer")
+
+    def __init__(self, config, tokenizer=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        # self.config.vision_config.video_decode_backend = 'opencv'
+        self.transform = get_video_transform(config)
+        self.image_processor = load_and_transform_video
+        self.tokenizer = tokenizer
+        
+
+    def __call__(self, images=None, text=None, context_length=77, return_tensors=None, bound=None, **kwargs):
+        if bound is not None:
+            start = bound[0]
+            end = bound[1]
+        else:
+            start = 0.0
+            end = None
+        # import pdb; pdb.set_trace()
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
+                                      truncation=True, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            images = make_list_of_images(images)
+            # import pdb; pdb.set_trace()
+            image_features = []
+            for image in images:
+                # try:
+                image_features.append(self.image_processor(image, self.transform,video_decode_backend=self.config.vision_config.video_decode_backend, clip_start_sec=start, clip_end_sec=end, num_frames=self.config.vision_config.num_frames))
+                # except:
+                #     pass
+            # image_features = [self.image_processor(image, self.transform,
+            #                                        video_decode_backend=self.config.vision_config.video_decode_backend,
+            #                                        num_frames=self.config.vision_config.num_frames) for image in images]
+            # image_features = [torch.rand(3, 8, 224, 224) for image in images]
+            # import pdb; pdb.set_trace()
+            image_features = torch.stack(image_features)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return {"pixel_values": image_features}
+
+    def preprocess(self, images, return_tensors):
+        return self.__call__(images=images, return_tensors=return_tensors)
+
+    def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+
+    def decode(self, skip_special_tokens=True, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
diff --git a/models/multimodal_encoder/languagebind/video/tokenization_video.py b/models/multimodal_encoder/languagebind/video/tokenization_video.py
new file mode 100755
index 0000000..2864429
--- /dev/null
+++ b/models/multimodal_encoder/languagebind/video/tokenization_video.py
@@ -0,0 +1,77 @@
+from transformers import CLIPTokenizer
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "lb203/LanguageBind-Video": 77,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "lb203/LanguageBind-Video": {},
+}
+
+class LanguageBindVideoTokenizer(CLIPTokenizer):
+    """
+    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+            self,
+            vocab_file,
+            merges_file,
+            errors="replace",
+            unk_token="<|endoftext|>",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            pad_token="<|endoftext|>",  # hack to enable padding
+            **kwargs,
+    ):
+        super(LanguageBindVideoTokenizer, self).__init__(
+            vocab_file,
+            merges_file,
+            errors,
+            unk_token,
+            bos_token,
+            eos_token,
+            pad_token,  # hack to enable padding
+            **kwargs,)
\ No newline at end of file
diff --git a/models/multimodal_encoder/mae_encoder.py b/models/multimodal_encoder/mae_encoder.py
new file mode 100755
index 0000000..377883d
--- /dev/null
+++ b/models/multimodal_encoder/mae_encoder.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+
+from transformers import ViTMAEForPreTraining, AutoConfig, AutoImageProcessor
+
+
+class MAEVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, cache_dir='./cache_dir', delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+        self.cache_dir = cache_dir
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = AutoConfig.from_pretrained(self.vision_tower_name, cache_dir=self.cache_dir)
+
+    def load_model(self):
+        self.image_processor = AutoImageProcessor.from_pretrained(self.vision_tower_name, cache_dir=self.cache_dir)
+        vision_tower = ViTMAEForPreTraining.from_pretrained(self.vision_tower_name, cache_dir=self.cache_dir)
+        self.vision_tower = vision_tower.vit
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        # print(image_features.shape)
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/models/multimodal_projector/builder.py b/models/multimodal_projector/builder.py
new file mode 100755
index 0000000..8cc8e9d
--- /dev/null
+++ b/models/multimodal_projector/builder.py
@@ -0,0 +1,257 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import re
+
+from transformers import PretrainedConfig, Blip2PreTrainedModel, Blip2Config, Blip2QFormerModel
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+
+
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+
+
+# def build_vision_projector(config, delay_load=False, **kwargs):
+#     projector_type = getattr(config, 'mm_projector_type', 'linear')
+#
+#     if projector_type == 'linear':
+#         return nn.Linear(config.mm_hidden_size, config.hidden_size)
+#
+#     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+#     if mlp_gelu_match:
+#         mlp_depth = int(mlp_gelu_match.group(1))
+#         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+#         for _ in range(1, mlp_depth):
+#             modules.append(nn.GELU())
+#             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+#         return nn.Sequential(*modules)
+#
+#     if projector_type == 'identity':
+#         return IdentityMap()
+#
+#     raise ValueError(f'Unknown projector type: {projector_type}')
+
+
+class Blip2Model(Blip2PreTrainedModel):
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        self.qformer = Blip2QFormerModel(config.qformer_config)
+
+        # self.proj = nn.Linear(config.mm_hidden_size, config.hidden_size)
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size), nn.GELU(), nn.Linear(config.hidden_size, config.hidden_size)]
+        self.proj = nn.Sequential(*modules)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import Blip2Processor, Blip2Model
+
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # vision_outputs = self.vision_model(
+        #     pixel_values=pixel_values,
+        #     output_attentions=output_attentions,
+        #     output_hidden_states=output_hidden_states,
+        #     return_dict=return_dict,
+        # )
+        #
+        # image_embeds = vision_outputs[0]
+        # image_embeds = self.proj(pixel_values)
+        image_embeds = pixel_values
+
+
+        # print('pixel_values to proj', pixel_values.shape, image_embeds.shape)
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        ).last_hidden_state
+        # print('qformer out', query_outputs.shape)
+        query_outputs = self.proj(query_outputs)
+        return query_outputs
+
+
+def qformer_config_template(config, projector_type):
+    pattern = r"qformer(\d+)_(\d+)"
+
+    match = re.search(pattern, projector_type)
+    num_hidden_layers = int(match.group(1))
+    num_query_tokens = int(match.group(2))
+
+    qformer_config = type('Blip2Config', (PretrainedConfig,), {
+        "initializer_factor": 1.0,
+        "initializer_range": 0.02,
+        "model_type": "blip-2",
+        "num_query_tokens": num_query_tokens,
+        "hidden_size": config.hidden_size,
+        "mm_hidden_size": config.mm_hidden_size,
+        "qformer_config": type('qformer_config', (PretrainedConfig,), {
+            "_name_or_path": "",
+            "add_cross_attention": False,
+            "architectures": None,
+            "attention_probs_dropout_prob": 0.0,
+            "bad_words_ids": None,
+            "begin_suppress_tokens": None,
+            "bos_token_id": None,
+            "chunk_size_feed_forward": 0,
+            "classifier_dropout": None,
+            "cross_attention_frequency": 1,
+            "cross_attention_hidden_size": None,
+            "decoder_start_token_id": None,
+            "diversity_penalty": 0.0,
+            "do_sample": False,
+            "early_stopping": False,
+            "encoder_hidden_size": config.mm_hidden_size,
+            "encoder_no_repeat_ngram_size": 0,
+            "eos_token_id": None,
+            "exponential_decay_length_penalty": None,
+            "finetuning_task": None,
+            "forced_bos_token_id": None,
+            "forced_eos_token_id": None,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.0,
+            "hidden_size": config.mm_hidden_size,
+            "id2label": {
+                "0": "LABEL_0",
+                "1": "LABEL_1"
+            },
+            "initializer_range": 0.02,
+            "intermediate_size": config.mm_hidden_size * 4,
+            "is_decoder": False,
+            "is_encoder_decoder": False,
+            "label2id": {
+                "LABEL_0": 0,
+                "LABEL_1": 1
+            },
+            "layer_norm_eps": 1e-12,
+            "length_penalty": 1.0,
+            "max_length": 20,
+            "max_position_embeddings": 512,
+            "min_length": 0,
+            "model_type": "blip_2_qformer",
+            "no_repeat_ngram_size": 0,
+            "num_attention_heads": 32,
+            "num_beam_groups": 1,
+            "num_beams": 1,
+            "num_hidden_layers": num_hidden_layers,
+            "num_return_sequences": 1,
+            "output_attentions": False,
+            "output_hidden_states": False,
+            "output_scores": False,
+            "pad_token_id": 0,
+            "position_embedding_type": "absolute",
+            "prefix": None,
+            "problem_type": None,
+            "pruned_heads": {},
+            "remove_invalid_values": False,
+            "repetition_penalty": 1.0,
+            "return_dict": True,
+            "return_dict_in_generate": False,
+            "sep_token_id": None,
+            "suppress_tokens": None,
+            "task_specific_params": None,
+            "temperature": 1.0,
+            "tf_legacy_loss": False,
+            "tie_encoder_decoder": False,
+            "tie_word_embeddings": True,
+            "tokenizer_class": None,
+            "top_k": 50,
+            "top_p": 1.0,
+            "torch_dtype": None,
+            "torchscript": False,
+            "transformers_version": "4.27.0.dev0",
+            "typical_p": 1.0,
+            "use_bfloat16": False,
+            "vocab_size": 30522
+        })()
+    })()
+    return qformer_config
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    elif projector_type == 'identity':
+        return IdentityMap()
+
+    elif projector_type.startswith('qformer'):  # qformer2_64
+        qformer_config = qformer_config_template(config, projector_type)
+        return Blip2Model(qformer_config)
+    else:
+        mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+        if mlp_gelu_match:
+            mlp_depth = int(mlp_gelu_match.group(1))
+            modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+            return nn.Sequential(*modules)
+
+    raise ValueError(f'Unknown projector type: {projector_type}')
\ No newline at end of file
diff --git a/models/quantize_cnn.py b/models/quantize_cnn.py
new file mode 100755
index 0000000..8cd3ecd
--- /dev/null
+++ b/models/quantize_cnn.py
@@ -0,0 +1,413 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class QuantizeEMAReset(nn.Module):
+    def __init__(self, nb_code, code_dim, args):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.mu = args.mu
+        self.reset_codebook()
+        
+    def reset_codebook(self):
+        self.init = False
+        self.code_sum = None
+        self.code_count = None
+        self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).cuda())
+
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = out[:self.nb_code]
+        self.code_sum = self.codebook.clone()
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+        
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) : 
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)  
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    
+    @torch.no_grad()
+    def update_codebook(self, x, code_idx):
+        
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+
+        code_sum = torch.matmul(code_onehot, x)  # nb_code, w
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+
+        out = self._tile(x)
+        code_rand = out[:self.nb_code]
+
+        # Update centres
+        self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum  # w, nb_code
+        self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count  # nb_code
+
+        usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+        code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+
+        self.codebook = usage * code_update + (1 - usage) * code_rand
+        prob = code_count / torch.sum(code_count)  
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+
+            
+        return perplexity
+
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])  
+        return x
+
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+
+    
+    def forward(self, x):
+        N, width, T = x.shape
+
+        # Preprocess
+        x = self.preprocess(x)
+
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else : 
+            perplexity = self.compute_perplexity(code_idx)
+        
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        
+        return x_d, commit_loss, perplexity
+
+
+
+class Quantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta):
+        super(Quantizer, self).__init__()
+
+        self.e_dim = e_dim
+        self.n_e = n_e
+        self.beta = beta
+
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+    def forward(self, z):
+        
+        N, width, T = z.shape
+        z = self.preprocess(z)
+        assert z.shape[-1] == self.e_dim
+        z_flattened = z.contiguous().view(-1, self.e_dim)
+
+        # B x V
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight**2, dim=1) - 2 * \
+            torch.matmul(z_flattened, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+
+        # compute loss for embedding
+        loss = torch.mean((z_q - z.detach())**2) + self.beta * \
+               torch.mean((z_q.detach() - z)**2)
+
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        z_q = z_q.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+
+        min_encodings = F.one_hot(min_encoding_indices, self.n_e).type(z.dtype)
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean*torch.log(e_mean + 1e-10)))
+        return z_q, loss, perplexity
+
+    def quantize(self, z):
+
+        assert z.shape[-1] == self.e_dim
+
+        # B x V
+        d = torch.sum(z ** 2, dim=1, keepdim=True) + \
+            torch.sum(self.embedding.weight ** 2, dim=1) - 2 * \
+            torch.matmul(z, self.embedding.weight.t())
+        # B x 1
+        min_encoding_indices = torch.argmin(d, dim=1)
+        return min_encoding_indices
+
+    def dequantize(self, indices):
+
+        index_flattened = indices.view(-1)
+        z_q = self.embedding(index_flattened)
+        z_q = z_q.view(indices.shape + (self.e_dim, )).contiguous()
+        return z_q
+
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])  
+        return x
+
+
+
+class QuantizeReset(nn.Module):
+    def __init__(self, nb_code, code_dim, args):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.reset_codebook()
+        self.codebook = nn.Parameter(torch.randn(nb_code, code_dim))
+        
+    def reset_codebook(self):
+        self.init = False
+        self.code_count = None
+
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = nn.Parameter(out[:self.nb_code])
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+        
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) : 
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)  
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    
+    def update_codebook(self, x, code_idx):
+        
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+
+        out = self._tile(x)
+        code_rand = out[:self.nb_code]
+
+        # Update centres
+        self.code_count = code_count  # nb_code
+        usage = (self.code_count.view(self.nb_code, 1) >= 1.0).float()
+
+        self.codebook.data = usage * self.codebook.data + (1 - usage) * code_rand
+        prob = code_count / torch.sum(code_count)  
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+
+            
+        return perplexity
+
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])  
+        return x
+
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+
+    
+    def forward(self, x):
+        N, width, T = x.shape
+        # Preprocess
+        x = self.preprocess(x)
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else : 
+            perplexity = self.compute_perplexity(code_idx)
+        
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        
+        return x_d, commit_loss, perplexity
+
+    
+class QuantizeEMA(nn.Module):
+    def __init__(self, nb_code, code_dim, args):
+        super().__init__()
+        self.nb_code = nb_code
+        self.code_dim = code_dim
+        self.mu = 0.99
+        self.reset_codebook()
+        
+    def reset_codebook(self):
+        self.init = False
+        self.code_sum = None
+        self.code_count = None
+        self.register_buffer('codebook', torch.zeros(self.nb_code, self.code_dim).cuda())
+
+    def _tile(self, x):
+        nb_code_x, code_dim = x.shape
+        if nb_code_x < self.nb_code:
+            n_repeats = (self.nb_code + nb_code_x - 1) // nb_code_x
+            std = 0.01 / np.sqrt(code_dim)
+            out = x.repeat(n_repeats, 1)
+            out = out + torch.randn_like(out) * std
+        else :
+            out = x
+        return out
+
+    def init_codebook(self, x):
+        out = self._tile(x)
+        self.codebook = out[:self.nb_code]
+        self.code_sum = self.codebook.clone()
+        self.code_count = torch.ones(self.nb_code, device=self.codebook.device)
+        self.init = True
+        
+    @torch.no_grad()
+    def compute_perplexity(self, code_idx) : 
+        # Calculate new centres
+        code_onehot = torch.zeros(self.nb_code, code_idx.shape[0], device=code_idx.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, code_idx.shape[0]), 1)
+
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+        prob = code_count / torch.sum(code_count)  
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+        return perplexity
+    
+    @torch.no_grad()
+    def update_codebook(self, x, code_idx):
+        
+        code_onehot = torch.zeros(self.nb_code, x.shape[0], device=x.device)  # nb_code, N * L
+        code_onehot.scatter_(0, code_idx.view(1, x.shape[0]), 1)
+
+        code_sum = torch.matmul(code_onehot, x)  # nb_code, w
+        code_count = code_onehot.sum(dim=-1)  # nb_code
+
+        # Update centres
+        self.code_sum = self.mu * self.code_sum + (1. - self.mu) * code_sum  # w, nb_code
+        self.code_count = self.mu * self.code_count + (1. - self.mu) * code_count  # nb_code
+
+        code_update = self.code_sum.view(self.nb_code, self.code_dim) / self.code_count.view(self.nb_code, 1)
+
+        self.codebook = code_update
+        prob = code_count / torch.sum(code_count)  
+        perplexity = torch.exp(-torch.sum(prob * torch.log(prob + 1e-7)))
+            
+        return perplexity
+
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])  
+        return x
+
+    def quantize(self, x):
+        # Calculate latent code x_l
+        k_w = self.codebook.t()
+        distance = torch.sum(x ** 2, dim=-1, keepdim=True) - 2 * torch.matmul(x, k_w) + torch.sum(k_w ** 2, dim=0,
+                                                                                            keepdim=True)  # (N * L, b)
+        _, code_idx = torch.min(distance, dim=-1)
+        return code_idx
+
+    def dequantize(self, code_idx):
+        x = F.embedding(code_idx, self.codebook)
+        return x
+
+    
+    def forward(self, x):
+        N, width, T = x.shape
+
+        # Preprocess
+        x = self.preprocess(x)
+
+        # Init codebook if not inited
+        if self.training and not self.init:
+            self.init_codebook(x)
+
+        # quantize and dequantize through bottleneck
+        code_idx = self.quantize(x)
+        x_d = self.dequantize(code_idx)
+
+        # Update embeddings
+        if self.training:
+            perplexity = self.update_codebook(x, code_idx)
+        else : 
+            perplexity = self.compute_perplexity(code_idx)
+        
+        # Loss
+        commit_loss = F.mse_loss(x, x_d.detach())
+
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+
+        # Postprocess
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()   #(N, DIM, T)
+        
+        return x_d, commit_loss, perplexity
diff --git a/models/resnet.py b/models/resnet.py
new file mode 100755
index 0000000..062346e
--- /dev/null
+++ b/models/resnet.py
@@ -0,0 +1,82 @@
+import torch.nn as nn
+import torch
+
+class nonlinearity(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        # swish
+        return x * torch.sigmoid(x)
+
+class ResConv1DBlock(nn.Module):
+    def __init__(self, n_in, n_state, dilation=1, activation='silu', norm=None, dropout=None):
+        super().__init__()
+        padding = dilation
+        self.norm = norm
+        if norm == "LN":
+            self.norm1 = nn.LayerNorm(n_in)
+            self.norm2 = nn.LayerNorm(n_in)
+        elif norm == "GN":
+            self.norm1 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.GroupNorm(num_groups=32, num_channels=n_in, eps=1e-6, affine=True)
+        elif norm == "BN":
+            self.norm1 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+            self.norm2 = nn.BatchNorm1d(num_features=n_in, eps=1e-6, affine=True)
+        
+        else:
+            self.norm1 = nn.Identity()
+            self.norm2 = nn.Identity()
+
+        if activation == "relu":
+            self.activation1 = nn.ReLU()
+            self.activation2 = nn.ReLU()
+            
+        elif activation == "silu":
+            self.activation1 = nonlinearity()
+            self.activation2 = nonlinearity()
+            
+        elif activation == "gelu":
+            self.activation1 = nn.GELU()
+            self.activation2 = nn.GELU()
+            
+        
+
+        self.conv1 = nn.Conv1d(n_in, n_state, 3, 1, padding, dilation)
+        self.conv2 = nn.Conv1d(n_state, n_in, 1, 1, 0,)     
+
+
+    def forward(self, x):
+        x_orig = x
+        if self.norm == "LN":
+            x = self.norm1(x.transpose(-2, -1))
+            x = self.activation1(x.transpose(-2, -1))
+        else:
+            x = self.norm1(x)
+            x = self.activation1(x)
+            
+        x = self.conv1(x)
+
+        if self.norm == "LN":
+            x = self.norm2(x.transpose(-2, -1))
+            x = self.activation2(x.transpose(-2, -1))
+        else:
+            x = self.norm2(x)
+            x = self.activation2(x)
+
+        x = self.conv2(x)
+        x = x + x_orig
+        return x
+
+class Resnet1D(nn.Module):
+    def __init__(self, n_in, n_depth, dilation_growth_rate=1, reverse_dilation=True, activation='relu', norm=None):
+        super().__init__()
+        
+        blocks = [ResConv1DBlock(n_in, n_in, dilation=dilation_growth_rate ** depth, activation=activation, norm=norm) for depth in range(n_depth)]
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):        
+        return self.model(x)
\ No newline at end of file
diff --git a/models/rotation2xyz.py b/models/rotation2xyz.py
new file mode 100755
index 0000000..44f6cb6
--- /dev/null
+++ b/models/rotation2xyz.py
@@ -0,0 +1,92 @@
+# This code is based on https://github.com/Mathux/ACTOR.git
+import torch
+import utils.rotation_conversions as geometry
+
+
+from models.smpl import SMPL, JOINTSTYPE_ROOT
+# from .get_model import JOINTSTYPES
+JOINTSTYPES = ["a2m", "a2mpl", "smpl", "vibe", "vertices"]
+
+
+class Rotation2xyz:
+    def __init__(self, device, dataset='amass'):
+        self.device = device
+        self.dataset = dataset
+        self.smpl_model = SMPL().eval().to(device)
+
+    def __call__(self, x, mask, pose_rep, translation, glob,
+                 jointstype, vertstrans, betas=None, beta=0,
+                 glob_rot=None, get_rotations_back=False, **kwargs):
+        if pose_rep == "xyz":
+            return x
+
+        if mask is None:
+            mask = torch.ones((x.shape[0], x.shape[-1]), dtype=bool, device=x.device)
+
+        if not glob and glob_rot is None:
+            raise TypeError("You must specify global rotation if glob is False")
+
+        if jointstype not in JOINTSTYPES:
+            raise NotImplementedError("This jointstype is not implemented.")
+
+        if translation:
+            x_translations = x[:, -1, :3]
+            x_rotations = x[:, :-1]
+        else:
+            x_rotations = x
+
+        x_rotations = x_rotations.permute(0, 3, 1, 2)
+        nsamples, time, njoints, feats = x_rotations.shape
+
+        # Compute rotations (convert only masked sequences output)
+        if pose_rep == "rotvec":
+            rotations = geometry.axis_angle_to_matrix(x_rotations[mask])
+        elif pose_rep == "rotmat":
+            rotations = x_rotations[mask].view(-1, njoints, 3, 3)
+        elif pose_rep == "rotquat":
+            rotations = geometry.quaternion_to_matrix(x_rotations[mask])
+        elif pose_rep == "rot6d":
+            rotations = geometry.rotation_6d_to_matrix(x_rotations[mask])
+        else:
+            raise NotImplementedError("No geometry for this one.")
+
+        if not glob:
+            global_orient = torch.tensor(glob_rot, device=x.device)
+            global_orient = geometry.axis_angle_to_matrix(global_orient).view(1, 1, 3, 3)
+            global_orient = global_orient.repeat(len(rotations), 1, 1, 1)
+        else:
+            global_orient = rotations[:, 0]
+            rotations = rotations[:, 1:]
+
+        if betas is None:
+            betas = torch.zeros([rotations.shape[0], self.smpl_model.num_betas],
+                                dtype=rotations.dtype, device=rotations.device)
+            betas[:, 1] = beta
+            # import ipdb; ipdb.set_trace()
+        out = self.smpl_model(body_pose=rotations, global_orient=global_orient, betas=betas)
+
+        # get the desirable joints
+        joints = out[jointstype]
+
+        x_xyz = torch.empty(nsamples, time, joints.shape[1], 3, device=x.device, dtype=x.dtype)
+        x_xyz[~mask] = 0
+        x_xyz[mask] = joints
+
+        x_xyz = x_xyz.permute(0, 2, 3, 1).contiguous()
+
+        # the first translation root at the origin on the prediction
+        if jointstype != "vertices":
+            rootindex = JOINTSTYPE_ROOT[jointstype]
+            x_xyz = x_xyz - x_xyz[:, [rootindex], :, :]
+
+        if translation and vertstrans:
+            # the first translation root at the origin
+            x_translations = x_translations - x_translations[:, :, [0]]
+
+            # add the translation to all the joints
+            x_xyz = x_xyz + x_translations[:, None, :, :]
+
+        if get_rotations_back:
+            return x_xyz, rotations, global_orient
+        else:
+            return x_xyz
diff --git a/models/smpl.py b/models/smpl.py
new file mode 100755
index 0000000..587f541
--- /dev/null
+++ b/models/smpl.py
@@ -0,0 +1,97 @@
+# This code is based on https://github.com/Mathux/ACTOR.git
+import numpy as np
+import torch
+
+import contextlib
+
+from smplx import SMPLLayer as _SMPLLayer
+from smplx.lbs import vertices2joints
+
+
+# action2motion_joints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21, 24, 38]
+# change 0 and 8
+action2motion_joints = [8, 1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 21, 24, 38]
+
+from utils.config import SMPL_MODEL_PATH, JOINT_REGRESSOR_TRAIN_EXTRA
+
+JOINTSTYPE_ROOT = {"a2m": 0, # action2motion
+                   "smpl": 0,
+                   "a2mpl": 0, # set(smpl, a2m)
+                   "vibe": 8}  # 0 is the 8 position: OP MidHip below
+
+JOINT_MAP = {
+    'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17,
+    'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16,
+    'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0,
+    'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8,
+    'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7,
+    'OP REye': 25, 'OP LEye': 26, 'OP REar': 27,
+    'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30,
+    'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34,
+    'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45,
+    'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7,
+    'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17,
+    'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20,
+    'Neck (LSP)': 47, 'Top of Head (LSP)': 48,
+    'Pelvis (MPII)': 49, 'Thorax (MPII)': 50,
+    'Spine (H36M)': 51, 'Jaw (H36M)': 52,
+    'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26,
+    'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27
+}
+
+JOINT_NAMES = [
+    'OP Nose', 'OP Neck', 'OP RShoulder',
+    'OP RElbow', 'OP RWrist', 'OP LShoulder',
+    'OP LElbow', 'OP LWrist', 'OP MidHip',
+    'OP RHip', 'OP RKnee', 'OP RAnkle',
+    'OP LHip', 'OP LKnee', 'OP LAnkle',
+    'OP REye', 'OP LEye', 'OP REar',
+    'OP LEar', 'OP LBigToe', 'OP LSmallToe',
+    'OP LHeel', 'OP RBigToe', 'OP RSmallToe', 'OP RHeel',
+    'Right Ankle', 'Right Knee', 'Right Hip',
+    'Left Hip', 'Left Knee', 'Left Ankle',
+    'Right Wrist', 'Right Elbow', 'Right Shoulder',
+    'Left Shoulder', 'Left Elbow', 'Left Wrist',
+    'Neck (LSP)', 'Top of Head (LSP)',
+    'Pelvis (MPII)', 'Thorax (MPII)',
+    'Spine (H36M)', 'Jaw (H36M)',
+    'Head (H36M)', 'Nose', 'Left Eye',
+    'Right Eye', 'Left Ear', 'Right Ear'
+]
+
+
+# adapted from VIBE/SPIN to output smpl_joints, vibe joints and action2motion joints
+class SMPL(_SMPLLayer):
+    """ Extension of the official SMPL implementation to support more joints """
+
+    def __init__(self, model_path=SMPL_MODEL_PATH, **kwargs):
+        kwargs["model_path"] = model_path
+
+        # remove the verbosity for the 10-shapes beta parameters
+        with contextlib.redirect_stdout(None):
+            super(SMPL, self).__init__(**kwargs)
+            
+        J_regressor_extra = np.load(JOINT_REGRESSOR_TRAIN_EXTRA)
+        self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32))
+        vibe_indexes = np.array([JOINT_MAP[i] for i in JOINT_NAMES])
+        a2m_indexes = vibe_indexes[action2motion_joints]
+        smpl_indexes = np.arange(24)
+        a2mpl_indexes = np.unique(np.r_[smpl_indexes, a2m_indexes])
+
+        self.maps = {"vibe": vibe_indexes,
+                     "a2m": a2m_indexes,
+                     "smpl": smpl_indexes,
+                     "a2mpl": a2mpl_indexes}
+        
+    def forward(self, *args, **kwargs):
+        smpl_output = super(SMPL, self).forward(*args, **kwargs)
+        
+        extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices)
+        all_joints = torch.cat([smpl_output.joints, extra_joints], dim=1)
+
+        output = {"vertices": smpl_output.vertices}
+
+        for joinstype, indexes in self.maps.items():
+            output[joinstype] = all_joints[:, indexes]
+            
+        return output
\ No newline at end of file
diff --git a/models/vqvae.py b/models/vqvae.py
new file mode 100755
index 0000000..a01a747
--- /dev/null
+++ b/models/vqvae.py
@@ -0,0 +1,134 @@
+# This code is based on https://github.com/Mael-zys/T2M-GPT.git
+import torch.nn as nn
+from models.encdec import Encoder, Decoder
+from models.quantize_cnn import QuantizeEMAReset, Quantizer, QuantizeEMA, QuantizeReset
+
+
+class VQVAE_251(nn.Module):
+    def __init__(self,
+                 args,
+                 nb_code=1024,
+                 code_dim=512,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 activation='relu',
+                 norm=None):
+        
+        super().__init__()
+        self.code_dim = code_dim
+        self.num_code = nb_code
+        self.quant = args.quantizer
+        self.encoder = Encoder(251 if args.dataname == 'kit' else 263, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        self.decoder = Decoder(251 if args.dataname == 'kit' else 263, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+        if args.quantizer == "ema_reset":
+            self.quantizer = QuantizeEMAReset(nb_code, code_dim, args)
+        elif args.quantizer == "orig":
+            self.quantizer = Quantizer(nb_code, code_dim, 1.0)
+        elif args.quantizer == "ema":
+            self.quantizer = QuantizeEMA(nb_code, code_dim, args)
+        elif args.quantizer == "reset":
+            self.quantizer = QuantizeReset(nb_code, code_dim, args)
+
+
+    def preprocess(self, x):
+        # (bs, T, Jx3) -> (bs, Jx3, T)
+        x = x.permute(0,2,1).float()
+        return x
+
+
+    def postprocess(self, x):
+        # (bs, Jx3, T) ->  (bs, T, Jx3)
+        x = x.permute(0,2,1)
+        return x
+
+
+    def encode(self, x):
+        N, T, _ = x.shape
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        # import pdb; pdb.set_trace()
+        x_encoder = self.postprocess(x_encoder)
+        x_encoder = x_encoder.contiguous().view(-1, x_encoder.shape[-1])  # (NT, C)
+        code_idx = self.quantizer.quantize(x_encoder)
+        code_idx = code_idx.view(N, -1)
+        return code_idx
+
+
+    def encode_x(self, x):
+        N, T, _ = x.shape
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        # import pdb; pdb.set_trace()
+        x_encoder = self.postprocess(x_encoder)
+        x_encoder = x_encoder.contiguous().view(-1, x_encoder.shape[-1])  # (NT, C)
+        return x_encoder # (B*T, 512)
+
+    def forward(self, x):
+        
+        x_in = self.preprocess(x)
+        # Encode
+        x_encoder = self.encoder(x_in)
+        
+        ## quantization
+        x_quantized, loss, perplexity  = self.quantizer(x_encoder)
+
+        ## decoder
+        x_decoder = self.decoder(x_quantized)
+        x_out = self.postprocess(x_decoder)
+        return x_out, loss, perplexity
+
+
+    def forward_decoder(self, x):
+        x_d = self.quantizer.dequantize(x)
+        x_d = x_d.view(1, -1, self.code_dim).permute(0, 2, 1).contiguous()
+        
+        # decoder
+        x_decoder = self.decoder(x_d)
+        x_out = self.postprocess(x_decoder)
+        return x_out
+
+
+
+class HumanVQVAE(nn.Module):
+    def __init__(self,
+                 args,
+                 nb_code=512,
+                 code_dim=512,
+                 output_emb_width=512,
+                 down_t=3,
+                 stride_t=2,
+                 width=512,
+                 depth=3,
+                 dilation_growth_rate=3,
+                 activation='relu',
+                 norm=None):
+        
+        super().__init__()
+        
+        self.nb_joints = 21 if args.dataname == 'kit' else 22
+        self.vqvae = VQVAE_251(args, nb_code, code_dim, output_emb_width, down_t, stride_t, width, depth, dilation_growth_rate, activation=activation, norm=norm)
+
+    def encode(self, x):
+        b, t, c = x.size()
+        quants = self.vqvae.encode(x) # (N, T)
+        return quants
+
+    def encode_x(self, x):
+        b, t, c = x.size()
+        quants = self.vqvae.encode_x(x) # (N, T)
+        return quants
+
+    def forward(self, x):
+
+        x_out, loss, perplexity = self.vqvae(x)
+        
+        return x_out, loss, perplexity
+
+    def forward_decoder(self, x):
+        x_out = self.vqvae.forward_decoder(x)
+        return x_out
+        
\ No newline at end of file
diff --git a/options/get_eval_option.py b/options/get_eval_option.py
new file mode 100755
index 0000000..d0989ba
--- /dev/null
+++ b/options/get_eval_option.py
@@ -0,0 +1,83 @@
+from argparse import Namespace
+import re
+from os.path import join as pjoin
+
+
+def is_float(numStr):
+    flag = False
+    numStr = str(numStr).strip().lstrip('-').lstrip('+')
+    try:
+        reg = re.compile(r'^[-+]?[0-9]+\.[0-9]+$')
+        res = reg.match(str(numStr))
+        if res:
+            flag = True
+    except Exception as ex:
+        print("is_float() - error: " + str(ex))
+    return flag
+
+
+def is_number(numStr):
+    flag = False
+    numStr = str(numStr).strip().lstrip('-').lstrip('+')
+    if str(numStr).isdigit():
+        flag = True
+    return flag
+
+
+def get_opt(opt_path, device):
+    opt = Namespace()
+    opt_dict = vars(opt)
+
+    skip = ('-------------- End ----------------',
+            '------------ Options -------------',
+            '\n')
+    print('Reading', opt_path)
+    with open(opt_path) as f:
+        for line in f:
+            if line.strip() not in skip:
+                # print(line.strip())
+                key, value = line.strip().split(': ')
+                if value in ('True', 'False'):
+                    opt_dict[key] = (value == 'True')
+                #     print(key, value)
+                elif is_float(value):
+                    opt_dict[key] = float(value)
+                elif is_number(value):
+                    opt_dict[key] = int(value)
+                else:
+                    opt_dict[key] = str(value)
+
+    # print(opt)
+    opt_dict['which_epoch'] = 'finest'
+    opt.save_root = pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name)
+    opt.model_dir = pjoin(opt.save_root, 'model')
+    opt.meta_dir = pjoin(opt.save_root, 'meta')
+
+    if opt.dataset_name == 't2m':
+        opt.data_root = './dataset/HumanML3D/'
+        opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
+        opt.text_dir = pjoin(opt.data_root, 'texts')
+        opt.joints_num = 22
+        opt.dim_pose = 263
+        opt.max_motion_length = 196
+        opt.max_motion_frame = 196
+        opt.max_motion_token = 55
+    elif opt.dataset_name == 'kit':
+        opt.data_root = './dataset/KIT-ML/'
+        opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs')
+        opt.text_dir = pjoin(opt.data_root, 'texts')
+        opt.joints_num = 21
+        opt.dim_pose = 251
+        opt.max_motion_length = 196
+        opt.max_motion_frame = 196
+        opt.max_motion_token = 55
+    else:
+        raise KeyError('Dataset not recognized')
+
+    opt.dim_word = 300
+    opt.num_classes = 200 // opt.unit_length
+    opt.is_train = False
+    opt.is_continue = False
+    opt.device = device
+
+    return opt
\ No newline at end of file
diff --git a/options/option.py b/options/option.py
new file mode 100755
index 0000000..c7909bb
--- /dev/null
+++ b/options/option.py
@@ -0,0 +1,84 @@
+import argparse
+
+def get_args_parser():
+    parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+                                     add_help=True,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    ## dataloader
+    parser.add_argument('--prompt', type=str, default="Generate a textual description corresponding to the given sequence of human motion tokens.", help='task description')
+    parser.add_argument('--input', type=str, help='generation condictions')
+    parser.add_argument('--dataname', type=str, default='t2m', help='dataset directory')
+    parser.add_argument('--pretrained_llama', type=str, default="13B")
+    parser.add_argument('--out_dir', type=str, default='./out/', help='output directory')
+    parser.add_argument('--vqvae_pth', type=str, default='/comp_robot/lushunlin/MotionGPT/checkpoints/pretrained_vqvae/t2m.pth', help='path to the pretrained vqvae pth')
+    parser.add_argument('--resume_pth', type=str, help='path to saved finetuned model')
+    parser.add_argument('--lora_path', type=str, help='path to fintuned model for evaluation')
+    parser.add_argument('--mlp_path', type=str, help='mlp path')
+    parser.add_argument('--data_dir', type=str, default='./data/', help='dataset directory')
+
+
+    ## lora
+    parser.add_argument('--lora_r', type=int, default=64)
+    parser.add_argument('--lora_alpha', type=int, default=16)
+    parser.add_argument('--lora_dropout', type=float, default=0.05)
+
+    ## llama
+    parser.add_argument('--block_size', type=int, default=512)
+
+    ## train
+    parser.add_argument('--batch_size', type=int, default=256, help='batch size')
+    parser.add_argument('--micro_batch_size', type=int, default=4, help='micro batch size')
+    # parser.add_argument('--learning_rate', type=float, default=3e-3, help='learning rate')
+    parser.add_argument('--learning_rate_lora', type=float, default=3e-3, help='learning rate of lora')
+    parser.add_argument('--learning_rate_mlp', type=float, default=3e-3, help='learning rate of mlp')
+    parser.add_argument('--weight_decay', type=float, default=0.01, help='weight decay')
+    parser.add_argument('--warmup_steps', type=int, default=100, help='warmup steps')
+    parser.add_argument('--eval_interval', type=int, default=100, help='evaluation frequency')
+    parser.add_argument('--save_interval', type=int, default=100, help='model save frequency')
+    parser.add_argument('--eval_iters', type=int, default=100, help='number of evaluation ierations')
+    parser.add_argument('--log_interval', type=int, default=1, help='log frequency')
+
+    ## vqvae
+    parser.add_argument("--code_dim", type=int, default=512, help="embedding dimension")
+    parser.add_argument("--nb_code", type=int, default=512, help="nb of embedding")
+    parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+    parser.add_argument("--down_t", type=int, default=2, help="downsampling rate")
+    parser.add_argument("--stride_t", type=int, default=2, help="stride size")
+    parser.add_argument("--width", type=int, default=512, help="width of the network")
+    parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+    parser.add_argument("--dilation_growth_rate", type=int, default=3, help="dilation growth rate")
+    parser.add_argument("--output_emb_width", type=int, default=512, help="output embedding width")
+    parser.add_argument('--vq_act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+    parser.add_argument('--seed', default=123, type=int, help='seed for initializing vqvae training.')
+    parser.add_argument('--window_size', type=int, default=64, help='training motion length')
+
+    ## quantizer
+    parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+    parser.add_argument('--quantbeta', type=float, default=1.0, help='dataset directory')
+
+    ## visualization
+    parser.add_argument("--render", action='store_true', help='render smpl')
+    parser.add_argument("--motion_vq_token_path", type=str, help='vq token path for motion visualization')
+
+
+    ## for motionx zero shot
+    parser.add_argument('--motionx_zero_shot_path', type=str, help='zero shot motion dataset directory')
+
+    parser.add_argument("--projectionnn", action='store_true', help='MLP projection')
+    parser.add_argument("--diverse", action='store_true', help='diverse description')
+    parser.add_argument("--vinilla", action='store_true', help='vinilla motion')
+    
+
+    # for video llava
+    parser.add_argument('--image_tower', type=str, default='LanguageBind/LanguageBind_Image', help='if use multimodal image tower')
+    parser.add_argument('--video_tower', type=str, default='LanguageBind/LanguageBind_Video_merge', help='if use multimodal video tower')
+    parser.add_argument('--mm_vision_select_layer', type=int, default=-2, help='if use multimodal video tower')
+    parser.add_argument('--mm_projector_type', type=str, default='mlp2x_gelu', help='if use multimodal video tower')
+    parser.add_argument('--mm_hidden_size', type=int, default=1024, help='if use multimodal video tower')
+    parser.add_argument('--hidden_size', type=int, default=4096, help='if use multimodal video tower')
+
+    # for mvbench save
+    parser.add_argument('--model_type', type=str, default=None, help='if use multimodal video tower')
+
+    return parser.parse_args()
diff --git a/options/option_video.py b/options/option_video.py
new file mode 100755
index 0000000..3770388
--- /dev/null
+++ b/options/option_video.py
@@ -0,0 +1,80 @@
+import argparse
+
+def get_args_parser():
+    parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+                                     add_help=True,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    ## dataloader
+    parser.add_argument('--prompt', type=str, default="Generate a textual description corresponding to the given sequence of human motion tokens.", help='task description')
+    parser.add_argument('--input', type=str, help='generation condictions')
+    parser.add_argument('--dataname', type=str, default='t2m', help='dataset directory')
+    parser.add_argument('--pretrained_llama', type=str, default="13B")
+    parser.add_argument('--out_dir', type=str, default='./out/', help='output directory')
+    parser.add_argument('--vqvae_pth', type=str, default='/comp_robot/lushunlin/MotionGPT/checkpoints/pretrained_vqvae/t2m.pth', help='path to the pretrained vqvae pth')
+    parser.add_argument('--resume_pth', type=str, help='path to saved finetuned model')
+    parser.add_argument('--lora_path', type=str, help='path to fintuned model for evaluation')
+    parser.add_argument('--data_dir', type=str, default='./data/', help='dataset directory')
+
+    ## lora
+    parser.add_argument('--lora_r', type=int, default=64)
+    parser.add_argument('--lora_alpha', type=int, default=16)
+    parser.add_argument('--lora_dropout', type=float, default=0.05)
+
+    ## llama
+    parser.add_argument('--block_size', type=int, default=512)
+
+    ## train
+    parser.add_argument('--batch_size', type=int, default=256, help='batch size')
+    parser.add_argument('--micro_batch_size', type=int, default=4, help='micro batch size')
+    parser.add_argument('--learning_rate', type=float, default=3e-3, help='learning rate')
+    parser.add_argument('--weight_decay', type=float, default=0.01, help='weight decay')
+    parser.add_argument('--warmup_steps', type=int, default=100, help='warmup steps')
+    parser.add_argument('--eval_interval', type=int, default=100, help='evaluation frequency')
+    parser.add_argument('--save_interval', type=int, default=100, help='model save frequency')
+    parser.add_argument('--eval_iters', type=int, default=100, help='number of evaluation ierations')
+    parser.add_argument('--log_interval', type=int, default=1, help='log frequency')
+
+    ## vqvae
+    parser.add_argument("--code_dim", type=int, default=512, help="embedding dimension")
+    parser.add_argument("--nb_code", type=int, default=512, help="nb of embedding")
+    parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+    parser.add_argument("--down_t", type=int, default=2, help="downsampling rate")
+    parser.add_argument("--stride_t", type=int, default=2, help="stride size")
+    parser.add_argument("--width", type=int, default=512, help="width of the network")
+    parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+    parser.add_argument("--dilation_growth_rate", type=int, default=3, help="dilation growth rate")
+    parser.add_argument("--output_emb_width", type=int, default=512, help="output embedding width")
+    parser.add_argument('--vq_act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+    parser.add_argument('--seed', default=123, type=int, help='seed for initializing vqvae training.')
+    parser.add_argument('--window_size', type=int, default=64, help='training motion length')
+
+    ## quantizer
+    parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+    parser.add_argument('--quantbeta', type=float, default=1.0, help='dataset directory')
+
+    ## visualization
+    parser.add_argument("--render", action='store_true', help='render smpl')
+    parser.add_argument("--motion_vq_token_path", type=str, help='vq token path for motion visualization')
+
+
+    ## for motionx zero shot
+    parser.add_argument('--motionx_zero_shot_path', type=str, help='zero shot motion dataset directory')
+
+    parser.add_argument("--projectionnn", action='store_true', help='MLP projection')
+    parser.add_argument("--diverse", action='store_true', help='diverse description')
+    parser.add_argument("--vinilla", action='store_true', help='vinilla motion')
+
+    # subparsers = parser.add_subparsers(help='sub-command help')
+    # model_subparser = subparsers.add_parser('model_config', help='subparser1 help')
+    parser.add_argument('--image_tower', type=str, default='LanguageBind/LanguageBind_Image', help='if use multimodal image tower')
+    parser.add_argument('--video_tower', type=str, default='LanguageBind/LanguageBind_Video_merge', help='if use multimodal video tower')
+    parser.add_argument('--mm_vision_select_layer', type=int, default=-2, help='if use multimodal video tower')
+    parser.add_argument('--mm_projector_type', type=str, default='mlp2x_gelu', help='if use multimodal video tower')
+    parser.add_argument('--mm_hidden_size', type=int, default=1024, help='if use multimodal video tower')
+    parser.add_argument('--hidden_size', type=int, default=4096, help='if use multimodal video tower')
+    
+
+
+
+    return parser.parse_args()
diff --git a/options/option_video_model.py b/options/option_video_model.py
new file mode 100755
index 0000000..a07892c
--- /dev/null
+++ b/options/option_video_model.py
@@ -0,0 +1,11 @@
+import argparse
+
+def get_args_parser():
+    parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+                                     add_help=True,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('mm_image_tower', action='store_true', default=True, help='if use multimodal image tower')
+    parser.add_argument('mm_video_tower', action='store_true', default=True, help='if use multimodal video tower')
+
+    return parser.parse_args()
diff --git a/options/option_vqvae.py b/options/option_vqvae.py
new file mode 100755
index 0000000..244dcdf
--- /dev/null
+++ b/options/option_vqvae.py
@@ -0,0 +1,47 @@
+import argparse
+
+def get_args_parser():
+    parser = argparse.ArgumentParser(description='Optimal Transport AutoEncoder training for Amass',
+                                     add_help=True,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    ## dataloader
+    parser.add_argument('--dataname', type=str, default='t2m', help='dataset directory')
+    parser.add_argument('--out_dir', type=str, default='./out/', help='output directory')
+    parser.add_argument('--resume_pth', type=str, help='path to saved vqvae model')
+    parser.add_argument('--window_size', type=int, default=64, help='training motion length')
+
+    ## train
+    parser.add_argument('--batch_size', type=int, default=256, help='batch size')
+    parser.add_argument('--learning_rate', type=float, default=2e-4, help='learning rate')
+    parser.add_argument('--weight_decay', type=float, default=0.0, help='weight decay')
+    parser.add_argument('--warmup_steps', type=int, default=1000, help='number of total iterations for warmup')
+    parser.add_argument('--total_iter', default=300000, type=int, help='number of total iterations to run')
+    parser.add_argument('--lr', default=2e-4, type=float, help='max learning rate')
+    parser.add_argument('--lr_scheduler', default=[200000], nargs="+", type=int, help="learning rate schedule (iterations)")
+    parser.add_argument('--gamma', default=0.05, type=float, help="learning rate decay")
+    parser.add_argument("--commit", type=float, default=0.02, help="hyper-parameter for the commitment loss")
+    parser.add_argument('--loss_vel', type=float, default=0.5, help='hyper-parameter for the velocity loss')
+    parser.add_argument('--recons_loss', type=str, default='l1_smooth', help='reconstruction loss')
+    parser.add_argument('--print_iter', default=200, type=int, help='print frequency')
+    parser.add_argument('--eval_iter', default=1000, type=int, help='evaluation frequency')
+    parser.add_argument('--seed', default=123, type=int, help='seed for initializing training.')
+
+    ## model
+    parser.add_argument("--code_dim", type=int, default=512, help="embedding dimension")
+    parser.add_argument("--nb_code", type=int, default=512, help="nb of embedding")
+    parser.add_argument("--mu", type=float, default=0.99, help="exponential moving average to update the codebook")
+    parser.add_argument("--down_t", type=int, default=2, help="downsampling rate")
+    parser.add_argument("--stride_t", type=int, default=2, help="stride size")
+    parser.add_argument("--width", type=int, default=512, help="width of the network")
+    parser.add_argument("--depth", type=int, default=3, help="depth of the network")
+    parser.add_argument("--dilation_growth_rate", type=int, default=3, help="dilation growth rate")
+    parser.add_argument("--output_emb_width", type=int, default=512, help="output embedding width")
+    parser.add_argument('--vq_act', type=str, default='relu', choices = ['relu', 'silu', 'gelu'], help='dataset directory')
+    parser.add_argument('--vq_norm', type=str, default=None, help='dataset directory')
+
+    ## quantizer
+    parser.add_argument("--quantizer", type=str, default='ema_reset', choices = ['ema', 'orig', 'ema_reset', 'reset'], help="eps for optimal transport")
+    parser.add_argument('--beta', type=float, default=1.0, help='commitment loss in standard VQ')
+
+    return parser.parse_args()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..424e68f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,48 @@
+bitsandbytes==0.41.3.post2
+dataloader==2.0
+decord==0.6.0
+deepspeed==0.9.5
+editdistance==0.8.1
+einops==0.8.0
+fastapi==0.111.0
+fire==0.5.0
+flash_attn==2.4.2
+gradio==4.31.5
+huggingface_hub==0.22.2
+imageio==2.13.5
+jsonargparse==4.26.1
+lightning==2.2.0rc0
+lightning_utilities==0.9.0
+matplotlib==3.5.1
+nlg_metricverse==0.9.9
+numpy==1.23.0
+numpy==1.20.0
+openai==0.28.0
+opencv_python==4.5.5.64
+packaging==21.3
+pandas==1.3.4
+peft==0.8.2
+Pillow==9.0.0
+pycocoevalcap==1.2
+pyrender==0.1.45
+pytorchvideo==0.1.5
+quantize==0.0.4
+ray==2.23.0
+Requests==2.32.2
+scipy==1.13.1
+sentence_transformers==2.2.2
+sentencepiece==0.1.99
+Shapely==2.0.4
+shortuuid==1.0.13
+smplx==0.1.26
+tokenizers==0.13.3
+torch==2.0.0
+torch_xla==2.3.0
+torchvision==0.15.1+cu117
+tqdm==4.66.1
+transformers==4.28.1
+trimesh==3.22.1
+typing_extensions==4.12.0
+uvicorn==0.30.0
+visualize==0.5.1
+xformers==0.0.22
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/video_dataset/prepare_video_dataset_intern_video.py b/scripts/video_dataset/prepare_video_dataset_intern_video.py
new file mode 100755
index 0000000..c60ae3f
--- /dev/null
+++ b/scripts/video_dataset/prepare_video_dataset_intern_video.py
@@ -0,0 +1,155 @@
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import os
+import sys
+from pathlib import Path
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import torch
+import requests
+import json
+from torch.utils.data import random_split
+sys.path.append(os.getcwd())
+from lit_llama.tokenizer import Tokenizer
+from tqdm import tqdm
+import numpy as np
+
+from options import option
+
+IGNORE_INDEX = -1
+
+def prepare(
+    destination_path: Path = Path("./data"), 
+    tokenizer_path: Path = Path("./checkpoints/lit-llama/tokenizer.model"),
+    max_seq_length: int = 2560,
+    seed: int = 42,
+    mask_inputs: bool = False,  # as in alpaca-lora
+    split: str = "train"
+):
+    """Prepare the Alpaca dataset for instruction tuning.
+    The output is a training and validation dataset saved as `train.pt` and `val.pt`,
+    which stores the preprocessed and tokenized prompts and labels.
+    """
+
+    destination_path.mkdir(parents=True, exist_ok=True)
+
+    file_path = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/{split}.json'
+
+    # TODO: If we don't have the Meta weights, where do we get the tokenizer from?
+    tokenizer = Tokenizer(tokenizer_path)
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+    data_set = list(data)
+
+    print(f"{split} set has {len(data_set):,} samples")
+
+    print(f"Processing {split} split ...")
+    data_set_new = []
+    for sample in tqdm(data_set):
+        # try:
+        data_set_new.append(prepare_sample(sample, tokenizer, max_seq_length, mask_inputs))
+            # import pdb; pdb.set_trace()
+
+    data_set = data_set_new
+
+    save_pt = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/{split}.pt'
+    torch.save(data_set, save_pt)
+
+
+def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True):
+    """Processes a single sample.
+    Each sample in the dataset consists of:
+    - instruction: A string describing the task
+    - input: A string holding a special input value for the instruction.
+        This only applies to some samples, and in others this is empty.
+    - output: The response string
+
+    This function processes this data to produce a prompt text and a label for
+    supervised training. The prompt text is formed as a single message including both
+    the instruction and the input. The label/target is the same message but with the
+    response attached.
+
+    Finally, both the prompt and the label get tokenized. If desired, all tokens
+    in the label that correspond to the original input prompt get masked out (default).
+
+
+    """
+    # import pdb; pdb.set_trace()
+    # full_prompt = generate_prompt(example)
+    # import pdb; pdb.set_trace()
+    full_prompt = generate_prompt_mlp(example)
+    full_prompt_and_response = full_prompt + example['output']
+    # import pdb; pdb.set_trace()
+    encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False)
+    encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length)
+    
+    # extendedQA = example['QA'][1:]
+    # for qa_item in extendedQA:
+    #     q, a = qa_item["Q"], qa_item["A"]
+    #     new_concat = "USER: " + q + "ASSISTANT: " + a
+    #     full_prompt_and_response = full_prompt_and_response + new_concat
+    #     encoded_new_concat = tokenize(tokenizer, new_concat, eos=True, max_length=max_length)
+    #     encoded_full_prompt_and_response = torch.cat((encoded_full_prompt_and_response, encoded_new_concat))
+        
+
+    # The labels are the full prompt with response, but with the prompt masked out
+    labels = encoded_full_prompt_and_response.clone()
+    if mask_inputs:
+        labels[:len(encoded_full_prompt)] = IGNORE_INDEX
+
+    # import pdb; pdb.set_trace()
+    
+    return {**example, "sys_command": generate_system_command(), "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels}
+
+
+def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor:
+    return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)
+
+def detokenizer(tokenizer: Tokenizer, tensor: torch.Tensor):
+    '''
+    tokenizer.decode(torch.tensor([13866,   338]))
+    '''
+    return tokenizer.decode(tensor)
+
+
+def generate_prompt_mlp(example):
+    """Generates a standardized message to prompt the model with an instruction, optional input and a
+    'response' field."""
+    # import pdb; pdb.set_trace()
+    # try: 
+    #     x = f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['QA'][0]['Q']} INPUT_MOTION_TOKENS: {example['input']}. \nASSISTANT: " 
+    # except:
+    #     import pdb; pdb.set_trace()
+    if example["input"]:
+        return (
+            f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} INPUT_VIDEO: {example['input']}. \nASSISTANT: "
+        )
+    return (
+        f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} ASSISTANT: "
+    )
+    
+    # return (
+    #     "Below is an instruction that describes a task, paired with an input that provides further context. "
+    #     "Write a response that appropriately completes the request.\n\n"
+    #     f"### Instruction:\n{example['instruction']}\n\n### Input:\n", "\n\n### Response:"
+    # )
+
+def generate_system_command():
+    return (
+        f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. "
+    )
+    
+
+def main():
+    args = option.get_args_parser()
+    # prepare(split='train')
+    # prepare(split='val')
+    prepare(split='train_intern_human_2M_stage1_caption')
+    prepare(split='val_intern_human_2M_stage1_caption')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/video_dataset/prepare_video_dataset_video_llava.py b/scripts/video_dataset/prepare_video_dataset_video_llava.py
new file mode 100755
index 0000000..05248e5
--- /dev/null
+++ b/scripts/video_dataset/prepare_video_dataset_video_llava.py
@@ -0,0 +1,178 @@
+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import os
+import sys
+from pathlib import Path
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+import torch
+import requests
+import json
+from torch.utils.data import random_split
+sys.path.append(os.getcwd())
+from lit_llama.tokenizer import Tokenizer
+from tqdm import tqdm
+import numpy as np
+
+from options import option
+
+IGNORE_INDEX = -1
+
+def prepare(
+    destination_path: Path = Path("./data"), 
+    tokenizer_path: Path = Path("./checkpoints/lit-llama/tokenizer.model"),
+    max_seq_length: int = 2560,
+    seed: int = 42,
+    mask_inputs: bool = False,  # as in alpaca-lora
+    split: str = "train"
+):
+    """Prepare the Alpaca dataset for instruction tuning.
+    The output is a training and validation dataset saved as `train.pt` and `val.pt`,
+    which stores the preprocessed and tokenized prompts and labels.
+    """
+
+    destination_path.mkdir(parents=True, exist_ok=True)
+
+    file_path = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/video_llava_{split}.json'
+
+    # TODO: If we don't have the Meta weights, where do we get the tokenizer from?
+    tokenizer = Tokenizer(tokenizer_path)
+
+    with open(file_path, "r") as file:
+        data = json.load(file)
+    data_set = list(data)
+
+    print(f"{split} set has {len(data_set):,} samples")
+
+    print(f"Processing {split} split ...")
+    data_set_new = []
+    for sample in tqdm(data_set):
+        # try:
+        data_set_new.append(prepare_sample(sample, tokenizer, max_seq_length, mask_inputs))
+            # import pdb; pdb.set_trace()
+
+    data_set = data_set_new
+
+    save_pt = f'/comp_robot/lushunlin/MotionGPT/data/video_dataset/video_llava_{split}.pt'
+    torch.save(data_set, save_pt)
+
+
+def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True):
+    """Processes a single sample.
+    Each sample in the dataset consists of:
+    - instruction: A string describing the task
+    - input: A string holding a special input value for the instruction.
+        This only applies to some samples, and in others this is empty.
+    - output: The response string
+
+    This function processes this data to produce a prompt text and a label for
+    supervised training. The prompt text is formed as a single message including both
+    the instruction and the input. The label/target is the same message but with the
+    response attached.
+
+    Finally, both the prompt and the label get tokenized. If desired, all tokens
+    in the label that correspond to the original input prompt get masked out (default).
+
+
+    """
+    # import pdb; pdb.set_trace()
+    # full_prompt = generate_prompt(example)
+    # import pdb; pdb.set_trace()
+    full_prompt = generate_prompt_mlp(example)
+    full_prompt_and_response = full_prompt + example['output']
+    
+    encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False)
+    encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length)
+    
+    # extendedQA = example['QA'][1:]
+    # for qa_item in extendedQA:
+    #     q, a = qa_item["Q"], qa_item["A"]
+    #     new_concat = "USER: " + q + "ASSISTANT: " + a
+    #     full_prompt_and_response = full_prompt_and_response + new_concat
+    #     encoded_new_concat = tokenize(tokenizer, new_concat, eos=True, max_length=max_length)
+    #     encoded_full_prompt_and_response = torch.cat((encoded_full_prompt_and_response, encoded_new_concat))
+        
+
+    # The labels are the full prompt with response, but with the prompt masked out
+    labels = encoded_full_prompt_and_response.clone()
+    if mask_inputs:
+        labels[:len(encoded_full_prompt)] = IGNORE_INDEX
+
+    # import pdb; pdb.set_trace()
+    
+    return {**example, "sys_command": generate_system_command(), "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels}
+
+
+def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor:
+    return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)
+
+def detokenizer(tokenizer: Tokenizer, tensor: torch.Tensor):
+    '''
+    tokenizer.decode(torch.tensor([13866,   338]))
+    '''
+    return tokenizer.decode(tensor)
+
+
+def generate_prompt_mlp(example):
+    """Generates a standardized message to prompt the model with an instruction, optional input and a
+    'response' field."""
+    # import pdb; pdb.set_trace()
+    # try: 
+    #     x = f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['QA'][0]['Q']} INPUT_MOTION_TOKENS: {example['input']}. \nASSISTANT: " 
+    # except:
+    #     import pdb; pdb.set_trace()
+    if example["input"]:
+        return (
+            f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} INPUT_VIDEO: {example['input']}. \nASSISTANT: "
+        )
+    return (
+        f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} ASSISTANT: "
+    )
+    
+    # return (
+    #     "Below is an instruction that describes a task, paired with an input that provides further context. "
+    #     "Write a response that appropriately completes the request.\n\n"
+    #     f"### Instruction:\n{example['instruction']}\n\n### Input:\n", "\n\n### Response:"
+    # )
+
+def generate_prompt_mlp_mv_bench(example):
+    """Generates a standardized message to prompt the model with an instruction, optional input and a
+    'response' field."""
+    # import pdb; pdb.set_trace()
+    # try: 
+    #     x = f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['QA'][0]['Q']} INPUT_MOTION_TOKENS: {example['input']}. \nASSISTANT: " 
+    # except:
+    #     import pdb; pdb.set_trace()
+    if example["input"]:
+        return (
+            f"A chat between a curious user and an artificial intelligence assistant, paired with an input that provides further context. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} INPUT_VIDEO: {example['input']}. \nASSISTANT: "
+        )
+    return (
+        f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {example['instruction']} ASSISTANT: "
+    )
+    
+    # return (
+    #     "Below is an instruction that describes a task, paired with an input that provides further context. "
+    #     "Write a response that appropriately completes the request.\n\n"
+    #     f"### Instruction:\n{example['instruction']}\n\n### Input:\n", "\n\n### Response:"
+    # )
+
+
+def generate_system_command():
+    return (
+        f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. "
+    )
+    
+
+def main():
+    args = option.get_args_parser()
+    # prepare(split='train')
+    # prepare(split='val')
+    prepare(split='train_filter_wrong_decord_videos')
+    prepare(split='val_filter_wrong_decord_videos')
+
+
+if __name__ == "__main__":
+    main()